diff --git a/llvm.spec b/llvm.spec index 1769d0ad3de..7d89ae2fc97 100644 --- a/llvm.spec +++ b/llvm.spec @@ -1,4 +1,4 @@ -### RPM external llvm 12.0.1 +### RPM external llvm 14.0.6 ## INITENV +PATH LD_LIBRARY_PATH %{i}/lib64 ## INITENV +PATH PYTHON3PATH %{i}/lib64/python%{cms_python3_major_minor_version}/site-packages @@ -7,10 +7,10 @@ Requires: gcc zlib python3 Requires: cuda AutoReq: no -%define llvmCommit 9f4ab770e61b68d2037cc7cda1f868a8ba52da85 -%define llvmBranch cms/release/12.x/fed4134 -%define iwyuCommit 5db414ac448004fe019871c977905cb7c2cff23f -%define iwyuBranch clang_11 +%define llvmCommit d88fe2a1cafb6621210a7a0ae968c1e8e797d2bb +%define llvmBranch cms/release/14.x/f28c006 +%define iwyuCommit d888efc52646dcf3e4e3a56af13aa23dd26abde0 +%define iwyuBranch master Source0: git+https://github.com/cms-externals/llvm-project.git?obj=%{llvmBranch}/%{llvmCommit}&export=llvm-%{realversion}-%{llvmCommit}&module=llvm-%{realversion}-%{llvmCommit}&output=/llvm-%{realversion}-%{llvmCommit}.tgz Source1: git+https://github.com/include-what-you-use/include-what-you-use.git?obj=%{iwyuBranch}/%{iwyuCommit}&export=iwyu-%{realversion}-%{iwyuCommit}&module=iwyu-%{realversion}-%{iwyuCommit}&output=/iwyu-%{realversion}-%{iwyuCommit}.tgz diff --git a/pip/llvmlite.file b/pip/llvmlite.file index 4491e0ac3b2..28d8ef95165 100644 --- a/pip/llvmlite.file +++ b/pip/llvmlite.file @@ -1,7 +1,5 @@ Requires: llvm -Patch0: py3-llvmlite-fpic-flag -Patch1: py3-llvmlite-version -Patch3: py3-llvmlite-removeMethod +Patch0: py3-llvmlite-14 -%define source0 git+https://github.com/numba/llvmlite?obj=release0.35/v%{realversion}&export=llvmlite-%{realversion}&output=/source.tar.gz +%define source0 git+https://github.com/numba/llvmlite?obj=main/%{realversion}&export=llvmlite-%{realversion}&output=/source.tar.gz %define PipPreBuild export LLVM_CONFIG=${LLVM_ROOT}/bin/llvm-config diff --git a/pip/requirements.txt b/pip/requirements.txt index 007c6b4776c..f9e4b706ee2 100644 --- a/pip/requirements.txt +++ b/pip/requirements.txt @@ -151,7 +151,7 @@ kiwisolver==1.4.4 law==0.1.7 lazy-object-proxy==1.7.1 lizard==1.17.10 -llvmlite==0.38.1 +llvmlite==778380378bb856b10d4d77f45aa9386f8de4d940 lockfile==0.12.2 luigi==3.1.1 lxml==4.9.1 diff --git a/py3-dxr.spec b/py3-dxr.spec index b95a6edf1b7..c276e728ba4 100644 --- a/py3-dxr.spec +++ b/py3-dxr.spec @@ -2,7 +2,7 @@ ## INITENV +PATH PYTHON3PATH %i/${PYTHON3_LIB_SITE_PACKAGES} Requires: zlib llvm sqlite Requires: py3-Jinja2 py3-parsimonious py3-pysqlite3 py3-Pygments -%define dxrCommit de41946bc5601d100efb44780f11db71dafaeb1e +%define dxrCommit e79425eded8ca0ae882b4ccceaf27ae1aab446d3 %define branch cms/6ea764102a/py3 Source0: git+https://github.com/cms-externals/dxr.git?obj=%{branch}/%{dxrCommit}&export=dxr-%{dxrCommit}&module=dxr-%dxrCommit&output=/dxr-%{dxrCommit}.tgz diff --git a/py3-llvmlite-14.patch b/py3-llvmlite-14.patch new file mode 100644 index 00000000000..088c40da230 --- /dev/null +++ b/py3-llvmlite-14.patch @@ -0,0 +1,3129 @@ +From c37e824380fec443edb24c914b1767dcff496d38 Mon Sep 17 00:00:00 2001 +From: Andre Masella +Date: Tue, 5 Apr 2022 15:22:21 -0400 +Subject: [PATCH] Update to LLVM 12-14 + +Modify llvmlite to support LLVM 11-14 and modify conda recipe to build LLVM14. +Also lift over all patches to LLVM versions as required. +--- + ...-Limit-size-of-non-GlobalValue-name.patch} | 0 + ...tch => llvm11-consecutive_registers.patch} | 0 + ...-entrypoints-in-add-TLI-mappings.ll.patch} | 0 + ...atch => llvm11-intel-D47188-svml-VF.patch} | 0 + ...o-static.patch => llvm11-lto-static.patch} | 0 + ...ing.patch => llvm11-partial-testing.patch} | 0 + ...t-Limit-size-of-non-GlobalValue-name.patch | 49 + + .../llvm12-consecutive_registers.patch | 181 ++ + conda-recipes/llvm12-lto-static.patch | 12 + + conda-recipes/llvm13-lto-static.patch | 12 + + .../llvm14-remove-use-of-clonefile.patch | 54 + + conda-recipes/llvm14-svml.patch | 2192 +++++++++++++++++ + conda-recipes/llvmdev/bld.bat | 45 +- + conda-recipes/llvmdev/build.sh | 24 +- + conda-recipes/llvmdev/meta.yaml | 33 +- + conda-recipes/llvmdev/numba-3016.ll | 80 - + conda-recipes/llvmlite/bld.bat | 5 +- + conda-recipes/llvmlite/meta.yaml | 10 +- + ffi/Makefile.freebsd | 2 +- + ffi/Makefile.osx | 4 +- + ffi/build.py | 15 +- + ffi/passmanagers.cpp | 9 +- + ffi/targets.cpp | 8 + + ffi/value.cpp | 13 +- + llvmlite/binding/passmanagers.py | 3 +- + llvmlite/tests/test_binding.py | 2 +- + 26 files changed, 2583 insertions(+), 170 deletions(-) + rename conda-recipes/{0001-Revert-Limit-size-of-non-GlobalValue-name.patch => llvm11-0001-Revert-Limit-size-of-non-GlobalValue-name.patch} (100%) + rename conda-recipes/{llvm_11_consecutive_registers.patch => llvm11-consecutive_registers.patch} (100%) + rename conda-recipes/{expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch => llvm11-expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch} (100%) + rename conda-recipes/{intel-D47188-svml-VF.patch => llvm11-intel-D47188-svml-VF.patch} (100%) + rename conda-recipes/{llvm-lto-static.patch => llvm11-lto-static.patch} (100%) + rename conda-recipes/{partial-testing.patch => llvm11-partial-testing.patch} (100%) + create mode 100644 conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch + create mode 100644 conda-recipes/llvm12-consecutive_registers.patch + create mode 100644 conda-recipes/llvm12-lto-static.patch + create mode 100644 conda-recipes/llvm13-lto-static.patch + create mode 100644 conda-recipes/llvm14-remove-use-of-clonefile.patch + create mode 100644 conda-recipes/llvm14-svml.patch + delete mode 100644 conda-recipes/llvmdev/numba-3016.ll + +diff --git a/conda-recipes/0001-Revert-Limit-size-of-non-GlobalValue-name.patch b/conda-recipes/llvm11-0001-Revert-Limit-size-of-non-GlobalValue-name.patch +similarity index 100% +rename from conda-recipes/0001-Revert-Limit-size-of-non-GlobalValue-name.patch +rename to conda-recipes/llvm11-0001-Revert-Limit-size-of-non-GlobalValue-name.patch +diff --git a/conda-recipes/llvm_11_consecutive_registers.patch b/conda-recipes/llvm11-consecutive_registers.patch +similarity index 100% +rename from conda-recipes/llvm_11_consecutive_registers.patch +rename to conda-recipes/llvm11-consecutive_registers.patch +diff --git a/conda-recipes/expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch b/conda-recipes/llvm11-expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch +similarity index 100% +rename from conda-recipes/expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch +rename to conda-recipes/llvm11-expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch +diff --git a/conda-recipes/intel-D47188-svml-VF.patch b/conda-recipes/llvm11-intel-D47188-svml-VF.patch +similarity index 100% +rename from conda-recipes/intel-D47188-svml-VF.patch +rename to conda-recipes/llvm11-intel-D47188-svml-VF.patch +diff --git a/conda-recipes/llvm-lto-static.patch b/conda-recipes/llvm11-lto-static.patch +similarity index 100% +rename from conda-recipes/llvm-lto-static.patch +rename to conda-recipes/llvm11-lto-static.patch +diff --git a/conda-recipes/partial-testing.patch b/conda-recipes/llvm11-partial-testing.patch +similarity index 100% +rename from conda-recipes/partial-testing.patch +rename to conda-recipes/llvm11-partial-testing.patch +diff --git a/conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch b/conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch +new file mode 100644 +index 000000000..9b722d36c +--- /dev/null ++++ b/conda-recipes/llvm12-0001-Revert-Limit-size-of-non-GlobalValue-name.patch +@@ -0,0 +1,49 @@ ++diff -ur a/lib/IR/Value.cpp b/lib/IR/Value.cpp ++--- a/lib/IR/Value.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/IR/Value.cpp 2022-03-31 15:39:31.000000000 -0400 ++@@ -38,10 +38,6 @@ ++ ++ using namespace llvm; ++ ++-static cl::opt NonGlobalValueMaxNameSize( ++- "non-global-value-max-name-size", cl::Hidden, cl::init(1024), ++- cl::desc("Maximum size for the name of non-global values.")); ++- ++ //===----------------------------------------------------------------------===// ++ // Value Class ++ //===----------------------------------------------------------------------===// ++@@ -319,11 +315,6 @@ ++ if (getName() == NameRef) ++ return; ++ ++- // Cap the size of non-GlobalValue names. ++- if (NameRef.size() > NonGlobalValueMaxNameSize && !isa(this)) ++- NameRef = ++- NameRef.substr(0, std::max(1u, (unsigned)NonGlobalValueMaxNameSize)); ++- ++ assert(!getType()->isVoidTy() && "Cannot assign a name to void values!"); ++ ++ // Get the symbol table to update for this object. ++diff -ur a/test/Bitcode/value-with-long-name.ll b/test/Bitcode/value-with-long-name.ll ++deleted file mode 1000644 ++--- a/test/Bitcode/value-with-long-name.ll +++++ /dev/null ++@@ -1,18 +0,0 @@ ++-; Check the size of generated variable when no option is set ++-; RUN: opt -S %s -O2 -o - | FileCheck -check-prefix=CHECK-LONG %s ++-; CHECK-LONG: %{{[a-z]{4}[a-z]+}} ++- ++-; Then check we correctly cap the size of newly generated non-global values name ++-; Force the size to be small so that the check works on release and debug build ++-; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=0 | FileCheck -check-prefix=CHECK-SHORT %s ++-; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=1 | FileCheck -check-prefix=CHECK-SHORT %s ++-; CHECK-SHORT-NOT: %{{[a-z][a-z]+}} ++- ++-define i32 @f(i32 %a, i32 %b) { ++- %c = add i32 %a, %b ++- %d = add i32 %c, %a ++- %e = add i32 %d, %b ++- ret i32 %e ++-} ++- ++- +diff --git a/conda-recipes/llvm12-consecutive_registers.patch b/conda-recipes/llvm12-consecutive_registers.patch +new file mode 100644 +index 000000000..cc60217bd +--- /dev/null ++++ b/conda-recipes/llvm12-consecutive_registers.patch +@@ -0,0 +1,181 @@ ++diff -ur a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h ++--- a/include/llvm/CodeGen/TargetLowering.h 2021-04-06 12:38:18.000000000 -0400 +++++ b/include/llvm/CodeGen/TargetLowering.h 2022-03-31 15:52:45.000000000 -0400 ++@@ -3975,7 +3975,8 @@ ++ /// must be passed in a block of consecutive registers. ++ virtual bool ++ functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, ++- bool isVarArg) const { +++ bool isVarArg, +++ const DataLayout &DL) const { ++ return false; ++ } ++ ++diff -ur a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp ++--- a/lib/CodeGen/SelectionDAG/FastISel.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/CodeGen/SelectionDAG/FastISel.cpp 2022-03-31 15:52:45.000000000 -0400 ++@@ -1087,7 +1087,7 @@ ++ if (Arg.IsByVal) ++ FinalType = cast(Arg.Ty)->getElementType(); ++ bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( ++- FinalType, CLI.CallConv, CLI.IsVarArg); +++ FinalType, CLI.CallConv, CLI.IsVarArg, DL); ++ ++ ISD::ArgFlagsTy Flags; ++ if (Arg.IsZExt) ++diff -ur a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp ++--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp 2022-03-31 15:52:45.000000000 -0400 ++@@ -1851,7 +1851,7 @@ ++ ++ bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( ++ I.getOperand(0)->getType(), F->getCallingConv(), ++- /*IsVarArg*/ false); +++ /*IsVarArg*/ false, DL); ++ ++ ISD::NodeType ExtendKind = ISD::ANY_EXTEND; ++ if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, ++@@ -9229,7 +9229,7 @@ ++ CLI.IsTailCall = false; ++ } else { ++ bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( ++- CLI.RetTy, CLI.CallConv, CLI.IsVarArg); +++ CLI.RetTy, CLI.CallConv, CLI.IsVarArg, DL); ++ for (unsigned I = 0, E = RetTys.size(); I != E; ++I) { ++ ISD::ArgFlagsTy Flags; ++ if (NeedsRegBlock) { ++@@ -9289,7 +9289,7 @@ ++ if (Args[i].IsByVal) ++ FinalType = cast(Args[i].Ty)->getElementType(); ++ bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( ++- FinalType, CLI.CallConv, CLI.IsVarArg); +++ FinalType, CLI.CallConv, CLI.IsVarArg, DL); ++ for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; ++ ++Value) { ++ EVT VT = ValueVTs[Value]; ++@@ -9830,7 +9830,7 @@ ++ if (Arg.hasAttribute(Attribute::ByVal)) ++ FinalType = Arg.getParamByValType(); ++ bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters( ++- FinalType, F.getCallingConv(), F.isVarArg()); +++ FinalType, F.getCallingConv(), F.isVarArg(), DL); ++ for (unsigned Value = 0, NumValues = ValueVTs.size(); ++ Value != NumValues; ++Value) { ++ EVT VT = ValueVTs[Value]; ++diff -ur a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp ++--- a/lib/Target/AArch64/AArch64ISelLowering.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/AArch64/AArch64ISelLowering.cpp 2022-03-31 15:52:45.000000000 -0400 ++@@ -30,6 +30,7 @@ ++ #include "llvm/ADT/Triple.h" ++ #include "llvm/ADT/Twine.h" ++ #include "llvm/Analysis/VectorUtils.h" +++#include "llvm/CodeGen/Analysis.h" ++ #include "llvm/CodeGen/CallingConvLower.h" ++ #include "llvm/CodeGen/MachineBasicBlock.h" ++ #include "llvm/CodeGen/MachineFrameInfo.h" ++@@ -16455,15 +16456,17 @@ ++ } ++ ++ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( ++- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { ++- if (Ty->isArrayTy()) ++- return true; ++- ++- const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); ++- if (TySize.isScalable() && TySize.getKnownMinSize() > 128) ++- return true; +++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, +++ const DataLayout &DL) const { +++ if (!Ty->isArrayTy()) { +++ const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); +++ return TySize.isScalable() && TySize.getKnownMinSize() > 128; +++ } ++ ++- return false; +++ // All non aggregate members of the type must have the same type +++ SmallVector ValueVTs; +++ ComputeValueVTs(*this, DL, Ty, ValueVTs); +++ return is_splat(ValueVTs); ++ } ++ ++ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, ++diff -ur a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h ++--- a/lib/Target/AArch64/AArch64ISelLowering.h 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/AArch64/AArch64ISelLowering.h 2022-03-31 15:52:45.000000000 -0400 ++@@ -770,9 +770,10 @@ ++ MachineMemOperand::Flags getTargetMMOFlags( ++ const Instruction &I) const override; ++ ++- bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, ++- CallingConv::ID CallConv, ++- bool isVarArg) const override; +++ bool functionArgumentNeedsConsecutiveRegisters( +++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, +++ const DataLayout &DL) const override; +++ ++ /// Used for exception handling on Win64. ++ bool needsFixedCatchObjects() const override; ++ ++diff -ur a/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/lib/Target/AArch64/GISel/AArch64CallLowering.cpp ++--- a/lib/Target/AArch64/GISel/AArch64CallLowering.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/AArch64/GISel/AArch64CallLowering.cpp 2022-03-31 15:52:45.000000000 -0400 ++@@ -259,7 +259,7 @@ ++ assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); ++ ++ bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( ++- OrigArg.Ty, CallConv, false); +++ OrigArg.Ty, CallConv, false, DL); ++ for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { ++ Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); ++ SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], ++diff -ur a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp ++--- a/lib/Target/ARM/ARMCallLowering.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/ARM/ARMCallLowering.cpp 2022-03-31 15:52:45.000000000 -0400 ++@@ -218,7 +218,7 @@ ++ ++ bool NeedsConsecutiveRegisters = ++ TLI.functionArgumentNeedsConsecutiveRegisters( ++- SplitTy, F.getCallingConv(), F.isVarArg()); +++ SplitTy, F.getCallingConv(), F.isVarArg(), DL); ++ if (NeedsConsecutiveRegisters) { ++ Flags.setInConsecutiveRegs(); ++ if (i == e - 1) ++diff -ur a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp ++--- a/lib/Target/ARM/ARMISelLowering.cpp 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/ARM/ARMISelLowering.cpp 2022-03-31 15:52:45.000000000 -0400 ++@@ -19269,7 +19269,8 @@ ++ /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when ++ /// passing according to AAPCS rules. ++ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( ++- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { +++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, +++ const DataLayout &DL) const { ++ if (getEffectiveCallingConv(CallConv, isVarArg) != ++ CallingConv::ARM_AAPCS_VFP) ++ return false; ++diff -ur a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h ++--- a/lib/Target/ARM/ARMISelLowering.h 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/ARM/ARMISelLowering.h 2022-03-31 15:52:45.000000000 -0400 ++@@ -578,7 +578,8 @@ ++ /// Returns true if an argument of type Ty needs to be passed in a ++ /// contiguous block of registers in calling convention CallConv. ++ bool functionArgumentNeedsConsecutiveRegisters( ++- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; +++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, +++ const DataLayout &DL) const override; ++ ++ /// If a physical register, this returns the register that receives the ++ /// exception address on entry to an EH pad. ++diff -ur a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h ++--- a/lib/Target/PowerPC/PPCISelLowering.h 2021-04-06 12:38:18.000000000 -0400 +++++ b/lib/Target/PowerPC/PPCISelLowering.h 2022-03-31 15:52:45.000000000 -0400 ++@@ -998,7 +998,8 @@ ++ /// Returns true if an argument of type Ty needs to be passed in a ++ /// contiguous block of registers in calling convention CallConv. ++ bool functionArgumentNeedsConsecutiveRegisters( ++- Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override { +++ Type *Ty, CallingConv::ID CallConv, bool isVarArg, +++ const DataLayout &DL) const override { ++ // We support any array type as "consecutive" block in the parameter ++ // save area. The element type defines the alignment requirement and ++ // whether the argument should go in GPRs, FPRs, or VRs if available. +diff --git a/conda-recipes/llvm12-lto-static.patch b/conda-recipes/llvm12-lto-static.patch +new file mode 100644 +index 000000000..76cc55def +--- /dev/null ++++ b/conda-recipes/llvm12-lto-static.patch +@@ -0,0 +1,12 @@ ++diff -ur a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt ++--- llvm-12.0.0.src-orig/tools/lto/CMakeLists.txt 2021-04-06 12:38:18.000000000 -0400 +++++ llvm-12.0.0.src/tools/lto/CMakeLists.txt 2022-03-31 15:46:00.000000000 -0400 ++@@ -21,7 +21,7 @@ ++ ++ set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/lto.exports) ++ ++-add_llvm_library(LTO SHARED INSTALL_WITH_TOOLCHAIN ${SOURCES} DEPENDS +++add_llvm_library(LTO INSTALL_WITH_TOOLCHAIN ${SOURCES} DEPENDS ++ intrinsics_gen) ++ ++ install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/lto.h +diff --git a/conda-recipes/llvm13-lto-static.patch b/conda-recipes/llvm13-lto-static.patch +new file mode 100644 +index 000000000..b8a624250 +--- /dev/null ++++ b/conda-recipes/llvm13-lto-static.patch +@@ -0,0 +1,12 @@ ++diff -ur llvm-13.0.0.src-orig/tools/lto/CMakeLists.txt llvm-13.0.0.src/tools/lto/CMakeLists.txt ++--- llvm-13.0.0.src-orig/tools/lto/CMakeLists.txt 2021-09-24 12:18:10.000000000 -0400 +++++ llvm-13.0.0.src/tools/lto/CMakeLists.txt 2022-03-31 17:07:07.000000000 -0400 ++@@ -25,7 +25,7 @@ ++ set(LTO_LIBRARY_TYPE MODULE) ++ set(LTO_LIBRARY_NAME libLTO) ++ else() ++- set(LTO_LIBRARY_TYPE SHARED) +++ set(LTO_LIBRARY_TYPE STATIC) ++ set(LTO_LIBRARY_NAME LTO) ++ endif() ++ +diff --git a/conda-recipes/llvm14-remove-use-of-clonefile.patch b/conda-recipes/llvm14-remove-use-of-clonefile.patch +new file mode 100644 +index 000000000..6ef9c9d61 +--- /dev/null ++++ b/conda-recipes/llvm14-remove-use-of-clonefile.patch +@@ -0,0 +1,54 @@ ++diff -ur a/llvm-14.0.6.src/lib/Support/Unix/Path.inc b/llvm-14.0.6.src/lib/Support/Unix/Path.inc ++--- a/llvm-14.0.6.src/lib/Support/Unix/Path.inc 2022-03-14 05:44:55.000000000 -0400 +++++ b/llvm-14.0.6.src/lib/Support/Unix/Path.inc 2022-09-19 11:30:59.000000000 -0400 ++@@ -1462,6 +1462,7 @@ ++ std::error_code copy_file(const Twine &From, const Twine &To) { ++ std::string FromS = From.str(); ++ std::string ToS = To.str(); +++ /* ++ #if __has_builtin(__builtin_available) ++ if (__builtin_available(macos 10.12, *)) { ++ // Optimistically try to use clonefile() and handle errors, rather than ++@@ -1490,6 +1491,7 @@ ++ // cheaper. ++ } ++ #endif +++ */ ++ if (!copyfile(FromS.c_str(), ToS.c_str(), /*State=*/NULL, COPYFILE_DATA)) ++ return std::error_code(); ++ return std::error_code(errno, std::generic_category()); ++diff -ur a/llvm-14.0.6.src/unittests/Support/Path.cpp b/llvm-14.0.6.src/unittests/Support/Path.cpp ++--- a/llvm-14.0.6.src/unittests/Support/Path.cpp 2022-03-14 05:44:55.000000000 -0400 +++++ b/llvm-14.0.6.src/unittests/Support/Path.cpp 2022-09-19 11:33:07.000000000 -0400 ++@@ -2267,15 +2267,15 @@ ++ ++ EXPECT_EQ(fs::setPermissions(TempPath, fs::set_uid_on_exe), NoError); ++ EXPECT_TRUE(CheckPermissions(fs::set_uid_on_exe)); ++- +++#if !defined(__APPLE__) ++ EXPECT_EQ(fs::setPermissions(TempPath, fs::set_gid_on_exe), NoError); ++ EXPECT_TRUE(CheckPermissions(fs::set_gid_on_exe)); ++- +++#endif ++ // Modern BSDs require root to set the sticky bit on files. ++ // AIX and Solaris without root will mask off (i.e., lose) the sticky bit ++ // on files. ++ #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) && \ ++- !defined(_AIX) && !(defined(__sun__) && defined(__svr4__)) +++ !defined(_AIX) && !(defined(__sun__) && defined(__svr4__)) && !defined(__APPLE__) ++ EXPECT_EQ(fs::setPermissions(TempPath, fs::sticky_bit), NoError); ++ EXPECT_TRUE(CheckPermissions(fs::sticky_bit)); ++ ++@@ -2297,10 +2297,12 @@ ++ EXPECT_TRUE(CheckPermissions(fs::all_perms)); ++ #endif // !FreeBSD && !NetBSD && !OpenBSD && !AIX ++ +++#if !defined(__APPLE__) ++ EXPECT_EQ(fs::setPermissions(TempPath, fs::all_perms & ~fs::sticky_bit), ++ NoError); ++ EXPECT_TRUE(CheckPermissions(fs::all_perms & ~fs::sticky_bit)); ++ #endif +++#endif ++ } ++ ++ #ifdef _WIN32 +diff --git a/conda-recipes/llvm14-svml.patch b/conda-recipes/llvm14-svml.patch +new file mode 100644 +index 000000000..cdce26b34 +--- /dev/null ++++ b/conda-recipes/llvm14-svml.patch +@@ -0,0 +1,2192 @@ ++From bc2dcd190b7148d04772fa7fcd18b5200b758d4a Mon Sep 17 00:00:00 2001 ++From: Ivan Butygin ++Date: Sun, 24 Jul 2022 20:31:29 +0200 ++Subject: [PATCH] Fixes vectorizer and extends SVML support ++ ++Patch was updated to fix SVML calling convention issues uncovered by llvm 10. ++In previous versions of patch SVML calling convention was selected based on ++compilation settings. So if you try to call 256bit vector function from avx512 ++code function will be called with avx512 cc which is incorrect. To fix this ++SVML cc was separated into 3 different cc for 128, 256 and 512bit vector lengths ++which are selected based on actual input vector length. ++ ++Original patch merged several fixes: ++ ++1. https://reviews.llvm.org/D47188 patch fixes the problem with improper calls ++to SVML library as it has non-standard calling conventions. So accordingly it ++has SVML calling conventions definitions and code to set CC to the vectorized ++calls. As SVML provides several implementations for the math functions we also ++took into consideration fast attribute and select more fast implementation in ++such case. This work is based on original Matt Masten's work. ++Author: Denis Nagorny ++ ++2. https://reviews.llvm.org/D53035 patch implements support to legalize SVML ++calls by breaking down the illegal vector call instruction into multiple legal ++vector call instructions during code generation. Currently the vectorizer does ++not check legality of the generated SVML (or any VECLIB) call instructions, and ++this can lead to potential problems even during vector type legalization. This ++patch addresses this issue by adding a legality check during code generation and ++replaces the illegal SVML call with corresponding legalized instructions. ++(RFC: http://lists.llvm.org/pipermail/llvm-dev/2018-June/124357.html) ++Author: Karthik Senthil ++--- ++ .../include/llvm/Analysis/TargetLibraryInfo.h | 22 +- ++ llvm/include/llvm/AsmParser/LLToken.h | 3 + ++ llvm/include/llvm/IR/CMakeLists.txt | 4 + ++ llvm/include/llvm/IR/CallingConv.h | 5 + ++ llvm/include/llvm/IR/SVML.td | 62 +++ ++ llvm/lib/Analysis/CMakeLists.txt | 1 + ++ llvm/lib/Analysis/TargetLibraryInfo.cpp | 55 +- ++ llvm/lib/AsmParser/LLLexer.cpp | 3 + ++ llvm/lib/AsmParser/LLParser.cpp | 6 + ++ llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 2 +- ++ llvm/lib/IR/AsmWriter.cpp | 3 + ++ llvm/lib/IR/Verifier.cpp | 3 + ++ llvm/lib/Target/X86/X86CallingConv.td | 70 +++ ++ llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- ++ llvm/lib/Target/X86/X86RegisterInfo.cpp | 46 ++ ++ llvm/lib/Target/X86/X86Subtarget.h | 3 + ++ .../Transforms/Utils/InjectTLIMappings.cpp | 2 +- ++ .../Transforms/Vectorize/LoopVectorize.cpp | 269 +++++++++ ++ .../Generic/replace-intrinsics-with-veclib.ll | 4 +- ++ .../LoopVectorize/X86/svml-calls-finite.ll | 24 +- ++ .../LoopVectorize/X86/svml-calls.ll | 108 ++-- ++ .../LoopVectorize/X86/svml-legal-calls.ll | 513 ++++++++++++++++++ ++ .../LoopVectorize/X86/svml-legal-codegen.ll | 61 +++ ++ llvm/test/Transforms/Util/add-TLI-mappings.ll | 18 +- ++ llvm/utils/TableGen/CMakeLists.txt | 1 + ++ llvm/utils/TableGen/SVMLEmitter.cpp | 110 ++++ ++ llvm/utils/TableGen/TableGen.cpp | 8 +- ++ llvm/utils/TableGen/TableGenBackends.h | 1 + ++ llvm/utils/vim/syntax/llvm.vim | 1 + ++ 29 files changed, 1341 insertions(+), 70 deletions(-) ++ create mode 100644 llvm/include/llvm/IR/SVML.td ++ create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll ++ create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll ++ create mode 100644 llvm/utils/TableGen/SVMLEmitter.cpp ++ ++diff --git a/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h b/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h ++index 17d1e3f770c14..110ff08189867 100644 ++--- a/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h +++++ b/llvm-14.0.6.src/include/llvm/Analysis/TargetLibraryInfo.h ++@@ -39,6 +39,12 @@ struct VecDesc { ++ NotLibFunc ++ }; ++ +++enum SVMLAccuracy { +++ SVML_DEFAULT, +++ SVML_HA, +++ SVML_EP +++}; +++ ++ /// Implementation of the target library information. ++ /// ++ /// This class constructs tables that hold the target library information and ++@@ -157,7 +163,7 @@ class TargetLibraryInfoImpl { ++ /// Return true if the function F has a vector equivalent with vectorization ++ /// factor VF. ++ bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const { ++- return !getVectorizedFunction(F, VF).empty(); +++ return !getVectorizedFunction(F, VF, false).empty(); ++ } ++ ++ /// Return true if the function F has a vector equivalent with any ++@@ -166,7 +172,10 @@ class TargetLibraryInfoImpl { ++ ++ /// Return the name of the equivalent of F, vectorized with factor VF. If no ++ /// such mapping exists, return the empty string. ++- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const; +++ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const; +++ +++ Optional getVectorizedFunctionCallingConv( +++ StringRef F, const FunctionType &FTy, const DataLayout &DL) const; ++ ++ /// Set to true iff i32 parameters to library functions should have signext ++ /// or zeroext attributes if they correspond to C-level int or unsigned int, ++@@ -326,8 +335,13 @@ class TargetLibraryInfo { ++ bool isFunctionVectorizable(StringRef F) const { ++ return Impl->isFunctionVectorizable(F); ++ } ++- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const { ++- return Impl->getVectorizedFunction(F, VF); +++ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const { +++ return Impl->getVectorizedFunction(F, VF, IsFast); +++ } +++ +++ Optional getVectorizedFunctionCallingConv( +++ StringRef F, const FunctionType &FTy, const DataLayout &DL) const { +++ return Impl->getVectorizedFunctionCallingConv(F, FTy, DL); ++ } ++ ++ /// Tests if the function is both available and a candidate for optimized code ++diff --git a/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h b/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h ++index 78ebb35e0ea4d..3ffb57db8b18b 100644 ++--- a/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h +++++ b/llvm-14.0.6.src/include/llvm/AsmParser/LLToken.h ++@@ -133,6 +133,9 @@ enum Kind { ++ kw_fastcc, ++ kw_coldcc, ++ kw_intel_ocl_bicc, +++ kw_intel_svmlcc128, +++ kw_intel_svmlcc256, +++ kw_intel_svmlcc512, ++ kw_cfguard_checkcc, ++ kw_x86_stdcallcc, ++ kw_x86_fastcallcc, ++diff --git a/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt b/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt ++index 0498fc269b634..23bb3de41bc1a 100644 ++--- a/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt +++++ b/llvm-14.0.6.src/include/llvm/IR/CMakeLists.txt ++@@ -20,3 +20,7 @@ tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86) ++ tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore) ++ tablegen(LLVM IntrinsicsVE.h -gen-intrinsic-enums -intrinsic-prefix=ve) ++ add_public_tablegen_target(intrinsics_gen) +++ +++set(LLVM_TARGET_DEFINITIONS SVML.td) +++tablegen(LLVM SVML.inc -gen-svml) +++add_public_tablegen_target(svml_gen) ++diff --git a/llvm-14.0.6.src/include/llvm/IR/CallingConv.h b/llvm-14.0.6.src/include/llvm/IR/CallingConv.h ++index fd28542465225..096eea1a8e19b 100644 ++--- a/llvm-14.0.6.src/include/llvm/IR/CallingConv.h +++++ b/llvm-14.0.6.src/include/llvm/IR/CallingConv.h ++@@ -252,6 +252,11 @@ namespace CallingConv { ++ /// M68k_INTR - Calling convention used for M68k interrupt routines. ++ M68k_INTR = 101, ++ +++ /// Intel_SVML - Calling conventions for Intel Short Math Vector Library +++ Intel_SVML128 = 102, +++ Intel_SVML256 = 103, +++ Intel_SVML512 = 104, +++ ++ /// The highest possible calling convention ID. Must be some 2^k - 1. ++ MaxID = 1023 ++ }; ++diff --git a/llvm-14.0.6.src/include/llvm/IR/SVML.td b/llvm-14.0.6.src/include/llvm/IR/SVML.td ++new file mode 100644 ++index 0000000000000..5af710404c9d9 ++--- /dev/null +++++ b/llvm-14.0.6.src/include/llvm/IR/SVML.td ++@@ -0,0 +1,62 @@ +++//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===// +++// +++// The LLVM Compiler Infrastructure +++// +++// This file is distributed under the University of Illinois Open Source +++// License. See LICENSE.TXT for details. +++// +++//===----------------------------------------------------------------------===// +++// +++// This file is used by TableGen to define the different typs of SVML function +++// variants used with -fveclib=SVML. +++// +++//===----------------------------------------------------------------------===// +++ +++class SvmlVariant; +++ +++def sin : SvmlVariant; +++def cos : SvmlVariant; +++def pow : SvmlVariant; +++def exp : SvmlVariant; +++def log : SvmlVariant; +++def acos : SvmlVariant; +++def acosh : SvmlVariant; +++def asin : SvmlVariant; +++def asinh : SvmlVariant; +++def atan2 : SvmlVariant; +++def atan : SvmlVariant; +++def atanh : SvmlVariant; +++def cbrt : SvmlVariant; +++def cdfnorm : SvmlVariant; +++def cdfnorminv : SvmlVariant; +++def cosd : SvmlVariant; +++def cosh : SvmlVariant; +++def erf : SvmlVariant; +++def erfc : SvmlVariant; +++def erfcinv : SvmlVariant; +++def erfinv : SvmlVariant; +++def exp10 : SvmlVariant; +++def exp2 : SvmlVariant; +++def expm1 : SvmlVariant; +++def hypot : SvmlVariant; +++def invsqrt : SvmlVariant; +++def log10 : SvmlVariant; +++def log1p : SvmlVariant; +++def log2 : SvmlVariant; +++def sind : SvmlVariant; +++def sinh : SvmlVariant; +++def sqrt : SvmlVariant; +++def tan : SvmlVariant; +++def tanh : SvmlVariant; +++ +++// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions. +++// We should call the default variant of these functions in all cases instead. +++ +++// def nearbyint : SvmlVariant; +++// def logb : SvmlVariant; +++// def floor : SvmlVariant; +++// def fmod : SvmlVariant; +++// def ceil : SvmlVariant; +++// def trunc : SvmlVariant; +++// def rint : SvmlVariant; +++// def round : SvmlVariant; ++diff --git a/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt b/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt ++index aec84124129f4..98286e166fbe2 100644 ++--- a/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt +++++ b/llvm-14.0.6.src/lib/Analysis/CMakeLists.txt ++@@ -150,6 +150,7 @@ add_llvm_component_library(LLVMAnalysis ++ DEPENDS ++ intrinsics_gen ++ ${MLDeps} +++ svml_gen ++ ++ LINK_LIBS ++ ${MLLinkDeps} ++diff --git a/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp b/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp ++index 02923c2c7eb14..83abde28a62a4 100644 ++--- a/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp +++++ b/llvm-14.0.6.src/lib/Analysis/TargetLibraryInfo.cpp ++@@ -110,6 +110,11 @@ bool TargetLibraryInfoImpl::isCallingConvCCompatible(Function *F) { ++ F->getFunctionType()); ++ } ++ +++static std::string svmlMangle(StringRef FnName, const bool IsFast) { +++ std::string FullName = FnName.str(); +++ return IsFast ? FullName : FullName + "_ha"; +++} +++ ++ /// Initialize the set of available library functions based on the specified ++ /// target triple. This should be carefully written so that a missing target ++ /// triple gets a sane set of defaults. ++@@ -1876,8 +1881,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( ++ } ++ case SVML: { ++ const VecDesc VecFuncs[] = { ++- #define TLI_DEFINE_SVML_VECFUNCS ++- #include "llvm/Analysis/VecFuncs.def" +++ #define GET_SVML_VARIANTS +++ #include "llvm/IR/SVML.inc" +++ #undef GET_SVML_VARIANTS ++ }; ++ addVectorizableFunctions(VecFuncs); ++ break; ++@@ -1897,20 +1903,51 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const { ++ return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName; ++ } ++ ++-StringRef ++-TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, ++- const ElementCount &VF) const { +++std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F, +++ const ElementCount &VF, +++ bool IsFast) const { +++ bool FromSVML = ClVectorLibrary == SVML; ++ F = sanitizeFunctionName(F); ++ if (F.empty()) ++- return F; +++ return F.str(); ++ std::vector::const_iterator I = ++ llvm::lower_bound(VectorDescs, F, compareWithScalarFnName); ++ while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) { ++- if (I->VectorizationFactor == VF) ++- return I->VectorFnName; +++ if (I->VectorizationFactor == VF) { +++ if (FromSVML) { +++ return svmlMangle(I->VectorFnName, IsFast); +++ } +++ return I->VectorFnName.str(); +++ } ++ ++I; ++ } ++- return StringRef(); +++ return std::string(); +++} +++ +++static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, const FunctionType &FType) +++{ +++ assert(isa(FType.getReturnType())); +++ auto *VecCallRetType = cast(FType.getReturnType()); +++ auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType); +++ if (TypeBitWidth == 128) { +++ return CallingConv::Intel_SVML128; +++ } else if (TypeBitWidth == 256) { +++ return CallingConv::Intel_SVML256; +++ } else if (TypeBitWidth == 512) { +++ return CallingConv::Intel_SVML512; +++ } else { +++ llvm_unreachable("Invalid vector width"); +++ } +++ return 0; // not reachable +++} +++ +++Optional +++TargetLibraryInfoImpl::getVectorizedFunctionCallingConv( +++ StringRef F, const FunctionType &FTy, const DataLayout &DL) const { +++ if (F.startswith("__svml")) { +++ return getSVMLCallingConv(DL, FTy); +++ } +++ return {}; ++ } ++ ++ TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F, ++diff --git a/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp b/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp ++index e3bf41c9721b6..4f9dccd4e0724 100644 ++--- a/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp +++++ b/llvm-14.0.6.src/lib/AsmParser/LLLexer.cpp ++@@ -603,6 +603,9 @@ lltok::Kind LLLexer::LexIdentifier() { ++ KEYWORD(spir_kernel); ++ KEYWORD(spir_func); ++ KEYWORD(intel_ocl_bicc); +++ KEYWORD(intel_svmlcc128); +++ KEYWORD(intel_svmlcc256); +++ KEYWORD(intel_svmlcc512); ++ KEYWORD(x86_64_sysvcc); ++ KEYWORD(win64cc); ++ KEYWORD(x86_regcallcc); ++diff --git a/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp b/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp ++index 432ec151cf8ae..3bd6ee61024b8 100644 ++--- a/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp +++++ b/llvm-14.0.6.src/lib/AsmParser/LLParser.cpp ++@@ -1781,6 +1781,9 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) { ++ /// ::= 'ccc' ++ /// ::= 'fastcc' ++ /// ::= 'intel_ocl_bicc' +++/// ::= 'intel_svmlcc128' +++/// ::= 'intel_svmlcc256' +++/// ::= 'intel_svmlcc512' ++ /// ::= 'coldcc' ++ /// ::= 'cfguard_checkcc' ++ /// ::= 'x86_stdcallcc' ++@@ -1850,6 +1853,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { ++ case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break; ++ case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break; ++ case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break; +++ case lltok::kw_intel_svmlcc128:CC = CallingConv::Intel_SVML128; break; +++ case lltok::kw_intel_svmlcc256:CC = CallingConv::Intel_SVML256; break; +++ case lltok::kw_intel_svmlcc512:CC = CallingConv::Intel_SVML512; break; ++ case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break; ++ case lltok::kw_win64cc: CC = CallingConv::Win64; break; ++ case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break; ++diff --git a/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp ++index 0ff045fa787e8..175651949ef85 100644 ++--- a/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp +++++ b/llvm-14.0.6.src/lib/CodeGen/ReplaceWithVeclib.cpp ++@@ -157,7 +157,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, ++ // and the exact vector width of the call operands in the ++ // TargetLibraryInfo. ++ const std::string TLIName = ++- std::string(TLI.getVectorizedFunction(ScalarName, VF)); +++ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast())); ++ ++ LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" ++ << ScalarName << "` and vector width " << VF << ".\n"); ++diff --git a/llvm-14.0.6.src/lib/IR/AsmWriter.cpp b/llvm-14.0.6.src/lib/IR/AsmWriter.cpp ++index 179754e275b03..c4e95752c97e8 100644 ++--- a/llvm-14.0.6.src/lib/IR/AsmWriter.cpp +++++ b/llvm-14.0.6.src/lib/IR/AsmWriter.cpp ++@@ -306,6 +306,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { ++ case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; ++ case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; ++ case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; +++ case CallingConv::Intel_SVML128: Out << "intel_svmlcc128"; break; +++ case CallingConv::Intel_SVML256: Out << "intel_svmlcc256"; break; +++ case CallingConv::Intel_SVML512: Out << "intel_svmlcc512"; break; ++ case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; ++ case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break; ++ case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break; ++diff --git a/llvm-14.0.6.src/lib/IR/Verifier.cpp b/llvm-14.0.6.src/lib/IR/Verifier.cpp ++index 989d01e2e3950..bae7382a36e13 100644 ++--- a/llvm-14.0.6.src/lib/IR/Verifier.cpp +++++ b/llvm-14.0.6.src/lib/IR/Verifier.cpp ++@@ -2457,6 +2457,9 @@ void Verifier::visitFunction(const Function &F) { ++ case CallingConv::Fast: ++ case CallingConv::Cold: ++ case CallingConv::Intel_OCL_BI: +++ case CallingConv::Intel_SVML128: +++ case CallingConv::Intel_SVML256: +++ case CallingConv::Intel_SVML512: ++ case CallingConv::PTX_Kernel: ++ case CallingConv::PTX_Device: ++ Assert(!F.isVarArg(), "Calling convention does not support varargs or " ++diff --git a/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td b/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td ++index 4dd8a6cdd8982..12e65521215e4 100644 ++--- a/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td +++++ b/llvm-14.0.6.src/lib/Target/X86/X86CallingConv.td ++@@ -498,6 +498,21 @@ def RetCC_X86_64 : CallingConv<[ ++ CCDelegateTo ++ ]>; ++ +++// Intel_SVML return-value convention. +++def RetCC_Intel_SVML : CallingConv<[ +++ // Vector types are returned in XMM0,XMM1 +++ CCIfType<[v4f32, v2f64], +++ CCAssignToReg<[XMM0,XMM1]>>, +++ +++ // 256-bit FP vectors +++ CCIfType<[v8f32, v4f64], +++ CCAssignToReg<[YMM0,YMM1]>>, +++ +++ // 512-bit FP vectors +++ CCIfType<[v16f32, v8f64], +++ CCAssignToReg<[ZMM0,ZMM1]>> +++]>; +++ ++ // This is the return-value convention used for the entire X86 backend. ++ let Entry = 1 in ++ def RetCC_X86 : CallingConv<[ ++@@ -505,6 +520,10 @@ def RetCC_X86 : CallingConv<[ ++ // Check if this is the Intel OpenCL built-ins calling convention ++ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, ++ +++ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo>, +++ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo>, +++ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo>, +++ ++ CCIfSubtarget<"is64Bit()", CCDelegateTo>, ++ CCDelegateTo ++ ]>; ++@@ -1064,6 +1083,30 @@ def CC_Intel_OCL_BI : CallingConv<[ ++ CCDelegateTo ++ ]>; ++ +++// X86-64 Intel Short Vector Math Library calling convention. +++def CC_Intel_SVML : CallingConv<[ +++ +++ // The SSE vector arguments are passed in XMM registers. +++ CCIfType<[v4f32, v2f64], +++ CCAssignToReg<[XMM0, XMM1, XMM2]>>, +++ +++ // The 256-bit vector arguments are passed in YMM registers. +++ CCIfType<[v8f32, v4f64], +++ CCAssignToReg<[YMM0, YMM1, YMM2]>>, +++ +++ // The 512-bit vector arguments are passed in ZMM registers. +++ CCIfType<[v16f32, v8f64], +++ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>> +++]>; +++ +++def CC_X86_32_Intr : CallingConv<[ +++ CCAssignToStack<4, 4> +++]>; +++ +++def CC_X86_64_Intr : CallingConv<[ +++ CCAssignToStack<8, 8> +++]>; +++ ++ //===----------------------------------------------------------------------===// ++ // X86 Root Argument Calling Conventions ++ //===----------------------------------------------------------------------===// ++@@ -1115,6 +1158,9 @@ def CC_X86_64 : CallingConv<[ ++ let Entry = 1 in ++ def CC_X86 : CallingConv<[ ++ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, +++ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo>, +++ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo>, +++ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo>, ++ CCIfSubtarget<"is64Bit()", CCDelegateTo>, ++ CCDelegateTo ++ ]>; ++@@ -1227,3 +1273,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, ++ (sequence "R%u", 12, 15))>; ++ def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, ++ (sequence "XMM%u", 8, 15))>; +++ +++// SVML calling convention +++def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>; +++def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML, +++ K4, K5, K6, K7)>; +++ +++def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>; +++ +++def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, +++ (sequence "XMM%u", 8, 15))>; +++def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, +++ (sequence "XMM%u", 6, 15))>; +++ +++def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, +++ (sequence "YMM%u", 8, 15))>; +++def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, +++ (sequence "YMM%u", 6, 15))>; +++ +++def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, +++ (sequence "ZMM%u", 16, 31), +++ K4, K5, K6, K7)>; +++def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE, +++ (sequence "ZMM%u", 6, 21), +++ K4, K5, K6, K7)>; ++diff --git a/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp b/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp ++index 8bb7e81e19bbd..1780ce3fc6467 100644 ++--- a/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp +++++ b/llvm-14.0.6.src/lib/Target/X86/X86ISelLowering.cpp ++@@ -3788,7 +3788,8 @@ void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { ++ // FIXME: Only some x86_32 calling conventions support AVX512. ++ if (Subtarget.useAVX512Regs() && ++ (is64Bit() || (CallConv == CallingConv::X86_VectorCall || ++- CallConv == CallingConv::Intel_OCL_BI))) +++ CallConv == CallingConv::Intel_OCL_BI || +++ CallConv == CallingConv::Intel_SVML512))) ++ VecVT = MVT::v16f32; ++ else if (Subtarget.hasAVX()) ++ VecVT = MVT::v8f32; ++diff --git a/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp b/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp ++index 130cb61cdde24..9eec3b25ca9f2 100644 ++--- a/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp +++++ b/llvm-14.0.6.src/lib/Target/X86/X86RegisterInfo.cpp ++@@ -272,6 +272,42 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, ++ } ++ } ++ +++namespace { +++std::pair getSVMLRegMaskAndSaveList( +++ bool Is64Bit, bool IsWin64, CallingConv::ID CC) { +++ assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512); +++ unsigned Abi = CC - CallingConv::Intel_SVML128 ; // 0 - 128, 1 - 256, 2 - 512 +++ +++ const std::pair Abi64[] = { +++ std::make_pair(CSR_64_Intel_SVML_RegMask, CSR_64_Intel_SVML_SaveList), +++ std::make_pair(CSR_64_Intel_SVML_AVX_RegMask, CSR_64_Intel_SVML_AVX_SaveList), +++ std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, CSR_64_Intel_SVML_AVX512_SaveList), +++ }; +++ +++ const std::pair AbiWin64[] = { +++ std::make_pair(CSR_Win64_Intel_SVML_RegMask, CSR_Win64_Intel_SVML_SaveList), +++ std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask, CSR_Win64_Intel_SVML_AVX_SaveList), +++ std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, CSR_Win64_Intel_SVML_AVX512_SaveList), +++ }; +++ +++ const std::pair Abi32[] = { +++ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList), +++ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList), +++ std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, CSR_32_Intel_SVML_AVX512_SaveList), +++ }; +++ +++ if (Is64Bit) { +++ if (IsWin64) { +++ return AbiWin64[Abi]; +++ } else { +++ return Abi64[Abi]; +++ } +++ } else { +++ return Abi32[Abi]; +++ } +++} +++} +++ ++ const MCPhysReg * ++ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { ++ assert(MF && "MachineFunction required"); ++@@ -327,6 +363,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { ++ return CSR_64_Intel_OCL_BI_SaveList; ++ break; ++ } +++ case CallingConv::Intel_SVML128: +++ case CallingConv::Intel_SVML256: +++ case CallingConv::Intel_SVML512: { +++ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second; +++ } ++ case CallingConv::HHVM: ++ return CSR_64_HHVM_SaveList; ++ case CallingConv::X86_RegCall: ++@@ -449,6 +490,11 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, ++ return CSR_64_Intel_OCL_BI_RegMask; ++ break; ++ } +++ case CallingConv::Intel_SVML128: +++ case CallingConv::Intel_SVML256: +++ case CallingConv::Intel_SVML512: { +++ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first; +++ } ++ case CallingConv::HHVM: ++ return CSR_64_HHVM_RegMask; ++ case CallingConv::X86_RegCall: ++diff --git a/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h b/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h ++index 5d773f0c57dfb..6bdf5bc6f3fe9 100644 ++--- a/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h +++++ b/llvm-14.0.6.src/lib/Target/X86/X86Subtarget.h ++@@ -916,6 +916,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { ++ case CallingConv::X86_ThisCall: ++ case CallingConv::X86_VectorCall: ++ case CallingConv::Intel_OCL_BI: +++ case CallingConv::Intel_SVML128: +++ case CallingConv::Intel_SVML256: +++ case CallingConv::Intel_SVML512: ++ return isTargetWin64(); ++ // This convention allows using the Win64 convention on other targets. ++ case CallingConv::Win64: ++diff --git a/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp ++index 047bf5569ded3..59897785f156c 100644 ++--- a/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp +++++ b/llvm-14.0.6.src/lib/Transforms/Utils/InjectTLIMappings.cpp ++@@ -92,7 +92,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { ++ ++ auto AddVariantDecl = [&](const ElementCount &VF) { ++ const std::string TLIName = ++- std::string(TLI.getVectorizedFunction(ScalarName, VF)); +++ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast())); ++ if (!TLIName.empty()) { ++ std::string MangledName = ++ VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF); ++diff --git a/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp ++index 46ff0994e04e7..f472af5e1a835 100644 ++--- a/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp +++++ b/llvm-14.0.6.src/lib/Transforms/Vectorize/LoopVectorize.cpp ++@@ -712,6 +712,27 @@ class InnerLoopVectorizer { ++ virtual void printDebugTracesAtStart(){}; ++ virtual void printDebugTracesAtEnd(){}; ++ +++ /// Check legality of given SVML call instruction \p VecCall generated for +++ /// scalar call \p Call. If illegal then the appropriate legal instruction +++ /// is returned. +++ Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call); +++ +++ /// Returns the legal VF for a call instruction \p CI using TTI information +++ /// and vector type. +++ ElementCount getLegalVFForCall(CallInst *CI); +++ +++ /// Partially vectorize a given call \p Call by breaking it down into multiple +++ /// calls of \p LegalCall, decided by the variant VF \p LegalVF. +++ Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall, +++ unsigned LegalVF); +++ +++ /// Generate shufflevector instruction for a vector value \p V based on the +++ /// current \p Part and a smaller VF \p LegalVF. +++ Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part); +++ +++ /// Combine partially vectorized calls stored in \p CallResults. +++ Value *combinePartialVecCalls(SmallVectorImpl &CallResults); +++ ++ /// The original loop. ++ Loop *OrigLoop; ++ ++@@ -4596,6 +4617,17 @@ static bool mayDivideByZero(Instruction &I) { ++ return !CInt || CInt->isZero(); ++ } ++ +++static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL, +++ const TargetLibraryInfo &TLI) { +++ Function *VectorF = CI.getCalledFunction(); +++ FunctionType *FTy = VectorF->getFunctionType(); +++ StringRef VFName = VectorF->getName(); +++ auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL); +++ if (CC) { +++ CI.setCallingConv(*CC); +++ } +++} +++ ++ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, ++ VPUser &ArgOperands, ++ VPTransformState &State) { ++@@ -4664,9 +4696,246 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, ++ if (isa(V)) ++ V->copyFastMathFlags(CI); ++ +++ const DataLayout &DL = V->getModule()->getDataLayout(); +++ setVectorFunctionCallingConv(*V, DL, *TLI); +++ +++ // Perform legalization of SVML call instruction only if original call +++ // was not Intrinsic +++ if (!UseVectorIntrinsic && +++ (V->getCalledFunction()->getName()).startswith("__svml")) { +++ // assert((V->getCalledFunction()->getName()).startswith("__svml")); +++ LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump()); +++ auto *LegalV = cast(legalizeSVMLCall(V, CI)); +++ LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: "; +++ LegalV->dump()); +++ State.set(Def, LegalV, Part); +++ addMetadata(LegalV, &I); +++ } else { ++ State.set(Def, V, Part); ++ addMetadata(V, &I); +++ } +++ } +++} +++ +++//===----------------------------------------------------------------------===// +++// Implementation of functions for SVML vector call legalization. +++//===----------------------------------------------------------------------===// +++// +++// Unlike other VECLIBs, SVML needs to be used with target-legal +++// vector types. Otherwise, link failures and/or runtime failures +++// will occur. A motivating example could be - +++// +++// double *a; +++// float *b; +++// #pragma clang loop vectorize_width(8) +++// for(i = 0; i < N; ++i) { +++// a[i] = sin(i); // Legal SVML VF must be 4 or below on AVX +++// b[i] = cosf(i); // VF can be 8 on AVX since 8 floats can fit in YMM +++// } +++// +++// Current implementation of vector code generation in LV is +++// driven based on a single VF (in InnerLoopVectorizer::VF). This +++// inhibits the flexibility of adjusting/choosing different VF +++// for different instructions. +++// +++// Due to this limitation it is much more straightforward to +++// first generate the illegal sin8 (svml_sin8 for SVML vector +++// library) call and then legalize it than trying to avoid +++// generating illegal code from the beginning. +++// +++// A solution for this problem is to check legality of the +++// call instruction right after generating it in vectorizer and +++// if it is illegal we split the call arguments and issue multiple +++// calls to match the legal VF. This is demonstrated currently for +++// the SVML vector library calls (non-intrinsic version only). +++// +++// Future directions and extensions: +++// 1) This legalization example shows us that a good direction +++// for the VPlan framework would be to model the vector call +++// instructions in a way that legal VF for each call is chosen +++// correctly within vectorizer and illegal code generation is +++// avoided. +++// 2) This logic can also be extended to general vector functions +++// i.e. legalization OpenMP decalre simd functions. The +++// requirements needed for this will be documented soon. +++ +++Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall, +++ CallInst *Call) { +++ ElementCount LegalVF = getLegalVFForCall(VecCall); +++ +++ assert(LegalVF.getKnownMinValue() > 1 && +++ "Legal VF for SVML call must be greater than 1 to vectorize"); +++ +++ if (LegalVF == VF) +++ return VecCall; +++ else if (LegalVF.getKnownMinValue() > VF.getKnownMinValue()) +++ // TODO: handle case when we are underfilling vectors +++ return VecCall; +++ +++ // Legal VF for this SVML call is smaller than chosen VF, break it down into +++ // smaller call instructions +++ +++ // Convert args, types and return type to match legal VF +++ SmallVector NewTys; +++ SmallVector NewArgs; +++ +++ for (Value *ArgOperand : Call->args()) { +++ Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF); +++ NewTys.push_back(Ty); +++ NewArgs.push_back(UndefValue::get(Ty)); ++ } +++ +++ // Construct legal vector function +++ const VFShape Shape = +++ VFShape::get(*Call, LegalVF /*EC*/, false /*HasGlobalPred*/); +++ Function *LegalVectorF = VFDatabase(*Call).getVectorizedFunction(Shape); +++ assert(LegalVectorF != nullptr && "Can't create legal vector function."); +++ +++ LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump()); +++ +++ SmallVector OpBundles; +++ Call->getOperandBundlesAsDefs(OpBundles); +++ auto LegalV = std::unique_ptr(CallInst::Create(LegalVectorF, NewArgs, OpBundles)); +++ +++ if (isa(LegalV)) +++ LegalV->copyFastMathFlags(Call); +++ +++ const DataLayout &DL = VecCall->getModule()->getDataLayout(); +++ // Set SVML calling conventions +++ setVectorFunctionCallingConv(*LegalV, DL, *TLI); +++ +++ LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump()); +++ +++ Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV.get(), LegalVF.getKnownMinValue()); +++ +++ LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump()); +++ +++ // Remove the illegal call from Builder +++ VecCall->eraseFromParent(); +++ +++ return LegalizedCall; +++} +++ +++ElementCount InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) { +++ const DataLayout DL = CI->getModule()->getDataLayout(); +++ FunctionType *CallFT = CI->getFunctionType(); +++ // All functions that need legalization should have a vector return type. +++ // This is true for all SVML functions that are currently supported. +++ assert(isa(CallFT->getReturnType()) && +++ "Return type of call that needs legalization is not a vector."); +++ auto *VecCallRetType = cast(CallFT->getReturnType()); +++ Type *ElemType = VecCallRetType->getElementType(); +++ +++ unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType); +++ unsigned VectorBitWidth = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); +++ unsigned LegalVF = VectorBitWidth / TypeBitWidth; +++ +++ LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n"); +++ LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n"); +++ LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth +++ << "\n"); +++ LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n"); +++ +++ return ElementCount::getFixed(LegalVF); +++} +++ +++// Partial vectorization of a call instruction is achieved by making clones of +++// \p LegalCall and overwriting its argument operands with shufflevector +++// equivalent decided based on \p LegalVF and current Part being filled. +++Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call, +++ CallInst *LegalCall, +++ unsigned LegalVF) { +++ unsigned NumParts = VF.getKnownMinValue() / LegalVF; +++ LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n"); +++ SmallVector CallResults; +++ +++ for (unsigned Part = 0; Part < NumParts; ++Part) { +++ auto *ClonedCall = cast(LegalCall->clone()); +++ +++ // Update the arg operand of cloned call to shufflevector +++ for (unsigned i = 0, ie = Call->arg_size(); i != ie; ++i) { +++ auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part); +++ ClonedCall->setArgOperand(i, NewOp); +++ } +++ +++ LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump()); +++ +++ auto *PartialVecCall = Builder.Insert(ClonedCall); +++ CallResults.push_back(PartialVecCall); +++ } +++ +++ return combinePartialVecCalls(CallResults); +++} +++ +++Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF, +++ unsigned Part) { +++ // Example: +++ // Consider the following vector code - +++ // %1 = sitofp <4 x i32> %0 to <4 x double> +++ // %2 = call <4 x double> @__svml_sin4(<4 x double> %1) +++ // +++ // If the LegalVF is 2, we partially vectorize the sin4 call by invoking +++ // generateShuffleValue on the operand %1 +++ // If Part = 1, output value is - +++ // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> +++ // and if Part = 2, output is - +++ // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32> +++ +++ assert(isa(V->getType()) && +++ "Cannot generate shuffles for non-vector values."); +++ SmallVector ShuffleMask; +++ Value *Undef = UndefValue::get(V->getType()); +++ +++ unsigned ElemIdx = Part * LegalVF; +++ +++ for (unsigned K = 0; K < LegalVF; K++) +++ ShuffleMask.push_back(static_cast(ElemIdx + K)); +++ +++ auto *ShuffleInst = +++ Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle"); +++ +++ return ShuffleInst; +++} +++ +++// Results of the calls executed by smaller legal call instructions must be +++// combined to match the original VF for later use. This is done by constructing +++// shufflevector instructions in a cumulative fashion. +++Value *InnerLoopVectorizer::combinePartialVecCalls( +++ SmallVectorImpl &CallResults) { +++ assert(isa(CallResults[0]->getType()) && +++ "Cannot combine calls with non-vector results."); +++ auto *CallType = cast(CallResults[0]->getType()); +++ +++ Value *CombinedShuffle; +++ unsigned NumElems = CallType->getElementCount().getKnownMinValue() * 2; +++ unsigned NumRegs = CallResults.size(); +++ +++ assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) && +++ "Number of partial vector calls to combine must be a power of 2 " +++ "(atleast 2^1)"); +++ +++ while (NumRegs > 1) { +++ for (unsigned I = 0; I < NumRegs; I += 2) { +++ SmallVector ShuffleMask; +++ for (unsigned J = 0; J < NumElems; J++) +++ ShuffleMask.push_back(static_cast(J)); +++ +++ CombinedShuffle = Builder.CreateShuffleVector( +++ CallResults[I], CallResults[I + 1], ShuffleMask, "combined"); +++ LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:"; +++ CombinedShuffle->dump()); +++ CallResults.push_back(CombinedShuffle); +++ } +++ +++ SmallVector::iterator Start = CallResults.begin(); +++ SmallVector::iterator End = Start + NumRegs; +++ CallResults.erase(Start, End); +++ +++ NumElems *= 2; +++ NumRegs /= 2; +++ } +++ +++ return CombinedShuffle; ++ } ++ ++ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { ++diff --git a/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll ++index df8b7c498bd00..63a36549f18fd 100644 ++--- a/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll +++++ b/llvm-14.0.6.src/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll ++@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu" ++ define <4 x double> @exp_v4(<4 x double> %in) { ++ ; SVML-LABEL: define {{[^@]+}}@exp_v4 ++ ; SVML-SAME: (<4 x double> [[IN:%.*]]) { ++-; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[IN]]) +++; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4_ha(<4 x double> [[IN]]) ++ ; SVML-NEXT: ret <4 x double> [[TMP1]] ++ ; ++ ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4 ++@@ -37,7 +37,7 @@ declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0 ++ define <4 x float> @exp_f32(<4 x float> %in) { ++ ; SVML-LABEL: define {{[^@]+}}@exp_f32 ++ ; SVML-SAME: (<4 x float> [[IN:%.*]]) { ++-; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[IN]]) +++; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4_ha(<4 x float> [[IN]]) ++ ; SVML-NEXT: ret <4 x float> [[TMP1]] ++ ; ++ ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32 ++diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll ++index a6e191c3d6923..d6e2e11106949 100644 ++--- a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll +++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll ++@@ -39,7 +39,8 @@ for.end: ; preds = %for.body ++ declare double @__exp_finite(double) #0 ++ ++ ; CHECK-LABEL: @exp_f64 ++-; CHECK: <4 x double> @__svml_exp4 +++; CHECK: <2 x double> @__svml_exp2 +++; CHECK: <2 x double> @__svml_exp2 ++ ; CHECK: ret ++ define void @exp_f64(double* nocapture %varray) { ++ entry: ++@@ -99,7 +100,8 @@ for.end: ; preds = %for.body ++ declare double @__log_finite(double) #0 ++ ++ ; CHECK-LABEL: @log_f64 ++-; CHECK: <4 x double> @__svml_log4 +++; CHECK: <2 x double> @__svml_log2 +++; CHECK: <2 x double> @__svml_log2 ++ ; CHECK: ret ++ define void @log_f64(double* nocapture %varray) { ++ entry: ++@@ -159,7 +161,8 @@ for.end: ; preds = %for.body ++ declare double @__pow_finite(double, double) #0 ++ ++ ; CHECK-LABEL: @pow_f64 ++-; CHECK: <4 x double> @__svml_pow4 +++; CHECK: <2 x double> @__svml_pow2 +++; CHECK: <2 x double> @__svml_pow2 ++ ; CHECK: ret ++ define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { ++ entry: ++@@ -190,7 +193,8 @@ declare float @__exp2f_finite(float) #0 ++ ++ define void @exp2f_finite(float* nocapture %varray) { ++ ; CHECK-LABEL: @exp2f_finite( ++-; CHECK: call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}}) +++; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}}) +++; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}}) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -219,7 +223,8 @@ declare double @__exp2_finite(double) #0 ++ ++ define void @exp2_finite(double* nocapture %varray) { ++ ; CHECK-LABEL: @exp2_finite( ++-; CHECK: call <4 x double> @__svml_exp24(<4 x double> {{.*}}) +++; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}}) +++; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}}) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -276,7 +281,8 @@ for.end: ; preds = %for.body ++ declare double @__log2_finite(double) #0 ++ ++ ; CHECK-LABEL: @log2_f64 ++-; CHECK: <4 x double> @__svml_log24 +++; CHECK: <2 x double> @__svml_log22 +++; CHECK: <2 x double> @__svml_log22 ++ ; CHECK: ret ++ define void @log2_f64(double* nocapture %varray) { ++ entry: ++@@ -333,7 +339,8 @@ for.end: ; preds = %for.body ++ declare double @__log10_finite(double) #0 ++ ++ ; CHECK-LABEL: @log10_f64 ++-; CHECK: <4 x double> @__svml_log104 +++; CHECK: <2 x double> @__svml_log102 +++; CHECK: <2 x double> @__svml_log102 ++ ; CHECK: ret ++ define void @log10_f64(double* nocapture %varray) { ++ entry: ++@@ -390,7 +397,8 @@ for.end: ; preds = %for.body ++ declare double @__sqrt_finite(double) #0 ++ ++ ; CHECK-LABEL: @sqrt_f64 ++-; CHECK: <4 x double> @__svml_sqrt4 +++; CHECK: <2 x double> @__svml_sqrt2 +++; CHECK: <2 x double> @__svml_sqrt2 ++ ; CHECK: ret ++ define void @sqrt_f64(double* nocapture %varray) { ++ entry: ++diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll ++index 42c280df6ad02..088bbdcf1aa4a 100644 ++--- a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll +++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-calls.ll ++@@ -48,7 +48,7 @@ declare float @llvm.exp2.f32(float) #0 ++ ++ define void @sin_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @sin_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -71,7 +71,7 @@ for.end: ++ ++ define void @sin_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @sin_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -94,7 +94,7 @@ for.end: ++ ++ define void @sin_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @sin_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -117,7 +117,7 @@ for.end: ++ ++ define void @sin_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @sin_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -140,7 +140,7 @@ for.end: ++ ++ define void @cos_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @cos_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -163,7 +163,7 @@ for.end: ++ ++ define void @cos_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @cos_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -186,7 +186,7 @@ for.end: ++ ++ define void @cos_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @cos_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -209,7 +209,7 @@ for.end: ++ ++ define void @cos_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @cos_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -232,7 +232,7 @@ for.end: ++ ++ define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { ++ ; CHECK-LABEL: @pow_f64( ++-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +++; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -257,7 +257,7 @@ for.end: ++ ++ define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { ++ ; CHECK-LABEL: @pow_f64_intrinsic( ++-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +++; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -282,7 +282,7 @@ for.end: ++ ++ define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { ++ ; CHECK-LABEL: @pow_f32( ++-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +++; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -307,7 +307,7 @@ for.end: ++ ++ define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { ++ ; CHECK-LABEL: @pow_f32_intrinsic( ++-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +++; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -332,7 +332,7 @@ for.end: ++ ++ define void @exp_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @exp_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -355,7 +355,7 @@ for.end: ++ ++ define void @exp_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @exp_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -378,7 +378,7 @@ for.end: ++ ++ define void @exp_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @exp_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -401,7 +401,7 @@ for.end: ++ ++ define void @exp_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @exp_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -424,7 +424,7 @@ for.end: ++ ++ define void @log_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @log_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -447,7 +447,7 @@ for.end: ++ ++ define void @log_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @log_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -470,7 +470,7 @@ for.end: ++ ++ define void @log_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @log_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -493,7 +493,7 @@ for.end: ++ ++ define void @log_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @log_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -516,7 +516,7 @@ for.end: ++ ++ define void @log2_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @log2_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -539,7 +539,7 @@ for.end: ++ ++ define void @log2_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @log2_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -562,7 +562,7 @@ for.end: ++ ++ define void @log2_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @log2_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -585,7 +585,7 @@ for.end: ++ ++ define void @log2_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @log2_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -608,7 +608,7 @@ for.end: ++ ++ define void @log10_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @log10_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -631,7 +631,7 @@ for.end: ++ ++ define void @log10_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @log10_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -654,7 +654,7 @@ for.end: ++ ++ define void @log10_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @log10_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -677,7 +677,7 @@ for.end: ++ ++ define void @log10_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @log10_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -700,7 +700,7 @@ for.end: ++ ++ define void @sqrt_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @sqrt_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sqrt4_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -723,7 +723,7 @@ for.end: ++ ++ define void @sqrt_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @sqrt_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sqrtf4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -746,7 +746,7 @@ for.end: ++ ++ define void @exp2_f64(double* nocapture %varray) { ++ ; CHECK-LABEL: @exp2_f64( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -769,7 +769,7 @@ for.end: ++ ++ define void @exp2_f32(float* nocapture %varray) { ++ ; CHECK-LABEL: @exp2_f32( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -792,7 +792,7 @@ for.end: ++ ++ define void @exp2_f64_intrinsic(double* nocapture %varray) { ++ ; CHECK-LABEL: @exp2_f64_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -815,7 +815,7 @@ for.end: ++ ++ define void @exp2_f32_intrinsic(float* nocapture %varray) { ++ ; CHECK-LABEL: @exp2_f32_intrinsic( ++-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]]) +++; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]]) ++ ; CHECK: ret void ++ ; ++ entry: ++@@ -836,4 +836,44 @@ for.end: ++ ret void ++ } ++ +++; CHECK-LABEL: @atan2_finite +++; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24( +++; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24( +++; CHECK: ret +++ +++declare double @__atan2_finite(double, double) local_unnamed_addr #0 +++ +++define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 { +++entry: +++ br label %for.cond1.preheader +++ +++for.cond1.preheader: ; preds = %for.inc7, %entry +++ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ] +++ %0 = trunc i64 %indvars.iv19 to i32 +++ %conv = sitofp i32 %0 to double +++ br label %for.body3 +++ +++for.body3: ; preds = %for.body3, %for.cond1.preheader +++ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] +++ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +++ %1 = trunc i64 %indvars.iv.next to i32 +++ %conv4 = sitofp i32 %1 to double +++ %call = tail call fast double @__atan2_finite(double %conv, double %conv4) +++ %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv +++ store double %call, double* %arrayidx6, align 8 +++ %exitcond = icmp eq i64 %indvars.iv.next, 100 +++ br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5 +++ +++for.inc7: ; preds = %for.body3 +++ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 +++ %exitcond21 = icmp eq i64 %indvars.iv.next20, 100 +++ br i1 %exitcond21, label %for.end9, label %for.cond1.preheader +++ +++for.end9: ; preds = %for.inc7 +++ ret void +++} +++ ++ attributes #0 = { nounwind readnone } +++!5 = distinct !{!5, !6, !7} +++!6 = !{!"llvm.loop.vectorize.width", i32 8} +++!7 = !{!"llvm.loop.vectorize.enable", i1 true} ++diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll ++new file mode 100644 ++index 0000000000000..326c763994343 ++--- /dev/null +++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll ++@@ -0,0 +1,513 @@ +++; Check legalization of SVML calls, including intrinsic versions (like @llvm..). +++ +++; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s +++ +++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +++target triple = "x86_64-unknown-linux-gnu" +++ +++declare double @sin(double) #0 +++declare float @sinf(float) #0 +++declare double @llvm.sin.f64(double) #0 +++declare float @llvm.sin.f32(float) #0 +++ +++declare double @cos(double) #0 +++declare float @cosf(float) #0 +++declare double @llvm.cos.f64(double) #0 +++declare float @llvm.cos.f32(float) #0 +++ +++declare double @pow(double, double) #0 +++declare float @powf(float, float) #0 +++declare double @llvm.pow.f64(double, double) #0 +++declare float @llvm.pow.f32(float, float) #0 +++ +++declare double @exp(double) #0 +++declare float @expf(float) #0 +++declare double @llvm.exp.f64(double) #0 +++declare float @llvm.exp.f32(float) #0 +++ +++declare double @log(double) #0 +++declare float @logf(float) #0 +++declare double @llvm.log.f64(double) #0 +++declare float @llvm.log.f32(float) #0 +++ +++ +++define void @sin_f64(double* nocapture %varray) { +++; CHECK-LABEL: @sin_f64( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @sin(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @sin_f32(float* nocapture %varray) { +++; CHECK-LABEL: @sin_f32( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @sinf(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @sin_f64_intrinsic(double* nocapture %varray) { +++; CHECK-LABEL: @sin_f64_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @llvm.sin.f64(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @sin_f32_intrinsic(float* nocapture %varray) { +++; CHECK-LABEL: @sin_f32_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @llvm.sin.f32(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @cos_f64(double* nocapture %varray) { +++; CHECK-LABEL: @cos_f64( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @cos(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @cos_f32(float* nocapture %varray) { +++; CHECK-LABEL: @cos_f32( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @cosf(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @cos_f64_intrinsic(double* nocapture %varray) { +++; CHECK-LABEL: @cos_f64_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @llvm.cos.f64(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @cos_f32_intrinsic(float* nocapture %varray) { +++; CHECK-LABEL: @cos_f32_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @llvm.cos.f32(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { +++; CHECK-LABEL: @pow_f64( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]]) +++; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv +++ %tmp1 = load double, double* %arrayidx, align 4 +++ %tmp2 = tail call double @pow(double %conv, double %tmp1) +++ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %tmp2, double* %arrayidx2, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) { +++; CHECK-LABEL: @pow_f64_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]]) +++; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv +++ %tmp1 = load double, double* %arrayidx, align 4 +++ %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1) +++ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %tmp2, double* %arrayidx2, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { +++; CHECK-LABEL: @pow_f32( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv +++ %tmp1 = load float, float* %arrayidx, align 4 +++ %tmp2 = tail call float @powf(float %conv, float %tmp1) +++ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %tmp2, float* %arrayidx2, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) { +++; CHECK-LABEL: @pow_f32_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv +++ %tmp1 = load float, float* %arrayidx, align 4 +++ %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1) +++ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %tmp2, float* %arrayidx2, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @exp_f64(double* nocapture %varray) { +++; CHECK-LABEL: @exp_f64( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @exp(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @exp_f32(float* nocapture %varray) { +++; CHECK-LABEL: @exp_f32( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @expf(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @exp_f64_intrinsic(double* nocapture %varray) { +++; CHECK-LABEL: @exp_f64_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @llvm.exp.f64(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @exp_f32_intrinsic(float* nocapture %varray) { +++; CHECK-LABEL: @exp_f32_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @llvm.exp.f32(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @log_f64(double* nocapture %varray) { +++; CHECK-LABEL: @log_f64( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @log(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @log_f32(float* nocapture %varray) { +++; CHECK-LABEL: @log_f32( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @logf(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @log_f64_intrinsic(double* nocapture %varray) { +++; CHECK-LABEL: @log_f64_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]]) +++; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to double +++ %call = tail call double @llvm.log.f64(double %conv) +++ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv +++ store double %call, double* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++define void @log_f32_intrinsic(float* nocapture %varray) { +++; CHECK-LABEL: @log_f32_intrinsic( +++; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]]) +++; CHECK: ret void +++; +++entry: +++ br label %for.body +++ +++for.body: +++ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +++ %tmp = trunc i64 %iv to i32 +++ %conv = sitofp i32 %tmp to float +++ %call = tail call float @llvm.log.f32(float %conv) +++ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv +++ store float %call, float* %arrayidx, align 4 +++ %iv.next = add nuw nsw i64 %iv, 1 +++ %exitcond = icmp eq i64 %iv.next, 1000 +++ br i1 %exitcond, label %for.end, label %for.body +++ +++for.end: +++ ret void +++} +++ +++attributes #0 = { nounwind readnone } +++ ++diff --git a/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll ++new file mode 100644 ++index 0000000000000..9422653445dc2 ++--- /dev/null +++++ b/llvm-14.0.6.src/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll ++@@ -0,0 +1,61 @@ +++; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype. +++; The C code used to generate this test: +++ +++; #include +++; +++; void foo(double *a, int N){ +++; int i; +++; #pragma clang loop vectorize_width(8) +++; for (i=0;i [[I0:%.*]] to <8 x double> +++; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> +++; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S1]]) +++; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> +++; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S2]]) +++; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> +++; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8 +++ +++ +++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +++target triple = "x86_64-unknown-linux-gnu" +++ +++; Function Attrs: nounwind uwtable +++define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 { +++entry: +++ %cmp5 = icmp sgt i32 %N, 0 +++ br i1 %cmp5, label %for.body.preheader, label %for.end +++ +++for.body.preheader: ; preds = %entry +++ %wide.trip.count = zext i32 %N to i64 +++ br label %for.body +++ +++for.body: ; preds = %for.body, %for.body.preheader +++ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] +++ %0 = trunc i64 %indvars.iv to i32 +++ %conv = sitofp i32 %0 to double +++ %call = tail call fast double @sin(double %conv) #2 +++ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv +++ store double %call, double* %arrayidx, align 8, !tbaa !2 +++ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +++ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count +++ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6 +++ +++for.end: ; preds = %for.body, %entry +++ ret void +++} +++ +++; Function Attrs: nounwind +++declare dso_local double @sin(double) local_unnamed_addr #1 +++ +++!2 = !{!3, !3, i64 0} +++!3 = !{!"double", !4, i64 0} +++!4 = !{!"omnipotent char", !5, i64 0} +++!5 = !{!"Simple C/C++ TBAA"} +++!6 = distinct !{!6, !7} +++!7 = !{!"llvm.loop.vectorize.width", i32 8} ++diff --git a/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll b/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll ++index e8c83c4d9bd1f..615fdc29176a2 100644 ++--- a/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll +++++ b/llvm-14.0.6.src/test/Transforms/Util/add-TLI-mappings.ll ++@@ -12,12 +12,12 @@ target triple = "x86_64-unknown-linux-gnu" ++ ++ ; COMMON-LABEL: @llvm.compiler.used = appending global ++ ; SVML-SAME: [6 x i8*] [ ++-; SVML-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__svml_sin2 to i8*), ++-; SVML-SAME: i8* bitcast (<4 x double> (<4 x double>)* @__svml_sin4 to i8*), ++-; SVML-SAME: i8* bitcast (<8 x double> (<8 x double>)* @__svml_sin8 to i8*), ++-; SVML-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__svml_log10f4 to i8*), ++-; SVML-SAME: i8* bitcast (<8 x float> (<8 x float>)* @__svml_log10f8 to i8*), ++-; SVML-SAME: i8* bitcast (<16 x float> (<16 x float>)* @__svml_log10f16 to i8*) +++; SVML-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__svml_sin2_ha to i8*), +++; SVML-SAME: i8* bitcast (<4 x double> (<4 x double>)* @__svml_sin4_ha to i8*), +++; SVML-SAME: i8* bitcast (<8 x double> (<8 x double>)* @__svml_sin8_ha to i8*), +++; SVML-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__svml_log10f4_ha to i8*), +++; SVML-SAME: i8* bitcast (<8 x float> (<8 x float>)* @__svml_log10f8_ha to i8*), +++; SVML-SAME: i8* bitcast (<16 x float> (<16 x float>)* @__svml_log10f16_ha to i8*) ++ ; MASSV-SAME: [2 x i8*] [ ++ ; MASSV-SAME: i8* bitcast (<2 x double> (<2 x double>)* @__sind2 to i8*), ++ ; MASSV-SAME: i8* bitcast (<4 x float> (<4 x float>)* @__log10f4 to i8*) ++@@ -59,9 +59,9 @@ declare float @llvm.log10.f32(float) #0 ++ attributes #0 = { nounwind readnone } ++ ++ ; SVML: attributes #[[SIN]] = { "vector-function-abi-variant"= ++-; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2), ++-; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4), ++-; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8)" } +++; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2_ha), +++; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4_ha), +++; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8_ha)" } ++ ++ ; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"= ++ ; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" } ++diff --git a/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt b/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt ++index 97df6a55d1b59..199e0285c9e5d 100644 ++--- a/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt +++++ b/llvm-14.0.6.src/utils/TableGen/CMakeLists.txt ++@@ -47,6 +47,7 @@ add_tablegen(llvm-tblgen LLVM ++ SearchableTableEmitter.cpp ++ SubtargetEmitter.cpp ++ SubtargetFeatureInfo.cpp +++ SVMLEmitter.cpp ++ TableGen.cpp ++ Types.cpp ++ X86DisassemblerTables.cpp ++diff --git a/llvm-14.0.6.src/utils/TableGen/SVMLEmitter.cpp b/llvm-14.0.6.src/utils/TableGen/SVMLEmitter.cpp ++new file mode 100644 ++index 0000000000000..a5aeea48db28b ++--- /dev/null +++++ b/llvm-14.0.6.src/utils/TableGen/SVMLEmitter.cpp ++@@ -0,0 +1,110 @@ +++//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===// +++// +++// The LLVM Compiler Infrastructure +++// +++// This file is distributed under the University of Illinois Open Source +++// License. See LICENSE.TXT for details. +++// +++//===----------------------------------------------------------------------===// +++// +++// This tablegen backend emits the scalar to svml function map for TLI. +++// +++//===----------------------------------------------------------------------===// +++ +++#include "CodeGenTarget.h" +++#include "llvm/Support/Format.h" +++#include "llvm/TableGen/Error.h" +++#include "llvm/TableGen/Record.h" +++#include "llvm/TableGen/TableGenBackend.h" +++#include +++#include +++ +++using namespace llvm; +++ +++#define DEBUG_TYPE "SVMLVariants" +++#include "llvm/Support/Debug.h" +++ +++namespace { +++ +++class SVMLVariantsEmitter { +++ +++ RecordKeeper &Records; +++ +++private: +++ void emitSVMLVariants(raw_ostream &OS); +++ +++public: +++ SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {} +++ +++ void run(raw_ostream &OS); +++}; +++} // End anonymous namespace +++ +++/// \brief Emit the set of SVML variant function names. +++// The default is to emit the high accuracy SVML variants until a mechanism is +++// introduced to allow a selection of different variants through precision +++// requirements specified by the user. This code generates mappings to svml +++// that are in the scalar form of llvm intrinsics, math library calls, or the +++// finite variants of math library calls. +++void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) { +++ +++ const unsigned MinSinglePrecVL = 4; +++ const unsigned MaxSinglePrecVL = 16; +++ const unsigned MinDoublePrecVL = 2; +++ const unsigned MaxDoublePrecVL = 8; +++ +++ OS << "#ifdef GET_SVML_VARIANTS\n"; +++ +++ for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) { +++ StringRef SvmlVariantNameStr = D->getName(); +++ // Single Precision SVML +++ for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) { +++ // Emit the scalar math library function to svml function entry. +++ OS << "{\"" << SvmlVariantNameStr << "f" << "\", "; +++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " +++ << "ElementCount::getFixed(" << VL << ")},\n"; +++ +++ // Emit the scalar intrinsic to svml function entry. +++ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", "; +++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " +++ << "ElementCount::getFixed(" << VL << ")},\n"; +++ +++ // Emit the finite math library function to svml function entry. +++ OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", "; +++ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", " +++ << "ElementCount::getFixed(" << VL << ")},\n"; +++ } +++ +++ // Double Precision SVML +++ for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) { +++ // Emit the scalar math library function to svml function entry. +++ OS << "{\"" << SvmlVariantNameStr << "\", "; +++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL +++ << ")},\n"; +++ +++ // Emit the scalar intrinsic to svml function entry. +++ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", "; +++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL +++ << ")},\n"; +++ +++ // Emit the finite math library function to svml function entry. +++ OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", "; +++ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " +++ << "ElementCount::getFixed(" << VL << ")},\n"; +++ } +++ } +++ +++ OS << "#endif // GET_SVML_VARIANTS\n\n"; +++} +++ +++void SVMLVariantsEmitter::run(raw_ostream &OS) { +++ emitSVMLVariants(OS); +++} +++ +++namespace llvm { +++ +++void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) { +++ SVMLVariantsEmitter(RK).run(OS); +++} +++ +++} // End llvm namespace ++diff --git a/llvm-14.0.6.src/utils/TableGen/TableGen.cpp b/llvm-14.0.6.src/utils/TableGen/TableGen.cpp ++index 2d4a45f889be6..603d0c223b33a 100644 ++--- a/llvm-14.0.6.src/utils/TableGen/TableGen.cpp +++++ b/llvm-14.0.6.src/utils/TableGen/TableGen.cpp ++@@ -57,6 +57,7 @@ enum ActionType { ++ GenAutomata, ++ GenDirectivesEnumDecl, ++ GenDirectivesEnumImpl, +++ GenSVMLVariants, ++ }; ++ ++ namespace llvm { ++@@ -138,7 +139,9 @@ cl::opt Action( ++ clEnumValN(GenDirectivesEnumDecl, "gen-directive-decl", ++ "Generate directive related declaration code (header file)"), ++ clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl", ++- "Generate directive related implementation code"))); +++ "Generate directive related implementation code"), +++ clEnumValN(GenSVMLVariants, "gen-svml", +++ "Generate SVML variant function names"))); ++ ++ cl::OptionCategory PrintEnumsCat("Options for -print-enums"); ++ cl::opt Class("class", cl::desc("Print Enum list for this class"), ++@@ -272,6 +275,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { ++ case GenDirectivesEnumImpl: ++ EmitDirectivesImpl(Records, OS); ++ break; +++ case GenSVMLVariants: +++ EmitSVMLVariants(Records, OS); +++ break; ++ } ++ ++ return false; ++diff --git a/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h b/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h ++index 71db8dc77b052..86c3a3068c2dc 100644 ++--- a/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h +++++ b/llvm-14.0.6.src/utils/TableGen/TableGenBackends.h ++@@ -93,6 +93,7 @@ void EmitExegesis(RecordKeeper &RK, raw_ostream &OS); ++ void EmitAutomata(RecordKeeper &RK, raw_ostream &OS); ++ void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS); ++ void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS); +++void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS); ++ ++ } // End llvm namespace ++ ++diff --git a/llvm-14.0.6.src/utils/vim/syntax/llvm.vim b/llvm-14.0.6.src/utils/vim/syntax/llvm.vim ++index 205db16b7d8cd..2572ab5a59e1b 100644 ++--- a/llvm-14.0.6.src/utils/vim/syntax/llvm.vim +++++ b/llvm-14.0.6.src/utils/vim/syntax/llvm.vim ++@@ -104,6 +104,7 @@ syn keyword llvmKeyword ++ \ inreg ++ \ intel_ocl_bicc ++ \ inteldialect +++ \ intel_svmlcc ++ \ internal ++ \ jumptable ++ \ linkonce +diff --git a/conda-recipes/llvmdev/bld.bat b/conda-recipes/llvmdev/bld.bat +index 1ce228c80..0cba1e937 100644 +--- a/conda-recipes/llvmdev/bld.bat ++++ b/conda-recipes/llvmdev/bld.bat +@@ -1,3 +1,13 @@ ++setlocal EnableDelayedExpansion ++FOR /D %%d IN (llvm-*.src) DO (MKLINK /J llvm %%d ++if !errorlevel! neq 0 exit /b %errorlevel%) ++FOR /D %%d IN (lld-*.src) DO (MKLINK /J lld %%d ++if !errorlevel! neq 0 exit /b %errorlevel%) ++FOR /D %%d IN (unwind\libunwind-*.src) DO (MKLINK /J libunwind %%d ++if !errorlevel! neq 0 exit /b %errorlevel%) ++ ++DIR ++ + mkdir build + cd build + +@@ -24,31 +34,18 @@ REM the 64bit linker anyway. This must be passed in to certain generators as + REM '-Thost x64'. + set PreferredToolArchitecture=x64 + +-set MAX_INDEX_CMAKE_GENERATOR=2 +- +-REM On older generators we can squeete the architecture into the generator +-REM name. In newer generators, we must use the -A flag for cmake to hand in the +-REM correct architecture. Also, using Visual Studio 16 2019 we use toolset +-REM v141, which basically means use a Visual Studio 15 2017 type compiler from +-REM Visual Studio 16 2019. See also: +-REM https://stackoverflow.com/questions/55708600/whats-the-cmake-generator-for-visual-studio-2019 ++set MAX_INDEX_CMAKE_GENERATOR=0 + +-set "CMAKE_GENERATOR[0]=Visual Studio 14 2015%ARCH_POSTFIX%" +-set "CMAKE_GENERATOR[1]=Visual Studio 15 2017%ARCH_POSTFIX%" +-set "CMAKE_GENERATOR[2]=Visual Studio 16 2019" ++set "CMAKE_GENERATOR[0]=Visual Studio 16 2019" + +-set "CMAKE_GENERATOR_ARCHITECTURE[0]=" +-set "CMAKE_GENERATOR_ARCHITECTURE[1]=" +-set "CMAKE_GENERATOR_ARCHITECTURE[2]=%GEN_ARCH%" ++set "CMAKE_GENERATOR_ARCHITECTURE[0]=%GEN_ARCH%" + +-set "CMAKE_GENERATOR_TOOLSET[0]=host %PreferredToolArchitecture%" +-set "CMAKE_GENERATOR_TOOLSET[1]=host %PreferredToolArchitecture%" +-set "CMAKE_GENERATOR_TOOLSET[2]=v141" ++set "CMAKE_GENERATOR_TOOLSET[0]=v142" + + REM Reduce build times and package size by removing unused stuff + REM BENCHMARKS (new for llvm8) don't build under Visual Studio 14 2015 + set CMAKE_CUSTOM=-DLLVM_TARGETS_TO_BUILD="%LLVM_TARGETS_TO_BUILD%" ^ +- -DLLVM_INCLUDE_TESTS=OFF ^ ++ -DLLVM_ENABLE_PROJECTS:STRING=lld ^ + -DLLVM_INCLUDE_UTILS=ON ^ + -DLLVM_INCLUDE_DOCS=OFF ^ + -DLLVM_INCLUDE_EXAMPLES=OFF ^ +@@ -67,7 +64,7 @@ for /l %%n in (0,1,%MAX_INDEX_CMAKE_GENERATOR%) do ( + -DCMAKE_BUILD_TYPE="%BUILD_CONFIG%" ^ + -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ + -DCMAKE_INSTALL_PREFIX:PATH="%LIBRARY_PREFIX%" ^ +- %CMAKE_CUSTOM% "%SRC_DIR%" ++ %CMAKE_CUSTOM% "%SRC_DIR%\llvm" + if not errorlevel 1 goto configuration_successful + del CMakeCache.txt + ) +@@ -85,13 +82,3 @@ if errorlevel 1 exit 1 + REM === Install step === + cmake --build . --config "%BUILD_CONFIG%" --target install + if errorlevel 1 exit 1 +- +-REM From: https://github.com/conda-forge/llvmdev-feedstock/pull/53 +-"%BUILD_CONFIG%\bin\opt" -S -vector-library=SVML -mcpu=haswell -O3 "%RECIPE_DIR%\numba-3016.ll" | "%BUILD_CONFIG%\bin\FileCheck" "%RECIPE_DIR%\numba-3016.ll" +-if errorlevel 1 exit 1 +- +-REM This is technically how to run the suite, but it will only run in an +-REM enhanced unix-like shell which has functions like `grep` available. +-REM cd ..\test +-REM "%PYTHON%" "..\build\%BUILD_CONFIG%\bin\llvm-lit.py" -vv Transforms ExecutionEngine Analysis CodeGen/X86 +-REM if errorlevel 1 exit 1 +diff --git a/conda-recipes/llvmdev/build.sh b/conda-recipes/llvmdev/build.sh +index fd99eee90..dc0af4074 100644 +--- a/conda-recipes/llvmdev/build.sh ++++ b/conda-recipes/llvmdev/build.sh +@@ -15,10 +15,14 @@ else + DARWIN_TARGET=x86_64-apple-darwin13.4.0 + fi + ++mv llvm-*.src llvm ++mv lld-*.src lld ++mv unwind/libunwind-*.src libunwind + + declare -a _cmake_config + _cmake_config+=(-DCMAKE_INSTALL_PREFIX:PATH=${PREFIX}) + _cmake_config+=(-DCMAKE_BUILD_TYPE:STRING=Release) ++_cmake_config+=(-DLLVM_ENABLE_PROJECTS:STRING="lld") + # The bootstrap clang I use was built with a static libLLVMObject.a and I trying to get the same here + # _cmake_config+=(-DBUILD_SHARED_LIBS:BOOL=ON) + _cmake_config+=(-DLLVM_ENABLE_ASSERTIONS:BOOL=ON) +@@ -27,6 +31,7 @@ _cmake_config+=(-DLINK_POLLY_INTO_TOOLS:BOOL=ON) + _cmake_config+=(-DLLVM_ENABLE_LIBXML2:BOOL=OFF) + # Urgh, llvm *really* wants to link to ncurses / terminfo and we *really* do not want it to. + _cmake_config+=(-DHAVE_TERMINFO_CURSES=OFF) ++_cmake_config+=(-DLLVM_ENABLE_TERMINFO=OFF) + # Sometimes these are reported as unused. Whatever. + _cmake_config+=(-DHAVE_TERMINFO_NCURSES=OFF) + _cmake_config+=(-DHAVE_TERMINFO_NCURSESW=OFF) +@@ -39,10 +44,10 @@ _cmake_config+=(-DLLVM_ENABLE_RTTI=OFF) + _cmake_config+=(-DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD}) + _cmake_config+=(-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly) + _cmake_config+=(-DLLVM_INCLUDE_UTILS=ON) # for llvm-lit ++_cmake_config+=(-DLLVM_INCLUDE_BENCHMARKS:BOOL=OFF) # doesn't build without the rest of LLVM project + # TODO :: It would be nice if we had a cross-ecosystem 'BUILD_TIME_LIMITED' env var we could use to + # disable these unnecessary but useful things. + if [[ ${CONDA_FORGE} == yes ]]; then +- _cmake_config+=(-DLLVM_INCLUDE_TESTS=OFF) + _cmake_config+=(-DLLVM_INCLUDE_DOCS=OFF) + _cmake_config+=(-DLLVM_INCLUDE_EXAMPLES=OFF) + fi +@@ -76,7 +81,7 @@ cd build + + cmake -G'Unix Makefiles' \ + "${_cmake_config[@]}" \ +- .. ++ ../llvm + + ARCH=`uname -m` + if [ $ARCH == 'armv7l' ]; then # RPi need thread count throttling +@@ -85,18 +90,7 @@ else + make -j${CPU_COUNT} VERBOSE=1 + fi + ++make check-llvm-unit || exit $? ++ + # From: https://github.com/conda-forge/llvmdev-feedstock/pull/53 + make install || exit $? +- +-# SVML tests on x86_64 arch only +-if [[ $ARCH == 'x86_64' ]]; then +- bin/opt -S -vector-library=SVML -mcpu=haswell -O3 $RECIPE_DIR/numba-3016.ll | bin/FileCheck $RECIPE_DIR/numba-3016.ll || exit $? +-fi +- +-# run the tests, skip some on linux-32 +-cd ../test +-if [[ $ARCH == 'i686' ]]; then +- ../build/bin/llvm-lit -vv Transforms Analysis CodeGen/X86 +-else +- ../build/bin/llvm-lit -vv Transforms ExecutionEngine Analysis CodeGen/X86 +-fi +diff --git a/conda-recipes/llvmdev/meta.yaml b/conda-recipes/llvmdev/meta.yaml +index 27b596ffc..e2df508e9 100644 +--- a/conda-recipes/llvmdev/meta.yaml ++++ b/conda-recipes/llvmdev/meta.yaml +@@ -1,8 +1,9 @@ +-{% set shortversion = "11.1" %} +-{% set version = "11.1.0" %} +-{% set sha256_llvm = "ce8508e318a01a63d4e8b3090ab2ded3c598a50258cc49e2625b9120d4c03ea5" %} +-{% set sha256_lld = "017a788cbe1ecc4a949abf10755870519086d058a2e99f438829aef24f0c66ce" %} +-{% set build_number = "5" %} ++{% set shortversion = "14.0" %} ++{% set version = "14.0.6" %} ++{% set sha256_llvm = "050922ecaaca5781fdf6631ea92bc715183f202f9d2f15147226f023414f619a" %} ++{% set sha256_lld = "0c28ce0496934d37d20fec96591032dd66af8d10178a45762e0e75e85cf95ad3" %} ++{% set sha256_libunwind = "3bbe9c23c73259fe39c045dc87d0b283236ba6e00750a226b2c2aeac4a51d86b" %} ++{% set build_number = "0" %} + + package: + name: llvmdev +@@ -13,20 +14,16 @@ source: + fn: llvm-{{ version }}.src.tar.xz + sha256: {{ sha256_llvm }} + patches: +- - ../partial-testing.patch +- # Intel SVML optimizations (two patches) +- - ../intel-D47188-svml-VF.patch +- # Second patch from https://github.com/conda-forge/llvmdev-feedstock/blob/c706309/recipe/patches/expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch +- - ../expect-fastmath-entrypoints-in-add-TLI-mappings.ll.patch +- # Reverts a patch limiting non-GlobalValue name length +- - ../0001-Revert-Limit-size-of-non-GlobalValue-name.patch +- # Fixes for aarch64 on LLVM 11 from https://reviews.llvm.org/D104123 +- - ../llvm_11_consecutive_registers.patch +- ++ - ../llvm14-remove-use-of-clonefile.patch ++ - ../llvm14-svml.patch + - url: https://github.com/llvm/llvm-project/releases/download/llvmorg-{{ version }}/lld-{{ version }}.src.tar.xz + fn: lld-{{ version }}.src.tar.xz + sha256: {{ sha256_lld }} +- folder: tools/lld ++ ++ - url: https://github.com/llvm/llvm-project/releases/download/llvmorg-{{ version }}/libunwind-{{ version }}.src.tar.xz ++ fn: libunwind-{{ version }}.src.tar.xz ++ sha256: {{ sha256_libunwind }} ++ folder: unwind + + build: + number: {{ build_number }} +@@ -59,8 +56,6 @@ requirements: + - python # [not (armv6l or armv7l or aarch64 or win)] + + test: +- files: +- - numba-3016.ll + commands: + - $PREFIX/bin/llvm-config --libs # [not win] + - $PREFIX/bin/llc -version # [not win] +@@ -81,5 +76,5 @@ about: + home: http://llvm.org/ + dev_url: https://github.com/llvm-mirror/llvm + license: NCSA +- license_file: LICENSE.TXT ++ license_file: llvm/LICENSE.TXT + summary: Development headers and libraries for LLVM +diff --git a/conda-recipes/llvmdev/numba-3016.ll b/conda-recipes/llvmdev/numba-3016.ll +deleted file mode 100644 +index 1a9b3ecf8..000000000 +--- a/conda-recipes/llvmdev/numba-3016.ll ++++ /dev/null +@@ -1,80 +0,0 @@ +-; Regression test for llvmdev-feedstock#52 and numba#3016 +- +-; Generated from C code: int a[1<<10],b[1<<10]; void foo() { int i=0; for(i=0; i<1<<10; i++) { b[i]=sin(a[i]); }} +-; compiled: -fvectorize -fveclib=SVML -O -S -mavx -mllvm -disable-llvm-optzns -emit-llvm +- +-; RUN: opt -vector-library=SVML -mcpu=haswell -O3 -S < %s | FileCheck %s +-; CHECK: call {{.*}}__svml_sin4_ha( +-; CHECK-NOT: call {{.*}}__svml_sin4( +-; CHECK-NOT: call {{.*}}__svml_sin8 +- +-source_filename = "svml-3016.c" +-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +-target triple = "x86_64-pc-linux-gnu" +- +-@a = common dso_local global [1024 x i32] zeroinitializer, align 16 +-@b = common dso_local global [1024 x i32] zeroinitializer, align 16 +- +-; Function Attrs: nounwind uwtable +-define dso_local void @foo() #0 { +- %1 = alloca i32, align 4 +- %2 = bitcast i32* %1 to i8* +- call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3 +- store i32 0, i32* %1, align 4, !tbaa !2 +- store i32 0, i32* %1, align 4, !tbaa !2 +- br label %3 +- +-;