From dbe4ebac2a21366f986808b175f4145499ba9856 Mon Sep 17 00:00:00 2001
From: Siu Kwan Lam <[email protected]>
Date: Mon, 8 Apr 2024 11:02:09 -0500
Subject: [PATCH] llvm15-svml
---
.../include/llvm/Analysis/TargetLibraryInfo.h | 22 +-
llvm/include/llvm/AsmParser/LLToken.h | 3 +
llvm/include/llvm/IR/CMakeLists.txt | 4 +
llvm/include/llvm/IR/CallingConv.h | 5 +
llvm/include/llvm/IR/SVML.td | 62 +++
llvm/lib/Analysis/CMakeLists.txt | 1 +
llvm/lib/Analysis/TargetLibraryInfo.cpp | 55 +-
llvm/lib/AsmParser/LLLexer.cpp | 3 +
llvm/lib/AsmParser/LLParser.cpp | 6 +
llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 2 +-
llvm/lib/IR/AsmWriter.cpp | 3 +
llvm/lib/IR/Verifier.cpp | 3 +
llvm/lib/Target/X86/X86CallingConv.td | 70 +++
llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-
llvm/lib/Target/X86/X86RegisterInfo.cpp | 46 ++
llvm/lib/Target/X86/X86Subtarget.h | 3 +
.../Transforms/Utils/InjectTLIMappings.cpp | 3 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 270 +++++++++
.../Transforms/Vectorize/SLPVectorizer.cpp | 18 +-
.../Generic/replace-intrinsics-with-veclib.ll | 4 +-
.../LoopVectorize/X86/svml-calls-finite.ll | 24 +-
.../LoopVectorize/X86/svml-calls.ll | 108 ++--
.../LoopVectorize/X86/svml-legal-calls.ll | 513 ++++++++++++++++++
.../LoopVectorize/X86/svml-legal-codegen.ll | 61 +++
llvm/test/Transforms/Util/add-TLI-mappings.ll | 18 +-
llvm/utils/TableGen/CMakeLists.txt | 1 +
llvm/utils/TableGen/SVMLEmitter.cpp | 110 ++++
llvm/utils/TableGen/TableGen.cpp | 6 +
llvm/utils/TableGen/TableGenBackends.h | 1 +
llvm/utils/vim/syntax/llvm.vim | 1 +
30 files changed, 1359 insertions(+), 70 deletions(-)
create mode 100644 llvm/include/llvm/IR/SVML.td
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
create mode 100644 llvm/utils/TableGen/SVMLEmitter.cpp
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 7bfda0124..a2ce0d0f2 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -40,6 +40,12 @@ struct VecDesc {
NotLibFunc
};
+enum SVMLAccuracy {
+ SVML_DEFAULT,
+ SVML_HA,
+ SVML_EP
+};
+
/// Implementation of the target library information.
///
/// This class constructs tables that hold the target library information and
@@ -158,7 +164,7 @@ public:
/// Return true if the function F has a vector equivalent with vectorization
/// factor VF.
bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const {
- return !getVectorizedFunction(F, VF).empty();
+ return !getVectorizedFunction(F, VF, false).empty();
}
/// Return true if the function F has a vector equivalent with any
@@ -167,7 +173,10 @@ public:
/// Return the name of the equivalent of F, vectorized with factor VF. If no
/// such mapping exists, return the empty string.
- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const;
+ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const;
+
+ Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
+ StringRef F, const FunctionType &FTy, const DataLayout &DL) const;
/// Set to true iff i32 parameters to library functions should have signext
/// or zeroext attributes if they correspond to C-level int or unsigned int,
@@ -334,8 +343,13 @@ public:
bool isFunctionVectorizable(StringRef F) const {
return Impl->isFunctionVectorizable(F);
}
- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const {
- return Impl->getVectorizedFunction(F, VF);
+ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const {
+ return Impl->getVectorizedFunction(F, VF, IsFast);
+ }
+
+ Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
+ StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
+ return Impl->getVectorizedFunctionCallingConv(F, FTy, DL);
}
/// Tests if the function is both available and a candidate for optimized code
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 04235f0fd..ca552efcd 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -130,6 +130,9 @@ enum Kind {
kw_fastcc,
kw_coldcc,
kw_intel_ocl_bicc,
+ kw_intel_svmlcc128,
+ kw_intel_svmlcc256,
+ kw_intel_svmlcc512,
kw_cfguard_checkcc,
kw_x86_stdcallcc,
kw_x86_fastcallcc,
diff --git a/llvm/include/llvm/IR/CMakeLists.txt b/llvm/include/llvm/IR/CMakeLists.txt
index 5151f9125..3c263a5d3 100644
--- a/llvm/include/llvm/IR/CMakeLists.txt
+++ b/llvm/include/llvm/IR/CMakeLists.txt
@@ -22,3 +22,7 @@ tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86)
tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore)
tablegen(LLVM IntrinsicsVE.h -gen-intrinsic-enums -intrinsic-prefix=ve)
add_public_tablegen_target(intrinsics_gen)
+
+set(LLVM_TARGET_DEFINITIONS SVML.td)
+tablegen(LLVM SVML.inc -gen-svml)
+add_public_tablegen_target(svml_gen)
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index fd2854246..096eea1a8 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -252,6 +252,11 @@ namespace CallingConv {
/// M68k_INTR - Calling convention used for M68k interrupt routines.
M68k_INTR = 101,
+ /// Intel_SVML - Calling conventions for Intel Short Math Vector Library
+ Intel_SVML128 = 102,
+ Intel_SVML256 = 103,
+ Intel_SVML512 = 104,
+
/// The highest possible calling convention ID. Must be some 2^k - 1.
MaxID = 1023
};
diff --git a/llvm/include/llvm/IR/SVML.td b/llvm/include/llvm/IR/SVML.td
new file mode 100644
index 000000000..5af710404
--- /dev/null
+++ b/llvm/include/llvm/IR/SVML.td
@@ -0,0 +1,62 @@
+//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used by TableGen to define the different typs of SVML function
+// variants used with -fveclib=SVML.
+//
+//===----------------------------------------------------------------------===//
+
+class SvmlVariant;
+
+def sin : SvmlVariant;
+def cos : SvmlVariant;
+def pow : SvmlVariant;
+def exp : SvmlVariant;
+def log : SvmlVariant;
+def acos : SvmlVariant;
+def acosh : SvmlVariant;
+def asin : SvmlVariant;
+def asinh : SvmlVariant;
+def atan2 : SvmlVariant;
+def atan : SvmlVariant;
+def atanh : SvmlVariant;
+def cbrt : SvmlVariant;
+def cdfnorm : SvmlVariant;
+def cdfnorminv : SvmlVariant;
+def cosd : SvmlVariant;
+def cosh : SvmlVariant;
+def erf : SvmlVariant;
+def erfc : SvmlVariant;
+def erfcinv : SvmlVariant;
+def erfinv : SvmlVariant;
+def exp10 : SvmlVariant;
+def exp2 : SvmlVariant;
+def expm1 : SvmlVariant;
+def hypot : SvmlVariant;
+def invsqrt : SvmlVariant;
+def log10 : SvmlVariant;
+def log1p : SvmlVariant;
+def log2 : SvmlVariant;
+def sind : SvmlVariant;
+def sinh : SvmlVariant;
+def sqrt : SvmlVariant;
+def tan : SvmlVariant;
+def tanh : SvmlVariant;
+
+// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions.
+// We should call the default variant of these functions in all cases instead.
+
+// def nearbyint : SvmlVariant;
+// def logb : SvmlVariant;
+// def floor : SvmlVariant;
+// def fmod : SvmlVariant;
+// def ceil : SvmlVariant;
+// def trunc : SvmlVariant;
+// def rint : SvmlVariant;
+// def round : SvmlVariant;
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index e59725c99..89af7f5d9 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMAnalysis
DEPENDS
intrinsics_gen
${MLDeps}
+ svml_gen
LINK_LIBS
${MLLinkDeps}
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 8ebdb65e8..eb3009593 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -110,6 +110,11 @@ bool TargetLibraryInfoImpl::isCallingConvCCompatible(Function *F) {
F->getFunctionType());
}
+static std::string svmlMangle(StringRef FnName, const bool IsFast) {
+ std::string FullName = FnName.str();
+ return IsFast ? FullName : FullName + "_ha";
+}
+
/// Initialize the set of available library functions based on the specified
/// target triple. This should be carefully written so that a missing target
/// triple gets a sane set of defaults.
@@ -1878,8 +1883,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
}
case SVML: {
const VecDesc VecFuncs[] = {
- #define TLI_DEFINE_SVML_VECFUNCS
- #include "llvm/Analysis/VecFuncs.def"
+ #define GET_SVML_VARIANTS
+ #include "llvm/IR/SVML.inc"
+ #undef GET_SVML_VARIANTS
};
addVectorizableFunctions(VecFuncs);
break;
@@ -1899,20 +1905,51 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
}
-StringRef
-TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
- const ElementCount &VF) const {
+std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
+ const ElementCount &VF,
+ bool IsFast) const {
+ bool FromSVML = ClVectorLibrary == SVML;
F = sanitizeFunctionName(F);
if (F.empty())
- return F;
+ return F.str();
std::vector<VecDesc>::const_iterator I =
llvm::lower_bound(VectorDescs, F, compareWithScalarFnName);
while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
- if (I->VectorizationFactor == VF)
- return I->VectorFnName;
+ if (I->VectorizationFactor == VF) {
+ if (FromSVML) {
+ return svmlMangle(I->VectorFnName, IsFast);
+ }
+ return I->VectorFnName.str();
+ }
++I;
}
- return StringRef();
+ return std::string();
+}
+
+static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, const FunctionType &FType)
+{
+ assert(isa<VectorType>(FType.getReturnType()));
+ auto *VecCallRetType = cast<VectorType>(FType.getReturnType());
+ auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType);
+ if (TypeBitWidth == 128) {
+ return CallingConv::Intel_SVML128;
+ } else if (TypeBitWidth == 256) {
+ return CallingConv::Intel_SVML256;
+ } else if (TypeBitWidth == 512) {
+ return CallingConv::Intel_SVML512;
+ } else {
+ llvm_unreachable("Invalid vector width");
+ }
+ return 0; // not reachable
+}
+
+Optional<CallingConv::ID>
+TargetLibraryInfoImpl::getVectorizedFunctionCallingConv(
+ StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
+ if (F.startswith("__svml")) {
+ return getSVMLCallingConv(DL, FTy);
+ }
+ return {};
}
TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F,
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index c9a982693..40e89fe57 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -605,6 +605,9 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(spir_kernel);
KEYWORD(spir_func);
KEYWORD(intel_ocl_bicc);
+ KEYWORD(intel_svmlcc128);
+ KEYWORD(intel_svmlcc256);
+ KEYWORD(intel_svmlcc512);
KEYWORD(x86_64_sysvcc);
KEYWORD(win64cc);
KEYWORD(x86_regcallcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index fd502eded..8bf9c50be 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1864,6 +1864,9 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
/// ::= 'ccc'
/// ::= 'fastcc'
/// ::= 'intel_ocl_bicc'
+/// ::= 'intel_svmlcc128'
+/// ::= 'intel_svmlcc256'
+/// ::= 'intel_svmlcc512'
/// ::= 'coldcc'
/// ::= 'cfguard_checkcc'
/// ::= 'x86_stdcallcc'
@@ -1933,6 +1936,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break;
case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break;
case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
+ case lltok::kw_intel_svmlcc128:CC = CallingConv::Intel_SVML128; break;
+ case lltok::kw_intel_svmlcc256:CC = CallingConv::Intel_SVML256; break;
+ case lltok::kw_intel_svmlcc512:CC = CallingConv::Intel_SVML512; break;
case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break;
case lltok::kw_win64cc: CC = CallingConv::Win64; break;
case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break;
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 87b8ac59b..5c02e237c 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -156,7 +156,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
// and the exact vector width of the call operands in the
// TargetLibraryInfo.
const std::string TLIName =
- std::string(TLI.getVectorizedFunction(ScalarName, VF));
+ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast()));
LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
<< ScalarName << "` and vector width " << VF << ".\n");
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index a29040b8c..d7a7b4e3f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -304,6 +304,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break;
case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
+ case CallingConv::Intel_SVML128: Out << "intel_svmlcc128"; break;
+ case CallingConv::Intel_SVML256: Out << "intel_svmlcc256"; break;
+ case CallingConv::Intel_SVML512: Out << "intel_svmlcc512"; break;
case CallingConv::ARM_APCS: Out << "arm_apcscc"; break;
case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break;
case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e3ea256af..1a3c50111 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2527,6 +2527,9 @@ void Verifier::visitFunction(const Function &F) {
case CallingConv::Fast:
case CallingConv::Cold:
case CallingConv::Intel_OCL_BI:
+ case CallingConv::Intel_SVML128:
+ case CallingConv::Intel_SVML256:
+ case CallingConv::Intel_SVML512:
case CallingConv::PTX_Kernel:
case CallingConv::PTX_Device:
Check(!F.isVarArg(),
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 4dd8a6cdd..12e655212 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -498,6 +498,21 @@ def RetCC_X86_64 : CallingConv<[
CCDelegateTo<RetCC_X86_64_C>
]>;
+// Intel_SVML return-value convention.
+def RetCC_Intel_SVML : CallingConv<[
+ // Vector types are returned in XMM0,XMM1
+ CCIfType<[v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1]>>,
+
+ // 256-bit FP vectors
+ CCIfType<[v8f32, v4f64],
+ CCAssignToReg<[YMM0,YMM1]>>,
+
+ // 512-bit FP vectors
+ CCIfType<[v16f32, v8f64],
+ CCAssignToReg<[ZMM0,ZMM1]>>
+]>;
+
// This is the return-value convention used for the entire X86 backend.
let Entry = 1 in
def RetCC_X86 : CallingConv<[
@@ -505,6 +520,10 @@ def RetCC_X86 : CallingConv<[
// Check if this is the Intel OpenCL built-ins calling convention
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<RetCC_Intel_SVML>>,
+ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<RetCC_Intel_SVML>>,
+ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<RetCC_Intel_SVML>>,
+
CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
CCDelegateTo<RetCC_X86_32>
]>;
@@ -1064,6 +1083,30 @@ def CC_Intel_OCL_BI : CallingConv<[
CCDelegateTo<CC_X86_32_C>
]>;
+// X86-64 Intel Short Vector Math Library calling convention.
+def CC_Intel_SVML : CallingConv<[
+
+ // The SSE vector arguments are passed in XMM registers.
+ CCIfType<[v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2]>>,
+
+ // The 256-bit vector arguments are passed in YMM registers.
+ CCIfType<[v8f32, v4f64],
+ CCAssignToReg<[YMM0, YMM1, YMM2]>>,
+
+ // The 512-bit vector arguments are passed in ZMM registers.
+ CCIfType<[v16f32, v8f64],
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>
+]>;
+
+def CC_X86_32_Intr : CallingConv<[
+ CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+ CCAssignToStack<8, 8>
+]>;
+
//===----------------------------------------------------------------------===//
// X86 Root Argument Calling Conventions
//===----------------------------------------------------------------------===//
@@ -1115,6 +1158,9 @@ def CC_X86_64 : CallingConv<[
let Entry = 1 in
def CC_X86 : CallingConv<[
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<CC_Intel_SVML>>,
+ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<CC_Intel_SVML>>,
+ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<CC_Intel_SVML>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
CCDelegateTo<CC_X86_32>
]>;
@@ -1227,3 +1273,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
(sequence "R%u", 12, 15))>;
def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
(sequence "XMM%u", 8, 15))>;
+
+// SVML calling convention
+def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>;
+def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML,
+ K4, K5, K6, K7)>;
+
+def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>;
+
+def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
+def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+ (sequence "XMM%u", 6, 15))>;
+
+def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+ (sequence "YMM%u", 8, 15))>;
+def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+ (sequence "YMM%u", 6, 15))>;
+
+def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+ (sequence "ZMM%u", 16, 31),
+ K4, K5, K6, K7)>;
+def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+ (sequence "ZMM%u", 6, 21),
+ K4, K5, K6, K7)>;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd45c4825..0ad88eac1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3966,7 +3966,8 @@ void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
// FIXME: Only some x86_32 calling conventions support AVX512.
if (Subtarget.useAVX512Regs() &&
(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
- CallConv == CallingConv::Intel_OCL_BI)))
+ CallConv == CallingConv::Intel_OCL_BI ||
+ CallConv == CallingConv::Intel_SVML512)))
VecVT = MVT::v16f32;
else if (Subtarget.hasAVX())
VecVT = MVT::v8f32;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index f2658f704..b2f4bb2dd 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -274,6 +274,42 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
+namespace {
+std::pair<const uint32_t *, const MCPhysReg *> getSVMLRegMaskAndSaveList(
+ bool Is64Bit, bool IsWin64, CallingConv::ID CC) {
+ assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512);
+ unsigned Abi = CC - CallingConv::Intel_SVML128 ; // 0 - 128, 1 - 256, 2 - 512
+
+ const std::pair<const uint32_t *, const MCPhysReg *> Abi64[] = {
+ std::make_pair(CSR_64_Intel_SVML_RegMask, CSR_64_Intel_SVML_SaveList),
+ std::make_pair(CSR_64_Intel_SVML_AVX_RegMask, CSR_64_Intel_SVML_AVX_SaveList),
+ std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, CSR_64_Intel_SVML_AVX512_SaveList),
+ };
+
+ const std::pair<const uint32_t *, const MCPhysReg *> AbiWin64[] = {
+ std::make_pair(CSR_Win64_Intel_SVML_RegMask, CSR_Win64_Intel_SVML_SaveList),
+ std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask, CSR_Win64_Intel_SVML_AVX_SaveList),
+ std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, CSR_Win64_Intel_SVML_AVX512_SaveList),
+ };
+
+ const std::pair<const uint32_t *, const MCPhysReg *> Abi32[] = {
+ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList),
+ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList),
+ std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, CSR_32_Intel_SVML_AVX512_SaveList),
+ };
+
+ if (Is64Bit) {
+ if (IsWin64) {
+ return AbiWin64[Abi];
+ } else {
+ return Abi64[Abi];
+ }
+ } else {
+ return Abi32[Abi];
+ }
+}
+}
+
const MCPhysReg *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "MachineFunction required");
@@ -329,6 +365,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_64_Intel_OCL_BI_SaveList;
break;
}
+ case CallingConv::Intel_SVML128:
+ case CallingConv::Intel_SVML256:
+ case CallingConv::Intel_SVML512: {
+ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second;
+ }
case CallingConv::HHVM:
return CSR_64_HHVM_SaveList;
case CallingConv::X86_RegCall:
@@ -451,6 +492,11 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_64_Intel_OCL_BI_RegMask;
break;
}
+ case CallingConv::Intel_SVML128:
+ case CallingConv::Intel_SVML256:
+ case CallingConv::Intel_SVML512: {
+ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first;
+ }
case CallingConv::HHVM:
return CSR_64_HHVM_RegMask;
case CallingConv::X86_RegCall:
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 09a8b1f1a..6863cf8b6 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -337,6 +337,9 @@ public:
case CallingConv::X86_ThisCall:
case CallingConv::X86_VectorCall:
case CallingConv::Intel_OCL_BI:
+ case CallingConv::Intel_SVML128:
+ case CallingConv::Intel_SVML256:
+ case CallingConv::Intel_SVML512:
return isTargetWin64();
// This convention allows using the Win64 convention on other targets.
case CallingConv::Win64:
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 55bcb6f3b..230b3c01a 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -19,6 +19,7 @@
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/FMF.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -91,7 +92,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
auto AddVariantDecl = [&](const ElementCount &VF) {
const std::string TLIName =
- std::string(TLI.getVectorizedFunction(ScalarName, VF));
+ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast()));
if (!TLIName.empty()) {
std::string MangledName =
VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5fd4e45d8..8b8c127d5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -629,6 +629,27 @@ protected:
virtual void printDebugTracesAtStart(){};
virtual void printDebugTracesAtEnd(){};
+ /// Check legality of given SVML call instruction \p VecCall generated for
+ /// scalar call \p Call. If illegal then the appropriate legal instruction
+ /// is returned.
+ Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call);
+
+ /// Returns the legal VF for a call instruction \p CI using TTI information
+ /// and vector type.
+ ElementCount getLegalVFForCall(CallInst *CI);
+
+ /// Partially vectorize a given call \p Call by breaking it down into multiple
+ /// calls of \p LegalCall, decided by the variant VF \p LegalVF.
+ Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall,
+ unsigned LegalVF);
+
+ /// Generate shufflevector instruction for a vector value \p V based on the
+ /// current \p Part and a smaller VF \p LegalVF.
+ Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part);
+
+ /// Combine partially vectorized calls stored in \p CallResults.
+ Value *combinePartialVecCalls(SmallVectorImpl<Value *> &CallResults);
+
/// The original loop.
Loop *OrigLoop;
@@ -4170,6 +4191,17 @@ bool InnerLoopVectorizer::useOrderedReductions(
return Cost->useOrderedReductions(RdxDesc);
}
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ Function *VectorF = CI.getCalledFunction();
+ FunctionType *FTy = VectorF->getFunctionType();
+ StringRef VFName = VectorF->getName();
+ auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
+ if (CC) {
+ CI.setCallingConv(*CC);
+ }
+}
+
void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
VPUser &ArgOperands,
VPTransformState &State) {
@@ -4237,11 +4269,249 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
if (isa<FPMathOperator>(V))
V->copyFastMathFlags(&CI);
+ const DataLayout &DL = V->getModule()->getDataLayout();
+ setVectorFunctionCallingConv(*V, DL, *TLI);
+
+ // Perform legalization of SVML call instruction only if original call
+ // was not Intrinsic
+ if (!UseVectorIntrinsic &&
+ (V->getCalledFunction()->getName()).startswith("__svml")) {
+ // assert((V->getCalledFunction()->getName()).startswith("__svml"));
+ LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump());
+ auto *LegalV = cast<Instruction>(legalizeSVMLCall(V, &CI));
+ LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: ";
+ LegalV->dump());
+ State.set(Def, LegalV, Part);
+ State.addMetadata(LegalV, &CI);
+ } else {
State.set(Def, V, Part);
State.addMetadata(V, &CI);
+ }
}
}
+
+//===----------------------------------------------------------------------===//
+// Implementation of functions for SVML vector call legalization.
+//===----------------------------------------------------------------------===//
+//
+// Unlike other VECLIBs, SVML needs to be used with target-legal
+// vector types. Otherwise, link failures and/or runtime failures
+// will occur. A motivating example could be -
+//
+// double *a;
+// float *b;
+// #pragma clang loop vectorize_width(8)
+// for(i = 0; i < N; ++i) {
+// a[i] = sin(i); // Legal SVML VF must be 4 or below on AVX
+// b[i] = cosf(i); // VF can be 8 on AVX since 8 floats can fit in YMM
+// }
+//
+// Current implementation of vector code generation in LV is
+// driven based on a single VF (in InnerLoopVectorizer::VF). This
+// inhibits the flexibility of adjusting/choosing different VF
+// for different instructions.
+//
+// Due to this limitation it is much more straightforward to
+// first generate the illegal sin8 (svml_sin8 for SVML vector
+// library) call and then legalize it than trying to avoid
+// generating illegal code from the beginning.
+//
+// A solution for this problem is to check legality of the
+// call instruction right after generating it in vectorizer and
+// if it is illegal we split the call arguments and issue multiple
+// calls to match the legal VF. This is demonstrated currently for
+// the SVML vector library calls (non-intrinsic version only).
+//
+// Future directions and extensions:
+// 1) This legalization example shows us that a good direction
+// for the VPlan framework would be to model the vector call
+// instructions in a way that legal VF for each call is chosen
+// correctly within vectorizer and illegal code generation is
+// avoided.
+// 2) This logic can also be extended to general vector functions
+// i.e. legalization OpenMP decalre simd functions. The
+// requirements needed for this will be documented soon.
+
+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall,
+ CallInst *Call) {
+ ElementCount LegalVF = getLegalVFForCall(VecCall);
+
+ assert(LegalVF.getKnownMinValue() > 1 &&
+ "Legal VF for SVML call must be greater than 1 to vectorize");
+
+ if (LegalVF == VF)
+ return VecCall;
+ else if (LegalVF.getKnownMinValue() > VF.getKnownMinValue())
+ // TODO: handle case when we are underfilling vectors
+ return VecCall;
+
+ // Legal VF for this SVML call is smaller than chosen VF, break it down into
+ // smaller call instructions
+
+ // Convert args, types and return type to match legal VF
+ SmallVector<Type *, 4> NewTys;
+ SmallVector<Value *, 4> NewArgs;
+
+ for (Value *ArgOperand : Call->args()) {
+ Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF);
+ NewTys.push_back(Ty);
+ NewArgs.push_back(UndefValue::get(Ty));
+ }
+
+ // Construct legal vector function
+ const VFShape Shape =
+ VFShape::get(*Call, LegalVF /*EC*/, false /*HasGlobalPred*/);
+ Function *LegalVectorF = VFDatabase(*Call).getVectorizedFunction(Shape);
+ assert(LegalVectorF != nullptr && "Can't create legal vector function.");
+
+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump());
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ Call->getOperandBundlesAsDefs(OpBundles);
+ auto LegalV = std::unique_ptr<CallInst>(CallInst::Create(LegalVectorF, NewArgs, OpBundles));
+
+ if (isa<FPMathOperator>(LegalV))
+ LegalV->copyFastMathFlags(Call);
+
+ const DataLayout &DL = VecCall->getModule()->getDataLayout();
+ // Set SVML calling conventions
+ setVectorFunctionCallingConv(*LegalV, DL, *TLI);
+
+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump());
+
+ Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV.get(), LegalVF.getKnownMinValue());
+
+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump());
+
+ // Remove the illegal call from Builder
+ VecCall->eraseFromParent();
+
+ return LegalizedCall;
+}
+
+ElementCount InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) {
+ const DataLayout DL = CI->getModule()->getDataLayout();
+ FunctionType *CallFT = CI->getFunctionType();
+ // All functions that need legalization should have a vector return type.
+ // This is true for all SVML functions that are currently supported.
+ assert(isa<VectorType>(CallFT->getReturnType()) &&
+ "Return type of call that needs legalization is not a vector.");
+ auto *VecCallRetType = cast<VectorType>(CallFT->getReturnType());
+ Type *ElemType = VecCallRetType->getElementType();
+
+ unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType);
+ unsigned VectorBitWidth = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+ unsigned LegalVF = VectorBitWidth / TypeBitWidth;
+
+ LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n");
+ LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n");
+ LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth
+ << "\n");
+ LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n");
+
+ return ElementCount::getFixed(LegalVF);
+}
+
+// Partial vectorization of a call instruction is achieved by making clones of
+// \p LegalCall and overwriting its argument operands with shufflevector
+// equivalent decided based on \p LegalVF and current Part being filled.
+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call,
+ CallInst *LegalCall,
+ unsigned LegalVF) {
+ unsigned NumParts = VF.getKnownMinValue() / LegalVF;
+ LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n");
+ SmallVector<Value *, 8> CallResults;
+
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
+ auto *ClonedCall = cast<CallInst>(LegalCall->clone());
+
+ // Update the arg operand of cloned call to shufflevector
+ for (unsigned i = 0, ie = Call->arg_size(); i != ie; ++i) {
+ auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part);
+ ClonedCall->setArgOperand(i, NewOp);
+ }
+
+ LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump());
+
+ auto *PartialVecCall = Builder.Insert(ClonedCall);
+ CallResults.push_back(PartialVecCall);
+ }
+
+ return combinePartialVecCalls(CallResults);
+}
+
+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF,
+ unsigned Part) {
+ // Example:
+ // Consider the following vector code -
+ // %1 = sitofp <4 x i32> %0 to <4 x double>
+ // %2 = call <4 x double> @__svml_sin4(<4 x double> %1)
+ //
+ // If the LegalVF is 2, we partially vectorize the sin4 call by invoking
+ // generateShuffleValue on the operand %1
+ // If Part = 1, output value is -
+ // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 0, i32 1>
+ // and if Part = 2, output is -
+ // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 2, i32 3>
+
+ assert(isa<VectorType>(V->getType()) &&
+ "Cannot generate shuffles for non-vector values.");
+ SmallVector<int, 4> ShuffleMask;
+ Value *Undef = UndefValue::get(V->getType());
+
+ unsigned ElemIdx = Part * LegalVF;
+
+ for (unsigned K = 0; K < LegalVF; K++)
+ ShuffleMask.push_back(static_cast<int>(ElemIdx + K));
+
+ auto *ShuffleInst =
+ Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle");
+
+ return ShuffleInst;
+}
+
+// Results of the calls executed by smaller legal call instructions must be
+// combined to match the original VF for later use. This is done by constructing
+// shufflevector instructions in a cumulative fashion.
+Value *InnerLoopVectorizer::combinePartialVecCalls(
+ SmallVectorImpl<Value *> &CallResults) {
+ assert(isa<VectorType>(CallResults[0]->getType()) &&
+ "Cannot combine calls with non-vector results.");
+ auto *CallType = cast<VectorType>(CallResults[0]->getType());
+
+ Value *CombinedShuffle;
+ unsigned NumElems = CallType->getElementCount().getKnownMinValue() * 2;
+ unsigned NumRegs = CallResults.size();
+
+ assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) &&
+ "Number of partial vector calls to combine must be a power of 2 "
+ "(atleast 2^1)");
+
+ while (NumRegs > 1) {
+ for (unsigned I = 0; I < NumRegs; I += 2) {
+ SmallVector<int, 4> ShuffleMask;
+ for (unsigned J = 0; J < NumElems; J++)
+ ShuffleMask.push_back(static_cast<int>(J));
+
+ CombinedShuffle = Builder.CreateShuffleVector(
+ CallResults[I], CallResults[I + 1], ShuffleMask, "combined");
+ LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:";
+ CombinedShuffle->dump());
+ CallResults.push_back(CombinedShuffle);
+ }
+
+ SmallVector<Value *, 2>::iterator Start = CallResults.begin();
+ SmallVector<Value *, 2>::iterator End = Start + NumRegs;
+ CallResults.erase(Start, End);
+
+ NumElems *= 2;
+ NumRegs /= 2;
+ }
+
+ return CombinedShuffle;
+}
+
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53c11c58f..5074bf21c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7823,6 +7823,17 @@ Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
return Vec;
}
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ Function *VectorF = CI.getCalledFunction();
+ FunctionType *FTy = VectorF->getFunctionType();
+ StringRef VFName = VectorF->getName();
+ auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
+ if (CC) {
+ CI.setCallingConv(*CC);
+ }
+}
+
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
@@ -8309,7 +8320,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
- Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+
+ CallInst *NewCall = Builder.CreateCall(CF, OpVecs, OpBundles);
+ const DataLayout &DL = NewCall->getModule()->getDataLayout();
+ setVectorFunctionCallingConv(*NewCall, DL, *TLI);
+
+ Value *V = NewCall;
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
diff --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
index df8b7c498..63a36549f 100644
--- a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
+++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
define <4 x double> @exp_v4(<4 x double> %in) {
; SVML-LABEL: define {{[^@]+}}@exp_v4
; SVML-SAME: (<4 x double> [[IN:%.*]]) {
-; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[IN]])
+; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4_ha(<4 x double> [[IN]])
; SVML-NEXT: ret <4 x double> [[TMP1]]
;
; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4
@@ -37,7 +37,7 @@ declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0
define <4 x float> @exp_f32(<4 x float> %in) {
; SVML-LABEL: define {{[^@]+}}@exp_f32
; SVML-SAME: (<4 x float> [[IN:%.*]]) {
-; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[IN]])
+; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4_ha(<4 x float> [[IN]])
; SVML-NEXT: ret <4 x float> [[TMP1]]
;
; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
index a6e191c3d..d6e2e1110 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -39,7 +39,8 @@ for.end: ; preds = %for.body
declare double @__exp_finite(double) #0
; CHECK-LABEL: @exp_f64
-; CHECK: <4 x double> @__svml_exp4
+; CHECK: <2 x double> @__svml_exp2
+; CHECK: <2 x double> @__svml_exp2
; CHECK: ret
define void @exp_f64(double* nocapture %varray) {
entry:
@@ -99,7 +100,8 @@ for.end: ; preds = %for.body
declare double @__log_finite(double) #0
; CHECK-LABEL: @log_f64
-; CHECK: <4 x double> @__svml_log4
+; CHECK: <2 x double> @__svml_log2
+; CHECK: <2 x double> @__svml_log2
; CHECK: ret
define void @log_f64(double* nocapture %varray) {
entry:
@@ -159,7 +161,8 @@ for.end: ; preds = %for.body
declare double @__pow_finite(double, double) #0
; CHECK-LABEL: @pow_f64
-; CHECK: <4 x double> @__svml_pow4
+; CHECK: <2 x double> @__svml_pow2
+; CHECK: <2 x double> @__svml_pow2
; CHECK: ret
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
entry:
@@ -190,7 +193,8 @@ declare float @__exp2f_finite(float) #0
define void @exp2f_finite(float* nocapture %varray) {
; CHECK-LABEL: @exp2f_finite(
-; CHECK: call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}})
+; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
+; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
; CHECK: ret void
;
entry:
@@ -219,7 +223,8 @@ declare double @__exp2_finite(double) #0
define void @exp2_finite(double* nocapture %varray) {
; CHECK-LABEL: @exp2_finite(
-; CHECK: call <4 x double> @__svml_exp24(<4 x double> {{.*}})
+; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
+; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
; CHECK: ret void
;
entry:
@@ -276,7 +281,8 @@ for.end: ; preds = %for.body
declare double @__log2_finite(double) #0
; CHECK-LABEL: @log2_f64
-; CHECK: <4 x double> @__svml_log24
+; CHECK: <2 x double> @__svml_log22
+; CHECK: <2 x double> @__svml_log22
; CHECK: ret
define void @log2_f64(double* nocapture %varray) {
entry:
@@ -333,7 +339,8 @@ for.end: ; preds = %for.body
declare double @__log10_finite(double) #0
; CHECK-LABEL: @log10_f64
-; CHECK: <4 x double> @__svml_log104
+; CHECK: <2 x double> @__svml_log102
+; CHECK: <2 x double> @__svml_log102
; CHECK: ret
define void @log10_f64(double* nocapture %varray) {
entry:
@@ -390,7 +397,8 @@ for.end: ; preds = %for.body
declare double @__sqrt_finite(double) #0
; CHECK-LABEL: @sqrt_f64
-; CHECK: <4 x double> @__svml_sqrt4
+; CHECK: <2 x double> @__svml_sqrt2
+; CHECK: <2 x double> @__svml_sqrt2
; CHECK: ret
define void @sqrt_f64(double* nocapture %varray) {
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
index 42c280df6..088bbdcf1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
@@ -48,7 +48,7 @@ declare float @llvm.exp2.f32(float) #0
define void @sin_f64(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -71,7 +71,7 @@ for.end:
define void @sin_f32(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -94,7 +94,7 @@ for.end:
define void @sin_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -117,7 +117,7 @@ for.end:
define void @sin_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -140,7 +140,7 @@ for.end:
define void @cos_f64(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -163,7 +163,7 @@ for.end:
define void @cos_f32(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -186,7 +186,7 @@ for.end:
define void @cos_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -209,7 +209,7 @@ for.end:
define void @cos_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -232,7 +232,7 @@ for.end:
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64(
-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
; CHECK: ret void
;
entry:
@@ -257,7 +257,7 @@ for.end:
define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64_intrinsic(
-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
; CHECK: ret void
;
entry:
@@ -282,7 +282,7 @@ for.end:
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32(
-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
; CHECK: ret void
;
entry:
@@ -307,7 +307,7 @@ for.end:
define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32_intrinsic(
-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
; CHECK: ret void
;
entry:
@@ -332,7 +332,7 @@ for.end:
define void @exp_f64(double* nocapture %varray) {
; CHECK-LABEL: @exp_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -355,7 +355,7 @@ for.end:
define void @exp_f32(float* nocapture %varray) {
; CHECK-LABEL: @exp_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -378,7 +378,7 @@ for.end:
define void @exp_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @exp_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -401,7 +401,7 @@ for.end:
define void @exp_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @exp_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -424,7 +424,7 @@ for.end:
define void @log_f64(double* nocapture %varray) {
; CHECK-LABEL: @log_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -447,7 +447,7 @@ for.end:
define void @log_f32(float* nocapture %varray) {
; CHECK-LABEL: @log_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -470,7 +470,7 @@ for.end:
define void @log_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @log_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -493,7 +493,7 @@ for.end:
define void @log_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @log_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -516,7 +516,7 @@ for.end:
define void @log2_f64(double* nocapture %varray) {
; CHECK-LABEL: @log2_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -539,7 +539,7 @@ for.end:
define void @log2_f32(float* nocapture %varray) {
; CHECK-LABEL: @log2_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -562,7 +562,7 @@ for.end:
define void @log2_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @log2_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -585,7 +585,7 @@ for.end:
define void @log2_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @log2_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -608,7 +608,7 @@ for.end:
define void @log10_f64(double* nocapture %varray) {
; CHECK-LABEL: @log10_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -631,7 +631,7 @@ for.end:
define void @log10_f32(float* nocapture %varray) {
; CHECK-LABEL: @log10_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -654,7 +654,7 @@ for.end:
define void @log10_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @log10_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -677,7 +677,7 @@ for.end:
define void @log10_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @log10_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -700,7 +700,7 @@ for.end:
define void @sqrt_f64(double* nocapture %varray) {
; CHECK-LABEL: @sqrt_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sqrt4_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -723,7 +723,7 @@ for.end:
define void @sqrt_f32(float* nocapture %varray) {
; CHECK-LABEL: @sqrt_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sqrtf4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -746,7 +746,7 @@ for.end:
define void @exp2_f64(double* nocapture %varray) {
; CHECK-LABEL: @exp2_f64(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -769,7 +769,7 @@ for.end:
define void @exp2_f32(float* nocapture %varray) {
; CHECK-LABEL: @exp2_f32(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -792,7 +792,7 @@ for.end:
define void @exp2_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @exp2_f64_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -815,7 +815,7 @@ for.end:
define void @exp2_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @exp2_f32_intrinsic(
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
; CHECK: ret void
;
entry:
@@ -836,4 +836,44 @@ for.end:
ret void
}
+; CHECK-LABEL: @atan2_finite
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24(
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24(
+; CHECK: ret
+
+declare double @__atan2_finite(double, double) local_unnamed_addr #0
+
+define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc7, %entry
+ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ]
+ %0 = trunc i64 %indvars.iv19 to i32
+ %conv = sitofp i32 %0 to double
+ br label %for.body3
+
+for.body3: ; preds = %for.body3, %for.cond1.preheader
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %1 = trunc i64 %indvars.iv.next to i32
+ %conv4 = sitofp i32 %1 to double
+ %call = tail call fast double @__atan2_finite(double %conv, double %conv4)
+ %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv
+ store double %call, double* %arrayidx6, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 100
+ br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5
+
+for.inc7: ; preds = %for.body3
+ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+ %exitcond21 = icmp eq i64 %indvars.iv.next20, 100
+ br i1 %exitcond21, label %for.end9, label %for.cond1.preheader
+
+for.end9: ; preds = %for.inc7
+ ret void
+}
+
attributes #0 = { nounwind readnone }
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 8}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
new file mode 100644
index 000000000..326c76399
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
@@ -0,0 +1,513 @@
+; Check legalization of SVML calls, including intrinsic versions (like @llvm.<fn_name>.<type>).
+
+; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @sin(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @sinf(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.sin.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @llvm.sin.f32(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @cos(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @cosf(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.cos.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @llvm.cos.f32(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+ %tmp1 = load double, double* %arrayidx, align 4
+ %tmp2 = tail call double @pow(double %conv, double %tmp1)
+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %tmp2, double* %arrayidx2, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+ %tmp1 = load double, double* %arrayidx, align 4
+ %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %tmp2, double* %arrayidx2, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+ %tmp1 = load float, float* %arrayidx, align 4
+ %tmp2 = tail call float @powf(float %conv, float %tmp1)
+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %tmp2, float* %arrayidx2, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+ %tmp1 = load float, float* %arrayidx, align 4
+ %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %tmp2, float* %arrayidx2, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @exp(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @expf(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.exp.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @llvm.exp.f32(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @log(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @logf(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call double @llvm.log.f64(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+ store double %call, double* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %tmp = trunc i64 %iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call float @llvm.log.f32(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+ store float %call, float* %arrayidx, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
new file mode 100644
index 000000000..942265344
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
@@ -0,0 +1,61 @@
+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype.
+; The C code used to generate this test:
+
+; #include <math.h>
+;
+; void foo(double *a, int N){
+; int i;
+; #pragma clang loop vectorize_width(8)
+; for (i=0;i<N;i++){
+; a[i] = sin(i);
+; }
+; }
+
+; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -mattr=avx -S < %s | FileCheck %s
+
+; CHECK: [[I1:%.*]] = sitofp <8 x i32> [[I0:%.*]] to <8 x double>
+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S1]])
+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S2]])
+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 {
+entry:
+ %cmp5 = icmp sgt i32 %N, 0
+ br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %0 = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %0 to double
+ %call = tail call fast double @sin(double %conv) #2
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ store double %call, double* %arrayidx, align 8, !tbaa !2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+; Function Attrs: nounwind
+declare dso_local double @sin(double) local_unnamed_addr #1
+
+!2 = !{!3, !3, i64 0}
+!3 = !{!"double", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 8e04c22bf..a7e6978c1 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -12,12 +12,12 @@ target triple = "x86_64-unknown-linux-gnu"
; COMMON-LABEL: @llvm.compiler.used = appending global
; SVML-SAME: [6 x ptr] [
-; SVML-SAME: ptr @__svml_sin2,
-; SVML-SAME: ptr @__svml_sin4,
-; SVML-SAME: ptr @__svml_sin8,
-; SVML-SAME: ptr @__svml_log10f4,
-; SVML-SAME: ptr @__svml_log10f8,
-; SVML-SAME: ptr @__svml_log10f16
+; SVML-SAME: ptr @__svml_sin2_ha,
+; SVML-SAME: ptr @__svml_sin4_ha,
+; SVML-SAME: ptr @__svml_sin8_ha,
+; SVML-SAME: ptr @__svml_log10f4_ha,
+; SVML-SAME: ptr @__svml_log10f8_ha,
+; SVML-SAME: ptr @__svml_log10f16_ha
; MASSV-SAME: [2 x ptr] [
; MASSV-SAME: ptr @__sind2,
; MASSV-SAME: ptr @__log10f4
@@ -59,9 +59,9 @@ declare float @llvm.log10.f32(float) #0
attributes #0 = { nounwind readnone }
; SVML: attributes #[[SIN]] = { "vector-function-abi-variant"=
-; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2),
-; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4),
-; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8)" }
+; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2_ha),
+; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4_ha),
+; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8_ha)" }
; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"=
; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" }
diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt
index 725c99b8e..58e2194e1 100644
--- a/llvm/utils/TableGen/CMakeLists.txt
+++ b/llvm/utils/TableGen/CMakeLists.txt
@@ -47,6 +47,7 @@ add_tablegen(llvm-tblgen LLVM
SearchableTableEmitter.cpp
SubtargetEmitter.cpp
SubtargetFeatureInfo.cpp
+ SVMLEmitter.cpp
TableGen.cpp
Types.cpp
VarLenCodeEmitterGen.cpp
diff --git a/llvm/utils/TableGen/SVMLEmitter.cpp b/llvm/utils/TableGen/SVMLEmitter.cpp
new file mode 100644
index 000000000..a5aeea48d
--- /dev/null
+++ b/llvm/utils/TableGen/SVMLEmitter.cpp
@@ -0,0 +1,110 @@
+//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits the scalar to svml function map for TLI.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenTarget.h"
+#include "llvm/Support/Format.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "SVMLVariants"
+#include "llvm/Support/Debug.h"
+
+namespace {
+
+class SVMLVariantsEmitter {
+
+ RecordKeeper &Records;
+
+private:
+ void emitSVMLVariants(raw_ostream &OS);
+
+public:
+ SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {}
+
+ void run(raw_ostream &OS);
+};
+} // End anonymous namespace
+
+/// \brief Emit the set of SVML variant function names.
+// The default is to emit the high accuracy SVML variants until a mechanism is
+// introduced to allow a selection of different variants through precision
+// requirements specified by the user. This code generates mappings to svml
+// that are in the scalar form of llvm intrinsics, math library calls, or the
+// finite variants of math library calls.
+void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) {
+
+ const unsigned MinSinglePrecVL = 4;
+ const unsigned MaxSinglePrecVL = 16;
+ const unsigned MinDoublePrecVL = 2;
+ const unsigned MaxDoublePrecVL = 8;
+
+ OS << "#ifdef GET_SVML_VARIANTS\n";
+
+ for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) {
+ StringRef SvmlVariantNameStr = D->getName();
+ // Single Precision SVML
+ for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) {
+ // Emit the scalar math library function to svml function entry.
+ OS << "{\"" << SvmlVariantNameStr << "f" << "\", ";
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+ << "ElementCount::getFixed(" << VL << ")},\n";
+
+ // Emit the scalar intrinsic to svml function entry.
+ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", ";
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+ << "ElementCount::getFixed(" << VL << ")},\n";
+
+ // Emit the finite math library function to svml function entry.
+ OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", ";
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+ << "ElementCount::getFixed(" << VL << ")},\n";
+ }
+
+ // Double Precision SVML
+ for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) {
+ // Emit the scalar math library function to svml function entry.
+ OS << "{\"" << SvmlVariantNameStr << "\", ";
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL
+ << ")},\n";
+
+ // Emit the scalar intrinsic to svml function entry.
+ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", ";
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL
+ << ")},\n";
+
+ // Emit the finite math library function to svml function entry.
+ OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", ";
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", "
+ << "ElementCount::getFixed(" << VL << ")},\n";
+ }
+ }
+
+ OS << "#endif // GET_SVML_VARIANTS\n\n";
+}
+
+void SVMLVariantsEmitter::run(raw_ostream &OS) {
+ emitSVMLVariants(OS);
+}
+
+namespace llvm {
+
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) {
+ SVMLVariantsEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
index efd641887..d31e144ff 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/TableGen.cpp
@@ -58,6 +58,7 @@ enum ActionType {
GenDirectivesEnumDecl,
GenDirectivesEnumImpl,
GenDXILOperation,
+ GenSVMLVariants,
};
namespace llvm {
@@ -140,6 +141,8 @@ cl::opt<ActionType> Action(
"Generate directive related declaration code (header file)"),
clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl",
"Generate directive related implementation code"),
+ clEnumValN(GenSVMLVariants, "gen-svml",
+ "Generate SVML variant function names"),
clEnumValN(GenDXILOperation, "gen-dxil-operation",
"Generate DXIL operation information")));
@@ -278,6 +281,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
case GenDXILOperation:
EmitDXILOperation(Records, OS);
break;
+ case GenSVMLVariants:
+ EmitSVMLVariants(Records, OS);
+ break;
}
return false;
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
index 4dff13095..5d58000e7 100644
--- a/llvm/utils/TableGen/TableGenBackends.h
+++ b/llvm/utils/TableGen/TableGenBackends.h
@@ -94,6 +94,7 @@ void EmitAutomata(RecordKeeper &RK, raw_ostream &OS);
void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS);
void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS);
void EmitDXILOperation(RecordKeeper &RK, raw_ostream &OS);
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS);
} // End llvm namespace
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index 9185a029a..cae895ada 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -104,6 +104,7 @@ syn keyword llvmKeyword
\ inreg
\ intel_ocl_bicc
\ inteldialect
+ \ intel_svmlcc
\ internal
\ jumptable
\ linkonce
--
2.41.0