Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
numba
GitHub Repository: numba/llvmlite
Path: blob/main/conda-recipes/llvm15-svml.patch
1154 views
1
From dbe4ebac2a21366f986808b175f4145499ba9856 Mon Sep 17 00:00:00 2001
2
From: Siu Kwan Lam <[email protected]>
3
Date: Mon, 8 Apr 2024 11:02:09 -0500
4
Subject: [PATCH] llvm15-svml
5
6
---
7
.../include/llvm/Analysis/TargetLibraryInfo.h | 22 +-
8
llvm/include/llvm/AsmParser/LLToken.h | 3 +
9
llvm/include/llvm/IR/CMakeLists.txt | 4 +
10
llvm/include/llvm/IR/CallingConv.h | 5 +
11
llvm/include/llvm/IR/SVML.td | 62 +++
12
llvm/lib/Analysis/CMakeLists.txt | 1 +
13
llvm/lib/Analysis/TargetLibraryInfo.cpp | 55 +-
14
llvm/lib/AsmParser/LLLexer.cpp | 3 +
15
llvm/lib/AsmParser/LLParser.cpp | 6 +
16
llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 2 +-
17
llvm/lib/IR/AsmWriter.cpp | 3 +
18
llvm/lib/IR/Verifier.cpp | 3 +
19
llvm/lib/Target/X86/X86CallingConv.td | 70 +++
20
llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-
21
llvm/lib/Target/X86/X86RegisterInfo.cpp | 46 ++
22
llvm/lib/Target/X86/X86Subtarget.h | 3 +
23
.../Transforms/Utils/InjectTLIMappings.cpp | 3 +-
24
.../Transforms/Vectorize/LoopVectorize.cpp | 270 +++++++++
25
.../Transforms/Vectorize/SLPVectorizer.cpp | 18 +-
26
.../Generic/replace-intrinsics-with-veclib.ll | 4 +-
27
.../LoopVectorize/X86/svml-calls-finite.ll | 24 +-
28
.../LoopVectorize/X86/svml-calls.ll | 108 ++--
29
.../LoopVectorize/X86/svml-legal-calls.ll | 513 ++++++++++++++++++
30
.../LoopVectorize/X86/svml-legal-codegen.ll | 61 +++
31
llvm/test/Transforms/Util/add-TLI-mappings.ll | 18 +-
32
llvm/utils/TableGen/CMakeLists.txt | 1 +
33
llvm/utils/TableGen/SVMLEmitter.cpp | 110 ++++
34
llvm/utils/TableGen/TableGen.cpp | 6 +
35
llvm/utils/TableGen/TableGenBackends.h | 1 +
36
llvm/utils/vim/syntax/llvm.vim | 1 +
37
30 files changed, 1359 insertions(+), 70 deletions(-)
38
create mode 100644 llvm/include/llvm/IR/SVML.td
39
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
40
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
41
create mode 100644 llvm/utils/TableGen/SVMLEmitter.cpp
42
43
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
44
index 7bfda0124..a2ce0d0f2 100644
45
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
46
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
47
@@ -40,6 +40,12 @@ struct VecDesc {
48
NotLibFunc
49
};
50
51
+enum SVMLAccuracy {
52
+ SVML_DEFAULT,
53
+ SVML_HA,
54
+ SVML_EP
55
+};
56
+
57
/// Implementation of the target library information.
58
///
59
/// This class constructs tables that hold the target library information and
60
@@ -158,7 +164,7 @@ public:
61
/// Return true if the function F has a vector equivalent with vectorization
62
/// factor VF.
63
bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const {
64
- return !getVectorizedFunction(F, VF).empty();
65
+ return !getVectorizedFunction(F, VF, false).empty();
66
}
67
68
/// Return true if the function F has a vector equivalent with any
69
@@ -167,7 +173,10 @@ public:
70
71
/// Return the name of the equivalent of F, vectorized with factor VF. If no
72
/// such mapping exists, return the empty string.
73
- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const;
74
+ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const;
75
+
76
+ Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
77
+ StringRef F, const FunctionType &FTy, const DataLayout &DL) const;
78
79
/// Set to true iff i32 parameters to library functions should have signext
80
/// or zeroext attributes if they correspond to C-level int or unsigned int,
81
@@ -334,8 +343,13 @@ public:
82
bool isFunctionVectorizable(StringRef F) const {
83
return Impl->isFunctionVectorizable(F);
84
}
85
- StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const {
86
- return Impl->getVectorizedFunction(F, VF);
87
+ std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const {
88
+ return Impl->getVectorizedFunction(F, VF, IsFast);
89
+ }
90
+
91
+ Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
92
+ StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
93
+ return Impl->getVectorizedFunctionCallingConv(F, FTy, DL);
94
}
95
96
/// Tests if the function is both available and a candidate for optimized code
97
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
98
index 04235f0fd..ca552efcd 100644
99
--- a/llvm/include/llvm/AsmParser/LLToken.h
100
+++ b/llvm/include/llvm/AsmParser/LLToken.h
101
@@ -130,6 +130,9 @@ enum Kind {
102
kw_fastcc,
103
kw_coldcc,
104
kw_intel_ocl_bicc,
105
+ kw_intel_svmlcc128,
106
+ kw_intel_svmlcc256,
107
+ kw_intel_svmlcc512,
108
kw_cfguard_checkcc,
109
kw_x86_stdcallcc,
110
kw_x86_fastcallcc,
111
diff --git a/llvm/include/llvm/IR/CMakeLists.txt b/llvm/include/llvm/IR/CMakeLists.txt
112
index 5151f9125..3c263a5d3 100644
113
--- a/llvm/include/llvm/IR/CMakeLists.txt
114
+++ b/llvm/include/llvm/IR/CMakeLists.txt
115
@@ -22,3 +22,7 @@ tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86)
116
tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore)
117
tablegen(LLVM IntrinsicsVE.h -gen-intrinsic-enums -intrinsic-prefix=ve)
118
add_public_tablegen_target(intrinsics_gen)
119
+
120
+set(LLVM_TARGET_DEFINITIONS SVML.td)
121
+tablegen(LLVM SVML.inc -gen-svml)
122
+add_public_tablegen_target(svml_gen)
123
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
124
index fd2854246..096eea1a8 100644
125
--- a/llvm/include/llvm/IR/CallingConv.h
126
+++ b/llvm/include/llvm/IR/CallingConv.h
127
@@ -252,6 +252,11 @@ namespace CallingConv {
128
/// M68k_INTR - Calling convention used for M68k interrupt routines.
129
M68k_INTR = 101,
130
131
+ /// Intel_SVML - Calling conventions for Intel Short Math Vector Library
132
+ Intel_SVML128 = 102,
133
+ Intel_SVML256 = 103,
134
+ Intel_SVML512 = 104,
135
+
136
/// The highest possible calling convention ID. Must be some 2^k - 1.
137
MaxID = 1023
138
};
139
diff --git a/llvm/include/llvm/IR/SVML.td b/llvm/include/llvm/IR/SVML.td
140
new file mode 100644
141
index 000000000..5af710404
142
--- /dev/null
143
+++ b/llvm/include/llvm/IR/SVML.td
144
@@ -0,0 +1,62 @@
145
+//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===//
146
+//
147
+// The LLVM Compiler Infrastructure
148
+//
149
+// This file is distributed under the University of Illinois Open Source
150
+// License. See LICENSE.TXT for details.
151
+//
152
+//===----------------------------------------------------------------------===//
153
+//
154
+// This file is used by TableGen to define the different typs of SVML function
155
+// variants used with -fveclib=SVML.
156
+//
157
+//===----------------------------------------------------------------------===//
158
+
159
+class SvmlVariant;
160
+
161
+def sin : SvmlVariant;
162
+def cos : SvmlVariant;
163
+def pow : SvmlVariant;
164
+def exp : SvmlVariant;
165
+def log : SvmlVariant;
166
+def acos : SvmlVariant;
167
+def acosh : SvmlVariant;
168
+def asin : SvmlVariant;
169
+def asinh : SvmlVariant;
170
+def atan2 : SvmlVariant;
171
+def atan : SvmlVariant;
172
+def atanh : SvmlVariant;
173
+def cbrt : SvmlVariant;
174
+def cdfnorm : SvmlVariant;
175
+def cdfnorminv : SvmlVariant;
176
+def cosd : SvmlVariant;
177
+def cosh : SvmlVariant;
178
+def erf : SvmlVariant;
179
+def erfc : SvmlVariant;
180
+def erfcinv : SvmlVariant;
181
+def erfinv : SvmlVariant;
182
+def exp10 : SvmlVariant;
183
+def exp2 : SvmlVariant;
184
+def expm1 : SvmlVariant;
185
+def hypot : SvmlVariant;
186
+def invsqrt : SvmlVariant;
187
+def log10 : SvmlVariant;
188
+def log1p : SvmlVariant;
189
+def log2 : SvmlVariant;
190
+def sind : SvmlVariant;
191
+def sinh : SvmlVariant;
192
+def sqrt : SvmlVariant;
193
+def tan : SvmlVariant;
194
+def tanh : SvmlVariant;
195
+
196
+// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions.
197
+// We should call the default variant of these functions in all cases instead.
198
+
199
+// def nearbyint : SvmlVariant;
200
+// def logb : SvmlVariant;
201
+// def floor : SvmlVariant;
202
+// def fmod : SvmlVariant;
203
+// def ceil : SvmlVariant;
204
+// def trunc : SvmlVariant;
205
+// def rint : SvmlVariant;
206
+// def round : SvmlVariant;
207
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
208
index e59725c99..89af7f5d9 100644
209
--- a/llvm/lib/Analysis/CMakeLists.txt
210
+++ b/llvm/lib/Analysis/CMakeLists.txt
211
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMAnalysis
212
DEPENDS
213
intrinsics_gen
214
${MLDeps}
215
+ svml_gen
216
217
LINK_LIBS
218
${MLLinkDeps}
219
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
220
index 8ebdb65e8..eb3009593 100644
221
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
222
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
223
@@ -110,6 +110,11 @@ bool TargetLibraryInfoImpl::isCallingConvCCompatible(Function *F) {
224
F->getFunctionType());
225
}
226
227
+static std::string svmlMangle(StringRef FnName, const bool IsFast) {
228
+ std::string FullName = FnName.str();
229
+ return IsFast ? FullName : FullName + "_ha";
230
+}
231
+
232
/// Initialize the set of available library functions based on the specified
233
/// target triple. This should be carefully written so that a missing target
234
/// triple gets a sane set of defaults.
235
@@ -1878,8 +1883,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
236
}
237
case SVML: {
238
const VecDesc VecFuncs[] = {
239
- #define TLI_DEFINE_SVML_VECFUNCS
240
- #include "llvm/Analysis/VecFuncs.def"
241
+ #define GET_SVML_VARIANTS
242
+ #include "llvm/IR/SVML.inc"
243
+ #undef GET_SVML_VARIANTS
244
};
245
addVectorizableFunctions(VecFuncs);
246
break;
247
@@ -1899,20 +1905,51 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
248
return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
249
}
250
251
-StringRef
252
-TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
253
- const ElementCount &VF) const {
254
+std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
255
+ const ElementCount &VF,
256
+ bool IsFast) const {
257
+ bool FromSVML = ClVectorLibrary == SVML;
258
F = sanitizeFunctionName(F);
259
if (F.empty())
260
- return F;
261
+ return F.str();
262
std::vector<VecDesc>::const_iterator I =
263
llvm::lower_bound(VectorDescs, F, compareWithScalarFnName);
264
while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
265
- if (I->VectorizationFactor == VF)
266
- return I->VectorFnName;
267
+ if (I->VectorizationFactor == VF) {
268
+ if (FromSVML) {
269
+ return svmlMangle(I->VectorFnName, IsFast);
270
+ }
271
+ return I->VectorFnName.str();
272
+ }
273
++I;
274
}
275
- return StringRef();
276
+ return std::string();
277
+}
278
+
279
+static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, const FunctionType &FType)
280
+{
281
+ assert(isa<VectorType>(FType.getReturnType()));
282
+ auto *VecCallRetType = cast<VectorType>(FType.getReturnType());
283
+ auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType);
284
+ if (TypeBitWidth == 128) {
285
+ return CallingConv::Intel_SVML128;
286
+ } else if (TypeBitWidth == 256) {
287
+ return CallingConv::Intel_SVML256;
288
+ } else if (TypeBitWidth == 512) {
289
+ return CallingConv::Intel_SVML512;
290
+ } else {
291
+ llvm_unreachable("Invalid vector width");
292
+ }
293
+ return 0; // not reachable
294
+}
295
+
296
+Optional<CallingConv::ID>
297
+TargetLibraryInfoImpl::getVectorizedFunctionCallingConv(
298
+ StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
299
+ if (F.startswith("__svml")) {
300
+ return getSVMLCallingConv(DL, FTy);
301
+ }
302
+ return {};
303
}
304
305
TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F,
306
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
307
index c9a982693..40e89fe57 100644
308
--- a/llvm/lib/AsmParser/LLLexer.cpp
309
+++ b/llvm/lib/AsmParser/LLLexer.cpp
310
@@ -605,6 +605,9 @@ lltok::Kind LLLexer::LexIdentifier() {
311
KEYWORD(spir_kernel);
312
KEYWORD(spir_func);
313
KEYWORD(intel_ocl_bicc);
314
+ KEYWORD(intel_svmlcc128);
315
+ KEYWORD(intel_svmlcc256);
316
+ KEYWORD(intel_svmlcc512);
317
KEYWORD(x86_64_sysvcc);
318
KEYWORD(win64cc);
319
KEYWORD(x86_regcallcc);
320
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
321
index fd502eded..8bf9c50be 100644
322
--- a/llvm/lib/AsmParser/LLParser.cpp
323
+++ b/llvm/lib/AsmParser/LLParser.cpp
324
@@ -1864,6 +1864,9 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
325
/// ::= 'ccc'
326
/// ::= 'fastcc'
327
/// ::= 'intel_ocl_bicc'
328
+/// ::= 'intel_svmlcc128'
329
+/// ::= 'intel_svmlcc256'
330
+/// ::= 'intel_svmlcc512'
331
/// ::= 'coldcc'
332
/// ::= 'cfguard_checkcc'
333
/// ::= 'x86_stdcallcc'
334
@@ -1933,6 +1936,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
335
case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break;
336
case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break;
337
case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
338
+ case lltok::kw_intel_svmlcc128:CC = CallingConv::Intel_SVML128; break;
339
+ case lltok::kw_intel_svmlcc256:CC = CallingConv::Intel_SVML256; break;
340
+ case lltok::kw_intel_svmlcc512:CC = CallingConv::Intel_SVML512; break;
341
case lltok::kw_x86_64_sysvcc: CC = CallingConv::X86_64_SysV; break;
342
case lltok::kw_win64cc: CC = CallingConv::Win64; break;
343
case lltok::kw_webkit_jscc: CC = CallingConv::WebKit_JS; break;
344
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
345
index 87b8ac59b..5c02e237c 100644
346
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
347
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
348
@@ -156,7 +156,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
349
// and the exact vector width of the call operands in the
350
// TargetLibraryInfo.
351
const std::string TLIName =
352
- std::string(TLI.getVectorizedFunction(ScalarName, VF));
353
+ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast()));
354
355
LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
356
<< ScalarName << "` and vector width " << VF << ".\n");
357
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
358
index a29040b8c..d7a7b4e3f 100644
359
--- a/llvm/lib/IR/AsmWriter.cpp
360
+++ b/llvm/lib/IR/AsmWriter.cpp
361
@@ -304,6 +304,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
362
case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break;
363
case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
364
case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
365
+ case CallingConv::Intel_SVML128: Out << "intel_svmlcc128"; break;
366
+ case CallingConv::Intel_SVML256: Out << "intel_svmlcc256"; break;
367
+ case CallingConv::Intel_SVML512: Out << "intel_svmlcc512"; break;
368
case CallingConv::ARM_APCS: Out << "arm_apcscc"; break;
369
case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break;
370
case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
371
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
372
index e3ea256af..1a3c50111 100644
373
--- a/llvm/lib/IR/Verifier.cpp
374
+++ b/llvm/lib/IR/Verifier.cpp
375
@@ -2527,6 +2527,9 @@ void Verifier::visitFunction(const Function &F) {
376
case CallingConv::Fast:
377
case CallingConv::Cold:
378
case CallingConv::Intel_OCL_BI:
379
+ case CallingConv::Intel_SVML128:
380
+ case CallingConv::Intel_SVML256:
381
+ case CallingConv::Intel_SVML512:
382
case CallingConv::PTX_Kernel:
383
case CallingConv::PTX_Device:
384
Check(!F.isVarArg(),
385
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
386
index 4dd8a6cdd..12e655212 100644
387
--- a/llvm/lib/Target/X86/X86CallingConv.td
388
+++ b/llvm/lib/Target/X86/X86CallingConv.td
389
@@ -498,6 +498,21 @@ def RetCC_X86_64 : CallingConv<[
390
CCDelegateTo<RetCC_X86_64_C>
391
]>;
392
393
+// Intel_SVML return-value convention.
394
+def RetCC_Intel_SVML : CallingConv<[
395
+ // Vector types are returned in XMM0,XMM1
396
+ CCIfType<[v4f32, v2f64],
397
+ CCAssignToReg<[XMM0,XMM1]>>,
398
+
399
+ // 256-bit FP vectors
400
+ CCIfType<[v8f32, v4f64],
401
+ CCAssignToReg<[YMM0,YMM1]>>,
402
+
403
+ // 512-bit FP vectors
404
+ CCIfType<[v16f32, v8f64],
405
+ CCAssignToReg<[ZMM0,ZMM1]>>
406
+]>;
407
+
408
// This is the return-value convention used for the entire X86 backend.
409
let Entry = 1 in
410
def RetCC_X86 : CallingConv<[
411
@@ -505,6 +520,10 @@ def RetCC_X86 : CallingConv<[
412
// Check if this is the Intel OpenCL built-ins calling convention
413
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
414
415
+ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<RetCC_Intel_SVML>>,
416
+ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<RetCC_Intel_SVML>>,
417
+ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<RetCC_Intel_SVML>>,
418
+
419
CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
420
CCDelegateTo<RetCC_X86_32>
421
]>;
422
@@ -1064,6 +1083,30 @@ def CC_Intel_OCL_BI : CallingConv<[
423
CCDelegateTo<CC_X86_32_C>
424
]>;
425
426
+// X86-64 Intel Short Vector Math Library calling convention.
427
+def CC_Intel_SVML : CallingConv<[
428
+
429
+ // The SSE vector arguments are passed in XMM registers.
430
+ CCIfType<[v4f32, v2f64],
431
+ CCAssignToReg<[XMM0, XMM1, XMM2]>>,
432
+
433
+ // The 256-bit vector arguments are passed in YMM registers.
434
+ CCIfType<[v8f32, v4f64],
435
+ CCAssignToReg<[YMM0, YMM1, YMM2]>>,
436
+
437
+ // The 512-bit vector arguments are passed in ZMM registers.
438
+ CCIfType<[v16f32, v8f64],
439
+ CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>
440
+]>;
441
+
442
+def CC_X86_32_Intr : CallingConv<[
443
+ CCAssignToStack<4, 4>
444
+]>;
445
+
446
+def CC_X86_64_Intr : CallingConv<[
447
+ CCAssignToStack<8, 8>
448
+]>;
449
+
450
//===----------------------------------------------------------------------===//
451
// X86 Root Argument Calling Conventions
452
//===----------------------------------------------------------------------===//
453
@@ -1115,6 +1158,9 @@ def CC_X86_64 : CallingConv<[
454
let Entry = 1 in
455
def CC_X86 : CallingConv<[
456
CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
457
+ CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<CC_Intel_SVML>>,
458
+ CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<CC_Intel_SVML>>,
459
+ CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<CC_Intel_SVML>>,
460
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
461
CCDelegateTo<CC_X86_32>
462
]>;
463
@@ -1227,3 +1273,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
464
(sequence "R%u", 12, 15))>;
465
def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
466
(sequence "XMM%u", 8, 15))>;
467
+
468
+// SVML calling convention
469
+def CSR_32_Intel_SVML : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>;
470
+def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML,
471
+ K4, K5, K6, K7)>;
472
+
473
+def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>;
474
+
475
+def CSR_64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
476
+ (sequence "XMM%u", 8, 15))>;
477
+def CSR_Win64_Intel_SVML : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
478
+ (sequence "XMM%u", 6, 15))>;
479
+
480
+def CSR_64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
481
+ (sequence "YMM%u", 8, 15))>;
482
+def CSR_Win64_Intel_SVML_AVX : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
483
+ (sequence "YMM%u", 6, 15))>;
484
+
485
+def CSR_64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
486
+ (sequence "ZMM%u", 16, 31),
487
+ K4, K5, K6, K7)>;
488
+def CSR_Win64_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
489
+ (sequence "ZMM%u", 6, 21),
490
+ K4, K5, K6, K7)>;
491
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
492
index cd45c4825..0ad88eac1 100644
493
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
494
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
495
@@ -3966,7 +3966,8 @@ void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
496
// FIXME: Only some x86_32 calling conventions support AVX512.
497
if (Subtarget.useAVX512Regs() &&
498
(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
499
- CallConv == CallingConv::Intel_OCL_BI)))
500
+ CallConv == CallingConv::Intel_OCL_BI ||
501
+ CallConv == CallingConv::Intel_SVML512)))
502
VecVT = MVT::v16f32;
503
else if (Subtarget.hasAVX())
504
VecVT = MVT::v8f32;
505
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
506
index f2658f704..b2f4bb2dd 100644
507
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
508
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
509
@@ -274,6 +274,42 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
510
}
511
}
512
513
+namespace {
514
+std::pair<const uint32_t *, const MCPhysReg *> getSVMLRegMaskAndSaveList(
515
+ bool Is64Bit, bool IsWin64, CallingConv::ID CC) {
516
+ assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512);
517
+ unsigned Abi = CC - CallingConv::Intel_SVML128 ; // 0 - 128, 1 - 256, 2 - 512
518
+
519
+ const std::pair<const uint32_t *, const MCPhysReg *> Abi64[] = {
520
+ std::make_pair(CSR_64_Intel_SVML_RegMask, CSR_64_Intel_SVML_SaveList),
521
+ std::make_pair(CSR_64_Intel_SVML_AVX_RegMask, CSR_64_Intel_SVML_AVX_SaveList),
522
+ std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, CSR_64_Intel_SVML_AVX512_SaveList),
523
+ };
524
+
525
+ const std::pair<const uint32_t *, const MCPhysReg *> AbiWin64[] = {
526
+ std::make_pair(CSR_Win64_Intel_SVML_RegMask, CSR_Win64_Intel_SVML_SaveList),
527
+ std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask, CSR_Win64_Intel_SVML_AVX_SaveList),
528
+ std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, CSR_Win64_Intel_SVML_AVX512_SaveList),
529
+ };
530
+
531
+ const std::pair<const uint32_t *, const MCPhysReg *> Abi32[] = {
532
+ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList),
533
+ std::make_pair(CSR_32_Intel_SVML_RegMask, CSR_32_Intel_SVML_SaveList),
534
+ std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, CSR_32_Intel_SVML_AVX512_SaveList),
535
+ };
536
+
537
+ if (Is64Bit) {
538
+ if (IsWin64) {
539
+ return AbiWin64[Abi];
540
+ } else {
541
+ return Abi64[Abi];
542
+ }
543
+ } else {
544
+ return Abi32[Abi];
545
+ }
546
+}
547
+}
548
+
549
const MCPhysReg *
550
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
551
assert(MF && "MachineFunction required");
552
@@ -329,6 +365,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
553
return CSR_64_Intel_OCL_BI_SaveList;
554
break;
555
}
556
+ case CallingConv::Intel_SVML128:
557
+ case CallingConv::Intel_SVML256:
558
+ case CallingConv::Intel_SVML512: {
559
+ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second;
560
+ }
561
case CallingConv::HHVM:
562
return CSR_64_HHVM_SaveList;
563
case CallingConv::X86_RegCall:
564
@@ -451,6 +492,11 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
565
return CSR_64_Intel_OCL_BI_RegMask;
566
break;
567
}
568
+ case CallingConv::Intel_SVML128:
569
+ case CallingConv::Intel_SVML256:
570
+ case CallingConv::Intel_SVML512: {
571
+ return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first;
572
+ }
573
case CallingConv::HHVM:
574
return CSR_64_HHVM_RegMask;
575
case CallingConv::X86_RegCall:
576
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
577
index 09a8b1f1a..6863cf8b6 100644
578
--- a/llvm/lib/Target/X86/X86Subtarget.h
579
+++ b/llvm/lib/Target/X86/X86Subtarget.h
580
@@ -337,6 +337,9 @@ public:
581
case CallingConv::X86_ThisCall:
582
case CallingConv::X86_VectorCall:
583
case CallingConv::Intel_OCL_BI:
584
+ case CallingConv::Intel_SVML128:
585
+ case CallingConv::Intel_SVML256:
586
+ case CallingConv::Intel_SVML512:
587
return isTargetWin64();
588
// This convention allows using the Win64 convention on other targets.
589
case CallingConv::Win64:
590
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
591
index 55bcb6f3b..230b3c01a 100644
592
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
593
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
594
@@ -19,6 +19,7 @@
595
#include "llvm/Analysis/TargetLibraryInfo.h"
596
#include "llvm/Analysis/VectorUtils.h"
597
#include "llvm/IR/InstIterator.h"
598
+#include "llvm/IR/FMF.h"
599
#include "llvm/Transforms/Utils.h"
600
#include "llvm/Transforms/Utils/ModuleUtils.h"
601
602
@@ -91,7 +92,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
603
604
auto AddVariantDecl = [&](const ElementCount &VF) {
605
const std::string TLIName =
606
- std::string(TLI.getVectorizedFunction(ScalarName, VF));
607
+ std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast()));
608
if (!TLIName.empty()) {
609
std::string MangledName =
610
VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF);
611
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
612
index 5fd4e45d8..8b8c127d5 100644
613
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
614
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
615
@@ -629,6 +629,27 @@ protected:
616
virtual void printDebugTracesAtStart(){};
617
virtual void printDebugTracesAtEnd(){};
618
619
+ /// Check legality of given SVML call instruction \p VecCall generated for
620
+ /// scalar call \p Call. If illegal then the appropriate legal instruction
621
+ /// is returned.
622
+ Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call);
623
+
624
+ /// Returns the legal VF for a call instruction \p CI using TTI information
625
+ /// and vector type.
626
+ ElementCount getLegalVFForCall(CallInst *CI);
627
+
628
+ /// Partially vectorize a given call \p Call by breaking it down into multiple
629
+ /// calls of \p LegalCall, decided by the variant VF \p LegalVF.
630
+ Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall,
631
+ unsigned LegalVF);
632
+
633
+ /// Generate shufflevector instruction for a vector value \p V based on the
634
+ /// current \p Part and a smaller VF \p LegalVF.
635
+ Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part);
636
+
637
+ /// Combine partially vectorized calls stored in \p CallResults.
638
+ Value *combinePartialVecCalls(SmallVectorImpl<Value *> &CallResults);
639
+
640
/// The original loop.
641
Loop *OrigLoop;
642
643
@@ -4170,6 +4191,17 @@ bool InnerLoopVectorizer::useOrderedReductions(
644
return Cost->useOrderedReductions(RdxDesc);
645
}
646
647
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
648
+ const TargetLibraryInfo &TLI) {
649
+ Function *VectorF = CI.getCalledFunction();
650
+ FunctionType *FTy = VectorF->getFunctionType();
651
+ StringRef VFName = VectorF->getName();
652
+ auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
653
+ if (CC) {
654
+ CI.setCallingConv(*CC);
655
+ }
656
+}
657
+
658
void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
659
VPUser &ArgOperands,
660
VPTransformState &State) {
661
@@ -4237,11 +4269,249 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
662
if (isa<FPMathOperator>(V))
663
V->copyFastMathFlags(&CI);
664
665
+ const DataLayout &DL = V->getModule()->getDataLayout();
666
+ setVectorFunctionCallingConv(*V, DL, *TLI);
667
+
668
+ // Perform legalization of SVML call instruction only if original call
669
+ // was not Intrinsic
670
+ if (!UseVectorIntrinsic &&
671
+ (V->getCalledFunction()->getName()).startswith("__svml")) {
672
+ // assert((V->getCalledFunction()->getName()).startswith("__svml"));
673
+ LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump());
674
+ auto *LegalV = cast<Instruction>(legalizeSVMLCall(V, &CI));
675
+ LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: ";
676
+ LegalV->dump());
677
+ State.set(Def, LegalV, Part);
678
+ State.addMetadata(LegalV, &CI);
679
+ } else {
680
State.set(Def, V, Part);
681
State.addMetadata(V, &CI);
682
+ }
683
}
684
}
685
686
+
687
+//===----------------------------------------------------------------------===//
688
+// Implementation of functions for SVML vector call legalization.
689
+//===----------------------------------------------------------------------===//
690
+//
691
+// Unlike other VECLIBs, SVML needs to be used with target-legal
692
+// vector types. Otherwise, link failures and/or runtime failures
693
+// will occur. A motivating example could be -
694
+//
695
+// double *a;
696
+// float *b;
697
+// #pragma clang loop vectorize_width(8)
698
+// for(i = 0; i < N; ++i) {
699
+// a[i] = sin(i); // Legal SVML VF must be 4 or below on AVX
700
+// b[i] = cosf(i); // VF can be 8 on AVX since 8 floats can fit in YMM
701
+// }
702
+//
703
+// Current implementation of vector code generation in LV is
704
+// driven based on a single VF (in InnerLoopVectorizer::VF). This
705
+// inhibits the flexibility of adjusting/choosing different VF
706
+// for different instructions.
707
+//
708
+// Due to this limitation it is much more straightforward to
709
+// first generate the illegal sin8 (svml_sin8 for SVML vector
710
+// library) call and then legalize it than trying to avoid
711
+// generating illegal code from the beginning.
712
+//
713
+// A solution for this problem is to check legality of the
714
+// call instruction right after generating it in vectorizer and
715
+// if it is illegal we split the call arguments and issue multiple
716
+// calls to match the legal VF. This is demonstrated currently for
717
+// the SVML vector library calls (non-intrinsic version only).
718
+//
719
+// Future directions and extensions:
720
+// 1) This legalization example shows us that a good direction
721
+// for the VPlan framework would be to model the vector call
722
+// instructions in a way that legal VF for each call is chosen
723
+// correctly within vectorizer and illegal code generation is
724
+// avoided.
725
+// 2) This logic can also be extended to general vector functions
726
+// i.e. legalization OpenMP decalre simd functions. The
727
+// requirements needed for this will be documented soon.
728
+
729
+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall,
730
+ CallInst *Call) {
731
+ ElementCount LegalVF = getLegalVFForCall(VecCall);
732
+
733
+ assert(LegalVF.getKnownMinValue() > 1 &&
734
+ "Legal VF for SVML call must be greater than 1 to vectorize");
735
+
736
+ if (LegalVF == VF)
737
+ return VecCall;
738
+ else if (LegalVF.getKnownMinValue() > VF.getKnownMinValue())
739
+ // TODO: handle case when we are underfilling vectors
740
+ return VecCall;
741
+
742
+ // Legal VF for this SVML call is smaller than chosen VF, break it down into
743
+ // smaller call instructions
744
+
745
+ // Convert args, types and return type to match legal VF
746
+ SmallVector<Type *, 4> NewTys;
747
+ SmallVector<Value *, 4> NewArgs;
748
+
749
+ for (Value *ArgOperand : Call->args()) {
750
+ Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF);
751
+ NewTys.push_back(Ty);
752
+ NewArgs.push_back(UndefValue::get(Ty));
753
+ }
754
+
755
+ // Construct legal vector function
756
+ const VFShape Shape =
757
+ VFShape::get(*Call, LegalVF /*EC*/, false /*HasGlobalPred*/);
758
+ Function *LegalVectorF = VFDatabase(*Call).getVectorizedFunction(Shape);
759
+ assert(LegalVectorF != nullptr && "Can't create legal vector function.");
760
+
761
+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump());
762
+
763
+ SmallVector<OperandBundleDef, 1> OpBundles;
764
+ Call->getOperandBundlesAsDefs(OpBundles);
765
+ auto LegalV = std::unique_ptr<CallInst>(CallInst::Create(LegalVectorF, NewArgs, OpBundles));
766
+
767
+ if (isa<FPMathOperator>(LegalV))
768
+ LegalV->copyFastMathFlags(Call);
769
+
770
+ const DataLayout &DL = VecCall->getModule()->getDataLayout();
771
+ // Set SVML calling conventions
772
+ setVectorFunctionCallingConv(*LegalV, DL, *TLI);
773
+
774
+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump());
775
+
776
+ Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV.get(), LegalVF.getKnownMinValue());
777
+
778
+ LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump());
779
+
780
+ // Remove the illegal call from Builder
781
+ VecCall->eraseFromParent();
782
+
783
+ return LegalizedCall;
784
+}
785
+
786
+ElementCount InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) {
787
+ const DataLayout DL = CI->getModule()->getDataLayout();
788
+ FunctionType *CallFT = CI->getFunctionType();
789
+ // All functions that need legalization should have a vector return type.
790
+ // This is true for all SVML functions that are currently supported.
791
+ assert(isa<VectorType>(CallFT->getReturnType()) &&
792
+ "Return type of call that needs legalization is not a vector.");
793
+ auto *VecCallRetType = cast<VectorType>(CallFT->getReturnType());
794
+ Type *ElemType = VecCallRetType->getElementType();
795
+
796
+ unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType);
797
+ unsigned VectorBitWidth = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
798
+ unsigned LegalVF = VectorBitWidth / TypeBitWidth;
799
+
800
+ LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n");
801
+ LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n");
802
+ LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth
803
+ << "\n");
804
+ LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n");
805
+
806
+ return ElementCount::getFixed(LegalVF);
807
+}
808
+
809
+// Partial vectorization of a call instruction is achieved by making clones of
810
+// \p LegalCall and overwriting its argument operands with shufflevector
811
+// equivalent decided based on \p LegalVF and current Part being filled.
812
+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call,
813
+ CallInst *LegalCall,
814
+ unsigned LegalVF) {
815
+ unsigned NumParts = VF.getKnownMinValue() / LegalVF;
816
+ LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n");
817
+ SmallVector<Value *, 8> CallResults;
818
+
819
+ for (unsigned Part = 0; Part < NumParts; ++Part) {
820
+ auto *ClonedCall = cast<CallInst>(LegalCall->clone());
821
+
822
+ // Update the arg operand of cloned call to shufflevector
823
+ for (unsigned i = 0, ie = Call->arg_size(); i != ie; ++i) {
824
+ auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part);
825
+ ClonedCall->setArgOperand(i, NewOp);
826
+ }
827
+
828
+ LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump());
829
+
830
+ auto *PartialVecCall = Builder.Insert(ClonedCall);
831
+ CallResults.push_back(PartialVecCall);
832
+ }
833
+
834
+ return combinePartialVecCalls(CallResults);
835
+}
836
+
837
+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF,
838
+ unsigned Part) {
839
+ // Example:
840
+ // Consider the following vector code -
841
+ // %1 = sitofp <4 x i32> %0 to <4 x double>
842
+ // %2 = call <4 x double> @__svml_sin4(<4 x double> %1)
843
+ //
844
+ // If the LegalVF is 2, we partially vectorize the sin4 call by invoking
845
+ // generateShuffleValue on the operand %1
846
+ // If Part = 1, output value is -
847
+ // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 0, i32 1>
848
+ // and if Part = 2, output is -
849
+ // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 2, i32 3>
850
+
851
+ assert(isa<VectorType>(V->getType()) &&
852
+ "Cannot generate shuffles for non-vector values.");
853
+ SmallVector<int, 4> ShuffleMask;
854
+ Value *Undef = UndefValue::get(V->getType());
855
+
856
+ unsigned ElemIdx = Part * LegalVF;
857
+
858
+ for (unsigned K = 0; K < LegalVF; K++)
859
+ ShuffleMask.push_back(static_cast<int>(ElemIdx + K));
860
+
861
+ auto *ShuffleInst =
862
+ Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle");
863
+
864
+ return ShuffleInst;
865
+}
866
+
867
+// Results of the calls executed by smaller legal call instructions must be
868
+// combined to match the original VF for later use. This is done by constructing
869
+// shufflevector instructions in a cumulative fashion.
870
+Value *InnerLoopVectorizer::combinePartialVecCalls(
871
+ SmallVectorImpl<Value *> &CallResults) {
872
+ assert(isa<VectorType>(CallResults[0]->getType()) &&
873
+ "Cannot combine calls with non-vector results.");
874
+ auto *CallType = cast<VectorType>(CallResults[0]->getType());
875
+
876
+ Value *CombinedShuffle;
877
+ unsigned NumElems = CallType->getElementCount().getKnownMinValue() * 2;
878
+ unsigned NumRegs = CallResults.size();
879
+
880
+ assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) &&
881
+ "Number of partial vector calls to combine must be a power of 2 "
882
+ "(atleast 2^1)");
883
+
884
+ while (NumRegs > 1) {
885
+ for (unsigned I = 0; I < NumRegs; I += 2) {
886
+ SmallVector<int, 4> ShuffleMask;
887
+ for (unsigned J = 0; J < NumElems; J++)
888
+ ShuffleMask.push_back(static_cast<int>(J));
889
+
890
+ CombinedShuffle = Builder.CreateShuffleVector(
891
+ CallResults[I], CallResults[I + 1], ShuffleMask, "combined");
892
+ LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:";
893
+ CombinedShuffle->dump());
894
+ CallResults.push_back(CombinedShuffle);
895
+ }
896
+
897
+ SmallVector<Value *, 2>::iterator Start = CallResults.begin();
898
+ SmallVector<Value *, 2>::iterator End = Start + NumRegs;
899
+ CallResults.erase(Start, End);
900
+
901
+ NumElems *= 2;
902
+ NumRegs /= 2;
903
+ }
904
+
905
+ return CombinedShuffle;
906
+}
907
+
908
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
909
// We should not collect Scalars more than once per VF. Right now, this
910
// function is called from collectUniformsAndScalars(), which already does
911
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
912
index 53c11c58f..5074bf21c 100644
913
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
914
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
915
@@ -7823,6 +7823,17 @@ Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
916
return Vec;
917
}
918
919
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
920
+ const TargetLibraryInfo &TLI) {
921
+ Function *VectorF = CI.getCalledFunction();
922
+ FunctionType *FTy = VectorF->getFunctionType();
923
+ StringRef VFName = VectorF->getName();
924
+ auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
925
+ if (CC) {
926
+ CI.setCallingConv(*CC);
927
+ }
928
+}
929
+
930
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
931
IRBuilder<>::InsertPointGuard Guard(Builder);
932
933
@@ -8309,7 +8320,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
934
935
SmallVector<OperandBundleDef, 1> OpBundles;
936
CI->getOperandBundlesAsDefs(OpBundles);
937
- Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
938
+
939
+ CallInst *NewCall = Builder.CreateCall(CF, OpVecs, OpBundles);
940
+ const DataLayout &DL = NewCall->getModule()->getDataLayout();
941
+ setVectorFunctionCallingConv(*NewCall, DL, *TLI);
942
+
943
+ Value *V = NewCall;
944
945
// The scalar argument uses an in-tree scalar so we add the new vectorized
946
// call to ExternalUses list to make sure that an extract will be
947
diff --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
948
index df8b7c498..63a36549f 100644
949
--- a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
950
+++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
951
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
952
define <4 x double> @exp_v4(<4 x double> %in) {
953
; SVML-LABEL: define {{[^@]+}}@exp_v4
954
; SVML-SAME: (<4 x double> [[IN:%.*]]) {
955
-; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[IN]])
956
+; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4_ha(<4 x double> [[IN]])
957
; SVML-NEXT: ret <4 x double> [[TMP1]]
958
;
959
; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4
960
@@ -37,7 +37,7 @@ declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0
961
define <4 x float> @exp_f32(<4 x float> %in) {
962
; SVML-LABEL: define {{[^@]+}}@exp_f32
963
; SVML-SAME: (<4 x float> [[IN:%.*]]) {
964
-; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[IN]])
965
+; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4_ha(<4 x float> [[IN]])
966
; SVML-NEXT: ret <4 x float> [[TMP1]]
967
;
968
; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32
969
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
970
index a6e191c3d..d6e2e1110 100644
971
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
972
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
973
@@ -39,7 +39,8 @@ for.end: ; preds = %for.body
974
declare double @__exp_finite(double) #0
975
976
; CHECK-LABEL: @exp_f64
977
-; CHECK: <4 x double> @__svml_exp4
978
+; CHECK: <2 x double> @__svml_exp2
979
+; CHECK: <2 x double> @__svml_exp2
980
; CHECK: ret
981
define void @exp_f64(double* nocapture %varray) {
982
entry:
983
@@ -99,7 +100,8 @@ for.end: ; preds = %for.body
984
declare double @__log_finite(double) #0
985
986
; CHECK-LABEL: @log_f64
987
-; CHECK: <4 x double> @__svml_log4
988
+; CHECK: <2 x double> @__svml_log2
989
+; CHECK: <2 x double> @__svml_log2
990
; CHECK: ret
991
define void @log_f64(double* nocapture %varray) {
992
entry:
993
@@ -159,7 +161,8 @@ for.end: ; preds = %for.body
994
declare double @__pow_finite(double, double) #0
995
996
; CHECK-LABEL: @pow_f64
997
-; CHECK: <4 x double> @__svml_pow4
998
+; CHECK: <2 x double> @__svml_pow2
999
+; CHECK: <2 x double> @__svml_pow2
1000
; CHECK: ret
1001
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
1002
entry:
1003
@@ -190,7 +193,8 @@ declare float @__exp2f_finite(float) #0
1004
1005
define void @exp2f_finite(float* nocapture %varray) {
1006
; CHECK-LABEL: @exp2f_finite(
1007
-; CHECK: call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}})
1008
+; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
1009
+; CHECK: call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
1010
; CHECK: ret void
1011
;
1012
entry:
1013
@@ -219,7 +223,8 @@ declare double @__exp2_finite(double) #0
1014
1015
define void @exp2_finite(double* nocapture %varray) {
1016
; CHECK-LABEL: @exp2_finite(
1017
-; CHECK: call <4 x double> @__svml_exp24(<4 x double> {{.*}})
1018
+; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
1019
+; CHECK: call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
1020
; CHECK: ret void
1021
;
1022
entry:
1023
@@ -276,7 +281,8 @@ for.end: ; preds = %for.body
1024
declare double @__log2_finite(double) #0
1025
1026
; CHECK-LABEL: @log2_f64
1027
-; CHECK: <4 x double> @__svml_log24
1028
+; CHECK: <2 x double> @__svml_log22
1029
+; CHECK: <2 x double> @__svml_log22
1030
; CHECK: ret
1031
define void @log2_f64(double* nocapture %varray) {
1032
entry:
1033
@@ -333,7 +339,8 @@ for.end: ; preds = %for.body
1034
declare double @__log10_finite(double) #0
1035
1036
; CHECK-LABEL: @log10_f64
1037
-; CHECK: <4 x double> @__svml_log104
1038
+; CHECK: <2 x double> @__svml_log102
1039
+; CHECK: <2 x double> @__svml_log102
1040
; CHECK: ret
1041
define void @log10_f64(double* nocapture %varray) {
1042
entry:
1043
@@ -390,7 +397,8 @@ for.end: ; preds = %for.body
1044
declare double @__sqrt_finite(double) #0
1045
1046
; CHECK-LABEL: @sqrt_f64
1047
-; CHECK: <4 x double> @__svml_sqrt4
1048
+; CHECK: <2 x double> @__svml_sqrt2
1049
+; CHECK: <2 x double> @__svml_sqrt2
1050
; CHECK: ret
1051
define void @sqrt_f64(double* nocapture %varray) {
1052
entry:
1053
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
1054
index 42c280df6..088bbdcf1 100644
1055
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
1056
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
1057
@@ -48,7 +48,7 @@ declare float @llvm.exp2.f32(float) #0
1058
1059
define void @sin_f64(double* nocapture %varray) {
1060
; CHECK-LABEL: @sin_f64(
1061
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
1062
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1063
; CHECK: ret void
1064
;
1065
entry:
1066
@@ -71,7 +71,7 @@ for.end:
1067
1068
define void @sin_f32(float* nocapture %varray) {
1069
; CHECK-LABEL: @sin_f32(
1070
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
1071
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
1072
; CHECK: ret void
1073
;
1074
entry:
1075
@@ -94,7 +94,7 @@ for.end:
1076
1077
define void @sin_f64_intrinsic(double* nocapture %varray) {
1078
; CHECK-LABEL: @sin_f64_intrinsic(
1079
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
1080
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1081
; CHECK: ret void
1082
;
1083
entry:
1084
@@ -117,7 +117,7 @@ for.end:
1085
1086
define void @sin_f32_intrinsic(float* nocapture %varray) {
1087
; CHECK-LABEL: @sin_f32_intrinsic(
1088
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
1089
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
1090
; CHECK: ret void
1091
;
1092
entry:
1093
@@ -140,7 +140,7 @@ for.end:
1094
1095
define void @cos_f64(double* nocapture %varray) {
1096
; CHECK-LABEL: @cos_f64(
1097
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
1098
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1099
; CHECK: ret void
1100
;
1101
entry:
1102
@@ -163,7 +163,7 @@ for.end:
1103
1104
define void @cos_f32(float* nocapture %varray) {
1105
; CHECK-LABEL: @cos_f32(
1106
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
1107
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
1108
; CHECK: ret void
1109
;
1110
entry:
1111
@@ -186,7 +186,7 @@ for.end:
1112
1113
define void @cos_f64_intrinsic(double* nocapture %varray) {
1114
; CHECK-LABEL: @cos_f64_intrinsic(
1115
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
1116
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1117
; CHECK: ret void
1118
;
1119
entry:
1120
@@ -209,7 +209,7 @@ for.end:
1121
1122
define void @cos_f32_intrinsic(float* nocapture %varray) {
1123
; CHECK-LABEL: @cos_f32_intrinsic(
1124
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
1125
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
1126
; CHECK: ret void
1127
;
1128
entry:
1129
@@ -232,7 +232,7 @@ for.end:
1130
1131
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
1132
; CHECK-LABEL: @pow_f64(
1133
-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1134
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1135
; CHECK: ret void
1136
;
1137
entry:
1138
@@ -257,7 +257,7 @@ for.end:
1139
1140
define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
1141
; CHECK-LABEL: @pow_f64_intrinsic(
1142
-; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1143
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1144
; CHECK: ret void
1145
;
1146
entry:
1147
@@ -282,7 +282,7 @@ for.end:
1148
1149
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
1150
; CHECK-LABEL: @pow_f32(
1151
-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1152
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1153
; CHECK: ret void
1154
;
1155
entry:
1156
@@ -307,7 +307,7 @@ for.end:
1157
1158
define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
1159
; CHECK-LABEL: @pow_f32_intrinsic(
1160
-; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1161
+; CHECK: [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1162
; CHECK: ret void
1163
;
1164
entry:
1165
@@ -332,7 +332,7 @@ for.end:
1166
1167
define void @exp_f64(double* nocapture %varray) {
1168
; CHECK-LABEL: @exp_f64(
1169
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
1170
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1171
; CHECK: ret void
1172
;
1173
entry:
1174
@@ -355,7 +355,7 @@ for.end:
1175
1176
define void @exp_f32(float* nocapture %varray) {
1177
; CHECK-LABEL: @exp_f32(
1178
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
1179
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
1180
; CHECK: ret void
1181
;
1182
entry:
1183
@@ -378,7 +378,7 @@ for.end:
1184
1185
define void @exp_f64_intrinsic(double* nocapture %varray) {
1186
; CHECK-LABEL: @exp_f64_intrinsic(
1187
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
1188
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1189
; CHECK: ret void
1190
;
1191
entry:
1192
@@ -401,7 +401,7 @@ for.end:
1193
1194
define void @exp_f32_intrinsic(float* nocapture %varray) {
1195
; CHECK-LABEL: @exp_f32_intrinsic(
1196
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
1197
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
1198
; CHECK: ret void
1199
;
1200
entry:
1201
@@ -424,7 +424,7 @@ for.end:
1202
1203
define void @log_f64(double* nocapture %varray) {
1204
; CHECK-LABEL: @log_f64(
1205
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
1206
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1207
; CHECK: ret void
1208
;
1209
entry:
1210
@@ -447,7 +447,7 @@ for.end:
1211
1212
define void @log_f32(float* nocapture %varray) {
1213
; CHECK-LABEL: @log_f32(
1214
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
1215
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
1216
; CHECK: ret void
1217
;
1218
entry:
1219
@@ -470,7 +470,7 @@ for.end:
1220
1221
define void @log_f64_intrinsic(double* nocapture %varray) {
1222
; CHECK-LABEL: @log_f64_intrinsic(
1223
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
1224
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1225
; CHECK: ret void
1226
;
1227
entry:
1228
@@ -493,7 +493,7 @@ for.end:
1229
1230
define void @log_f32_intrinsic(float* nocapture %varray) {
1231
; CHECK-LABEL: @log_f32_intrinsic(
1232
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
1233
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
1234
; CHECK: ret void
1235
;
1236
entry:
1237
@@ -516,7 +516,7 @@ for.end:
1238
1239
define void @log2_f64(double* nocapture %varray) {
1240
; CHECK-LABEL: @log2_f64(
1241
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
1242
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]])
1243
; CHECK: ret void
1244
;
1245
entry:
1246
@@ -539,7 +539,7 @@ for.end:
1247
1248
define void @log2_f32(float* nocapture %varray) {
1249
; CHECK-LABEL: @log2_f32(
1250
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
1251
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]])
1252
; CHECK: ret void
1253
;
1254
entry:
1255
@@ -562,7 +562,7 @@ for.end:
1256
1257
define void @log2_f64_intrinsic(double* nocapture %varray) {
1258
; CHECK-LABEL: @log2_f64_intrinsic(
1259
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
1260
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]])
1261
; CHECK: ret void
1262
;
1263
entry:
1264
@@ -585,7 +585,7 @@ for.end:
1265
1266
define void @log2_f32_intrinsic(float* nocapture %varray) {
1267
; CHECK-LABEL: @log2_f32_intrinsic(
1268
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
1269
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]])
1270
; CHECK: ret void
1271
;
1272
entry:
1273
@@ -608,7 +608,7 @@ for.end:
1274
1275
define void @log10_f64(double* nocapture %varray) {
1276
; CHECK-LABEL: @log10_f64(
1277
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
1278
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]])
1279
; CHECK: ret void
1280
;
1281
entry:
1282
@@ -631,7 +631,7 @@ for.end:
1283
1284
define void @log10_f32(float* nocapture %varray) {
1285
; CHECK-LABEL: @log10_f32(
1286
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
1287
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]])
1288
; CHECK: ret void
1289
;
1290
entry:
1291
@@ -654,7 +654,7 @@ for.end:
1292
1293
define void @log10_f64_intrinsic(double* nocapture %varray) {
1294
; CHECK-LABEL: @log10_f64_intrinsic(
1295
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
1296
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]])
1297
; CHECK: ret void
1298
;
1299
entry:
1300
@@ -677,7 +677,7 @@ for.end:
1301
1302
define void @log10_f32_intrinsic(float* nocapture %varray) {
1303
; CHECK-LABEL: @log10_f32_intrinsic(
1304
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
1305
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]])
1306
; CHECK: ret void
1307
;
1308
entry:
1309
@@ -700,7 +700,7 @@ for.end:
1310
1311
define void @sqrt_f64(double* nocapture %varray) {
1312
; CHECK-LABEL: @sqrt_f64(
1313
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]])
1314
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sqrt4_ha(<4 x double> [[TMP4:%.*]])
1315
; CHECK: ret void
1316
;
1317
entry:
1318
@@ -723,7 +723,7 @@ for.end:
1319
1320
define void @sqrt_f32(float* nocapture %varray) {
1321
; CHECK-LABEL: @sqrt_f32(
1322
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]])
1323
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sqrtf4_ha(<4 x float> [[TMP4:%.*]])
1324
; CHECK: ret void
1325
;
1326
entry:
1327
@@ -746,7 +746,7 @@ for.end:
1328
1329
define void @exp2_f64(double* nocapture %varray) {
1330
; CHECK-LABEL: @exp2_f64(
1331
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
1332
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
1333
; CHECK: ret void
1334
;
1335
entry:
1336
@@ -769,7 +769,7 @@ for.end:
1337
1338
define void @exp2_f32(float* nocapture %varray) {
1339
; CHECK-LABEL: @exp2_f32(
1340
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
1341
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
1342
; CHECK: ret void
1343
;
1344
entry:
1345
@@ -792,7 +792,7 @@ for.end:
1346
1347
define void @exp2_f64_intrinsic(double* nocapture %varray) {
1348
; CHECK-LABEL: @exp2_f64_intrinsic(
1349
-; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
1350
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
1351
; CHECK: ret void
1352
;
1353
entry:
1354
@@ -815,7 +815,7 @@ for.end:
1355
1356
define void @exp2_f32_intrinsic(float* nocapture %varray) {
1357
; CHECK-LABEL: @exp2_f32_intrinsic(
1358
-; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
1359
+; CHECK: [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
1360
; CHECK: ret void
1361
;
1362
entry:
1363
@@ -836,4 +836,44 @@ for.end:
1364
ret void
1365
}
1366
1367
+; CHECK-LABEL: @atan2_finite
1368
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24(
1369
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24(
1370
+; CHECK: ret
1371
+
1372
+declare double @__atan2_finite(double, double) local_unnamed_addr #0
1373
+
1374
+define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 {
1375
+entry:
1376
+ br label %for.cond1.preheader
1377
+
1378
+for.cond1.preheader: ; preds = %for.inc7, %entry
1379
+ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ]
1380
+ %0 = trunc i64 %indvars.iv19 to i32
1381
+ %conv = sitofp i32 %0 to double
1382
+ br label %for.body3
1383
+
1384
+for.body3: ; preds = %for.body3, %for.cond1.preheader
1385
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
1386
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1387
+ %1 = trunc i64 %indvars.iv.next to i32
1388
+ %conv4 = sitofp i32 %1 to double
1389
+ %call = tail call fast double @__atan2_finite(double %conv, double %conv4)
1390
+ %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv
1391
+ store double %call, double* %arrayidx6, align 8
1392
+ %exitcond = icmp eq i64 %indvars.iv.next, 100
1393
+ br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5
1394
+
1395
+for.inc7: ; preds = %for.body3
1396
+ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
1397
+ %exitcond21 = icmp eq i64 %indvars.iv.next20, 100
1398
+ br i1 %exitcond21, label %for.end9, label %for.cond1.preheader
1399
+
1400
+for.end9: ; preds = %for.inc7
1401
+ ret void
1402
+}
1403
+
1404
attributes #0 = { nounwind readnone }
1405
+!5 = distinct !{!5, !6, !7}
1406
+!6 = !{!"llvm.loop.vectorize.width", i32 8}
1407
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
1408
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
1409
new file mode 100644
1410
index 000000000..326c76399
1411
--- /dev/null
1412
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
1413
@@ -0,0 +1,513 @@
1414
+; Check legalization of SVML calls, including intrinsic versions (like @llvm.<fn_name>.<type>).
1415
+
1416
+; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
1417
+
1418
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
1419
+target triple = "x86_64-unknown-linux-gnu"
1420
+
1421
+declare double @sin(double) #0
1422
+declare float @sinf(float) #0
1423
+declare double @llvm.sin.f64(double) #0
1424
+declare float @llvm.sin.f32(float) #0
1425
+
1426
+declare double @cos(double) #0
1427
+declare float @cosf(float) #0
1428
+declare double @llvm.cos.f64(double) #0
1429
+declare float @llvm.cos.f32(float) #0
1430
+
1431
+declare double @pow(double, double) #0
1432
+declare float @powf(float, float) #0
1433
+declare double @llvm.pow.f64(double, double) #0
1434
+declare float @llvm.pow.f32(float, float) #0
1435
+
1436
+declare double @exp(double) #0
1437
+declare float @expf(float) #0
1438
+declare double @llvm.exp.f64(double) #0
1439
+declare float @llvm.exp.f32(float) #0
1440
+
1441
+declare double @log(double) #0
1442
+declare float @logf(float) #0
1443
+declare double @llvm.log.f64(double) #0
1444
+declare float @llvm.log.f32(float) #0
1445
+
1446
+
1447
+define void @sin_f64(double* nocapture %varray) {
1448
+; CHECK-LABEL: @sin_f64(
1449
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
1450
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1451
+; CHECK: ret void
1452
+;
1453
+entry:
1454
+ br label %for.body
1455
+
1456
+for.body:
1457
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1458
+ %tmp = trunc i64 %iv to i32
1459
+ %conv = sitofp i32 %tmp to double
1460
+ %call = tail call double @sin(double %conv)
1461
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1462
+ store double %call, double* %arrayidx, align 4
1463
+ %iv.next = add nuw nsw i64 %iv, 1
1464
+ %exitcond = icmp eq i64 %iv.next, 1000
1465
+ br i1 %exitcond, label %for.end, label %for.body
1466
+
1467
+for.end:
1468
+ ret void
1469
+}
1470
+
1471
+define void @sin_f32(float* nocapture %varray) {
1472
+; CHECK-LABEL: @sin_f32(
1473
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
1474
+; CHECK: ret void
1475
+;
1476
+entry:
1477
+ br label %for.body
1478
+
1479
+for.body:
1480
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1481
+ %tmp = trunc i64 %iv to i32
1482
+ %conv = sitofp i32 %tmp to float
1483
+ %call = tail call float @sinf(float %conv)
1484
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1485
+ store float %call, float* %arrayidx, align 4
1486
+ %iv.next = add nuw nsw i64 %iv, 1
1487
+ %exitcond = icmp eq i64 %iv.next, 1000
1488
+ br i1 %exitcond, label %for.end, label %for.body
1489
+
1490
+for.end:
1491
+ ret void
1492
+}
1493
+
1494
+define void @sin_f64_intrinsic(double* nocapture %varray) {
1495
+; CHECK-LABEL: @sin_f64_intrinsic(
1496
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
1497
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1498
+; CHECK: ret void
1499
+;
1500
+entry:
1501
+ br label %for.body
1502
+
1503
+for.body:
1504
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1505
+ %tmp = trunc i64 %iv to i32
1506
+ %conv = sitofp i32 %tmp to double
1507
+ %call = tail call double @llvm.sin.f64(double %conv)
1508
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1509
+ store double %call, double* %arrayidx, align 4
1510
+ %iv.next = add nuw nsw i64 %iv, 1
1511
+ %exitcond = icmp eq i64 %iv.next, 1000
1512
+ br i1 %exitcond, label %for.end, label %for.body
1513
+
1514
+for.end:
1515
+ ret void
1516
+}
1517
+
1518
+define void @sin_f32_intrinsic(float* nocapture %varray) {
1519
+; CHECK-LABEL: @sin_f32_intrinsic(
1520
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
1521
+; CHECK: ret void
1522
+;
1523
+entry:
1524
+ br label %for.body
1525
+
1526
+for.body:
1527
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1528
+ %tmp = trunc i64 %iv to i32
1529
+ %conv = sitofp i32 %tmp to float
1530
+ %call = tail call float @llvm.sin.f32(float %conv)
1531
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1532
+ store float %call, float* %arrayidx, align 4
1533
+ %iv.next = add nuw nsw i64 %iv, 1
1534
+ %exitcond = icmp eq i64 %iv.next, 1000
1535
+ br i1 %exitcond, label %for.end, label %for.body
1536
+
1537
+for.end:
1538
+ ret void
1539
+}
1540
+
1541
+define void @cos_f64(double* nocapture %varray) {
1542
+; CHECK-LABEL: @cos_f64(
1543
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
1544
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1545
+; CHECK: ret void
1546
+;
1547
+entry:
1548
+ br label %for.body
1549
+
1550
+for.body:
1551
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1552
+ %tmp = trunc i64 %iv to i32
1553
+ %conv = sitofp i32 %tmp to double
1554
+ %call = tail call double @cos(double %conv)
1555
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1556
+ store double %call, double* %arrayidx, align 4
1557
+ %iv.next = add nuw nsw i64 %iv, 1
1558
+ %exitcond = icmp eq i64 %iv.next, 1000
1559
+ br i1 %exitcond, label %for.end, label %for.body
1560
+
1561
+for.end:
1562
+ ret void
1563
+}
1564
+
1565
+define void @cos_f32(float* nocapture %varray) {
1566
+; CHECK-LABEL: @cos_f32(
1567
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
1568
+; CHECK: ret void
1569
+;
1570
+entry:
1571
+ br label %for.body
1572
+
1573
+for.body:
1574
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1575
+ %tmp = trunc i64 %iv to i32
1576
+ %conv = sitofp i32 %tmp to float
1577
+ %call = tail call float @cosf(float %conv)
1578
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1579
+ store float %call, float* %arrayidx, align 4
1580
+ %iv.next = add nuw nsw i64 %iv, 1
1581
+ %exitcond = icmp eq i64 %iv.next, 1000
1582
+ br i1 %exitcond, label %for.end, label %for.body
1583
+
1584
+for.end:
1585
+ ret void
1586
+}
1587
+
1588
+define void @cos_f64_intrinsic(double* nocapture %varray) {
1589
+; CHECK-LABEL: @cos_f64_intrinsic(
1590
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
1591
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1592
+; CHECK: ret void
1593
+;
1594
+entry:
1595
+ br label %for.body
1596
+
1597
+for.body:
1598
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1599
+ %tmp = trunc i64 %iv to i32
1600
+ %conv = sitofp i32 %tmp to double
1601
+ %call = tail call double @llvm.cos.f64(double %conv)
1602
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1603
+ store double %call, double* %arrayidx, align 4
1604
+ %iv.next = add nuw nsw i64 %iv, 1
1605
+ %exitcond = icmp eq i64 %iv.next, 1000
1606
+ br i1 %exitcond, label %for.end, label %for.body
1607
+
1608
+for.end:
1609
+ ret void
1610
+}
1611
+
1612
+define void @cos_f32_intrinsic(float* nocapture %varray) {
1613
+; CHECK-LABEL: @cos_f32_intrinsic(
1614
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
1615
+; CHECK: ret void
1616
+;
1617
+entry:
1618
+ br label %for.body
1619
+
1620
+for.body:
1621
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1622
+ %tmp = trunc i64 %iv to i32
1623
+ %conv = sitofp i32 %tmp to float
1624
+ %call = tail call float @llvm.cos.f32(float %conv)
1625
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1626
+ store float %call, float* %arrayidx, align 4
1627
+ %iv.next = add nuw nsw i64 %iv, 1
1628
+ %exitcond = icmp eq i64 %iv.next, 1000
1629
+ br i1 %exitcond, label %for.end, label %for.body
1630
+
1631
+for.end:
1632
+ ret void
1633
+}
1634
+
1635
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
1636
+; CHECK-LABEL: @pow_f64(
1637
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
1638
+; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
1639
+; CHECK: ret void
1640
+;
1641
+entry:
1642
+ br label %for.body
1643
+
1644
+for.body:
1645
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1646
+ %tmp = trunc i64 %iv to i32
1647
+ %conv = sitofp i32 %tmp to double
1648
+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
1649
+ %tmp1 = load double, double* %arrayidx, align 4
1650
+ %tmp2 = tail call double @pow(double %conv, double %tmp1)
1651
+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
1652
+ store double %tmp2, double* %arrayidx2, align 4
1653
+ %iv.next = add nuw nsw i64 %iv, 1
1654
+ %exitcond = icmp eq i64 %iv.next, 1000
1655
+ br i1 %exitcond, label %for.end, label %for.body
1656
+
1657
+for.end:
1658
+ ret void
1659
+}
1660
+
1661
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
1662
+; CHECK-LABEL: @pow_f64_intrinsic(
1663
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
1664
+; CHECK: [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
1665
+; CHECK: ret void
1666
+;
1667
+entry:
1668
+ br label %for.body
1669
+
1670
+for.body:
1671
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1672
+ %tmp = trunc i64 %iv to i32
1673
+ %conv = sitofp i32 %tmp to double
1674
+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
1675
+ %tmp1 = load double, double* %arrayidx, align 4
1676
+ %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
1677
+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
1678
+ store double %tmp2, double* %arrayidx2, align 4
1679
+ %iv.next = add nuw nsw i64 %iv, 1
1680
+ %exitcond = icmp eq i64 %iv.next, 1000
1681
+ br i1 %exitcond, label %for.end, label %for.body
1682
+
1683
+for.end:
1684
+ ret void
1685
+}
1686
+
1687
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
1688
+; CHECK-LABEL: @pow_f32(
1689
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
1690
+; CHECK: ret void
1691
+;
1692
+entry:
1693
+ br label %for.body
1694
+
1695
+for.body:
1696
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1697
+ %tmp = trunc i64 %iv to i32
1698
+ %conv = sitofp i32 %tmp to float
1699
+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
1700
+ %tmp1 = load float, float* %arrayidx, align 4
1701
+ %tmp2 = tail call float @powf(float %conv, float %tmp1)
1702
+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
1703
+ store float %tmp2, float* %arrayidx2, align 4
1704
+ %iv.next = add nuw nsw i64 %iv, 1
1705
+ %exitcond = icmp eq i64 %iv.next, 1000
1706
+ br i1 %exitcond, label %for.end, label %for.body
1707
+
1708
+for.end:
1709
+ ret void
1710
+}
1711
+
1712
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
1713
+; CHECK-LABEL: @pow_f32_intrinsic(
1714
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]])
1715
+; CHECK: ret void
1716
+;
1717
+entry:
1718
+ br label %for.body
1719
+
1720
+for.body:
1721
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1722
+ %tmp = trunc i64 %iv to i32
1723
+ %conv = sitofp i32 %tmp to float
1724
+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
1725
+ %tmp1 = load float, float* %arrayidx, align 4
1726
+ %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
1727
+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
1728
+ store float %tmp2, float* %arrayidx2, align 4
1729
+ %iv.next = add nuw nsw i64 %iv, 1
1730
+ %exitcond = icmp eq i64 %iv.next, 1000
1731
+ br i1 %exitcond, label %for.end, label %for.body
1732
+
1733
+for.end:
1734
+ ret void
1735
+}
1736
+
1737
+define void @exp_f64(double* nocapture %varray) {
1738
+; CHECK-LABEL: @exp_f64(
1739
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
1740
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1741
+; CHECK: ret void
1742
+;
1743
+entry:
1744
+ br label %for.body
1745
+
1746
+for.body:
1747
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1748
+ %tmp = trunc i64 %iv to i32
1749
+ %conv = sitofp i32 %tmp to double
1750
+ %call = tail call double @exp(double %conv)
1751
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1752
+ store double %call, double* %arrayidx, align 4
1753
+ %iv.next = add nuw nsw i64 %iv, 1
1754
+ %exitcond = icmp eq i64 %iv.next, 1000
1755
+ br i1 %exitcond, label %for.end, label %for.body
1756
+
1757
+for.end:
1758
+ ret void
1759
+}
1760
+
1761
+define void @exp_f32(float* nocapture %varray) {
1762
+; CHECK-LABEL: @exp_f32(
1763
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
1764
+; CHECK: ret void
1765
+;
1766
+entry:
1767
+ br label %for.body
1768
+
1769
+for.body:
1770
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1771
+ %tmp = trunc i64 %iv to i32
1772
+ %conv = sitofp i32 %tmp to float
1773
+ %call = tail call float @expf(float %conv)
1774
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1775
+ store float %call, float* %arrayidx, align 4
1776
+ %iv.next = add nuw nsw i64 %iv, 1
1777
+ %exitcond = icmp eq i64 %iv.next, 1000
1778
+ br i1 %exitcond, label %for.end, label %for.body
1779
+
1780
+for.end:
1781
+ ret void
1782
+}
1783
+
1784
+define void @exp_f64_intrinsic(double* nocapture %varray) {
1785
+; CHECK-LABEL: @exp_f64_intrinsic(
1786
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
1787
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1788
+; CHECK: ret void
1789
+;
1790
+entry:
1791
+ br label %for.body
1792
+
1793
+for.body:
1794
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1795
+ %tmp = trunc i64 %iv to i32
1796
+ %conv = sitofp i32 %tmp to double
1797
+ %call = tail call double @llvm.exp.f64(double %conv)
1798
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1799
+ store double %call, double* %arrayidx, align 4
1800
+ %iv.next = add nuw nsw i64 %iv, 1
1801
+ %exitcond = icmp eq i64 %iv.next, 1000
1802
+ br i1 %exitcond, label %for.end, label %for.body
1803
+
1804
+for.end:
1805
+ ret void
1806
+}
1807
+
1808
+define void @exp_f32_intrinsic(float* nocapture %varray) {
1809
+; CHECK-LABEL: @exp_f32_intrinsic(
1810
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
1811
+; CHECK: ret void
1812
+;
1813
+entry:
1814
+ br label %for.body
1815
+
1816
+for.body:
1817
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1818
+ %tmp = trunc i64 %iv to i32
1819
+ %conv = sitofp i32 %tmp to float
1820
+ %call = tail call float @llvm.exp.f32(float %conv)
1821
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1822
+ store float %call, float* %arrayidx, align 4
1823
+ %iv.next = add nuw nsw i64 %iv, 1
1824
+ %exitcond = icmp eq i64 %iv.next, 1000
1825
+ br i1 %exitcond, label %for.end, label %for.body
1826
+
1827
+for.end:
1828
+ ret void
1829
+}
1830
+
1831
+define void @log_f64(double* nocapture %varray) {
1832
+; CHECK-LABEL: @log_f64(
1833
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
1834
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1835
+; CHECK: ret void
1836
+;
1837
+entry:
1838
+ br label %for.body
1839
+
1840
+for.body:
1841
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1842
+ %tmp = trunc i64 %iv to i32
1843
+ %conv = sitofp i32 %tmp to double
1844
+ %call = tail call double @log(double %conv)
1845
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1846
+ store double %call, double* %arrayidx, align 4
1847
+ %iv.next = add nuw nsw i64 %iv, 1
1848
+ %exitcond = icmp eq i64 %iv.next, 1000
1849
+ br i1 %exitcond, label %for.end, label %for.body
1850
+
1851
+for.end:
1852
+ ret void
1853
+}
1854
+
1855
+define void @log_f32(float* nocapture %varray) {
1856
+; CHECK-LABEL: @log_f32(
1857
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
1858
+; CHECK: ret void
1859
+;
1860
+entry:
1861
+ br label %for.body
1862
+
1863
+for.body:
1864
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1865
+ %tmp = trunc i64 %iv to i32
1866
+ %conv = sitofp i32 %tmp to float
1867
+ %call = tail call float @logf(float %conv)
1868
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1869
+ store float %call, float* %arrayidx, align 4
1870
+ %iv.next = add nuw nsw i64 %iv, 1
1871
+ %exitcond = icmp eq i64 %iv.next, 1000
1872
+ br i1 %exitcond, label %for.end, label %for.body
1873
+
1874
+for.end:
1875
+ ret void
1876
+}
1877
+
1878
+define void @log_f64_intrinsic(double* nocapture %varray) {
1879
+; CHECK-LABEL: @log_f64_intrinsic(
1880
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
1881
+; CHECK: [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1882
+; CHECK: ret void
1883
+;
1884
+entry:
1885
+ br label %for.body
1886
+
1887
+for.body:
1888
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1889
+ %tmp = trunc i64 %iv to i32
1890
+ %conv = sitofp i32 %tmp to double
1891
+ %call = tail call double @llvm.log.f64(double %conv)
1892
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1893
+ store double %call, double* %arrayidx, align 4
1894
+ %iv.next = add nuw nsw i64 %iv, 1
1895
+ %exitcond = icmp eq i64 %iv.next, 1000
1896
+ br i1 %exitcond, label %for.end, label %for.body
1897
+
1898
+for.end:
1899
+ ret void
1900
+}
1901
+
1902
+define void @log_f32_intrinsic(float* nocapture %varray) {
1903
+; CHECK-LABEL: @log_f32_intrinsic(
1904
+; CHECK: [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
1905
+; CHECK: ret void
1906
+;
1907
+entry:
1908
+ br label %for.body
1909
+
1910
+for.body:
1911
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1912
+ %tmp = trunc i64 %iv to i32
1913
+ %conv = sitofp i32 %tmp to float
1914
+ %call = tail call float @llvm.log.f32(float %conv)
1915
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1916
+ store float %call, float* %arrayidx, align 4
1917
+ %iv.next = add nuw nsw i64 %iv, 1
1918
+ %exitcond = icmp eq i64 %iv.next, 1000
1919
+ br i1 %exitcond, label %for.end, label %for.body
1920
+
1921
+for.end:
1922
+ ret void
1923
+}
1924
+
1925
+attributes #0 = { nounwind readnone }
1926
+
1927
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
1928
new file mode 100644
1929
index 000000000..942265344
1930
--- /dev/null
1931
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
1932
@@ -0,0 +1,61 @@
1933
+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype.
1934
+; The C code used to generate this test:
1935
+
1936
+; #include <math.h>
1937
+;
1938
+; void foo(double *a, int N){
1939
+; int i;
1940
+; #pragma clang loop vectorize_width(8)
1941
+; for (i=0;i<N;i++){
1942
+; a[i] = sin(i);
1943
+; }
1944
+; }
1945
+
1946
+; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -mattr=avx -S < %s | FileCheck %s
1947
+
1948
+; CHECK: [[I1:%.*]] = sitofp <8 x i32> [[I0:%.*]] to <8 x double>
1949
+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1950
+; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S1]])
1951
+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1952
+; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S2]])
1953
+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1954
+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8
1955
+
1956
+
1957
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
1958
+target triple = "x86_64-unknown-linux-gnu"
1959
+
1960
+; Function Attrs: nounwind uwtable
1961
+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 {
1962
+entry:
1963
+ %cmp5 = icmp sgt i32 %N, 0
1964
+ br i1 %cmp5, label %for.body.preheader, label %for.end
1965
+
1966
+for.body.preheader: ; preds = %entry
1967
+ %wide.trip.count = zext i32 %N to i64
1968
+ br label %for.body
1969
+
1970
+for.body: ; preds = %for.body, %for.body.preheader
1971
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
1972
+ %0 = trunc i64 %indvars.iv to i32
1973
+ %conv = sitofp i32 %0 to double
1974
+ %call = tail call fast double @sin(double %conv) #2
1975
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
1976
+ store double %call, double* %arrayidx, align 8, !tbaa !2
1977
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1978
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
1979
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
1980
+
1981
+for.end: ; preds = %for.body, %entry
1982
+ ret void
1983
+}
1984
+
1985
+; Function Attrs: nounwind
1986
+declare dso_local double @sin(double) local_unnamed_addr #1
1987
+
1988
+!2 = !{!3, !3, i64 0}
1989
+!3 = !{!"double", !4, i64 0}
1990
+!4 = !{!"omnipotent char", !5, i64 0}
1991
+!5 = !{!"Simple C/C++ TBAA"}
1992
+!6 = distinct !{!6, !7}
1993
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
1994
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
1995
index 8e04c22bf..a7e6978c1 100644
1996
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
1997
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
1998
@@ -12,12 +12,12 @@ target triple = "x86_64-unknown-linux-gnu"
1999
2000
; COMMON-LABEL: @llvm.compiler.used = appending global
2001
; SVML-SAME: [6 x ptr] [
2002
-; SVML-SAME: ptr @__svml_sin2,
2003
-; SVML-SAME: ptr @__svml_sin4,
2004
-; SVML-SAME: ptr @__svml_sin8,
2005
-; SVML-SAME: ptr @__svml_log10f4,
2006
-; SVML-SAME: ptr @__svml_log10f8,
2007
-; SVML-SAME: ptr @__svml_log10f16
2008
+; SVML-SAME: ptr @__svml_sin2_ha,
2009
+; SVML-SAME: ptr @__svml_sin4_ha,
2010
+; SVML-SAME: ptr @__svml_sin8_ha,
2011
+; SVML-SAME: ptr @__svml_log10f4_ha,
2012
+; SVML-SAME: ptr @__svml_log10f8_ha,
2013
+; SVML-SAME: ptr @__svml_log10f16_ha
2014
; MASSV-SAME: [2 x ptr] [
2015
; MASSV-SAME: ptr @__sind2,
2016
; MASSV-SAME: ptr @__log10f4
2017
@@ -59,9 +59,9 @@ declare float @llvm.log10.f32(float) #0
2018
attributes #0 = { nounwind readnone }
2019
2020
; SVML: attributes #[[SIN]] = { "vector-function-abi-variant"=
2021
-; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2),
2022
-; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4),
2023
-; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8)" }
2024
+; SVML-SAME: "_ZGV_LLVM_N2v_sin(__svml_sin2_ha),
2025
+; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4_ha),
2026
+; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8_ha)" }
2027
2028
; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"=
2029
; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" }
2030
diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt
2031
index 725c99b8e..58e2194e1 100644
2032
--- a/llvm/utils/TableGen/CMakeLists.txt
2033
+++ b/llvm/utils/TableGen/CMakeLists.txt
2034
@@ -47,6 +47,7 @@ add_tablegen(llvm-tblgen LLVM
2035
SearchableTableEmitter.cpp
2036
SubtargetEmitter.cpp
2037
SubtargetFeatureInfo.cpp
2038
+ SVMLEmitter.cpp
2039
TableGen.cpp
2040
Types.cpp
2041
VarLenCodeEmitterGen.cpp
2042
diff --git a/llvm/utils/TableGen/SVMLEmitter.cpp b/llvm/utils/TableGen/SVMLEmitter.cpp
2043
new file mode 100644
2044
index 000000000..a5aeea48d
2045
--- /dev/null
2046
+++ b/llvm/utils/TableGen/SVMLEmitter.cpp
2047
@@ -0,0 +1,110 @@
2048
+//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===//
2049
+//
2050
+// The LLVM Compiler Infrastructure
2051
+//
2052
+// This file is distributed under the University of Illinois Open Source
2053
+// License. See LICENSE.TXT for details.
2054
+//
2055
+//===----------------------------------------------------------------------===//
2056
+//
2057
+// This tablegen backend emits the scalar to svml function map for TLI.
2058
+//
2059
+//===----------------------------------------------------------------------===//
2060
+
2061
+#include "CodeGenTarget.h"
2062
+#include "llvm/Support/Format.h"
2063
+#include "llvm/TableGen/Error.h"
2064
+#include "llvm/TableGen/Record.h"
2065
+#include "llvm/TableGen/TableGenBackend.h"
2066
+#include <map>
2067
+#include <vector>
2068
+
2069
+using namespace llvm;
2070
+
2071
+#define DEBUG_TYPE "SVMLVariants"
2072
+#include "llvm/Support/Debug.h"
2073
+
2074
+namespace {
2075
+
2076
+class SVMLVariantsEmitter {
2077
+
2078
+ RecordKeeper &Records;
2079
+
2080
+private:
2081
+ void emitSVMLVariants(raw_ostream &OS);
2082
+
2083
+public:
2084
+ SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {}
2085
+
2086
+ void run(raw_ostream &OS);
2087
+};
2088
+} // End anonymous namespace
2089
+
2090
+/// \brief Emit the set of SVML variant function names.
2091
+// The default is to emit the high accuracy SVML variants until a mechanism is
2092
+// introduced to allow a selection of different variants through precision
2093
+// requirements specified by the user. This code generates mappings to svml
2094
+// that are in the scalar form of llvm intrinsics, math library calls, or the
2095
+// finite variants of math library calls.
2096
+void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) {
2097
+
2098
+ const unsigned MinSinglePrecVL = 4;
2099
+ const unsigned MaxSinglePrecVL = 16;
2100
+ const unsigned MinDoublePrecVL = 2;
2101
+ const unsigned MaxDoublePrecVL = 8;
2102
+
2103
+ OS << "#ifdef GET_SVML_VARIANTS\n";
2104
+
2105
+ for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) {
2106
+ StringRef SvmlVariantNameStr = D->getName();
2107
+ // Single Precision SVML
2108
+ for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) {
2109
+ // Emit the scalar math library function to svml function entry.
2110
+ OS << "{\"" << SvmlVariantNameStr << "f" << "\", ";
2111
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
2112
+ << "ElementCount::getFixed(" << VL << ")},\n";
2113
+
2114
+ // Emit the scalar intrinsic to svml function entry.
2115
+ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", ";
2116
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
2117
+ << "ElementCount::getFixed(" << VL << ")},\n";
2118
+
2119
+ // Emit the finite math library function to svml function entry.
2120
+ OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", ";
2121
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
2122
+ << "ElementCount::getFixed(" << VL << ")},\n";
2123
+ }
2124
+
2125
+ // Double Precision SVML
2126
+ for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) {
2127
+ // Emit the scalar math library function to svml function entry.
2128
+ OS << "{\"" << SvmlVariantNameStr << "\", ";
2129
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL
2130
+ << ")},\n";
2131
+
2132
+ // Emit the scalar intrinsic to svml function entry.
2133
+ OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", ";
2134
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL
2135
+ << ")},\n";
2136
+
2137
+ // Emit the finite math library function to svml function entry.
2138
+ OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", ";
2139
+ OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", "
2140
+ << "ElementCount::getFixed(" << VL << ")},\n";
2141
+ }
2142
+ }
2143
+
2144
+ OS << "#endif // GET_SVML_VARIANTS\n\n";
2145
+}
2146
+
2147
+void SVMLVariantsEmitter::run(raw_ostream &OS) {
2148
+ emitSVMLVariants(OS);
2149
+}
2150
+
2151
+namespace llvm {
2152
+
2153
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) {
2154
+ SVMLVariantsEmitter(RK).run(OS);
2155
+}
2156
+
2157
+} // End llvm namespace
2158
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
2159
index efd641887..d31e144ff 100644
2160
--- a/llvm/utils/TableGen/TableGen.cpp
2161
+++ b/llvm/utils/TableGen/TableGen.cpp
2162
@@ -58,6 +58,7 @@ enum ActionType {
2163
GenDirectivesEnumDecl,
2164
GenDirectivesEnumImpl,
2165
GenDXILOperation,
2166
+ GenSVMLVariants,
2167
};
2168
2169
namespace llvm {
2170
@@ -140,6 +141,8 @@ cl::opt<ActionType> Action(
2171
"Generate directive related declaration code (header file)"),
2172
clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl",
2173
"Generate directive related implementation code"),
2174
+ clEnumValN(GenSVMLVariants, "gen-svml",
2175
+ "Generate SVML variant function names"),
2176
clEnumValN(GenDXILOperation, "gen-dxil-operation",
2177
"Generate DXIL operation information")));
2178
2179
@@ -278,6 +281,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
2180
case GenDXILOperation:
2181
EmitDXILOperation(Records, OS);
2182
break;
2183
+ case GenSVMLVariants:
2184
+ EmitSVMLVariants(Records, OS);
2185
+ break;
2186
}
2187
2188
return false;
2189
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
2190
index 4dff13095..5d58000e7 100644
2191
--- a/llvm/utils/TableGen/TableGenBackends.h
2192
+++ b/llvm/utils/TableGen/TableGenBackends.h
2193
@@ -94,6 +94,7 @@ void EmitAutomata(RecordKeeper &RK, raw_ostream &OS);
2194
void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS);
2195
void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS);
2196
void EmitDXILOperation(RecordKeeper &RK, raw_ostream &OS);
2197
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS);
2198
2199
} // End llvm namespace
2200
2201
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
2202
index 9185a029a..cae895ada 100644
2203
--- a/llvm/utils/vim/syntax/llvm.vim
2204
+++ b/llvm/utils/vim/syntax/llvm.vim
2205
@@ -104,6 +104,7 @@ syn keyword llvmKeyword
2206
\ inreg
2207
\ intel_ocl_bicc
2208
\ inteldialect
2209
+ \ intel_svmlcc
2210
\ internal
2211
\ jumptable
2212
\ linkonce
2213
--
2214
2.41.0
2215
2216
2217