CoCalc -- llvm15-svml.patch

GitHub Repository: numba/llvmlite
Path: blob/main/conda-recipes/llvm15-svml.patch
¹¹⁵⁴ views
1
From dbe4ebac2a21366f986808b175f4145499ba9856 Mon Sep 17 00:00:00 2001
2
From: Siu Kwan Lam <[email protected]>
3
Date: Mon, 8 Apr 2024 11:02:09 -0500
4
Subject: [PATCH] llvm15-svml
5

6
---
7
 .../include/llvm/Analysis/TargetLibraryInfo.h |  22 +-
8
 llvm/include/llvm/AsmParser/LLToken.h         |   3 +
9
 llvm/include/llvm/IR/CMakeLists.txt           |   4 +
10
 llvm/include/llvm/IR/CallingConv.h            |   5 +
11
 llvm/include/llvm/IR/SVML.td                  |  62 +++
12
 llvm/lib/Analysis/CMakeLists.txt              |   1 +
13
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |  55 +-
14
 llvm/lib/AsmParser/LLLexer.cpp                |   3 +
15
 llvm/lib/AsmParser/LLParser.cpp               |   6 +
16
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        |   2 +-
17
 llvm/lib/IR/AsmWriter.cpp                     |   3 +
18
 llvm/lib/IR/Verifier.cpp                      |   3 +
19
 llvm/lib/Target/X86/X86CallingConv.td         |  70 +++
20
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   3 +-
21
 llvm/lib/Target/X86/X86RegisterInfo.cpp       |  46 ++
22
 llvm/lib/Target/X86/X86Subtarget.h            |   3 +
23
 .../Transforms/Utils/InjectTLIMappings.cpp    |   3 +-
24
 .../Transforms/Vectorize/LoopVectorize.cpp    | 270 +++++++++
25
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  18 +-
26
 .../Generic/replace-intrinsics-with-veclib.ll |   4 +-
27
 .../LoopVectorize/X86/svml-calls-finite.ll    |  24 +-
28
 .../LoopVectorize/X86/svml-calls.ll           | 108 ++--
29
 .../LoopVectorize/X86/svml-legal-calls.ll     | 513 ++++++++++++++++++
30
 .../LoopVectorize/X86/svml-legal-codegen.ll   |  61 +++
31
 llvm/test/Transforms/Util/add-TLI-mappings.ll |  18 +-
32
 llvm/utils/TableGen/CMakeLists.txt            |   1 +
33
 llvm/utils/TableGen/SVMLEmitter.cpp           | 110 ++++
34
 llvm/utils/TableGen/TableGen.cpp              |   6 +
35
 llvm/utils/TableGen/TableGenBackends.h        |   1 +
36
 llvm/utils/vim/syntax/llvm.vim                |   1 +
37
 30 files changed, 1359 insertions(+), 70 deletions(-)
38
 create mode 100644 llvm/include/llvm/IR/SVML.td
39
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
40
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
41
 create mode 100644 llvm/utils/TableGen/SVMLEmitter.cpp
42

43
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
44
index 7bfda0124..a2ce0d0f2 100644
45
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
46
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
47
@@ -40,6 +40,12 @@ struct VecDesc {
48
     NotLibFunc
49
   };
50
 
51
+enum SVMLAccuracy {
52
+  SVML_DEFAULT,
53
+  SVML_HA,
54
+  SVML_EP
55
+};
56
+
57
 /// Implementation of the target library information.
58
 ///
59
 /// This class constructs tables that hold the target library information and
60
@@ -158,7 +164,7 @@ public:
61
   /// Return true if the function F has a vector equivalent with vectorization
62
   /// factor VF.
63
   bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const {
64
-    return !getVectorizedFunction(F, VF).empty();
65
+    return !getVectorizedFunction(F, VF, false).empty();
66
   }
67
 
68
   /// Return true if the function F has a vector equivalent with any
69
@@ -167,7 +173,10 @@ public:
70
 
71
   /// Return the name of the equivalent of F, vectorized with factor VF. If no
72
   /// such mapping exists, return the empty string.
73
-  StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const;
74
+  std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const;
75
+
76
+  Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
77
+    StringRef F, const FunctionType &FTy, const DataLayout &DL) const;
78
 
79
   /// Set to true iff i32 parameters to library functions should have signext
80
   /// or zeroext attributes if they correspond to C-level int or unsigned int,
81
@@ -334,8 +343,13 @@ public:
82
   bool isFunctionVectorizable(StringRef F) const {
83
     return Impl->isFunctionVectorizable(F);
84
   }
85
-  StringRef getVectorizedFunction(StringRef F, const ElementCount &VF) const {
86
-    return Impl->getVectorizedFunction(F, VF);
87
+  std::string getVectorizedFunction(StringRef F, const ElementCount &VF, bool IsFast) const {
88
+    return Impl->getVectorizedFunction(F, VF, IsFast);
89
+  }
90
+
91
+  Optional<CallingConv::ID> getVectorizedFunctionCallingConv(
92
+    StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
93
+    return Impl->getVectorizedFunctionCallingConv(F, FTy, DL);
94
   }
95
 
96
   /// Tests if the function is both available and a candidate for optimized code
97
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
98
index 04235f0fd..ca552efcd 100644
99
--- a/llvm/include/llvm/AsmParser/LLToken.h
100
+++ b/llvm/include/llvm/AsmParser/LLToken.h
101
@@ -130,6 +130,9 @@ enum Kind {
102
   kw_fastcc,
103
   kw_coldcc,
104
   kw_intel_ocl_bicc,
105
+  kw_intel_svmlcc128,
106
+  kw_intel_svmlcc256,
107
+  kw_intel_svmlcc512,
108
   kw_cfguard_checkcc,
109
   kw_x86_stdcallcc,
110
   kw_x86_fastcallcc,
111
diff --git a/llvm/include/llvm/IR/CMakeLists.txt b/llvm/include/llvm/IR/CMakeLists.txt
112
index 5151f9125..3c263a5d3 100644
113
--- a/llvm/include/llvm/IR/CMakeLists.txt
114
+++ b/llvm/include/llvm/IR/CMakeLists.txt
115
@@ -22,3 +22,7 @@ tablegen(LLVM IntrinsicsX86.h -gen-intrinsic-enums -intrinsic-prefix=x86)
116
 tablegen(LLVM IntrinsicsXCore.h -gen-intrinsic-enums -intrinsic-prefix=xcore)
117
 tablegen(LLVM IntrinsicsVE.h -gen-intrinsic-enums -intrinsic-prefix=ve)
118
 add_public_tablegen_target(intrinsics_gen)
119
+
120
+set(LLVM_TARGET_DEFINITIONS SVML.td)
121
+tablegen(LLVM SVML.inc -gen-svml)
122
+add_public_tablegen_target(svml_gen)
123
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
124
index fd2854246..096eea1a8 100644
125
--- a/llvm/include/llvm/IR/CallingConv.h
126
+++ b/llvm/include/llvm/IR/CallingConv.h
127
@@ -252,6 +252,11 @@ namespace CallingConv {
128
     /// M68k_INTR - Calling convention used for M68k interrupt routines.
129
     M68k_INTR = 101,
130
 
131
+    /// Intel_SVML - Calling conventions for Intel Short Math Vector Library
132
+    Intel_SVML128 = 102,
133
+    Intel_SVML256 = 103,
134
+    Intel_SVML512 = 104,
135
+
136
     /// The highest possible calling convention ID. Must be some 2^k - 1.
137
     MaxID = 1023
138
   };
139
diff --git a/llvm/include/llvm/IR/SVML.td b/llvm/include/llvm/IR/SVML.td
140
new file mode 100644
141
index 000000000..5af710404
142
--- /dev/null
143
+++ b/llvm/include/llvm/IR/SVML.td
144
@@ -0,0 +1,62 @@
145
+//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===//
146
+//
147
+//                     The LLVM Compiler Infrastructure
148
+//
149
+// This file is distributed under the University of Illinois Open Source
150
+// License. See LICENSE.TXT for details.
151
+//
152
+//===----------------------------------------------------------------------===//
153
+//
154
+// This file is used by TableGen to define the different typs of SVML function
155
+// variants used with -fveclib=SVML.
156
+//
157
+//===----------------------------------------------------------------------===//
158
+
159
+class SvmlVariant;
160
+
161
+def sin        : SvmlVariant;
162
+def cos        : SvmlVariant;
163
+def pow        : SvmlVariant;
164
+def exp        : SvmlVariant;
165
+def log        : SvmlVariant;
166
+def acos       : SvmlVariant;
167
+def acosh      : SvmlVariant;
168
+def asin       : SvmlVariant;
169
+def asinh      : SvmlVariant;
170
+def atan2      : SvmlVariant;
171
+def atan       : SvmlVariant;
172
+def atanh      : SvmlVariant;
173
+def cbrt       : SvmlVariant;
174
+def cdfnorm    : SvmlVariant;
175
+def cdfnorminv : SvmlVariant;
176
+def cosd       : SvmlVariant;
177
+def cosh       : SvmlVariant;
178
+def erf        : SvmlVariant;
179
+def erfc       : SvmlVariant;
180
+def erfcinv    : SvmlVariant;
181
+def erfinv     : SvmlVariant;
182
+def exp10      : SvmlVariant;
183
+def exp2       : SvmlVariant;
184
+def expm1      : SvmlVariant;
185
+def hypot      : SvmlVariant;
186
+def invsqrt    : SvmlVariant;
187
+def log10      : SvmlVariant;
188
+def log1p      : SvmlVariant;
189
+def log2       : SvmlVariant;
190
+def sind       : SvmlVariant;
191
+def sinh       : SvmlVariant;
192
+def sqrt       : SvmlVariant;
193
+def tan        : SvmlVariant;
194
+def tanh       : SvmlVariant;
195
+
196
+// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions.
197
+// We should call the default variant of these functions in all cases instead.
198
+
199
+// def nearbyint  : SvmlVariant;
200
+// def logb       : SvmlVariant;
201
+// def floor      : SvmlVariant;
202
+// def fmod       : SvmlVariant;
203
+// def ceil       : SvmlVariant;
204
+// def trunc      : SvmlVariant;
205
+// def rint       : SvmlVariant;
206
+// def round      : SvmlVariant;
207
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
208
index e59725c99..89af7f5d9 100644
209
--- a/llvm/lib/Analysis/CMakeLists.txt
210
+++ b/llvm/lib/Analysis/CMakeLists.txt
211
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMAnalysis
212
   DEPENDS
213
   intrinsics_gen
214
   ${MLDeps}
215
+  svml_gen
216
 
217
   LINK_LIBS
218
   ${MLLinkDeps}
219
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
220
index 8ebdb65e8..eb3009593 100644
221
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
222
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
223
@@ -110,6 +110,11 @@ bool TargetLibraryInfoImpl::isCallingConvCCompatible(Function *F) {
224
                                     F->getFunctionType());
225
 }
226
 
227
+static std::string svmlMangle(StringRef FnName, const bool IsFast) {
228
+  std::string FullName = FnName.str();
229
+  return IsFast ? FullName : FullName + "_ha";
230
+}
231
+
232
 /// Initialize the set of available library functions based on the specified
233
 /// target triple. This should be carefully written so that a missing target
234
 /// triple gets a sane set of defaults.
235
@@ -1878,8 +1883,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
236
   }
237
   case SVML: {
238
     const VecDesc VecFuncs[] = {
239
-    #define TLI_DEFINE_SVML_VECFUNCS
240
-    #include "llvm/Analysis/VecFuncs.def"
241
+    #define GET_SVML_VARIANTS
242
+    #include "llvm/IR/SVML.inc"
243
+    #undef GET_SVML_VARIANTS
244
     };
245
     addVectorizableFunctions(VecFuncs);
246
     break;
247
@@ -1899,20 +1905,51 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
248
   return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
249
 }
250
 
251
-StringRef
252
-TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
253
-                                             const ElementCount &VF) const {
254
+std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
255
+                                                         const ElementCount &VF,
256
+                                                         bool IsFast) const {
257
+  bool FromSVML = ClVectorLibrary == SVML;
258
   F = sanitizeFunctionName(F);
259
   if (F.empty())
260
-    return F;
261
+    return F.str();
262
   std::vector<VecDesc>::const_iterator I =
263
       llvm::lower_bound(VectorDescs, F, compareWithScalarFnName);
264
   while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
265
-    if (I->VectorizationFactor == VF)
266
-      return I->VectorFnName;
267
+    if (I->VectorizationFactor == VF) {
268
+      if (FromSVML) {
269
+        return svmlMangle(I->VectorFnName, IsFast);
270
+      }
271
+      return I->VectorFnName.str();
272
+    }
273
     ++I;
274
   }
275
-  return StringRef();
276
+  return std::string();
277
+}
278
+
279
+static CallingConv::ID getSVMLCallingConv(const DataLayout &DL, const FunctionType &FType)
280
+{
281
+  assert(isa<VectorType>(FType.getReturnType()));
282
+  auto *VecCallRetType = cast<VectorType>(FType.getReturnType());
283
+  auto TypeBitWidth = DL.getTypeSizeInBits(VecCallRetType);
284
+  if (TypeBitWidth == 128) {
285
+    return CallingConv::Intel_SVML128;
286
+  } else if (TypeBitWidth == 256) {
287
+    return CallingConv::Intel_SVML256;
288
+  } else if (TypeBitWidth == 512) {
289
+    return CallingConv::Intel_SVML512;
290
+  } else {
291
+    llvm_unreachable("Invalid vector width");
292
+  }
293
+  return 0; // not reachable
294
+}
295
+
296
+Optional<CallingConv::ID>
297
+TargetLibraryInfoImpl::getVectorizedFunctionCallingConv(
298
+    StringRef F, const FunctionType &FTy, const DataLayout &DL) const {
299
+  if (F.startswith("__svml")) {
300
+    return getSVMLCallingConv(DL, FTy);
301
+  }
302
+  return {};
303
 }
304
 
305
 TargetLibraryInfo TargetLibraryAnalysis::run(const Function &F,
306
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
307
index c9a982693..40e89fe57 100644
308
--- a/llvm/lib/AsmParser/LLLexer.cpp
309
+++ b/llvm/lib/AsmParser/LLLexer.cpp
310
@@ -605,6 +605,9 @@ lltok::Kind LLLexer::LexIdentifier() {
311
   KEYWORD(spir_kernel);
312
   KEYWORD(spir_func);
313
   KEYWORD(intel_ocl_bicc);
314
+  KEYWORD(intel_svmlcc128);
315
+  KEYWORD(intel_svmlcc256);
316
+  KEYWORD(intel_svmlcc512);
317
   KEYWORD(x86_64_sysvcc);
318
   KEYWORD(win64cc);
319
   KEYWORD(x86_regcallcc);
320
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
321
index fd502eded..8bf9c50be 100644
322
--- a/llvm/lib/AsmParser/LLParser.cpp
323
+++ b/llvm/lib/AsmParser/LLParser.cpp
324
@@ -1864,6 +1864,9 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
325
 ///   ::= 'ccc'
326
 ///   ::= 'fastcc'
327
 ///   ::= 'intel_ocl_bicc'
328
+///   ::= 'intel_svmlcc128'
329
+///   ::= 'intel_svmlcc256'
330
+///   ::= 'intel_svmlcc512'
331
 ///   ::= 'coldcc'
332
 ///   ::= 'cfguard_checkcc'
333
 ///   ::= 'x86_stdcallcc'
334
@@ -1933,6 +1936,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
335
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
336
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
337
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
338
+  case lltok::kw_intel_svmlcc128:CC = CallingConv::Intel_SVML128; break;
339
+  case lltok::kw_intel_svmlcc256:CC = CallingConv::Intel_SVML256; break;
340
+  case lltok::kw_intel_svmlcc512:CC = CallingConv::Intel_SVML512; break;
341
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
342
   case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
343
   case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
344
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
345
index 87b8ac59b..5c02e237c 100644
346
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
347
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
348
@@ -156,7 +156,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
349
   // and the exact vector width of the call operands in the
350
   // TargetLibraryInfo.
351
   const std::string TLIName =
352
-      std::string(TLI.getVectorizedFunction(ScalarName, VF));
353
+      std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast()));
354
 
355
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
356
                     << ScalarName << "` and vector width " << VF << ".\n");
357
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
358
index a29040b8c..d7a7b4e3f 100644
359
--- a/llvm/lib/IR/AsmWriter.cpp
360
+++ b/llvm/lib/IR/AsmWriter.cpp
361
@@ -304,6 +304,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
362
   case CallingConv::X86_RegCall:   Out << "x86_regcallcc"; break;
363
   case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
364
   case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
365
+  case CallingConv::Intel_SVML128: Out << "intel_svmlcc128"; break;
366
+  case CallingConv::Intel_SVML256: Out << "intel_svmlcc256"; break;
367
+  case CallingConv::Intel_SVML512: Out << "intel_svmlcc512"; break;
368
   case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
369
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
370
   case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
371
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
372
index e3ea256af..1a3c50111 100644
373
--- a/llvm/lib/IR/Verifier.cpp
374
+++ b/llvm/lib/IR/Verifier.cpp
375
@@ -2527,6 +2527,9 @@ void Verifier::visitFunction(const Function &F) {
376
   case CallingConv::Fast:
377
   case CallingConv::Cold:
378
   case CallingConv::Intel_OCL_BI:
379
+  case CallingConv::Intel_SVML128:
380
+  case CallingConv::Intel_SVML256:
381
+  case CallingConv::Intel_SVML512:
382
   case CallingConv::PTX_Kernel:
383
   case CallingConv::PTX_Device:
384
     Check(!F.isVarArg(),
385
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
386
index 4dd8a6cdd..12e655212 100644
387
--- a/llvm/lib/Target/X86/X86CallingConv.td
388
+++ b/llvm/lib/Target/X86/X86CallingConv.td
389
@@ -498,6 +498,21 @@ def RetCC_X86_64 : CallingConv<[
390
   CCDelegateTo<RetCC_X86_64_C>
391
 ]>;
392
 
393
+// Intel_SVML return-value convention.
394
+def RetCC_Intel_SVML : CallingConv<[
395
+  // Vector types are returned in XMM0,XMM1
396
+  CCIfType<[v4f32, v2f64],
397
+            CCAssignToReg<[XMM0,XMM1]>>,
398
+
399
+  // 256-bit FP vectors
400
+  CCIfType<[v8f32, v4f64],
401
+            CCAssignToReg<[YMM0,YMM1]>>,
402
+
403
+  // 512-bit FP vectors
404
+  CCIfType<[v16f32, v8f64],
405
+            CCAssignToReg<[ZMM0,ZMM1]>>
406
+]>;
407
+
408
 // This is the return-value convention used for the entire X86 backend.
409
 let Entry = 1 in
410
 def RetCC_X86 : CallingConv<[
411
@@ -505,6 +520,10 @@ def RetCC_X86 : CallingConv<[
412
   // Check if this is the Intel OpenCL built-ins calling convention
413
   CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
414
 
415
+  CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<RetCC_Intel_SVML>>,
416
+  CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<RetCC_Intel_SVML>>,
417
+  CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<RetCC_Intel_SVML>>,
418
+
419
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
420
   CCDelegateTo<RetCC_X86_32>
421
 ]>;
422
@@ -1064,6 +1083,30 @@ def CC_Intel_OCL_BI : CallingConv<[
423
   CCDelegateTo<CC_X86_32_C>
424
 ]>;
425
 
426
+// X86-64 Intel Short Vector Math Library calling convention.
427
+def CC_Intel_SVML : CallingConv<[
428
+
429
+  // The SSE vector arguments are passed in XMM registers.
430
+  CCIfType<[v4f32, v2f64],
431
+           CCAssignToReg<[XMM0, XMM1, XMM2]>>,
432
+
433
+  // The 256-bit vector arguments are passed in YMM registers.
434
+  CCIfType<[v8f32, v4f64],
435
+           CCAssignToReg<[YMM0, YMM1, YMM2]>>,
436
+
437
+  // The 512-bit vector arguments are passed in ZMM registers.
438
+  CCIfType<[v16f32, v8f64],
439
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>
440
+]>;
441
+
442
+def CC_X86_32_Intr : CallingConv<[
443
+  CCAssignToStack<4, 4>
444
+]>;
445
+
446
+def CC_X86_64_Intr : CallingConv<[
447
+  CCAssignToStack<8, 8>
448
+]>;
449
+
450
 //===----------------------------------------------------------------------===//
451
 // X86 Root Argument Calling Conventions
452
 //===----------------------------------------------------------------------===//
453
@@ -1115,6 +1158,9 @@ def CC_X86_64 : CallingConv<[
454
 let Entry = 1 in
455
 def CC_X86 : CallingConv<[
456
   CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
457
+  CCIfCC<"CallingConv::Intel_SVML128", CCDelegateTo<CC_Intel_SVML>>,
458
+  CCIfCC<"CallingConv::Intel_SVML256", CCDelegateTo<CC_Intel_SVML>>,
459
+  CCIfCC<"CallingConv::Intel_SVML512", CCDelegateTo<CC_Intel_SVML>>,
460
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
461
   CCDelegateTo<CC_X86_32>
462
 ]>;
463
@@ -1227,3 +1273,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
464
                                                (sequence "R%u", 12, 15))>;
465
 def CSR_SysV64_RegCall       : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,               
466
                                                (sequence "XMM%u", 8, 15))>;
467
+
468
+// SVML calling convention
469
+def CSR_32_Intel_SVML        : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>;
470
+def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML,
471
+                                                K4, K5, K6, K7)>;
472
+
473
+def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>;
474
+
475
+def CSR_64_Intel_SVML       : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
476
+                                               (sequence "XMM%u", 8, 15))>;
477
+def CSR_Win64_Intel_SVML    : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
478
+                                               (sequence "XMM%u", 6, 15))>;
479
+
480
+def CSR_64_Intel_SVML_AVX        : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
481
+                                                    (sequence "YMM%u", 8, 15))>;
482
+def CSR_Win64_Intel_SVML_AVX     : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
483
+                                                    (sequence "YMM%u", 6, 15))>;
484
+
485
+def CSR_64_Intel_SVML_AVX512     : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
486
+                                                    (sequence "ZMM%u", 16, 31),
487
+                                                    K4, K5, K6, K7)>;
488
+def CSR_Win64_Intel_SVML_AVX512  : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
489
+                                                    (sequence "ZMM%u", 6, 21),
490
+                                                    K4, K5, K6, K7)>;
491
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
492
index cd45c4825..0ad88eac1 100644
493
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
494
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
495
@@ -3966,7 +3966,8 @@ void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
496
   // FIXME: Only some x86_32 calling conventions support AVX512.
497
   if (Subtarget.useAVX512Regs() &&
498
       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
499
-                     CallConv == CallingConv::Intel_OCL_BI)))
500
+                     CallConv == CallingConv::Intel_OCL_BI   ||
501
+                     CallConv == CallingConv::Intel_SVML512)))
502
     VecVT = MVT::v16f32;
503
   else if (Subtarget.hasAVX())
504
     VecVT = MVT::v8f32;
505
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
506
index f2658f704..b2f4bb2dd 100644
507
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
508
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
509
@@ -274,6 +274,42 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
510
   }
511
 }
512
 
513
+namespace {
514
+std::pair<const uint32_t *, const MCPhysReg *> getSVMLRegMaskAndSaveList(
515
+  bool Is64Bit, bool IsWin64, CallingConv::ID CC) {
516
+  assert(CC >= CallingConv::Intel_SVML128 && CC <= CallingConv::Intel_SVML512);
517
+  unsigned Abi = CC - CallingConv::Intel_SVML128 ; // 0 - 128, 1 - 256, 2 - 512
518
+
519
+  const std::pair<const uint32_t *, const MCPhysReg *> Abi64[] = {
520
+    std::make_pair(CSR_64_Intel_SVML_RegMask,        CSR_64_Intel_SVML_SaveList),
521
+    std::make_pair(CSR_64_Intel_SVML_AVX_RegMask,    CSR_64_Intel_SVML_AVX_SaveList),
522
+    std::make_pair(CSR_64_Intel_SVML_AVX512_RegMask, CSR_64_Intel_SVML_AVX512_SaveList),
523
+  };
524
+
525
+  const std::pair<const uint32_t *, const MCPhysReg *> AbiWin64[] = {
526
+    std::make_pair(CSR_Win64_Intel_SVML_RegMask,        CSR_Win64_Intel_SVML_SaveList),
527
+    std::make_pair(CSR_Win64_Intel_SVML_AVX_RegMask,    CSR_Win64_Intel_SVML_AVX_SaveList),
528
+    std::make_pair(CSR_Win64_Intel_SVML_AVX512_RegMask, CSR_Win64_Intel_SVML_AVX512_SaveList),
529
+  };
530
+
531
+  const std::pair<const uint32_t *, const MCPhysReg *> Abi32[] = {
532
+    std::make_pair(CSR_32_Intel_SVML_RegMask,        CSR_32_Intel_SVML_SaveList),
533
+    std::make_pair(CSR_32_Intel_SVML_RegMask,        CSR_32_Intel_SVML_SaveList),
534
+    std::make_pair(CSR_32_Intel_SVML_AVX512_RegMask, CSR_32_Intel_SVML_AVX512_SaveList),
535
+  };
536
+
537
+  if (Is64Bit) {
538
+    if (IsWin64) {
539
+      return AbiWin64[Abi];
540
+    } else {
541
+      return Abi64[Abi];
542
+    }
543
+  } else {
544
+    return Abi32[Abi];
545
+  }
546
+}
547
+}
548
+
549
 const MCPhysReg *
550
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
551
   assert(MF && "MachineFunction required");
552
@@ -329,6 +365,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
553
       return CSR_64_Intel_OCL_BI_SaveList;
554
     break;
555
   }
556
+  case CallingConv::Intel_SVML128:
557
+  case CallingConv::Intel_SVML256:
558
+  case CallingConv::Intel_SVML512: {
559
+    return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).second;
560
+  }
561
   case CallingConv::HHVM:
562
     return CSR_64_HHVM_SaveList;
563
   case CallingConv::X86_RegCall:
564
@@ -451,6 +492,11 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
565
       return CSR_64_Intel_OCL_BI_RegMask;
566
     break;
567
   }
568
+  case CallingConv::Intel_SVML128:
569
+  case CallingConv::Intel_SVML256:
570
+  case CallingConv::Intel_SVML512: {
571
+    return getSVMLRegMaskAndSaveList(Is64Bit, IsWin64, CC).first;
572
+  }
573
   case CallingConv::HHVM:
574
     return CSR_64_HHVM_RegMask;
575
   case CallingConv::X86_RegCall:
576
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
577
index 09a8b1f1a..6863cf8b6 100644
578
--- a/llvm/lib/Target/X86/X86Subtarget.h
579
+++ b/llvm/lib/Target/X86/X86Subtarget.h
580
@@ -337,6 +337,9 @@ public:
581
     case CallingConv::X86_ThisCall:
582
     case CallingConv::X86_VectorCall:
583
     case CallingConv::Intel_OCL_BI:
584
+    case CallingConv::Intel_SVML128:
585
+    case CallingConv::Intel_SVML256:
586
+    case CallingConv::Intel_SVML512:
587
       return isTargetWin64();
588
     // This convention allows using the Win64 convention on other targets.
589
     case CallingConv::Win64:
590
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
591
index 55bcb6f3b..230b3c01a 100644
592
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
593
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
594
@@ -19,6 +19,7 @@
595
 #include "llvm/Analysis/TargetLibraryInfo.h"
596
 #include "llvm/Analysis/VectorUtils.h"
597
 #include "llvm/IR/InstIterator.h"
598
+#include "llvm/IR/FMF.h"
599
 #include "llvm/Transforms/Utils.h"
600
 #include "llvm/Transforms/Utils/ModuleUtils.h"
601
 
602
@@ -91,7 +92,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
603
 
604
   auto AddVariantDecl = [&](const ElementCount &VF) {
605
     const std::string TLIName =
606
-        std::string(TLI.getVectorizedFunction(ScalarName, VF));
607
+        std::string(TLI.getVectorizedFunction(ScalarName, VF, CI.getFastMathFlags().isFast()));
608
     if (!TLIName.empty()) {
609
       std::string MangledName =
610
           VFABI::mangleTLIVectorName(TLIName, ScalarName, CI.arg_size(), VF);
611
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
612
index 5fd4e45d8..8b8c127d5 100644
613
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
614
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
615
@@ -629,6 +629,27 @@ protected:
616
   virtual void printDebugTracesAtStart(){};
617
   virtual void printDebugTracesAtEnd(){};
618
 
619
+  /// Check legality of given SVML call instruction \p VecCall generated for
620
+  /// scalar call \p Call. If illegal then the appropriate legal instruction
621
+  /// is returned.
622
+  Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call);
623
+
624
+  /// Returns the legal VF for a call instruction \p CI using TTI information
625
+  /// and vector type.
626
+  ElementCount getLegalVFForCall(CallInst *CI);
627
+
628
+  /// Partially vectorize a given call \p Call by breaking it down into multiple
629
+  /// calls of \p LegalCall, decided by the variant VF \p LegalVF.
630
+  Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall,
631
+                              unsigned LegalVF);
632
+
633
+  /// Generate shufflevector instruction for a vector value \p V based on the
634
+  /// current \p Part and a smaller VF \p LegalVF.
635
+  Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part);
636
+
637
+  /// Combine partially vectorized calls stored in \p CallResults.
638
+  Value *combinePartialVecCalls(SmallVectorImpl<Value *> &CallResults);
639
+
640
   /// The original loop.
641
   Loop *OrigLoop;
642
 
643
@@ -4170,6 +4191,17 @@ bool InnerLoopVectorizer::useOrderedReductions(
644
   return Cost->useOrderedReductions(RdxDesc);
645
 }
646
 
647
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
648
+                                         const TargetLibraryInfo &TLI) {
649
+  Function *VectorF = CI.getCalledFunction();
650
+  FunctionType *FTy = VectorF->getFunctionType();
651
+  StringRef VFName = VectorF->getName();
652
+  auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
653
+  if (CC) {
654
+    CI.setCallingConv(*CC);
655
+  }
656
+}
657
+
658
 void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
659
                                                VPUser &ArgOperands,
660
                                                VPTransformState &State) {
661
@@ -4237,11 +4269,249 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
662
       if (isa<FPMathOperator>(V))
663
         V->copyFastMathFlags(&CI);
664
 
665
+    const DataLayout &DL = V->getModule()->getDataLayout();
666
+    setVectorFunctionCallingConv(*V, DL, *TLI);
667
+
668
+    // Perform legalization of SVML call instruction only if original call
669
+    // was not Intrinsic
670
+    if (!UseVectorIntrinsic &&
671
+        (V->getCalledFunction()->getName()).startswith("__svml")) {
672
+      // assert((V->getCalledFunction()->getName()).startswith("__svml"));
673
+      LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump());
674
+      auto *LegalV = cast<Instruction>(legalizeSVMLCall(V, &CI));
675
+      LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: ";
676
+                 LegalV->dump());
677
+      State.set(Def, LegalV, Part);
678
+      State.addMetadata(LegalV, &CI);    
679
+    } else {
680
       State.set(Def, V, Part);
681
       State.addMetadata(V, &CI);
682
+    }
683
   }
684
 }
685
 
686
+
687
+//===----------------------------------------------------------------------===//
688
+// Implementation of functions for SVML vector call legalization.
689
+//===----------------------------------------------------------------------===//
690
+//
691
+// Unlike other VECLIBs, SVML needs to be used with target-legal
692
+// vector types. Otherwise, link failures and/or runtime failures
693
+// will occur. A motivating example could be -
694
+//
695
+//   double *a;
696
+//   float *b;
697
+//   #pragma clang loop vectorize_width(8)
698
+//   for(i = 0; i < N; ++i) {
699
+//     a[i] = sin(i);   // Legal SVML VF must be 4 or below on AVX
700
+//     b[i] = cosf(i);  // VF can be 8 on AVX since 8 floats can fit in YMM
701
+//    }
702
+//
703
+// Current implementation of vector code generation in LV is
704
+// driven based on a single VF (in InnerLoopVectorizer::VF). This
705
+// inhibits the flexibility of adjusting/choosing different VF
706
+// for different instructions.
707
+//
708
+// Due to this limitation it is much more straightforward to
709
+// first generate the illegal sin8 (svml_sin8 for SVML vector
710
+// library) call and then legalize it than trying to avoid
711
+// generating illegal code from the beginning.
712
+//
713
+// A solution for this problem is to check legality of the
714
+// call instruction right after generating it in vectorizer and
715
+// if it is illegal we split the call arguments and issue multiple
716
+// calls to match the legal VF. This is demonstrated currently for
717
+// the SVML vector library calls (non-intrinsic version only).
718
+//
719
+// Future directions and extensions:
720
+// 1) This legalization example shows us that a good direction
721
+//    for the VPlan framework would be to model the vector call
722
+//    instructions in a way that legal VF for each call is chosen
723
+//    correctly within vectorizer and illegal code generation is
724
+//    avoided.
725
+// 2) This logic can also be extended to general vector functions
726
+//    i.e. legalization OpenMP decalre simd functions. The
727
+//    requirements needed for this will be documented soon.
728
+
729
+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall,
730
+                                             CallInst *Call) {
731
+  ElementCount LegalVF = getLegalVFForCall(VecCall);
732
+
733
+  assert(LegalVF.getKnownMinValue() > 1 &&
734
+         "Legal VF for SVML call must be greater than 1 to vectorize");
735
+
736
+  if (LegalVF == VF)
737
+    return VecCall;
738
+  else if (LegalVF.getKnownMinValue() > VF.getKnownMinValue())
739
+    // TODO: handle case when we are underfilling vectors
740
+    return VecCall;
741
+
742
+  // Legal VF for this SVML call is smaller than chosen VF, break it down into
743
+  // smaller call instructions
744
+
745
+  // Convert args, types and return type to match legal VF
746
+  SmallVector<Type *, 4> NewTys;
747
+  SmallVector<Value *, 4> NewArgs;
748
+
749
+  for (Value *ArgOperand : Call->args()) {
750
+    Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF);
751
+    NewTys.push_back(Ty);
752
+    NewArgs.push_back(UndefValue::get(Ty));
753
+  }
754
+
755
+  // Construct legal vector function
756
+  const VFShape Shape =
757
+    VFShape::get(*Call, LegalVF /*EC*/, false /*HasGlobalPred*/);
758
+  Function *LegalVectorF = VFDatabase(*Call).getVectorizedFunction(Shape);
759
+  assert(LegalVectorF != nullptr && "Can't create legal vector function.");
760
+
761
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump());
762
+
763
+  SmallVector<OperandBundleDef, 1> OpBundles;
764
+  Call->getOperandBundlesAsDefs(OpBundles);
765
+  auto LegalV = std::unique_ptr<CallInst>(CallInst::Create(LegalVectorF, NewArgs, OpBundles));
766
+
767
+  if (isa<FPMathOperator>(LegalV))
768
+    LegalV->copyFastMathFlags(Call);
769
+
770
+  const DataLayout &DL = VecCall->getModule()->getDataLayout();
771
+  // Set SVML calling conventions
772
+  setVectorFunctionCallingConv(*LegalV, DL, *TLI);
773
+
774
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump());
775
+
776
+  Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV.get(), LegalVF.getKnownMinValue());
777
+
778
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump());
779
+
780
+  // Remove the illegal call from Builder
781
+  VecCall->eraseFromParent();
782
+
783
+  return LegalizedCall;
784
+}
785
+
786
+ElementCount InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) {
787
+  const DataLayout DL = CI->getModule()->getDataLayout();
788
+  FunctionType *CallFT = CI->getFunctionType();
789
+  // All functions that need legalization should have a vector return type.
790
+  // This is true for all SVML functions that are currently supported.
791
+  assert(isa<VectorType>(CallFT->getReturnType()) &&
792
+         "Return type of call that needs legalization is not a vector.");
793
+  auto *VecCallRetType = cast<VectorType>(CallFT->getReturnType());
794
+  Type *ElemType = VecCallRetType->getElementType();
795
+
796
+  unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType);
797
+  unsigned VectorBitWidth = TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
798
+  unsigned LegalVF = VectorBitWidth / TypeBitWidth;
799
+
800
+  LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n");
801
+  LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n");
802
+  LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth
803
+                    << "\n");
804
+  LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n");
805
+
806
+  return ElementCount::getFixed(LegalVF);
807
+}
808
+
809
+// Partial vectorization of a call instruction is achieved by making clones of
810
+// \p LegalCall and overwriting its argument operands with shufflevector
811
+// equivalent decided based on \p LegalVF and current Part being filled.
812
+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call,
813
+                                                 CallInst *LegalCall,
814
+                                                 unsigned LegalVF) {
815
+  unsigned NumParts = VF.getKnownMinValue() / LegalVF;
816
+  LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n");
817
+  SmallVector<Value *, 8> CallResults;
818
+
819
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
820
+    auto *ClonedCall = cast<CallInst>(LegalCall->clone());
821
+
822
+    // Update the arg operand of cloned call to shufflevector
823
+    for (unsigned i = 0, ie = Call->arg_size(); i != ie; ++i) {
824
+      auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part);
825
+      ClonedCall->setArgOperand(i, NewOp);
826
+    }
827
+
828
+    LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump());
829
+
830
+    auto *PartialVecCall = Builder.Insert(ClonedCall);
831
+    CallResults.push_back(PartialVecCall);
832
+  }
833
+
834
+  return combinePartialVecCalls(CallResults);
835
+}
836
+
837
+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF,
838
+                                                 unsigned Part) {
839
+  // Example:
840
+  // Consider the following vector code -
841
+  // %1 = sitofp <4 x i32> %0 to <4 x double>
842
+  // %2 = call <4 x double> @__svml_sin4(<4 x double> %1)
843
+  //
844
+  // If the LegalVF is 2, we partially vectorize the sin4 call by invoking
845
+  // generateShuffleValue on the operand %1
846
+  // If Part = 1, output value is -
847
+  // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 0, i32 1>
848
+  // and if Part = 2, output is -
849
+  // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 2, i32 3>
850
+
851
+  assert(isa<VectorType>(V->getType()) &&
852
+         "Cannot generate shuffles for non-vector values.");
853
+  SmallVector<int, 4> ShuffleMask;
854
+  Value *Undef = UndefValue::get(V->getType());
855
+
856
+  unsigned ElemIdx = Part * LegalVF;
857
+
858
+  for (unsigned K = 0; K < LegalVF; K++)
859
+    ShuffleMask.push_back(static_cast<int>(ElemIdx + K));
860
+
861
+  auto *ShuffleInst =
862
+      Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle");
863
+
864
+  return ShuffleInst;
865
+}
866
+
867
+// Results of the calls executed by smaller legal call instructions must be
868
+// combined to match the original VF for later use. This is done by constructing
869
+// shufflevector instructions in a cumulative fashion.
870
+Value *InnerLoopVectorizer::combinePartialVecCalls(
871
+    SmallVectorImpl<Value *> &CallResults) {
872
+  assert(isa<VectorType>(CallResults[0]->getType()) &&
873
+         "Cannot combine calls with non-vector results.");
874
+  auto *CallType = cast<VectorType>(CallResults[0]->getType());
875
+
876
+  Value *CombinedShuffle;
877
+  unsigned NumElems = CallType->getElementCount().getKnownMinValue() * 2;
878
+  unsigned NumRegs = CallResults.size();
879
+
880
+  assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) &&
881
+         "Number of partial vector calls to combine must be a power of 2 "
882
+         "(atleast 2^1)");
883
+
884
+  while (NumRegs > 1) {
885
+    for (unsigned I = 0; I < NumRegs; I += 2) {
886
+      SmallVector<int, 4> ShuffleMask;
887
+      for (unsigned J = 0; J < NumElems; J++)
888
+        ShuffleMask.push_back(static_cast<int>(J));
889
+
890
+      CombinedShuffle = Builder.CreateShuffleVector(
891
+          CallResults[I], CallResults[I + 1], ShuffleMask, "combined");
892
+      LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:";
893
+                 CombinedShuffle->dump());
894
+      CallResults.push_back(CombinedShuffle);
895
+    }
896
+
897
+    SmallVector<Value *, 2>::iterator Start = CallResults.begin();
898
+    SmallVector<Value *, 2>::iterator End = Start + NumRegs;
899
+    CallResults.erase(Start, End);
900
+
901
+    NumElems *= 2;
902
+    NumRegs /= 2;
903
+  }
904
+
905
+  return CombinedShuffle;
906
+}
907
+ 
908
 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
909
   // We should not collect Scalars more than once per VF. Right now, this
910
   // function is called from collectUniformsAndScalars(), which already does
911
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
912
index 53c11c58f..5074bf21c 100644
913
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
914
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
915
@@ -7823,6 +7823,17 @@ Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {
916
   return Vec;
917
 }
918
 
919
+static void setVectorFunctionCallingConv(CallInst &CI, const DataLayout &DL,
920
+                                         const TargetLibraryInfo &TLI) {
921
+  Function *VectorF = CI.getCalledFunction();
922
+  FunctionType *FTy = VectorF->getFunctionType();
923
+  StringRef VFName = VectorF->getName();
924
+  auto CC = TLI.getVectorizedFunctionCallingConv(VFName, *FTy, DL);
925
+  if (CC) {
926
+    CI.setCallingConv(*CC);
927
+  }
928
+}
929
+
930
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
931
   IRBuilder<>::InsertPointGuard Guard(Builder);
932
 
933
@@ -8309,7 +8320,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
934
 
935
       SmallVector<OperandBundleDef, 1> OpBundles;
936
       CI->getOperandBundlesAsDefs(OpBundles);
937
-      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
938
+
939
+      CallInst *NewCall = Builder.CreateCall(CF, OpVecs, OpBundles);
940
+      const DataLayout &DL = NewCall->getModule()->getDataLayout();
941
+      setVectorFunctionCallingConv(*NewCall, DL, *TLI);
942
+
943
+      Value *V = NewCall;
944
 
945
       // The scalar argument uses an in-tree scalar so we add the new vectorized
946
       // call to ExternalUses list to make sure that an extract will be
947
diff --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
948
index df8b7c498..63a36549f 100644
949
--- a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
950
+++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll
951
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
952
 define <4 x double> @exp_v4(<4 x double> %in) {
953
 ; SVML-LABEL: define {{[^@]+}}@exp_v4
954
 ; SVML-SAME: (<4 x double> [[IN:%.*]]) {
955
-; SVML-NEXT:    [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[IN]])
956
+; SVML-NEXT:    [[TMP1:%.*]] = call <4 x double> @__svml_exp4_ha(<4 x double> [[IN]])
957
 ; SVML-NEXT:    ret <4 x double> [[TMP1]]
958
 ;
959
 ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4
960
@@ -37,7 +37,7 @@ declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0
961
 define <4 x float> @exp_f32(<4 x float> %in) {
962
 ; SVML-LABEL: define {{[^@]+}}@exp_f32
963
 ; SVML-SAME: (<4 x float> [[IN:%.*]]) {
964
-; SVML-NEXT:    [[TMP1:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[IN]])
965
+; SVML-NEXT:    [[TMP1:%.*]] = call <4 x float> @__svml_expf4_ha(<4 x float> [[IN]])
966
 ; SVML-NEXT:    ret <4 x float> [[TMP1]]
967
 ;
968
 ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32
969
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
970
index a6e191c3d..d6e2e1110 100644
971
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
972
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
973
@@ -39,7 +39,8 @@ for.end:                                          ; preds = %for.body
974
 declare double @__exp_finite(double) #0
975
 
976
 ; CHECK-LABEL: @exp_f64
977
-; CHECK: <4 x double> @__svml_exp4
978
+; CHECK: <2 x double> @__svml_exp2
979
+; CHECK: <2 x double> @__svml_exp2
980
 ; CHECK: ret
981
 define void @exp_f64(double* nocapture %varray) {
982
 entry:
983
@@ -99,7 +100,8 @@ for.end:                                          ; preds = %for.body
984
 declare double @__log_finite(double) #0
985
 
986
 ; CHECK-LABEL: @log_f64
987
-; CHECK: <4 x double> @__svml_log4
988
+; CHECK: <2 x double> @__svml_log2
989
+; CHECK: <2 x double> @__svml_log2
990
 ; CHECK: ret
991
 define void @log_f64(double* nocapture %varray) {
992
 entry:
993
@@ -159,7 +161,8 @@ for.end:                                          ; preds = %for.body
994
 declare double @__pow_finite(double, double) #0
995
 
996
 ; CHECK-LABEL: @pow_f64
997
-; CHECK: <4 x double> @__svml_pow4
998
+; CHECK: <2 x double> @__svml_pow2
999
+; CHECK: <2 x double> @__svml_pow2
1000
 ; CHECK: ret
1001
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
1002
 entry:
1003
@@ -190,7 +193,8 @@ declare float @__exp2f_finite(float) #0
1004
 
1005
 define void @exp2f_finite(float* nocapture %varray) {
1006
 ; CHECK-LABEL: @exp2f_finite(
1007
-; CHECK:    call <4 x float> @__svml_exp2f4(<4 x float> %{{.*}})
1008
+; CHECK:    call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
1009
+; CHECK:    call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> %{{.*}})
1010
 ; CHECK:    ret void
1011
 ;
1012
 entry:
1013
@@ -219,7 +223,8 @@ declare double @__exp2_finite(double) #0
1014
 
1015
 define void @exp2_finite(double* nocapture %varray) {
1016
 ; CHECK-LABEL: @exp2_finite(
1017
-; CHECK:    call <4 x double> @__svml_exp24(<4 x double> {{.*}})
1018
+; CHECK:    call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
1019
+; CHECK:    call intel_svmlcc128 <2 x double> @__svml_exp22_ha(<2 x double> {{.*}})
1020
 ; CHECK:    ret void
1021
 ;
1022
 entry:
1023
@@ -276,7 +281,8 @@ for.end:                                          ; preds = %for.body
1024
 declare double @__log2_finite(double) #0
1025
 
1026
 ; CHECK-LABEL: @log2_f64
1027
-; CHECK: <4 x double> @__svml_log24
1028
+; CHECK: <2 x double> @__svml_log22
1029
+; CHECK: <2 x double> @__svml_log22
1030
 ; CHECK: ret
1031
 define void @log2_f64(double* nocapture %varray) {
1032
 entry:
1033
@@ -333,7 +339,8 @@ for.end:                                          ; preds = %for.body
1034
 declare double @__log10_finite(double) #0
1035
 
1036
 ; CHECK-LABEL: @log10_f64
1037
-; CHECK: <4 x double> @__svml_log104
1038
+; CHECK: <2 x double> @__svml_log102
1039
+; CHECK: <2 x double> @__svml_log102
1040
 ; CHECK: ret
1041
 define void @log10_f64(double* nocapture %varray) {
1042
 entry:
1043
@@ -390,7 +397,8 @@ for.end:                                          ; preds = %for.body
1044
 declare double @__sqrt_finite(double) #0
1045
 
1046
 ; CHECK-LABEL: @sqrt_f64
1047
-; CHECK: <4 x double> @__svml_sqrt4
1048
+; CHECK: <2 x double> @__svml_sqrt2
1049
+; CHECK: <2 x double> @__svml_sqrt2
1050
 ; CHECK: ret
1051
 define void @sqrt_f64(double* nocapture %varray) {
1052
 entry:
1053
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
1054
index 42c280df6..088bbdcf1 100644
1055
--- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
1056
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
1057
@@ -48,7 +48,7 @@ declare float @llvm.exp2.f32(float) #0
1058
 
1059
 define void @sin_f64(double* nocapture %varray) {
1060
 ; CHECK-LABEL: @sin_f64(
1061
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
1062
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1063
 ; CHECK:    ret void
1064
 ;
1065
 entry:
1066
@@ -71,7 +71,7 @@ for.end:
1067
 
1068
 define void @sin_f32(float* nocapture %varray) {
1069
 ; CHECK-LABEL: @sin_f32(
1070
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
1071
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
1072
 ; CHECK:    ret void
1073
 ;
1074
 entry:
1075
@@ -94,7 +94,7 @@ for.end:
1076
 
1077
 define void @sin_f64_intrinsic(double* nocapture %varray) {
1078
 ; CHECK-LABEL: @sin_f64_intrinsic(
1079
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
1080
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1081
 ; CHECK:    ret void
1082
 ;
1083
 entry:
1084
@@ -117,7 +117,7 @@ for.end:
1085
 
1086
 define void @sin_f32_intrinsic(float* nocapture %varray) {
1087
 ; CHECK-LABEL: @sin_f32_intrinsic(
1088
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
1089
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
1090
 ; CHECK:    ret void
1091
 ;
1092
 entry:
1093
@@ -140,7 +140,7 @@ for.end:
1094
 
1095
 define void @cos_f64(double* nocapture %varray) {
1096
 ; CHECK-LABEL: @cos_f64(
1097
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
1098
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1099
 ; CHECK:    ret void
1100
 ;
1101
 entry:
1102
@@ -163,7 +163,7 @@ for.end:
1103
 
1104
 define void @cos_f32(float* nocapture %varray) {
1105
 ; CHECK-LABEL: @cos_f32(
1106
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
1107
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
1108
 ; CHECK:    ret void
1109
 ;
1110
 entry:
1111
@@ -186,7 +186,7 @@ for.end:
1112
 
1113
 define void @cos_f64_intrinsic(double* nocapture %varray) {
1114
 ; CHECK-LABEL: @cos_f64_intrinsic(
1115
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
1116
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1117
 ; CHECK:    ret void
1118
 ;
1119
 entry:
1120
@@ -209,7 +209,7 @@ for.end:
1121
 
1122
 define void @cos_f32_intrinsic(float* nocapture %varray) {
1123
 ; CHECK-LABEL: @cos_f32_intrinsic(
1124
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
1125
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
1126
 ; CHECK:    ret void
1127
 ;
1128
 entry:
1129
@@ -232,7 +232,7 @@ for.end:
1130
 
1131
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
1132
 ; CHECK-LABEL: @pow_f64(
1133
-; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1134
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1135
 ; CHECK:    ret void
1136
 ;
1137
 entry:
1138
@@ -257,7 +257,7 @@ for.end:
1139
 
1140
 define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
1141
 ; CHECK-LABEL: @pow_f64_intrinsic(
1142
-; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1143
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
1144
 ; CHECK:    ret void
1145
 ;
1146
 entry:
1147
@@ -282,7 +282,7 @@ for.end:
1148
 
1149
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
1150
 ; CHECK-LABEL: @pow_f32(
1151
-; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1152
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1153
 ; CHECK:    ret void
1154
 ;
1155
 entry:
1156
@@ -307,7 +307,7 @@ for.end:
1157
 
1158
 define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
1159
 ; CHECK-LABEL: @pow_f32_intrinsic(
1160
-; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1161
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc128 <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
1162
 ; CHECK:    ret void
1163
 ;
1164
 entry:
1165
@@ -332,7 +332,7 @@ for.end:
1166
 
1167
 define void @exp_f64(double* nocapture %varray) {
1168
 ; CHECK-LABEL: @exp_f64(
1169
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
1170
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1171
 ; CHECK:    ret void
1172
 ;
1173
 entry:
1174
@@ -355,7 +355,7 @@ for.end:
1175
 
1176
 define void @exp_f32(float* nocapture %varray) {
1177
 ; CHECK-LABEL: @exp_f32(
1178
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
1179
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
1180
 ; CHECK:    ret void
1181
 ;
1182
 entry:
1183
@@ -378,7 +378,7 @@ for.end:
1184
 
1185
 define void @exp_f64_intrinsic(double* nocapture %varray) {
1186
 ; CHECK-LABEL: @exp_f64_intrinsic(
1187
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
1188
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1189
 ; CHECK:    ret void
1190
 ;
1191
 entry:
1192
@@ -401,7 +401,7 @@ for.end:
1193
 
1194
 define void @exp_f32_intrinsic(float* nocapture %varray) {
1195
 ; CHECK-LABEL: @exp_f32_intrinsic(
1196
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
1197
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
1198
 ; CHECK:    ret void
1199
 ;
1200
 entry:
1201
@@ -424,7 +424,7 @@ for.end:
1202
 
1203
 define void @log_f64(double* nocapture %varray) {
1204
 ; CHECK-LABEL: @log_f64(
1205
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
1206
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1207
 ; CHECK:    ret void
1208
 ;
1209
 entry:
1210
@@ -447,7 +447,7 @@ for.end:
1211
 
1212
 define void @log_f32(float* nocapture %varray) {
1213
 ; CHECK-LABEL: @log_f32(
1214
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
1215
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
1216
 ; CHECK:    ret void
1217
 ;
1218
 entry:
1219
@@ -470,7 +470,7 @@ for.end:
1220
 
1221
 define void @log_f64_intrinsic(double* nocapture %varray) {
1222
 ; CHECK-LABEL: @log_f64_intrinsic(
1223
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
1224
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1225
 ; CHECK:    ret void
1226
 ;
1227
 entry:
1228
@@ -493,7 +493,7 @@ for.end:
1229
 
1230
 define void @log_f32_intrinsic(float* nocapture %varray) {
1231
 ; CHECK-LABEL: @log_f32_intrinsic(
1232
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
1233
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
1234
 ; CHECK:    ret void
1235
 ;
1236
 entry:
1237
@@ -516,7 +516,7 @@ for.end:
1238
 
1239
 define void @log2_f64(double* nocapture %varray) {
1240
 ; CHECK-LABEL: @log2_f64(
1241
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
1242
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]])
1243
 ; CHECK:    ret void
1244
 ;
1245
 entry:
1246
@@ -539,7 +539,7 @@ for.end:
1247
 
1248
 define void @log2_f32(float* nocapture %varray) {
1249
 ; CHECK-LABEL: @log2_f32(
1250
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
1251
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]])
1252
 ; CHECK:    ret void
1253
 ;
1254
 entry:
1255
@@ -562,7 +562,7 @@ for.end:
1256
 
1257
 define void @log2_f64_intrinsic(double* nocapture %varray) {
1258
 ; CHECK-LABEL: @log2_f64_intrinsic(
1259
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
1260
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log24_ha(<4 x double> [[TMP4:%.*]])
1261
 ; CHECK:    ret void
1262
 ;
1263
 entry:
1264
@@ -585,7 +585,7 @@ for.end:
1265
 
1266
 define void @log2_f32_intrinsic(float* nocapture %varray) {
1267
 ; CHECK-LABEL: @log2_f32_intrinsic(
1268
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
1269
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log2f4_ha(<4 x float> [[TMP4:%.*]])
1270
 ; CHECK:    ret void
1271
 ;
1272
 entry:
1273
@@ -608,7 +608,7 @@ for.end:
1274
 
1275
 define void @log10_f64(double* nocapture %varray) {
1276
 ; CHECK-LABEL: @log10_f64(
1277
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
1278
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]])
1279
 ; CHECK:    ret void
1280
 ;
1281
 entry:
1282
@@ -631,7 +631,7 @@ for.end:
1283
 
1284
 define void @log10_f32(float* nocapture %varray) {
1285
 ; CHECK-LABEL: @log10_f32(
1286
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
1287
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]])
1288
 ; CHECK:    ret void
1289
 ;
1290
 entry:
1291
@@ -654,7 +654,7 @@ for.end:
1292
 
1293
 define void @log10_f64_intrinsic(double* nocapture %varray) {
1294
 ; CHECK-LABEL: @log10_f64_intrinsic(
1295
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
1296
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log104_ha(<4 x double> [[TMP4:%.*]])
1297
 ; CHECK:    ret void
1298
 ;
1299
 entry:
1300
@@ -677,7 +677,7 @@ for.end:
1301
 
1302
 define void @log10_f32_intrinsic(float* nocapture %varray) {
1303
 ; CHECK-LABEL: @log10_f32_intrinsic(
1304
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
1305
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_log10f4_ha(<4 x float> [[TMP4:%.*]])
1306
 ; CHECK:    ret void
1307
 ;
1308
 entry:
1309
@@ -700,7 +700,7 @@ for.end:
1310
 
1311
 define void @sqrt_f64(double* nocapture %varray) {
1312
 ; CHECK-LABEL: @sqrt_f64(
1313
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]])
1314
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sqrt4_ha(<4 x double> [[TMP4:%.*]])
1315
 ; CHECK:    ret void
1316
 ;
1317
 entry:
1318
@@ -723,7 +723,7 @@ for.end:
1319
 
1320
 define void @sqrt_f32(float* nocapture %varray) {
1321
 ; CHECK-LABEL: @sqrt_f32(
1322
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]])
1323
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_sqrtf4_ha(<4 x float> [[TMP4:%.*]])
1324
 ; CHECK:    ret void
1325
 ;
1326
 entry:
1327
@@ -746,7 +746,7 @@ for.end:
1328
 
1329
 define void @exp2_f64(double* nocapture %varray) {
1330
 ; CHECK-LABEL: @exp2_f64(
1331
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
1332
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
1333
 ; CHECK:    ret void
1334
 ;
1335
 entry:
1336
@@ -769,7 +769,7 @@ for.end:
1337
 
1338
 define void @exp2_f32(float* nocapture %varray) {
1339
 ; CHECK-LABEL: @exp2_f32(
1340
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
1341
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
1342
 ; CHECK:    ret void
1343
 ;
1344
 entry:
1345
@@ -792,7 +792,7 @@ for.end:
1346
 
1347
 define void @exp2_f64_intrinsic(double* nocapture %varray) {
1348
 ; CHECK-LABEL: @exp2_f64_intrinsic(
1349
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
1350
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp24_ha(<4 x double> [[TMP4:%.*]])
1351
 ; CHECK:    ret void
1352
 ;
1353
 entry:
1354
@@ -815,7 +815,7 @@ for.end:
1355
 
1356
 define void @exp2_f32_intrinsic(float* nocapture %varray) {
1357
 ; CHECK-LABEL: @exp2_f32_intrinsic(
1358
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
1359
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc128 <4 x float> @__svml_exp2f4_ha(<4 x float> [[TMP4:%.*]])
1360
 ; CHECK:    ret void
1361
 ;
1362
 entry:
1363
@@ -836,4 +836,44 @@ for.end:
1364
   ret void
1365
 }
1366
 
1367
+; CHECK-LABEL: @atan2_finite
1368
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24(
1369
+; CHECK: intel_svmlcc256 <4 x double> @__svml_atan24(
1370
+; CHECK: ret
1371
+
1372
+declare double @__atan2_finite(double, double) local_unnamed_addr #0
1373
+
1374
+define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 {
1375
+entry:
1376
+  br label %for.cond1.preheader
1377
+
1378
+for.cond1.preheader:                              ; preds = %for.inc7, %entry
1379
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ]
1380
+  %0 = trunc i64 %indvars.iv19 to i32
1381
+  %conv = sitofp i32 %0 to double
1382
+  br label %for.body3
1383
+
1384
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
1385
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
1386
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1387
+  %1 = trunc i64 %indvars.iv.next to i32
1388
+  %conv4 = sitofp i32 %1 to double
1389
+  %call = tail call fast double @__atan2_finite(double %conv, double %conv4)
1390
+  %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv
1391
+  store double %call, double* %arrayidx6, align 8
1392
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
1393
+  br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5
1394
+
1395
+for.inc7:                                         ; preds = %for.body3
1396
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
1397
+  %exitcond21 = icmp eq i64 %indvars.iv.next20, 100
1398
+  br i1 %exitcond21, label %for.end9, label %for.cond1.preheader
1399
+
1400
+for.end9:                                         ; preds = %for.inc7
1401
+  ret void
1402
+}
1403
+
1404
 attributes #0 = { nounwind readnone }
1405
+!5 = distinct !{!5, !6, !7}
1406
+!6 = !{!"llvm.loop.vectorize.width", i32 8}
1407
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
1408
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
1409
new file mode 100644
1410
index 000000000..326c76399
1411
--- /dev/null
1412
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
1413
@@ -0,0 +1,513 @@
1414
+; Check legalization of SVML calls, including intrinsic versions (like @llvm.<fn_name>.<type>).
1415
+
1416
+; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
1417
+
1418
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
1419
+target triple = "x86_64-unknown-linux-gnu"
1420
+
1421
+declare double @sin(double) #0
1422
+declare float @sinf(float) #0
1423
+declare double @llvm.sin.f64(double) #0
1424
+declare float @llvm.sin.f32(float) #0
1425
+
1426
+declare double @cos(double) #0
1427
+declare float @cosf(float) #0
1428
+declare double @llvm.cos.f64(double) #0
1429
+declare float @llvm.cos.f32(float) #0
1430
+
1431
+declare double @pow(double, double) #0
1432
+declare float @powf(float, float) #0
1433
+declare double @llvm.pow.f64(double, double) #0
1434
+declare float @llvm.pow.f32(float, float) #0
1435
+
1436
+declare double @exp(double) #0
1437
+declare float @expf(float) #0
1438
+declare double @llvm.exp.f64(double) #0
1439
+declare float @llvm.exp.f32(float) #0
1440
+
1441
+declare double @log(double) #0
1442
+declare float @logf(float) #0
1443
+declare double @llvm.log.f64(double) #0
1444
+declare float @llvm.log.f32(float) #0
1445
+
1446
+
1447
+define void @sin_f64(double* nocapture %varray) {
1448
+; CHECK-LABEL: @sin_f64(
1449
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
1450
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1451
+; CHECK:    ret void
1452
+;
1453
+entry:
1454
+  br label %for.body
1455
+
1456
+for.body:
1457
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1458
+  %tmp = trunc i64 %iv to i32
1459
+  %conv = sitofp i32 %tmp to double
1460
+  %call = tail call double @sin(double %conv)
1461
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1462
+  store double %call, double* %arrayidx, align 4
1463
+  %iv.next = add nuw nsw i64 %iv, 1
1464
+  %exitcond = icmp eq i64 %iv.next, 1000
1465
+  br i1 %exitcond, label %for.end, label %for.body
1466
+
1467
+for.end:
1468
+  ret void
1469
+}
1470
+
1471
+define void @sin_f32(float* nocapture %varray) {
1472
+; CHECK-LABEL: @sin_f32(
1473
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
1474
+; CHECK:    ret void
1475
+;
1476
+entry:
1477
+  br label %for.body
1478
+
1479
+for.body:
1480
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1481
+  %tmp = trunc i64 %iv to i32
1482
+  %conv = sitofp i32 %tmp to float
1483
+  %call = tail call float @sinf(float %conv)
1484
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1485
+  store float %call, float* %arrayidx, align 4
1486
+  %iv.next = add nuw nsw i64 %iv, 1
1487
+  %exitcond = icmp eq i64 %iv.next, 1000
1488
+  br i1 %exitcond, label %for.end, label %for.body
1489
+
1490
+for.end:
1491
+  ret void
1492
+}
1493
+
1494
+define void @sin_f64_intrinsic(double* nocapture %varray) {
1495
+; CHECK-LABEL: @sin_f64_intrinsic(
1496
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
1497
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
1498
+; CHECK:    ret void
1499
+;
1500
+entry:
1501
+  br label %for.body
1502
+
1503
+for.body:
1504
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1505
+  %tmp = trunc i64 %iv to i32
1506
+  %conv = sitofp i32 %tmp to double
1507
+  %call = tail call double @llvm.sin.f64(double %conv)
1508
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1509
+  store double %call, double* %arrayidx, align 4
1510
+  %iv.next = add nuw nsw i64 %iv, 1
1511
+  %exitcond = icmp eq i64 %iv.next, 1000
1512
+  br i1 %exitcond, label %for.end, label %for.body
1513
+
1514
+for.end:
1515
+  ret void
1516
+}
1517
+
1518
+define void @sin_f32_intrinsic(float* nocapture %varray) {
1519
+; CHECK-LABEL: @sin_f32_intrinsic(
1520
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
1521
+; CHECK:    ret void
1522
+;
1523
+entry:
1524
+  br label %for.body
1525
+
1526
+for.body:
1527
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1528
+  %tmp = trunc i64 %iv to i32
1529
+  %conv = sitofp i32 %tmp to float
1530
+  %call = tail call float @llvm.sin.f32(float %conv)
1531
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1532
+  store float %call, float* %arrayidx, align 4
1533
+  %iv.next = add nuw nsw i64 %iv, 1
1534
+  %exitcond = icmp eq i64 %iv.next, 1000
1535
+  br i1 %exitcond, label %for.end, label %for.body
1536
+
1537
+for.end:
1538
+  ret void
1539
+}
1540
+
1541
+define void @cos_f64(double* nocapture %varray) {
1542
+; CHECK-LABEL: @cos_f64(
1543
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
1544
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1545
+; CHECK:    ret void
1546
+;
1547
+entry:
1548
+  br label %for.body
1549
+
1550
+for.body:
1551
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1552
+  %tmp = trunc i64 %iv to i32
1553
+  %conv = sitofp i32 %tmp to double
1554
+  %call = tail call double @cos(double %conv)
1555
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1556
+  store double %call, double* %arrayidx, align 4
1557
+  %iv.next = add nuw nsw i64 %iv, 1
1558
+  %exitcond = icmp eq i64 %iv.next, 1000
1559
+  br i1 %exitcond, label %for.end, label %for.body
1560
+
1561
+for.end:
1562
+  ret void
1563
+}
1564
+
1565
+define void @cos_f32(float* nocapture %varray) {
1566
+; CHECK-LABEL: @cos_f32(
1567
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
1568
+; CHECK:    ret void
1569
+;
1570
+entry:
1571
+  br label %for.body
1572
+
1573
+for.body:
1574
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1575
+  %tmp = trunc i64 %iv to i32
1576
+  %conv = sitofp i32 %tmp to float
1577
+  %call = tail call float @cosf(float %conv)
1578
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1579
+  store float %call, float* %arrayidx, align 4
1580
+  %iv.next = add nuw nsw i64 %iv, 1
1581
+  %exitcond = icmp eq i64 %iv.next, 1000
1582
+  br i1 %exitcond, label %for.end, label %for.body
1583
+
1584
+for.end:
1585
+  ret void
1586
+}
1587
+
1588
+define void @cos_f64_intrinsic(double* nocapture %varray) {
1589
+; CHECK-LABEL: @cos_f64_intrinsic(
1590
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
1591
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
1592
+; CHECK:    ret void
1593
+;
1594
+entry:
1595
+  br label %for.body
1596
+
1597
+for.body:
1598
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1599
+  %tmp = trunc i64 %iv to i32
1600
+  %conv = sitofp i32 %tmp to double
1601
+  %call = tail call double @llvm.cos.f64(double %conv)
1602
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1603
+  store double %call, double* %arrayidx, align 4
1604
+  %iv.next = add nuw nsw i64 %iv, 1
1605
+  %exitcond = icmp eq i64 %iv.next, 1000
1606
+  br i1 %exitcond, label %for.end, label %for.body
1607
+
1608
+for.end:
1609
+  ret void
1610
+}
1611
+
1612
+define void @cos_f32_intrinsic(float* nocapture %varray) {
1613
+; CHECK-LABEL: @cos_f32_intrinsic(
1614
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
1615
+; CHECK:    ret void
1616
+;
1617
+entry:
1618
+  br label %for.body
1619
+
1620
+for.body:
1621
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1622
+  %tmp = trunc i64 %iv to i32
1623
+  %conv = sitofp i32 %tmp to float
1624
+  %call = tail call float @llvm.cos.f32(float %conv)
1625
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1626
+  store float %call, float* %arrayidx, align 4
1627
+  %iv.next = add nuw nsw i64 %iv, 1
1628
+  %exitcond = icmp eq i64 %iv.next, 1000
1629
+  br i1 %exitcond, label %for.end, label %for.body
1630
+
1631
+for.end:
1632
+  ret void
1633
+}
1634
+
1635
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
1636
+; CHECK-LABEL: @pow_f64(
1637
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
1638
+; CHECK:    [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
1639
+; CHECK:    ret void
1640
+;
1641
+entry:
1642
+  br label %for.body
1643
+
1644
+for.body:
1645
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1646
+  %tmp = trunc i64 %iv to i32
1647
+  %conv = sitofp i32 %tmp to double
1648
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
1649
+  %tmp1 = load double, double* %arrayidx, align 4
1650
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
1651
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
1652
+  store double %tmp2, double* %arrayidx2, align 4
1653
+  %iv.next = add nuw nsw i64 %iv, 1
1654
+  %exitcond = icmp eq i64 %iv.next, 1000
1655
+  br i1 %exitcond, label %for.end, label %for.body
1656
+
1657
+for.end:
1658
+  ret void
1659
+}
1660
+
1661
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
1662
+; CHECK-LABEL: @pow_f64_intrinsic(
1663
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
1664
+; CHECK:    [[TMP4:%.*]] = call intel_svmlcc256 <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
1665
+; CHECK:    ret void
1666
+;
1667
+entry:
1668
+  br label %for.body
1669
+
1670
+for.body:
1671
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1672
+  %tmp = trunc i64 %iv to i32
1673
+  %conv = sitofp i32 %tmp to double
1674
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
1675
+  %tmp1 = load double, double* %arrayidx, align 4
1676
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
1677
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
1678
+  store double %tmp2, double* %arrayidx2, align 4
1679
+  %iv.next = add nuw nsw i64 %iv, 1
1680
+  %exitcond = icmp eq i64 %iv.next, 1000
1681
+  br i1 %exitcond, label %for.end, label %for.body
1682
+
1683
+for.end:
1684
+  ret void
1685
+}
1686
+
1687
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
1688
+; CHECK-LABEL: @pow_f32(
1689
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
1690
+; CHECK:    ret void
1691
+;
1692
+entry:
1693
+  br label %for.body
1694
+
1695
+for.body:
1696
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1697
+  %tmp = trunc i64 %iv to i32
1698
+  %conv = sitofp i32 %tmp to float
1699
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
1700
+  %tmp1 = load float, float* %arrayidx, align 4
1701
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
1702
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
1703
+  store float %tmp2, float* %arrayidx2, align 4
1704
+  %iv.next = add nuw nsw i64 %iv, 1
1705
+  %exitcond = icmp eq i64 %iv.next, 1000
1706
+  br i1 %exitcond, label %for.end, label %for.body
1707
+
1708
+for.end:
1709
+  ret void
1710
+}
1711
+
1712
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
1713
+; CHECK-LABEL: @pow_f32_intrinsic(
1714
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]])
1715
+; CHECK:    ret void
1716
+;
1717
+entry:
1718
+  br label %for.body
1719
+
1720
+for.body:
1721
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1722
+  %tmp = trunc i64 %iv to i32
1723
+  %conv = sitofp i32 %tmp to float
1724
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
1725
+  %tmp1 = load float, float* %arrayidx, align 4
1726
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
1727
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
1728
+  store float %tmp2, float* %arrayidx2, align 4
1729
+  %iv.next = add nuw nsw i64 %iv, 1
1730
+  %exitcond = icmp eq i64 %iv.next, 1000
1731
+  br i1 %exitcond, label %for.end, label %for.body
1732
+
1733
+for.end:
1734
+  ret void
1735
+}
1736
+
1737
+define void @exp_f64(double* nocapture %varray) {
1738
+; CHECK-LABEL: @exp_f64(
1739
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
1740
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1741
+; CHECK:    ret void
1742
+;
1743
+entry:
1744
+  br label %for.body
1745
+
1746
+for.body:
1747
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1748
+  %tmp = trunc i64 %iv to i32
1749
+  %conv = sitofp i32 %tmp to double
1750
+  %call = tail call double @exp(double %conv)
1751
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1752
+  store double %call, double* %arrayidx, align 4
1753
+  %iv.next = add nuw nsw i64 %iv, 1
1754
+  %exitcond = icmp eq i64 %iv.next, 1000
1755
+  br i1 %exitcond, label %for.end, label %for.body
1756
+
1757
+for.end:
1758
+  ret void
1759
+}
1760
+
1761
+define void @exp_f32(float* nocapture %varray) {
1762
+; CHECK-LABEL: @exp_f32(
1763
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
1764
+; CHECK:    ret void
1765
+;
1766
+entry:
1767
+  br label %for.body
1768
+
1769
+for.body:
1770
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1771
+  %tmp = trunc i64 %iv to i32
1772
+  %conv = sitofp i32 %tmp to float
1773
+  %call = tail call float @expf(float %conv)
1774
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1775
+  store float %call, float* %arrayidx, align 4
1776
+  %iv.next = add nuw nsw i64 %iv, 1
1777
+  %exitcond = icmp eq i64 %iv.next, 1000
1778
+  br i1 %exitcond, label %for.end, label %for.body
1779
+
1780
+for.end:
1781
+  ret void
1782
+}
1783
+
1784
+define void @exp_f64_intrinsic(double* nocapture %varray) {
1785
+; CHECK-LABEL: @exp_f64_intrinsic(
1786
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
1787
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
1788
+; CHECK:    ret void
1789
+;
1790
+entry:
1791
+  br label %for.body
1792
+
1793
+for.body:
1794
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1795
+  %tmp = trunc i64 %iv to i32
1796
+  %conv = sitofp i32 %tmp to double
1797
+  %call = tail call double @llvm.exp.f64(double %conv)
1798
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1799
+  store double %call, double* %arrayidx, align 4
1800
+  %iv.next = add nuw nsw i64 %iv, 1
1801
+  %exitcond = icmp eq i64 %iv.next, 1000
1802
+  br i1 %exitcond, label %for.end, label %for.body
1803
+
1804
+for.end:
1805
+  ret void
1806
+}
1807
+
1808
+define void @exp_f32_intrinsic(float* nocapture %varray) {
1809
+; CHECK-LABEL: @exp_f32_intrinsic(
1810
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
1811
+; CHECK:    ret void
1812
+;
1813
+entry:
1814
+  br label %for.body
1815
+
1816
+for.body:
1817
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1818
+  %tmp = trunc i64 %iv to i32
1819
+  %conv = sitofp i32 %tmp to float
1820
+  %call = tail call float @llvm.exp.f32(float %conv)
1821
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1822
+  store float %call, float* %arrayidx, align 4
1823
+  %iv.next = add nuw nsw i64 %iv, 1
1824
+  %exitcond = icmp eq i64 %iv.next, 1000
1825
+  br i1 %exitcond, label %for.end, label %for.body
1826
+
1827
+for.end:
1828
+  ret void
1829
+}
1830
+
1831
+define void @log_f64(double* nocapture %varray) {
1832
+; CHECK-LABEL: @log_f64(
1833
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
1834
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1835
+; CHECK:    ret void
1836
+;
1837
+entry:
1838
+  br label %for.body
1839
+
1840
+for.body:
1841
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1842
+  %tmp = trunc i64 %iv to i32
1843
+  %conv = sitofp i32 %tmp to double
1844
+  %call = tail call double @log(double %conv)
1845
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1846
+  store double %call, double* %arrayidx, align 4
1847
+  %iv.next = add nuw nsw i64 %iv, 1
1848
+  %exitcond = icmp eq i64 %iv.next, 1000
1849
+  br i1 %exitcond, label %for.end, label %for.body
1850
+
1851
+for.end:
1852
+  ret void
1853
+}
1854
+
1855
+define void @log_f32(float* nocapture %varray) {
1856
+; CHECK-LABEL: @log_f32(
1857
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
1858
+; CHECK:    ret void
1859
+;
1860
+entry:
1861
+  br label %for.body
1862
+
1863
+for.body:
1864
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1865
+  %tmp = trunc i64 %iv to i32
1866
+  %conv = sitofp i32 %tmp to float
1867
+  %call = tail call float @logf(float %conv)
1868
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1869
+  store float %call, float* %arrayidx, align 4
1870
+  %iv.next = add nuw nsw i64 %iv, 1
1871
+  %exitcond = icmp eq i64 %iv.next, 1000
1872
+  br i1 %exitcond, label %for.end, label %for.body
1873
+
1874
+for.end:
1875
+  ret void
1876
+}
1877
+
1878
+define void @log_f64_intrinsic(double* nocapture %varray) {
1879
+; CHECK-LABEL: @log_f64_intrinsic(
1880
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
1881
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc256 <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
1882
+; CHECK:    ret void
1883
+;
1884
+entry:
1885
+  br label %for.body
1886
+
1887
+for.body:
1888
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1889
+  %tmp = trunc i64 %iv to i32
1890
+  %conv = sitofp i32 %tmp to double
1891
+  %call = tail call double @llvm.log.f64(double %conv)
1892
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
1893
+  store double %call, double* %arrayidx, align 4
1894
+  %iv.next = add nuw nsw i64 %iv, 1
1895
+  %exitcond = icmp eq i64 %iv.next, 1000
1896
+  br i1 %exitcond, label %for.end, label %for.body
1897
+
1898
+for.end:
1899
+  ret void
1900
+}
1901
+
1902
+define void @log_f32_intrinsic(float* nocapture %varray) {
1903
+; CHECK-LABEL: @log_f32_intrinsic(
1904
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc256 <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
1905
+; CHECK:    ret void
1906
+;
1907
+entry:
1908
+  br label %for.body
1909
+
1910
+for.body:
1911
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1912
+  %tmp = trunc i64 %iv to i32
1913
+  %conv = sitofp i32 %tmp to float
1914
+  %call = tail call float @llvm.log.f32(float %conv)
1915
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
1916
+  store float %call, float* %arrayidx, align 4
1917
+  %iv.next = add nuw nsw i64 %iv, 1
1918
+  %exitcond = icmp eq i64 %iv.next, 1000
1919
+  br i1 %exitcond, label %for.end, label %for.body
1920
+
1921
+for.end:
1922
+  ret void
1923
+}
1924
+
1925
+attributes #0 = { nounwind readnone }
1926
+
1927
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
1928
new file mode 100644
1929
index 000000000..942265344
1930
--- /dev/null
1931
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
1932
@@ -0,0 +1,61 @@
1933
+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype.
1934
+; The C code used to generate this test:
1935
+
1936
+; #include <math.h>
1937
+;
1938
+; void foo(double *a, int N){
1939
+;   int i;
1940
+; #pragma clang loop vectorize_width(8)
1941
+;   for (i=0;i<N;i++){
1942
+;     a[i] = sin(i);
1943
+;   }
1944
+; }
1945
+
1946
+; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=8 -mattr=avx -S < %s | FileCheck %s
1947
+
1948
+; CHECK: [[I1:%.*]] = sitofp <8 x i32> [[I0:%.*]] to <8 x double>
1949
+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1950
+; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S1]])
1951
+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1952
+; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc256 <4 x double> @__svml_sin4(<4 x double> [[S2]])
1953
+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1954
+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8
1955
+
1956
+
1957
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
1958
+target triple = "x86_64-unknown-linux-gnu"
1959
+
1960
+; Function Attrs: nounwind uwtable
1961
+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 {
1962
+entry:
1963
+  %cmp5 = icmp sgt i32 %N, 0
1964
+  br i1 %cmp5, label %for.body.preheader, label %for.end
1965
+
1966
+for.body.preheader:                               ; preds = %entry
1967
+  %wide.trip.count = zext i32 %N to i64
1968
+  br label %for.body
1969
+
1970
+for.body:                                         ; preds = %for.body, %for.body.preheader
1971
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
1972
+  %0 = trunc i64 %indvars.iv to i32
1973
+  %conv = sitofp i32 %0 to double
1974
+  %call = tail call fast double @sin(double %conv) #2
1975
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
1976
+  store double %call, double* %arrayidx, align 8, !tbaa !2
1977
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1978
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
1979
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
1980
+
1981
+for.end:                                          ; preds = %for.body, %entry
1982
+  ret void
1983
+}
1984
+
1985
+; Function Attrs: nounwind
1986
+declare dso_local double @sin(double) local_unnamed_addr #1
1987
+
1988
+!2 = !{!3, !3, i64 0}
1989
+!3 = !{!"double", !4, i64 0}
1990
+!4 = !{!"omnipotent char", !5, i64 0}
1991
+!5 = !{!"Simple C/C++ TBAA"}
1992
+!6 = distinct !{!6, !7}
1993
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
1994
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
1995
index 8e04c22bf..a7e6978c1 100644
1996
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
1997
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
1998
@@ -12,12 +12,12 @@ target triple = "x86_64-unknown-linux-gnu"
1999
 
2000
 ; COMMON-LABEL: @llvm.compiler.used = appending global
2001
 ; SVML-SAME:        [6 x ptr] [
2002
-; SVML-SAME:          ptr @__svml_sin2,
2003
-; SVML-SAME:          ptr @__svml_sin4,
2004
-; SVML-SAME:          ptr @__svml_sin8,
2005
-; SVML-SAME:          ptr @__svml_log10f4,
2006
-; SVML-SAME:          ptr @__svml_log10f8,
2007
-; SVML-SAME:          ptr @__svml_log10f16
2008
+; SVML-SAME:          ptr @__svml_sin2_ha,
2009
+; SVML-SAME:          ptr @__svml_sin4_ha,
2010
+; SVML-SAME:          ptr @__svml_sin8_ha,
2011
+; SVML-SAME:          ptr @__svml_log10f4_ha,
2012
+; SVML-SAME:          ptr @__svml_log10f8_ha,
2013
+; SVML-SAME:          ptr @__svml_log10f16_ha
2014
 ; MASSV-SAME:       [2 x ptr] [
2015
 ; MASSV-SAME:         ptr @__sind2,
2016
 ; MASSV-SAME:         ptr @__log10f4
2017
@@ -59,9 +59,9 @@ declare float @llvm.log10.f32(float) #0
2018
 attributes #0 = { nounwind readnone }
2019
 
2020
 ; SVML:      attributes #[[SIN]] = { "vector-function-abi-variant"=
2021
-; SVML-SAME:   "_ZGV_LLVM_N2v_sin(__svml_sin2),
2022
-; SVML-SAME:   _ZGV_LLVM_N4v_sin(__svml_sin4),
2023
-; SVML-SAME:   _ZGV_LLVM_N8v_sin(__svml_sin8)" }
2024
+; SVML-SAME:   "_ZGV_LLVM_N2v_sin(__svml_sin2_ha),
2025
+; SVML-SAME:   _ZGV_LLVM_N4v_sin(__svml_sin4_ha),
2026
+; SVML-SAME:   _ZGV_LLVM_N8v_sin(__svml_sin8_ha)" }
2027
 
2028
 ; MASSV:      attributes #[[SIN]] = { "vector-function-abi-variant"=
2029
 ; MASSV-SAME:   "_ZGV_LLVM_N2v_sin(__sind2)" }
2030
diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt
2031
index 725c99b8e..58e2194e1 100644
2032
--- a/llvm/utils/TableGen/CMakeLists.txt
2033
+++ b/llvm/utils/TableGen/CMakeLists.txt
2034
@@ -47,6 +47,7 @@ add_tablegen(llvm-tblgen LLVM
2035
   SearchableTableEmitter.cpp
2036
   SubtargetEmitter.cpp
2037
   SubtargetFeatureInfo.cpp
2038
+  SVMLEmitter.cpp
2039
   TableGen.cpp
2040
   Types.cpp
2041
   VarLenCodeEmitterGen.cpp
2042
diff --git a/llvm/utils/TableGen/SVMLEmitter.cpp b/llvm/utils/TableGen/SVMLEmitter.cpp
2043
new file mode 100644
2044
index 000000000..a5aeea48d
2045
--- /dev/null
2046
+++ b/llvm/utils/TableGen/SVMLEmitter.cpp
2047
@@ -0,0 +1,110 @@
2048
+//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===//
2049
+//
2050
+//                     The LLVM Compiler Infrastructure
2051
+//
2052
+// This file is distributed under the University of Illinois Open Source
2053
+// License. See LICENSE.TXT for details.
2054
+//
2055
+//===----------------------------------------------------------------------===//
2056
+//
2057
+// This tablegen backend emits the scalar to svml function map for TLI.
2058
+//
2059
+//===----------------------------------------------------------------------===//
2060
+
2061
+#include "CodeGenTarget.h"
2062
+#include "llvm/Support/Format.h"
2063
+#include "llvm/TableGen/Error.h"
2064
+#include "llvm/TableGen/Record.h"
2065
+#include "llvm/TableGen/TableGenBackend.h"
2066
+#include <map>
2067
+#include <vector>
2068
+
2069
+using namespace llvm;
2070
+
2071
+#define DEBUG_TYPE "SVMLVariants"
2072
+#include "llvm/Support/Debug.h"
2073
+
2074
+namespace {
2075
+
2076
+class SVMLVariantsEmitter {
2077
+
2078
+  RecordKeeper &Records;
2079
+
2080
+private:
2081
+  void emitSVMLVariants(raw_ostream &OS);
2082
+
2083
+public:
2084
+  SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {}
2085
+
2086
+  void run(raw_ostream &OS);
2087
+};
2088
+} // End anonymous namespace
2089
+
2090
+/// \brief Emit the set of SVML variant function names.
2091
+// The default is to emit the high accuracy SVML variants until a mechanism is
2092
+// introduced to allow a selection of different variants through precision
2093
+// requirements specified by the user. This code generates mappings to svml
2094
+// that are in the scalar form of llvm intrinsics, math library calls, or the
2095
+// finite variants of math library calls.
2096
+void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) {
2097
+
2098
+  const unsigned MinSinglePrecVL = 4;
2099
+  const unsigned MaxSinglePrecVL = 16;
2100
+  const unsigned MinDoublePrecVL = 2;
2101
+  const unsigned MaxDoublePrecVL = 8;
2102
+
2103
+  OS << "#ifdef GET_SVML_VARIANTS\n";
2104
+
2105
+  for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) {
2106
+    StringRef SvmlVariantNameStr = D->getName();
2107
+    // Single Precision SVML
2108
+    for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) {
2109
+      // Emit the scalar math library function to svml function entry.
2110
+      OS << "{\"" << SvmlVariantNameStr << "f" << "\", ";
2111
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
2112
+         << "ElementCount::getFixed(" << VL << ")},\n";
2113
+
2114
+      // Emit the scalar intrinsic to svml function entry.
2115
+      OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", ";
2116
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
2117
+         << "ElementCount::getFixed(" << VL << ")},\n";
2118
+
2119
+      // Emit the finite math library function to svml function entry.
2120
+      OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", ";
2121
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
2122
+         << "ElementCount::getFixed(" << VL << ")},\n";
2123
+    }
2124
+
2125
+    // Double Precision SVML
2126
+    for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) {
2127
+      // Emit the scalar math library function to svml function entry.
2128
+      OS << "{\"" << SvmlVariantNameStr << "\", ";
2129
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL
2130
+         << ")},\n";
2131
+
2132
+      // Emit the scalar intrinsic to svml function entry.
2133
+      OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", ";
2134
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << "ElementCount::getFixed(" << VL
2135
+         << ")},\n";
2136
+
2137
+      // Emit the finite math library function to svml function entry.
2138
+      OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", ";
2139
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", "
2140
+         << "ElementCount::getFixed(" << VL << ")},\n";
2141
+    }
2142
+  }
2143
+
2144
+  OS << "#endif // GET_SVML_VARIANTS\n\n";
2145
+}
2146
+
2147
+void SVMLVariantsEmitter::run(raw_ostream &OS) {
2148
+  emitSVMLVariants(OS);
2149
+}
2150
+
2151
+namespace llvm {
2152
+
2153
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) {
2154
+  SVMLVariantsEmitter(RK).run(OS);
2155
+}
2156
+
2157
+} // End llvm namespace
2158
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
2159
index efd641887..d31e144ff 100644
2160
--- a/llvm/utils/TableGen/TableGen.cpp
2161
+++ b/llvm/utils/TableGen/TableGen.cpp
2162
@@ -58,6 +58,7 @@ enum ActionType {
2163
   GenDirectivesEnumDecl,
2164
   GenDirectivesEnumImpl,
2165
   GenDXILOperation,
2166
+  GenSVMLVariants,
2167
 };
2168
 
2169
 namespace llvm {
2170
@@ -140,6 +141,8 @@ cl::opt<ActionType> Action(
2171
                    "Generate directive related declaration code (header file)"),
2172
         clEnumValN(GenDirectivesEnumImpl, "gen-directive-impl",
2173
                    "Generate directive related implementation code"),
2174
+        clEnumValN(GenSVMLVariants, "gen-svml",
2175
+                   "Generate SVML variant function names"),
2176
         clEnumValN(GenDXILOperation, "gen-dxil-operation",
2177
                    "Generate DXIL operation information")));
2178
 
2179
@@ -278,6 +281,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
2180
   case GenDXILOperation:
2181
     EmitDXILOperation(Records, OS);
2182
     break;
2183
+  case GenSVMLVariants:
2184
+    EmitSVMLVariants(Records, OS);
2185
+    break;
2186
   }
2187
 
2188
   return false;
2189
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
2190
index 4dff13095..5d58000e7 100644
2191
--- a/llvm/utils/TableGen/TableGenBackends.h
2192
+++ b/llvm/utils/TableGen/TableGenBackends.h
2193
@@ -94,6 +94,7 @@ void EmitAutomata(RecordKeeper &RK, raw_ostream &OS);
2194
 void EmitDirectivesDecl(RecordKeeper &RK, raw_ostream &OS);
2195
 void EmitDirectivesImpl(RecordKeeper &RK, raw_ostream &OS);
2196
 void EmitDXILOperation(RecordKeeper &RK, raw_ostream &OS);
2197
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS);
2198
 
2199
 } // End llvm namespace
2200
 
2201
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
2202
index 9185a029a..cae895ada 100644
2203
--- a/llvm/utils/vim/syntax/llvm.vim
2204
+++ b/llvm/utils/vim/syntax/llvm.vim
2205
@@ -104,6 +104,7 @@ syn keyword llvmKeyword
2206
       \ inreg
2207
       \ intel_ocl_bicc
2208
       \ inteldialect
2209
+      \ intel_svmlcc
2210
       \ internal
2211
       \ jumptable
2212
       \ linkonce
2213
-- 
2214
2.41.0
2215

2216

2217
Product

Resources

Company