Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
4574 views
1
/****************************************************************************
2
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
* @file lower_x86.cpp
24
*
25
* @brief llvm pass to lower meta code to x86
26
*
27
* Notes:
28
*
29
******************************************************************************/
30
31
#include "jit_pch.hpp"
32
#include "passes.h"
33
#include "JitManager.h"
34
35
#include "common/simdlib.hpp"
36
37
#include <unordered_map>
38
39
extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
40
41
namespace llvm
42
{
43
// forward declare the initializer
44
void initializeLowerX86Pass(PassRegistry&);
45
} // namespace llvm
46
47
namespace SwrJit
48
{
49
using namespace llvm;
50
51
enum TargetArch
52
{
53
AVX = 0,
54
AVX2 = 1,
55
AVX512 = 2
56
};
57
58
enum TargetWidth
59
{
60
W256 = 0,
61
W512 = 1,
62
NUM_WIDTHS = 2
63
};
64
65
struct LowerX86;
66
67
typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
68
69
struct X86Intrinsic
70
{
71
IntrinsicID intrin[NUM_WIDTHS];
72
EmuFunc emuFunc;
73
};
74
75
// Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
76
// previous behavior of mapping directly to avx/avx2 intrinsics.
77
using intrinsicMap_t = std::map<std::string, IntrinsicID>;
78
static intrinsicMap_t& getIntrinsicMap() {
79
static std::map<std::string, IntrinsicID> intrinsicMap = {
80
{"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
81
{"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
82
{"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
83
{"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
84
{"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
85
{"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
86
{"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
87
{"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}
88
};
89
return intrinsicMap;
90
}
91
92
// Forward decls
93
Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
94
Instruction*
95
VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
96
Instruction*
97
VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
98
Instruction*
99
VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
100
Instruction*
101
VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
102
Instruction*
103
VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
104
Instruction*
105
VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
106
107
Instruction* DOUBLE_EMU(LowerX86* pThis,
108
TargetArch arch,
109
TargetWidth width,
110
CallInst* pCallInst,
111
Intrinsic::ID intrin);
112
113
static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
114
115
using intrinsicMapAdvanced_t = std::vector<std::map<std::string, X86Intrinsic>>;
116
117
static intrinsicMapAdvanced_t& getIntrinsicMapAdvanced()
118
{
119
// clang-format off
120
static intrinsicMapAdvanced_t intrinsicMapAdvanced = {
121
// 256 wide 512 wide
122
{
123
// AVX
124
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
125
{"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
126
{"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
127
{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
128
{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
129
{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
130
{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
131
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
132
{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
133
{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
134
},
135
{
136
// AVX2
137
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
138
{"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
139
{"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
140
{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
141
{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
142
{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
143
{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
144
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
145
{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
146
{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
147
},
148
{
149
// AVX512
150
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
151
#if LLVM_VERSION_MAJOR < 7
152
{"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
153
{"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
154
#else
155
{"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
156
{"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
157
#endif
158
{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
159
{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
160
{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
161
{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
162
#if LLVM_VERSION_MAJOR < 7
163
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
164
#else
165
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
166
#endif
167
{"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
168
{"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}
169
}};
170
// clang-format on
171
return intrinsicMapAdvanced;
172
}
173
174
static uint32_t getBitWidth(VectorType *pVTy)
175
{
176
#if LLVM_VERSION_MAJOR >= 12
177
return cast<FixedVectorType>(pVTy)->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();
178
#elif LLVM_VERSION_MAJOR >= 11
179
return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits();
180
#else
181
return pVTy->getBitWidth();
182
#endif
183
}
184
185
struct LowerX86 : public FunctionPass
186
{
187
LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
188
{
189
initializeLowerX86Pass(*PassRegistry::getPassRegistry());
190
191
// Determine target arch
192
if (JM()->mArch.AVX512F())
193
{
194
mTarget = AVX512;
195
}
196
else if (JM()->mArch.AVX2())
197
{
198
mTarget = AVX2;
199
}
200
else if (JM()->mArch.AVX())
201
{
202
mTarget = AVX;
203
}
204
else
205
{
206
SWR_ASSERT(false, "Unsupported AVX architecture.");
207
mTarget = AVX;
208
}
209
210
// Setup scatter function for 256 wide
211
uint32_t curWidth = B->mVWidth;
212
B->SetTargetWidth(8);
213
std::vector<Type*> args = {
214
B->mInt8PtrTy, // pBase
215
B->mSimdInt32Ty, // vIndices
216
B->mSimdFP32Ty, // vSrc
217
B->mInt8Ty, // mask
218
B->mInt32Ty // scale
219
};
220
221
FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
222
mPfnScatter256 = cast<Function>(
223
#if LLVM_VERSION_MAJOR >= 9
224
B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
225
#else
226
B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
227
#endif
228
if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
229
{
230
sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
231
}
232
233
B->SetTargetWidth(curWidth);
234
}
235
236
// Try to decipher the vector type of the instruction. This does not work properly
237
// across all intrinsics, and will have to be rethought. Probably need something
238
// similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
239
// intrinsic.
240
void GetRequestedWidthAndType(CallInst* pCallInst,
241
const StringRef intrinName,
242
TargetWidth* pWidth,
243
Type** pTy)
244
{
245
assert(pCallInst);
246
Type* pVecTy = pCallInst->getType();
247
248
// Check for intrinsic specific types
249
// VCVTPD2PS type comes from src, not dst
250
if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
251
{
252
Value* pOp = pCallInst->getOperand(0);
253
assert(pOp);
254
pVecTy = pOp->getType();
255
}
256
257
if (!pVecTy->isVectorTy())
258
{
259
for (auto& op : pCallInst->arg_operands())
260
{
261
if (op.get()->getType()->isVectorTy())
262
{
263
pVecTy = op.get()->getType();
264
break;
265
}
266
}
267
}
268
SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
269
270
uint32_t width = getBitWidth(cast<VectorType>(pVecTy));
271
switch (width)
272
{
273
case 256:
274
*pWidth = W256;
275
break;
276
case 512:
277
*pWidth = W512;
278
break;
279
default:
280
SWR_ASSERT(false, "Unhandled vector width %d", width);
281
*pWidth = W256;
282
}
283
284
*pTy = pVecTy->getScalarType();
285
}
286
287
Value* GetZeroVec(TargetWidth width, Type* pTy)
288
{
289
uint32_t numElem = 0;
290
switch (width)
291
{
292
case W256:
293
numElem = 8;
294
break;
295
case W512:
296
numElem = 16;
297
break;
298
default:
299
SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
300
}
301
302
return ConstantVector::getNullValue(getVectorType(pTy, numElem));
303
}
304
305
Value* GetMask(TargetWidth width)
306
{
307
Value* mask;
308
switch (width)
309
{
310
case W256:
311
mask = B->C((uint8_t)-1);
312
break;
313
case W512:
314
mask = B->C((uint16_t)-1);
315
break;
316
default:
317
SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
318
}
319
return mask;
320
}
321
322
// Convert <N x i1> mask to <N x i32> x86 mask
323
Value* VectorMask(Value* vi1Mask)
324
{
325
#if LLVM_VERSION_MAJOR >= 12
326
uint32_t numElem = cast<FixedVectorType>(vi1Mask->getType())->getNumElements();
327
#elif LLVM_VERSION_MAJOR >= 11
328
uint32_t numElem = cast<VectorType>(vi1Mask->getType())->getNumElements();
329
#else
330
uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
331
#endif
332
return B->S_EXT(vi1Mask, getVectorType(B->mInt32Ty, numElem));
333
}
334
335
Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
336
{
337
Function* pFunc = pCallInst->getCalledFunction();
338
assert(pFunc);
339
340
auto& intrinsic = getIntrinsicMapAdvanced()[mTarget][pFunc->getName().str()];
341
TargetWidth vecWidth;
342
Type* pElemTy;
343
GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
344
345
// Check if there is a native intrinsic for this instruction
346
IntrinsicID id = intrinsic.intrin[vecWidth];
347
if (id == DOUBLE)
348
{
349
// Double pump the next smaller SIMD intrinsic
350
SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
351
Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
352
SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
353
"Cannot find intrinsic to double pump.");
354
return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
355
}
356
else if (id != Intrinsic::not_intrinsic)
357
{
358
Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
359
SmallVector<Value*, 8> args;
360
for (auto& arg : pCallInst->arg_operands())
361
{
362
args.push_back(arg.get());
363
}
364
365
// If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
366
// full mask for now Assuming the intrinsics are consistent and place the src
367
// operand and mask last in the argument list.
368
if (mTarget == AVX512)
369
{
370
if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
371
{
372
args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
373
args.push_back(GetMask(W256));
374
// for AVX512 VCVTPD2PS, we also have to add rounding mode
375
args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
376
}
377
else
378
{
379
args.push_back(GetZeroVec(vecWidth, pElemTy));
380
args.push_back(GetMask(vecWidth));
381
}
382
}
383
384
return B->CALLA(pIntrin, args);
385
}
386
else
387
{
388
// No native intrinsic, call emulation function
389
return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
390
}
391
392
SWR_ASSERT(false);
393
return nullptr;
394
}
395
396
Instruction* ProcessIntrinsic(CallInst* pCallInst)
397
{
398
Function* pFunc = pCallInst->getCalledFunction();
399
assert(pFunc);
400
401
// Forward to the advanced support if found
402
if (getIntrinsicMapAdvanced()[mTarget].find(pFunc->getName().str()) != getIntrinsicMapAdvanced()[mTarget].end())
403
{
404
return ProcessIntrinsicAdvanced(pCallInst);
405
}
406
407
SWR_ASSERT(getIntrinsicMap().find(pFunc->getName().str()) != getIntrinsicMap().end(),
408
"Unimplemented intrinsic %s.",
409
pFunc->getName().str().c_str());
410
411
Intrinsic::ID x86Intrinsic = getIntrinsicMap()[pFunc->getName().str()];
412
Function* pX86IntrinFunc =
413
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
414
415
SmallVector<Value*, 8> args;
416
for (auto& arg : pCallInst->arg_operands())
417
{
418
args.push_back(arg.get());
419
}
420
return B->CALLA(pX86IntrinFunc, args);
421
}
422
423
//////////////////////////////////////////////////////////////////////////
424
/// @brief LLVM function pass run method.
425
/// @param f- The function we're working on with this pass.
426
virtual bool runOnFunction(Function& F)
427
{
428
std::vector<Instruction*> toRemove;
429
std::vector<BasicBlock*> bbs;
430
431
// Make temp copy of the basic blocks and instructions, as the intrinsic
432
// replacement code might invalidate the iterators
433
for (auto& b : F.getBasicBlockList())
434
{
435
bbs.push_back(&b);
436
}
437
438
for (auto* BB : bbs)
439
{
440
std::vector<Instruction*> insts;
441
for (auto& i : BB->getInstList())
442
{
443
insts.push_back(&i);
444
}
445
446
for (auto* I : insts)
447
{
448
if (CallInst* pCallInst = dyn_cast<CallInst>(I))
449
{
450
Function* pFunc = pCallInst->getCalledFunction();
451
if (pFunc)
452
{
453
if (pFunc->getName().startswith("meta.intrinsic"))
454
{
455
B->IRB()->SetInsertPoint(I);
456
Instruction* pReplace = ProcessIntrinsic(pCallInst);
457
toRemove.push_back(pCallInst);
458
if (pReplace)
459
{
460
pCallInst->replaceAllUsesWith(pReplace);
461
}
462
}
463
}
464
}
465
}
466
}
467
468
for (auto* pInst : toRemove)
469
{
470
pInst->eraseFromParent();
471
}
472
473
JitManager::DumpToFile(&F, "lowerx86");
474
475
return true;
476
}
477
478
virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
479
480
JitManager* JM() { return B->JM(); }
481
Builder* B;
482
TargetArch mTarget;
483
Function* mPfnScatter256;
484
485
static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
486
};
487
488
char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
489
490
FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
491
492
Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
493
{
494
SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
495
return nullptr;
496
}
497
498
Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
499
{
500
// Only need vperm emulation for AVX
501
SWR_ASSERT(arch == AVX);
502
503
Builder* B = pThis->B;
504
auto v32A = pCallInst->getArgOperand(0);
505
auto vi32Index = pCallInst->getArgOperand(1);
506
507
Value* v32Result;
508
if (isa<Constant>(vi32Index))
509
{
510
// Can use llvm shuffle vector directly with constant shuffle indices
511
v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
512
}
513
else
514
{
515
v32Result = UndefValue::get(v32A->getType());
516
#if LLVM_VERSION_MAJOR >= 12
517
uint32_t numElem = cast<FixedVectorType>(v32A->getType())->getNumElements();
518
#elif LLVM_VERSION_MAJOR >= 11
519
uint32_t numElem = cast<VectorType>(v32A->getType())->getNumElements();
520
#else
521
uint32_t numElem = v32A->getType()->getVectorNumElements();
522
#endif
523
for (uint32_t l = 0; l < numElem; ++l)
524
{
525
auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
526
auto val = B->VEXTRACT(v32A, i32Index);
527
v32Result = B->VINSERT(v32Result, val, B->C(l));
528
}
529
}
530
return cast<Instruction>(v32Result);
531
}
532
533
Instruction*
534
VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
535
{
536
Builder* B = pThis->B;
537
auto vSrc = pCallInst->getArgOperand(0);
538
auto pBase = pCallInst->getArgOperand(1);
539
auto vi32Indices = pCallInst->getArgOperand(2);
540
auto vi1Mask = pCallInst->getArgOperand(3);
541
auto i8Scale = pCallInst->getArgOperand(4);
542
543
pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
544
#if LLVM_VERSION_MAJOR >= 11
545
#if LLVM_VERSION_MAJOR >= 12
546
FixedVectorType* pVectorType = cast<FixedVectorType>(vSrc->getType());
547
#else
548
VectorType* pVectorType = cast<VectorType>(vSrc->getType());
549
#endif
550
uint32_t numElem = pVectorType->getNumElements();
551
auto srcTy = pVectorType->getElementType();
552
#else
553
uint32_t numElem = vSrc->getType()->getVectorNumElements();
554
auto srcTy = vSrc->getType()->getVectorElementType();
555
#endif
556
auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
557
558
Value* v32Gather = nullptr;
559
if (arch == AVX)
560
{
561
// Full emulation for AVX
562
// Store source on stack to provide a valid address to load from inactive lanes
563
auto pStack = B->STACKSAVE();
564
auto pTmp = B->ALLOCA(vSrc->getType());
565
B->STORE(vSrc, pTmp);
566
567
v32Gather = UndefValue::get(vSrc->getType());
568
#if LLVM_VERSION_MAJOR <= 10
569
auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
570
#elif LLVM_VERSION_MAJOR == 11
571
auto vi32Scale = ConstantVector::getSplat(ElementCount(numElem, false), cast<ConstantInt>(i32Scale));
572
#else
573
auto vi32Scale = ConstantVector::getSplat(ElementCount::get(numElem, false), cast<ConstantInt>(i32Scale));
574
#endif
575
auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
576
577
for (uint32_t i = 0; i < numElem; ++i)
578
{
579
auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
580
auto pLoadAddress = B->GEP(pBase, i32Offset);
581
pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
582
auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
583
auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
584
auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
585
auto val = B->LOAD(pValidAddress);
586
v32Gather = B->VINSERT(v32Gather, val, B->C(i));
587
}
588
589
B->STACKRESTORE(pStack);
590
}
591
else if (arch == AVX2 || (arch == AVX512 && width == W256))
592
{
593
Function* pX86IntrinFunc = nullptr;
594
if (srcTy == B->mFP32Ty)
595
{
596
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
597
Intrinsic::x86_avx2_gather_d_ps_256);
598
}
599
else if (srcTy == B->mInt32Ty)
600
{
601
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
602
Intrinsic::x86_avx2_gather_d_d_256);
603
}
604
else if (srcTy == B->mDoubleTy)
605
{
606
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
607
Intrinsic::x86_avx2_gather_d_q_256);
608
}
609
else
610
{
611
SWR_ASSERT(false, "Unsupported vector element type for gather.");
612
}
613
614
if (width == W256)
615
{
616
auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
617
v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
618
}
619
else if (width == W512)
620
{
621
// Double pump 4-wide for 64bit elements
622
#if LLVM_VERSION_MAJOR >= 12
623
if (cast<FixedVectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)
624
#elif LLVM_VERSION_MAJOR >= 11
625
if (cast<VectorType>(vSrc->getType())->getElementType() == B->mDoubleTy)
626
#else
627
if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
628
#endif
629
{
630
auto v64Mask = pThis->VectorMask(vi1Mask);
631
#if LLVM_VERSION_MAJOR >= 12
632
uint32_t numElem = cast<FixedVectorType>(v64Mask->getType())->getNumElements();
633
#elif LLVM_VERSION_MAJOR >= 11
634
uint32_t numElem = cast<VectorType>(v64Mask->getType())->getNumElements();
635
#else
636
uint32_t numElem = v64Mask->getType()->getVectorNumElements();
637
#endif
638
v64Mask = B->S_EXT(v64Mask, getVectorType(B->mInt64Ty, numElem));
639
v64Mask = B->BITCAST(v64Mask, vSrc->getType());
640
641
Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
642
Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
643
644
Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
645
Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
646
647
Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
648
Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
649
650
#if LLVM_VERSION_MAJOR >= 12
651
uint32_t numElemSrc0 = cast<FixedVectorType>(src0->getType())->getNumElements();
652
uint32_t numElemMask0 = cast<FixedVectorType>(mask0->getType())->getNumElements();
653
uint32_t numElemSrc1 = cast<FixedVectorType>(src1->getType())->getNumElements();
654
uint32_t numElemMask1 = cast<FixedVectorType>(mask1->getType())->getNumElements();
655
#elif LLVM_VERSION_MAJOR >= 11
656
uint32_t numElemSrc0 = cast<VectorType>(src0->getType())->getNumElements();
657
uint32_t numElemMask0 = cast<VectorType>(mask0->getType())->getNumElements();
658
uint32_t numElemSrc1 = cast<VectorType>(src1->getType())->getNumElements();
659
uint32_t numElemMask1 = cast<VectorType>(mask1->getType())->getNumElements();
660
#else
661
uint32_t numElemSrc0 = src0->getType()->getVectorNumElements();
662
uint32_t numElemMask0 = mask0->getType()->getVectorNumElements();
663
uint32_t numElemSrc1 = src1->getType()->getVectorNumElements();
664
uint32_t numElemMask1 = mask1->getType()->getVectorNumElements();
665
#endif
666
src0 = B->BITCAST(src0, getVectorType(B->mInt64Ty, numElemSrc0));
667
mask0 = B->BITCAST(mask0, getVectorType(B->mInt64Ty, numElemMask0));
668
Value* gather0 =
669
B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
670
src1 = B->BITCAST(src1, getVectorType(B->mInt64Ty, numElemSrc1));
671
mask1 = B->BITCAST(mask1, getVectorType(B->mInt64Ty, numElemMask1));
672
Value* gather1 =
673
B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
674
v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
675
v32Gather = B->BITCAST(v32Gather, vSrc->getType());
676
}
677
else
678
{
679
// Double pump 8-wide for 32bit elements
680
auto v32Mask = pThis->VectorMask(vi1Mask);
681
v32Mask = B->BITCAST(v32Mask, vSrc->getType());
682
Value* src0 = B->EXTRACT_16(vSrc, 0);
683
Value* src1 = B->EXTRACT_16(vSrc, 1);
684
685
Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
686
Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
687
688
Value* mask0 = B->EXTRACT_16(v32Mask, 0);
689
Value* mask1 = B->EXTRACT_16(v32Mask, 1);
690
691
Value* gather0 =
692
B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
693
Value* gather1 =
694
B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
695
696
v32Gather = B->JOIN_16(gather0, gather1);
697
}
698
}
699
}
700
else if (arch == AVX512)
701
{
702
Value* iMask = nullptr;
703
Function* pX86IntrinFunc = nullptr;
704
if (srcTy == B->mFP32Ty)
705
{
706
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
707
Intrinsic::x86_avx512_gather_dps_512);
708
iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
709
}
710
else if (srcTy == B->mInt32Ty)
711
{
712
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
713
Intrinsic::x86_avx512_gather_dpi_512);
714
iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
715
}
716
else if (srcTy == B->mDoubleTy)
717
{
718
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
719
Intrinsic::x86_avx512_gather_dpd_512);
720
iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
721
}
722
else
723
{
724
SWR_ASSERT(false, "Unsupported vector element type for gather.");
725
}
726
727
auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
728
v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
729
}
730
731
return cast<Instruction>(v32Gather);
732
}
733
Instruction*
734
VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
735
{
736
Builder* B = pThis->B;
737
auto pBase = pCallInst->getArgOperand(0);
738
auto vi1Mask = pCallInst->getArgOperand(1);
739
auto vi32Indices = pCallInst->getArgOperand(2);
740
auto v32Src = pCallInst->getArgOperand(3);
741
auto i32Scale = pCallInst->getArgOperand(4);
742
743
if (arch != AVX512)
744
{
745
// Call into C function to do the scatter. This has significantly better compile perf
746
// compared to jitting scatter loops for every scatter
747
if (width == W256)
748
{
749
auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
750
B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
751
}
752
else
753
{
754
// Need to break up 512 wide scatter to two 256 wide
755
auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
756
auto indicesLo =
757
B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
758
auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
759
760
auto mask = B->BITCAST(maskLo, B->mInt8Ty);
761
B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
762
763
auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
764
auto indicesHi =
765
B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
766
auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
767
768
mask = B->BITCAST(maskHi, B->mInt8Ty);
769
B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
770
}
771
return nullptr;
772
}
773
774
Value* iMask;
775
Function* pX86IntrinFunc;
776
if (width == W256)
777
{
778
// No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
779
// can use the scatter of 8 elements with 64bit indices
780
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
781
Intrinsic::x86_avx512_scatter_qps_512);
782
783
auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
784
iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
785
B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
786
}
787
else if (width == W512)
788
{
789
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
790
Intrinsic::x86_avx512_scatter_dps_512);
791
iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
792
B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
793
}
794
return nullptr;
795
}
796
797
// No support for vroundps in avx512 (it is available in kncni), so emulate with avx
798
// instructions
799
Instruction*
800
VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
801
{
802
SWR_ASSERT(arch == AVX512);
803
804
auto B = pThis->B;
805
auto vf32Src = pCallInst->getOperand(0);
806
assert(vf32Src);
807
auto i8Round = pCallInst->getOperand(1);
808
assert(i8Round);
809
auto pfnFunc =
810
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
811
812
if (width == W256)
813
{
814
return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
815
}
816
else if (width == W512)
817
{
818
auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
819
auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
820
821
auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
822
auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
823
824
return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
825
}
826
else
827
{
828
SWR_ASSERT(false, "Unimplemented vector width.");
829
}
830
831
return nullptr;
832
}
833
834
Instruction*
835
VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
836
{
837
SWR_ASSERT(arch == AVX512);
838
839
auto B = pThis->B;
840
auto vf32Src = pCallInst->getOperand(0);
841
842
if (width == W256)
843
{
844
auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
845
Intrinsic::x86_avx_round_ps_256);
846
return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
847
}
848
else if (width == W512)
849
{
850
// 512 can use intrinsic
851
auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
852
Intrinsic::x86_avx512_mask_cvtpd2ps_512);
853
return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
854
}
855
else
856
{
857
SWR_ASSERT(false, "Unimplemented vector width.");
858
}
859
860
return nullptr;
861
}
862
863
// No support for hsub in AVX512
864
Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
865
{
866
SWR_ASSERT(arch == AVX512);
867
868
auto B = pThis->B;
869
auto src0 = pCallInst->getOperand(0);
870
auto src1 = pCallInst->getOperand(1);
871
872
// 256b hsub can just use avx intrinsic
873
if (width == W256)
874
{
875
auto pX86IntrinFunc =
876
Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
877
return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
878
}
879
else if (width == W512)
880
{
881
// 512b hsub can be accomplished with shuf/sub combo
882
auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
883
auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
884
return cast<Instruction>(B->SUB(minuend, subtrahend));
885
}
886
else
887
{
888
SWR_ASSERT(false, "Unimplemented vector width.");
889
return nullptr;
890
}
891
}
892
893
// Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
894
// each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
895
Instruction* DOUBLE_EMU(LowerX86* pThis,
896
TargetArch arch,
897
TargetWidth width,
898
CallInst* pCallInst,
899
Intrinsic::ID intrin)
900
{
901
auto B = pThis->B;
902
SWR_ASSERT(width == W512);
903
Value* result[2];
904
Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
905
for (uint32_t i = 0; i < 2; ++i)
906
{
907
SmallVector<Value*, 8> args;
908
for (auto& arg : pCallInst->arg_operands())
909
{
910
auto argType = arg.get()->getType();
911
if (argType->isVectorTy())
912
{
913
#if LLVM_VERSION_MAJOR >= 12
914
uint32_t vecWidth = cast<FixedVectorType>(argType)->getNumElements();
915
auto elemTy = cast<FixedVectorType>(argType)->getElementType();
916
#elif LLVM_VERSION_MAJOR >= 11
917
uint32_t vecWidth = cast<VectorType>(argType)->getNumElements();
918
auto elemTy = cast<VectorType>(argType)->getElementType();
919
#else
920
uint32_t vecWidth = argType->getVectorNumElements();
921
auto elemTy = argType->getVectorElementType();
922
#endif
923
Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
924
Value* argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes);
925
args.push_back(argToPush);
926
}
927
else
928
{
929
args.push_back(arg.get());
930
}
931
}
932
result[i] = B->CALLA(pX86IntrinFunc, args);
933
}
934
uint32_t vecWidth;
935
if (result[0]->getType()->isVectorTy())
936
{
937
assert(result[1]->getType()->isVectorTy());
938
#if LLVM_VERSION_MAJOR >= 12
939
vecWidth = cast<FixedVectorType>(result[0]->getType())->getNumElements() +
940
cast<FixedVectorType>(result[1]->getType())->getNumElements();
941
#elif LLVM_VERSION_MAJOR >= 11
942
vecWidth = cast<VectorType>(result[0]->getType())->getNumElements() +
943
cast<VectorType>(result[1]->getType())->getNumElements();
944
#else
945
vecWidth = result[0]->getType()->getVectorNumElements() +
946
result[1]->getType()->getVectorNumElements();
947
#endif
948
}
949
else
950
{
951
vecWidth = 2;
952
}
953
Value* lanes = B->CInc<int>(0, vecWidth);
954
return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
955
}
956
957
} // namespace SwrJit
958
959
using namespace SwrJit;
960
961
INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
962
INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
963
964