Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
4574 views
1
/****************************************************************************
2
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*
23
* @file builder_misc.cpp
24
*
25
* @brief Implementation for miscellaneous builder functions
26
*
27
* Notes:
28
*
29
******************************************************************************/
30
#include "jit_pch.hpp"
31
#include "builder.h"
32
33
#include <cstdarg>
34
35
namespace SwrJit
36
{
37
void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
38
{
39
SWR_ASSERT(
40
ptr->getType() != mInt64Ty,
41
"Address appears to be GFX access. Requires translation through BuilderGfxMem.");
42
}
43
44
Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
45
{
46
return IRB()->CreateGEP(Ptr, Idx, Name);
47
}
48
49
Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
50
{
51
return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52
}
53
54
Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
55
{
56
std::vector<Value*> indices;
57
for (auto i : indexList)
58
indices.push_back(i);
59
return GEPA(ptr, indices);
60
}
61
62
Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
63
{
64
std::vector<Value*> indices;
65
for (auto i : indexList)
66
indices.push_back(C(i));
67
return GEPA(ptr, indices);
68
}
69
70
Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
71
{
72
return IRB()->CreateGEP(Ptr, IdxList, Name);
73
}
74
75
Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
76
{
77
return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78
}
79
80
Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
81
{
82
std::vector<Value*> indices;
83
for (auto i : indexList)
84
indices.push_back(i);
85
return IN_BOUNDS_GEP(ptr, indices);
86
}
87
88
Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
89
{
90
std::vector<Value*> indices;
91
for (auto i : indexList)
92
indices.push_back(C(i));
93
return IN_BOUNDS_GEP(ptr, indices);
94
}
95
96
LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
97
{
98
AssertMemoryUsageParams(Ptr, usage);
99
return IRB()->CreateLoad(Ptr, Name);
100
}
101
102
LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
103
{
104
AssertMemoryUsageParams(Ptr, usage);
105
return IRB()->CreateLoad(Ptr, Name);
106
}
107
108
LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
109
{
110
AssertMemoryUsageParams(Ptr, usage);
111
return IRB()->CreateLoad(Ty, Ptr, Name);
112
}
113
114
LoadInst*
115
Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
116
{
117
AssertMemoryUsageParams(Ptr, usage);
118
return IRB()->CreateLoad(Ptr, isVolatile, Name);
119
}
120
121
LoadInst* Builder::LOAD(Value* basePtr,
122
const std::initializer_list<uint32_t>& indices,
123
const llvm::Twine& name,
124
Type* Ty,
125
MEM_CLIENT usage)
126
{
127
std::vector<Value*> valIndices;
128
for (auto i : indices)
129
valIndices.push_back(C(i));
130
return Builder::LOAD(GEPA(basePtr, valIndices), name);
131
}
132
133
LoadInst* Builder::LOADV(Value* basePtr,
134
const std::initializer_list<Value*>& indices,
135
const llvm::Twine& name)
136
{
137
std::vector<Value*> valIndices;
138
for (auto i : indices)
139
valIndices.push_back(i);
140
return LOAD(GEPA(basePtr, valIndices), name);
141
}
142
143
StoreInst*
144
Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
145
{
146
std::vector<Value*> valIndices;
147
for (auto i : indices)
148
valIndices.push_back(C(i));
149
return STORE(val, GEPA(basePtr, valIndices));
150
}
151
152
StoreInst*
153
Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
154
{
155
std::vector<Value*> valIndices;
156
for (auto i : indices)
157
valIndices.push_back(i);
158
return STORE(val, GEPA(basePtr, valIndices));
159
}
160
161
Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
162
{
163
return GEP(base, offset);
164
}
165
166
Value* Builder::MEM_ADD(Value* i32Incr,
167
Value* basePtr,
168
const std::initializer_list<uint32_t>& indices,
169
const llvm::Twine& name)
170
{
171
Value* i32Value = LOAD(GEP(basePtr, indices), name);
172
Value* i32Result = ADD(i32Value, i32Incr);
173
return STORE(i32Result, GEP(basePtr, indices));
174
}
175
176
//////////////////////////////////////////////////////////////////////////
177
/// @brief Generate a masked gather operation in LLVM IR. If not
178
/// supported on the underlying platform, emulate it with loads
179
/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180
/// @param pBase - Int8* base VB address pointer value
181
/// @param vIndices - SIMD wide value of VB byte offsets
182
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183
/// @param scale - value to scale indices by
184
Value* Builder::GATHERPS(Value* vSrc,
185
Value* pBase,
186
Value* vIndices,
187
Value* vMask,
188
uint8_t scale,
189
MEM_CLIENT usage)
190
{
191
AssertMemoryUsageParams(pBase, usage);
192
193
return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
194
}
195
196
//////////////////////////////////////////////////////////////////////////
197
/// @brief Generate a masked gather operation in LLVM IR. If not
198
/// supported on the underlying platform, emulate it with loads
199
/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200
/// @param pBase - Int8* base VB address pointer value
201
/// @param vIndices - SIMD wide value of VB byte offsets
202
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203
/// @param scale - value to scale indices by
204
Value* Builder::GATHERDD(Value* vSrc,
205
Value* pBase,
206
Value* vIndices,
207
Value* vMask,
208
uint8_t scale,
209
MEM_CLIENT usage)
210
{
211
AssertMemoryUsageParams(pBase, usage);
212
213
return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
214
}
215
216
//////////////////////////////////////////////////////////////////////////
217
/// @brief Generate a masked gather operation in LLVM IR. If not
218
/// supported on the underlying platform, emulate it with loads
219
/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220
/// @param pBase - Int8* base VB address pointer value
221
/// @param vIndices - SIMD wide value of VB byte offsets
222
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223
/// @param scale - value to scale indices by
224
Value*
225
Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226
{
227
return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
228
}
229
230
//////////////////////////////////////////////////////////////////////////
231
/// @brief Alternative masked gather where source is a vector of pointers
232
/// @param pVecSrcPtr - SIMD wide vector of pointers
233
/// @param pVecMask - SIMD active lanes
234
/// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
235
Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
236
{
237
return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
238
}
239
240
void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
241
{
242
MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
243
}
244
245
void Builder::Gather4(const SWR_FORMAT format,
246
Value* pSrcBase,
247
Value* byteOffsets,
248
Value* mask,
249
Value* vGatherComponents[],
250
bool bPackedOutput,
251
MEM_CLIENT usage)
252
{
253
const SWR_FORMAT_INFO& info = GetFormatInfo(format);
254
if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
255
{
256
GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
257
}
258
else
259
{
260
GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
261
}
262
}
263
264
void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
265
Value* pSrcBase,
266
Value* byteOffsets,
267
Value* vMask,
268
Value* vGatherComponents[],
269
bool bPackedOutput,
270
MEM_CLIENT usage)
271
{
272
switch (info.bpp / info.numComps)
273
{
274
case 16:
275
{
276
Value* vGatherResult[2];
277
278
// TODO: vGatherMaskedVal
279
Value* vGatherMaskedVal = VIMMED1((float)0);
280
281
// always have at least one component out of x or y to fetch
282
283
vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
284
// e.g. result of first 8x32bit integer gather for 16bit components
285
// 256i - 0 1 2 3 4 5 6 7
286
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
287
//
288
289
// if we have at least one component out of x or y to fetch
290
if (info.numComps > 2)
291
{
292
// offset base to the next components(zw) in the vertex to gather
293
pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
294
295
vGatherResult[1] =
296
GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
297
// e.g. result of second 8x32bit integer gather for 16bit components
298
// 256i - 0 1 2 3 4 5 6 7
299
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
300
//
301
}
302
else
303
{
304
vGatherResult[1] = vGatherMaskedVal;
305
}
306
307
// Shuffle gathered components into place, each row is a component
308
Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
309
}
310
break;
311
case 32:
312
{
313
// apply defaults
314
for (uint32_t i = 0; i < 4; ++i)
315
{
316
vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
317
}
318
319
for (uint32_t i = 0; i < info.numComps; i++)
320
{
321
uint32_t swizzleIndex = info.swizzle[i];
322
323
// Gather a SIMD of components
324
vGatherComponents[swizzleIndex] = GATHERPS(
325
vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
326
327
// offset base to the next component to gather
328
pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
329
}
330
}
331
break;
332
default:
333
SWR_INVALID("Invalid float format");
334
break;
335
}
336
}
337
338
void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
339
Value* pSrcBase,
340
Value* byteOffsets,
341
Value* vMask,
342
Value* vGatherComponents[],
343
bool bPackedOutput,
344
MEM_CLIENT usage)
345
{
346
switch (info.bpp / info.numComps)
347
{
348
case 8:
349
{
350
Value* vGatherMaskedVal = VIMMED1((int32_t)0);
351
Value* vGatherResult =
352
GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
353
// e.g. result of an 8x32bit integer gather for 8bit components
354
// 256i - 0 1 2 3 4 5 6 7
355
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
356
357
Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
358
}
359
break;
360
case 16:
361
{
362
Value* vGatherResult[2];
363
364
// TODO: vGatherMaskedVal
365
Value* vGatherMaskedVal = VIMMED1((int32_t)0);
366
367
// always have at least one component out of x or y to fetch
368
369
vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
370
// e.g. result of first 8x32bit integer gather for 16bit components
371
// 256i - 0 1 2 3 4 5 6 7
372
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
373
//
374
375
// if we have at least one component out of x or y to fetch
376
if (info.numComps > 2)
377
{
378
// offset base to the next components(zw) in the vertex to gather
379
pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
380
381
vGatherResult[1] =
382
GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
383
// e.g. result of second 8x32bit integer gather for 16bit components
384
// 256i - 0 1 2 3 4 5 6 7
385
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
386
//
387
}
388
else
389
{
390
vGatherResult[1] = vGatherMaskedVal;
391
}
392
393
// Shuffle gathered components into place, each row is a component
394
Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
395
}
396
break;
397
case 32:
398
{
399
// apply defaults
400
for (uint32_t i = 0; i < 4; ++i)
401
{
402
vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
403
}
404
405
for (uint32_t i = 0; i < info.numComps; i++)
406
{
407
uint32_t swizzleIndex = info.swizzle[i];
408
409
// Gather a SIMD of components
410
vGatherComponents[swizzleIndex] = GATHERDD(
411
vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
412
413
// offset base to the next component to gather
414
pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
415
}
416
}
417
break;
418
default:
419
SWR_INVALID("unsupported format");
420
break;
421
}
422
}
423
424
void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
425
Value* vGatherInput[2],
426
Value* vGatherOutput[4],
427
bool bPackedOutput)
428
{
429
// cast types
430
Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
431
Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
432
433
// input could either be float or int vector; do shuffle work in int
434
vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
435
vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
436
437
if (bPackedOutput)
438
{
439
Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
440
mVWidth / 4); // vwidth is units of 32 bits
441
442
// shuffle mask
443
Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
444
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
445
Value* vShufResult =
446
BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
447
// after pshufb: group components together in each 128bit lane
448
// 256i - 0 1 2 3 4 5 6 7
449
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
450
451
Value* vi128XY =
452
BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
453
// after PERMD: move and pack xy components into each 128bit lane
454
// 256i - 0 1 2 3 4 5 6 7
455
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
456
457
// do the same for zw components
458
Value* vi128ZW = nullptr;
459
if (info.numComps > 2)
460
{
461
Value* vShufResult =
462
BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
463
vi128ZW =
464
BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
465
}
466
467
for (uint32_t i = 0; i < 4; i++)
468
{
469
uint32_t swizzleIndex = info.swizzle[i];
470
// todo: fixed for packed
471
Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
472
if (i >= info.numComps)
473
{
474
// set the default component val
475
vGatherOutput[swizzleIndex] = vGatherMaskedVal;
476
continue;
477
}
478
479
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
480
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
481
// if x or y, use vi128XY permute result, else use vi128ZW
482
Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
483
484
// extract packed component 128 bit lanes
485
vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
486
}
487
}
488
else
489
{
490
// pshufb masks for each component
491
Value* vConstMask[2];
492
// x/z shuffle mask
493
vConstMask[0] = C<char>({
494
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495
0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
496
});
497
498
// y/w shuffle mask
499
vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
500
2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
501
502
// shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
503
// apply defaults
504
for (uint32_t i = 0; i < 4; ++i)
505
{
506
vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
507
}
508
509
for (uint32_t i = 0; i < info.numComps; i++)
510
{
511
uint32_t swizzleIndex = info.swizzle[i];
512
513
// select correct constMask for x/z or y/w pshufb
514
uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
515
// if x or y, use vi128XY permute result, else use vi128ZW
516
uint32_t selectedGather = (i < 2) ? 0 : 1;
517
518
vGatherOutput[swizzleIndex] =
519
BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
520
vConstMask[selectedMask]),
521
vGatherTy);
522
// after pshufb mask for x channel; z uses the same shuffle from the second gather
523
// 256i - 0 1 2 3 4 5 6 7
524
// xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
525
}
526
}
527
}
528
529
void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
530
Value* vGatherInput,
531
Value* vGatherOutput[],
532
bool bPackedOutput)
533
{
534
// cast types
535
Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
536
Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
537
538
if (bPackedOutput)
539
{
540
Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
541
mVWidth / 4); // vwidth is units of 32 bits
542
// shuffle mask
543
Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
544
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
545
Value* vShufResult =
546
BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
547
// after pshufb: group components together in each 128bit lane
548
// 256i - 0 1 2 3 4 5 6 7
549
// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
550
551
Value* vi128XY =
552
BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
553
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
554
// 256i - 0 1 2 3 4 5 6 7
555
// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
556
557
// do the same for zw components
558
Value* vi128ZW = nullptr;
559
if (info.numComps > 2)
560
{
561
vi128ZW =
562
BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
563
}
564
565
// sign extend all enabled components. If we have a fill vVertexElements, output to
566
// current simdvertex
567
for (uint32_t i = 0; i < 4; i++)
568
{
569
uint32_t swizzleIndex = info.swizzle[i];
570
// todo: fix for packed
571
Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
572
if (i >= info.numComps)
573
{
574
// set the default component val
575
vGatherOutput[swizzleIndex] = vGatherMaskedVal;
576
continue;
577
}
578
579
// if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
580
uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
581
// if x or y, use vi128XY permute result, else use vi128ZW
582
Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
583
584
// sign extend
585
vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
586
}
587
}
588
// else zero extend
589
else
590
{
591
// shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
592
// apply defaults
593
for (uint32_t i = 0; i < 4; ++i)
594
{
595
vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
596
}
597
598
for (uint32_t i = 0; i < info.numComps; i++)
599
{
600
uint32_t swizzleIndex = info.swizzle[i];
601
602
// pshufb masks for each component
603
Value* vConstMask;
604
switch (i)
605
{
606
case 0:
607
// x shuffle mask
608
vConstMask =
609
C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
610
0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
611
break;
612
case 1:
613
// y shuffle mask
614
vConstMask =
615
C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
616
1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
617
break;
618
case 2:
619
// z shuffle mask
620
vConstMask =
621
C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
622
2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
623
break;
624
case 3:
625
// w shuffle mask
626
vConstMask =
627
C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
628
3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
629
break;
630
default:
631
vConstMask = nullptr;
632
break;
633
}
634
635
assert(vConstMask && "Invalid info.numComps value");
636
vGatherOutput[swizzleIndex] =
637
BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
638
// after pshufb for x channel
639
// 256i - 0 1 2 3 4 5 6 7
640
// x000 x000 x000 x000 x000 x000 x000 x000
641
}
642
}
643
}
644
645
//////////////////////////////////////////////////////////////////////////
646
/// @brief emulates a scatter operation.
647
/// @param pDst - pointer to destination
648
/// @param vSrc - vector of src data to scatter
649
/// @param vOffsets - vector of byte offsets from pDst
650
/// @param vMask - mask of valid lanes
651
void Builder::SCATTERPS(
652
Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
653
{
654
AssertMemoryUsageParams(pDst, usage);
655
#if LLVM_VERSION_MAJOR >= 11
656
SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
657
#else
658
SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
659
#endif
660
VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
661
return;
662
663
/* Scatter algorithm
664
665
while(Index = BitScanForward(mask))
666
srcElem = srcVector[Index]
667
offsetElem = offsetVector[Index]
668
*(pDst + offsetElem) = srcElem
669
Update mask (&= ~(1<<Index)
670
671
*/
672
673
/*
674
675
// Reference implementation kept around for reference
676
677
BasicBlock* pCurBB = IRB()->GetInsertBlock();
678
Function* pFunc = pCurBB->getParent();
679
Type* pSrcTy = vSrc->getType()->getVectorElementType();
680
681
// Store vectors on stack
682
if (pScatterStackSrc == nullptr)
683
{
684
// Save off stack allocations and reuse per scatter. Significantly reduces stack
685
// requirements for shaders with a lot of scatters.
686
pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
687
pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
688
}
689
690
Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
691
Value* pOffsetsArrayPtr = pScatterStackOffsets;
692
STORE(vSrc, pSrcArrayPtr);
693
STORE(vOffsets, pOffsetsArrayPtr);
694
695
// Cast to pointers for random access
696
pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
697
pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
698
699
Value* pMask = VMOVMSK(vMask);
700
701
// Setup loop basic block
702
BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
703
704
// compute first set bit
705
Value* pIndex = CTTZ(pMask, C(false));
706
707
Value* pIsUndef = ICMP_EQ(pIndex, C(32));
708
709
// Split current block or create new one if building inline
710
BasicBlock* pPostLoop;
711
if (pCurBB->getTerminator())
712
{
713
pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
714
715
// Remove unconditional jump created by splitBasicBlock
716
pCurBB->getTerminator()->eraseFromParent();
717
718
// Add terminator to end of original block
719
IRB()->SetInsertPoint(pCurBB);
720
721
// Add conditional branch
722
COND_BR(pIsUndef, pPostLoop, pLoop);
723
}
724
else
725
{
726
pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
727
728
// Add conditional branch
729
COND_BR(pIsUndef, pPostLoop, pLoop);
730
}
731
732
// Add loop basic block contents
733
IRB()->SetInsertPoint(pLoop);
734
PHINode* pIndexPhi = PHI(mInt32Ty, 2);
735
PHINode* pMaskPhi = PHI(mInt32Ty, 2);
736
737
pIndexPhi->addIncoming(pIndex, pCurBB);
738
pMaskPhi->addIncoming(pMask, pCurBB);
739
740
// Extract elements for this index
741
Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi});
742
Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
743
744
// GEP to this offset in dst
745
Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
746
pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
747
STORE(pSrcElem, pCurDst);
748
749
// Update the mask
750
Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
751
752
// Terminator
753
Value* pNewIndex = CTTZ(pNewMask, C(false));
754
755
pIsUndef = ICMP_EQ(pNewIndex, C(32));
756
COND_BR(pIsUndef, pPostLoop, pLoop);
757
758
// Update phi edges
759
pIndexPhi->addIncoming(pNewIndex, pLoop);
760
pMaskPhi->addIncoming(pNewMask, pLoop);
761
762
// Move builder to beginning of post loop
763
IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
764
765
*/
766
}
767
} // namespace SwrJit
768
769