CoCalc -- builder

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
⁴⁵⁷⁴ views
1
/****************************************************************************
2
 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 * @file builder_misc.cpp
24
 *
25
 * @brief Implementation for miscellaneous builder functions
26
 *
27
 * Notes:
28
 *
29
 ******************************************************************************/
30
#include "jit_pch.hpp"
31
#include "builder.h"
32

33
#include <cstdarg>
34

35
namespace SwrJit
36
{
37
    void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
38
    {
39
        SWR_ASSERT(
40
            ptr->getType() != mInt64Ty,
41
            "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
42
    }
43

44
    Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
45
    {
46
        return IRB()->CreateGEP(Ptr, Idx, Name);
47
    }
48

49
    Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
50
    {
51
        return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52
    }
53

54
    Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
55
    {
56
        std::vector<Value*> indices;
57
        for (auto i : indexList)
58
            indices.push_back(i);
59
        return GEPA(ptr, indices);
60
    }
61

62
    Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
63
    {
64
        std::vector<Value*> indices;
65
        for (auto i : indexList)
66
            indices.push_back(C(i));
67
        return GEPA(ptr, indices);
68
    }
69

70
    Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
71
    {
72
        return IRB()->CreateGEP(Ptr, IdxList, Name);
73
    }
74

75
    Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
76
    {
77
        return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78
    }
79

80
    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
81
    {
82
        std::vector<Value*> indices;
83
        for (auto i : indexList)
84
            indices.push_back(i);
85
        return IN_BOUNDS_GEP(ptr, indices);
86
    }
87

88
    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
89
    {
90
        std::vector<Value*> indices;
91
        for (auto i : indexList)
92
            indices.push_back(C(i));
93
        return IN_BOUNDS_GEP(ptr, indices);
94
    }
95

96
    LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
97
    {
98
        AssertMemoryUsageParams(Ptr, usage);
99
        return IRB()->CreateLoad(Ptr, Name);
100
    }
101

102
    LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
103
    {
104
        AssertMemoryUsageParams(Ptr, usage);
105
        return IRB()->CreateLoad(Ptr, Name);
106
    }
107

108
    LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
109
    {
110
        AssertMemoryUsageParams(Ptr, usage);
111
        return IRB()->CreateLoad(Ty, Ptr, Name);
112
    }
113

114
    LoadInst*
115
    Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
116
    {
117
        AssertMemoryUsageParams(Ptr, usage);
118
        return IRB()->CreateLoad(Ptr, isVolatile, Name);
119
    }
120

121
    LoadInst* Builder::LOAD(Value*                                 basePtr,
122
                            const std::initializer_list<uint32_t>& indices,
123
                            const llvm::Twine&                     name,
124
                            Type*                                  Ty,
125
                            MEM_CLIENT                             usage)
126
    {
127
        std::vector<Value*> valIndices;
128
        for (auto i : indices)
129
            valIndices.push_back(C(i));
130
        return Builder::LOAD(GEPA(basePtr, valIndices), name);
131
    }
132

133
    LoadInst* Builder::LOADV(Value*                               basePtr,
134
                             const std::initializer_list<Value*>& indices,
135
                             const llvm::Twine&                   name)
136
    {
137
        std::vector<Value*> valIndices;
138
        for (auto i : indices)
139
            valIndices.push_back(i);
140
        return LOAD(GEPA(basePtr, valIndices), name);
141
    }
142

143
    StoreInst*
144
    Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
145
    {
146
        std::vector<Value*> valIndices;
147
        for (auto i : indices)
148
            valIndices.push_back(C(i));
149
        return STORE(val, GEPA(basePtr, valIndices));
150
    }
151

152
    StoreInst*
153
    Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
154
    {
155
        std::vector<Value*> valIndices;
156
        for (auto i : indices)
157
            valIndices.push_back(i);
158
        return STORE(val, GEPA(basePtr, valIndices));
159
    }
160

161
    Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
162
    {
163
        return GEP(base, offset);
164
    }
165

166
    Value* Builder::MEM_ADD(Value*                                 i32Incr,
167
                            Value*                                 basePtr,
168
                            const std::initializer_list<uint32_t>& indices,
169
                            const llvm::Twine&                     name)
170
    {
171
        Value* i32Value  = LOAD(GEP(basePtr, indices), name);
172
        Value* i32Result = ADD(i32Value, i32Incr);
173
        return STORE(i32Result, GEP(basePtr, indices));
174
    }
175

176
    //////////////////////////////////////////////////////////////////////////
177
    /// @brief Generate a masked gather operation in LLVM IR.  If not
178
    /// supported on the underlying platform, emulate it with loads
179
    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180
    /// @param pBase - Int8* base VB address pointer value
181
    /// @param vIndices - SIMD wide value of VB byte offsets
182
    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183
    /// @param scale - value to scale indices by
184
    Value* Builder::GATHERPS(Value*         vSrc,
185
                             Value*         pBase,
186
                             Value*         vIndices,
187
                             Value*         vMask,
188
                             uint8_t        scale,
189
                             MEM_CLIENT     usage)
190
    {
191
        AssertMemoryUsageParams(pBase, usage);
192

193
        return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
194
    }
195

196
    //////////////////////////////////////////////////////////////////////////
197
    /// @brief Generate a masked gather operation in LLVM IR.  If not
198
    /// supported on the underlying platform, emulate it with loads
199
    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200
    /// @param pBase - Int8* base VB address pointer value
201
    /// @param vIndices - SIMD wide value of VB byte offsets
202
    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203
    /// @param scale - value to scale indices by
204
    Value* Builder::GATHERDD(Value*         vSrc,
205
                             Value*         pBase,
206
                             Value*         vIndices,
207
                             Value*         vMask,
208
                             uint8_t        scale,
209
                             MEM_CLIENT     usage)
210
    {
211
        AssertMemoryUsageParams(pBase, usage);
212

213
        return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
214
    }
215

216
    //////////////////////////////////////////////////////////////////////////
217
    /// @brief Generate a masked gather operation in LLVM IR.  If not
218
    /// supported on the underlying platform, emulate it with loads
219
    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220
    /// @param pBase - Int8* base VB address pointer value
221
    /// @param vIndices - SIMD wide value of VB byte offsets
222
    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223
    /// @param scale - value to scale indices by
224
    Value*
225
    Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226
    {
227
        return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
228
    }
229

230
    //////////////////////////////////////////////////////////////////////////
231
    /// @brief Alternative masked gather where source is a vector of pointers
232
    /// @param pVecSrcPtr   - SIMD wide vector of pointers
233
    /// @param pVecMask     - SIMD active lanes
234
    /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
235
    Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
236
    {
237
        return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
238
    }
239

240
    void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
241
    {
242
        MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
243
    }
244

245
    void Builder::Gather4(const SWR_FORMAT format,
246
                          Value*           pSrcBase,
247
                          Value*           byteOffsets,
248
                          Value*           mask,
249
                          Value*           vGatherComponents[],
250
                          bool             bPackedOutput,
251
                          MEM_CLIENT       usage)
252
    {
253
        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
254
        if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
255
        {
256
            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
257
        }
258
        else
259
        {
260
            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
261
        }
262
    }
263

264
    void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
265
                            Value*                 pSrcBase,
266
                            Value*                 byteOffsets,
267
                            Value*                 vMask,
268
                            Value*                 vGatherComponents[],
269
                            bool                   bPackedOutput,
270
                            MEM_CLIENT             usage)
271
    {
272
        switch (info.bpp / info.numComps)
273
        {
274
        case 16:
275
        {
276
            Value* vGatherResult[2];
277

278
            // TODO: vGatherMaskedVal
279
            Value* vGatherMaskedVal = VIMMED1((float)0);
280

281
            // always have at least one component out of x or y to fetch
282

283
            vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
284
            // e.g. result of first 8x32bit integer gather for 16bit components
285
            // 256i - 0    1    2    3    4    5    6    7
286
            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
287
            //
288

289
            // if we have at least one component out of x or y to fetch
290
            if (info.numComps > 2)
291
            {
292
                // offset base to the next components(zw) in the vertex to gather
293
                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
294

295
                vGatherResult[1] =
296
                    GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
297
                // e.g. result of second 8x32bit integer gather for 16bit components
298
                // 256i - 0    1    2    3    4    5    6    7
299
                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
300
                //
301
            }
302
            else
303
            {
304
                vGatherResult[1] = vGatherMaskedVal;
305
            }
306

307
            // Shuffle gathered components into place, each row is a component
308
            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
309
        }
310
        break;
311
        case 32:
312
        {
313
            // apply defaults
314
            for (uint32_t i = 0; i < 4; ++i)
315
            {
316
                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
317
            }
318

319
            for (uint32_t i = 0; i < info.numComps; i++)
320
            {
321
                uint32_t swizzleIndex = info.swizzle[i];
322

323
                // Gather a SIMD of components
324
                vGatherComponents[swizzleIndex] = GATHERPS(
325
                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
326

327
                // offset base to the next component to gather
328
                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
329
            }
330
        }
331
        break;
332
        default:
333
            SWR_INVALID("Invalid float format");
334
            break;
335
        }
336
    }
337

338
    void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
339
                            Value*                 pSrcBase,
340
                            Value*                 byteOffsets,
341
                            Value*                 vMask,
342
                            Value*                 vGatherComponents[],
343
                            bool                   bPackedOutput,
344
                            MEM_CLIENT             usage)
345
    {
346
        switch (info.bpp / info.numComps)
347
        {
348
        case 8:
349
        {
350
            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
351
            Value* vGatherResult =
352
                GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
353
            // e.g. result of an 8x32bit integer gather for 8bit components
354
            // 256i - 0    1    2    3    4    5    6    7
355
            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
356

357
            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
358
        }
359
        break;
360
        case 16:
361
        {
362
            Value* vGatherResult[2];
363

364
            // TODO: vGatherMaskedVal
365
            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
366

367
            // always have at least one component out of x or y to fetch
368

369
            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
370
            // e.g. result of first 8x32bit integer gather for 16bit components
371
            // 256i - 0    1    2    3    4    5    6    7
372
            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
373
            //
374

375
            // if we have at least one component out of x or y to fetch
376
            if (info.numComps > 2)
377
            {
378
                // offset base to the next components(zw) in the vertex to gather
379
                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
380

381
                vGatherResult[1] =
382
                    GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
383
                // e.g. result of second 8x32bit integer gather for 16bit components
384
                // 256i - 0    1    2    3    4    5    6    7
385
                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
386
                //
387
            }
388
            else
389
            {
390
                vGatherResult[1] = vGatherMaskedVal;
391
            }
392

393
            // Shuffle gathered components into place, each row is a component
394
            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
395
        }
396
        break;
397
        case 32:
398
        {
399
            // apply defaults
400
            for (uint32_t i = 0; i < 4; ++i)
401
            {
402
                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
403
            }
404

405
            for (uint32_t i = 0; i < info.numComps; i++)
406
            {
407
                uint32_t swizzleIndex = info.swizzle[i];
408

409
                // Gather a SIMD of components
410
                vGatherComponents[swizzleIndex] = GATHERDD(
411
                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
412

413
                // offset base to the next component to gather
414
                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
415
            }
416
        }
417
        break;
418
        default:
419
            SWR_INVALID("unsupported format");
420
            break;
421
        }
422
    }
423

424
    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
425
                                      Value*                 vGatherInput[2],
426
                                      Value*                 vGatherOutput[4],
427
                                      bool                   bPackedOutput)
428
    {
429
        // cast types
430
        Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
431
        Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
432

433
        // input could either be float or int vector; do shuffle work in int
434
        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
435
        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
436

437
        if (bPackedOutput)
438
        {
439
            Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
440
                                              mVWidth / 4); // vwidth is units of 32 bits
441

442
            // shuffle mask
443
            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
444
                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
445
            Value* vShufResult =
446
                BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
447
            // after pshufb: group components together in each 128bit lane
448
            // 256i - 0    1    2    3    4    5    6    7
449
            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
450

451
            Value* vi128XY =
452
                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
453
            // after PERMD: move and pack xy components into each 128bit lane
454
            // 256i - 0    1    2    3    4    5    6    7
455
            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
456

457
            // do the same for zw components
458
            Value* vi128ZW = nullptr;
459
            if (info.numComps > 2)
460
            {
461
                Value* vShufResult =
462
                    BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
463
                vi128ZW =
464
                    BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
465
            }
466

467
            for (uint32_t i = 0; i < 4; i++)
468
            {
469
                uint32_t swizzleIndex = info.swizzle[i];
470
                // todo: fixed for packed
471
                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
472
                if (i >= info.numComps)
473
                {
474
                    // set the default component val
475
                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
476
                    continue;
477
                }
478

479
                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
480
                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
481
                // if x or y, use vi128XY permute result, else use vi128ZW
482
                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
483

484
                // extract packed component 128 bit lanes
485
                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
486
            }
487
        }
488
        else
489
        {
490
            // pshufb masks for each component
491
            Value* vConstMask[2];
492
            // x/z shuffle mask
493
            vConstMask[0] = C<char>({
494
                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495
                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
496
            });
497

498
            // y/w shuffle mask
499
            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
500
                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
501

502
            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
503
            // apply defaults
504
            for (uint32_t i = 0; i < 4; ++i)
505
            {
506
                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
507
            }
508

509
            for (uint32_t i = 0; i < info.numComps; i++)
510
            {
511
                uint32_t swizzleIndex = info.swizzle[i];
512

513
                // select correct constMask for x/z or y/w pshufb
514
                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
515
                // if x or y, use vi128XY permute result, else use vi128ZW
516
                uint32_t selectedGather = (i < 2) ? 0 : 1;
517

518
                vGatherOutput[swizzleIndex] =
519
                    BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
520
                                   vConstMask[selectedMask]),
521
                            vGatherTy);
522
                // after pshufb mask for x channel; z uses the same shuffle from the second gather
523
                // 256i - 0    1    2    3    4    5    6    7
524
                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
525
            }
526
        }
527
    }
528

529
    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
530
                                     Value*                 vGatherInput,
531
                                     Value*                 vGatherOutput[],
532
                                     bool                   bPackedOutput)
533
    {
534
        // cast types
535
        Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
536
        Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
537

538
        if (bPackedOutput)
539
        {
540
            Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
541
                                           mVWidth / 4); // vwidth is units of 32 bits
542
                                                         // shuffle mask
543
            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
544
                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
545
            Value* vShufResult =
546
                BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
547
            // after pshufb: group components together in each 128bit lane
548
            // 256i - 0    1    2    3    4    5    6    7
549
            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
550

551
            Value* vi128XY =
552
                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
553
            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
554
            // 256i - 0    1    2    3    4    5    6    7
555
            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
556

557
            // do the same for zw components
558
            Value* vi128ZW = nullptr;
559
            if (info.numComps > 2)
560
            {
561
                vi128ZW =
562
                    BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
563
            }
564

565
            // sign extend all enabled components. If we have a fill vVertexElements, output to
566
            // current simdvertex
567
            for (uint32_t i = 0; i < 4; i++)
568
            {
569
                uint32_t swizzleIndex = info.swizzle[i];
570
                // todo: fix for packed
571
                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
572
                if (i >= info.numComps)
573
                {
574
                    // set the default component val
575
                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
576
                    continue;
577
                }
578

579
                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
580
                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
581
                // if x or y, use vi128XY permute result, else use vi128ZW
582
                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
583

584
                // sign extend
585
                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
586
            }
587
        }
588
        // else zero extend
589
        else
590
        {
591
            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
592
            // apply defaults
593
            for (uint32_t i = 0; i < 4; ++i)
594
            {
595
                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
596
            }
597

598
            for (uint32_t i = 0; i < info.numComps; i++)
599
            {
600
                uint32_t swizzleIndex = info.swizzle[i];
601

602
                // pshufb masks for each component
603
                Value* vConstMask;
604
                switch (i)
605
                {
606
                case 0:
607
                    // x shuffle mask
608
                    vConstMask =
609
                        C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
610
                                 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
611
                    break;
612
                case 1:
613
                    // y shuffle mask
614
                    vConstMask =
615
                        C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
616
                                 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
617
                    break;
618
                case 2:
619
                    // z shuffle mask
620
                    vConstMask =
621
                        C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
622
                                 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
623
                    break;
624
                case 3:
625
                    // w shuffle mask
626
                    vConstMask =
627
                        C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
628
                                 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
629
                    break;
630
                default:
631
                    vConstMask = nullptr;
632
                    break;
633
                }
634

635
                assert(vConstMask && "Invalid info.numComps value");
636
                vGatherOutput[swizzleIndex] =
637
                    BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
638
                // after pshufb for x channel
639
                // 256i - 0    1    2    3    4    5    6    7
640
                //        x000 x000 x000 x000 x000 x000 x000 x000
641
            }
642
        }
643
    }
644

645
    //////////////////////////////////////////////////////////////////////////
646
    /// @brief emulates a scatter operation.
647
    /// @param pDst - pointer to destination
648
    /// @param vSrc - vector of src data to scatter
649
    /// @param vOffsets - vector of byte offsets from pDst
650
    /// @param vMask - mask of valid lanes
651
    void Builder::SCATTERPS(
652
        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
653
    {
654
        AssertMemoryUsageParams(pDst, usage);
655
#if LLVM_VERSION_MAJOR >= 11
656
        SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
657
#else
658
        SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
659
#endif
660
        VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
661
        return;
662

663
        /* Scatter algorithm
664

665
        while(Index = BitScanForward(mask))
666
        srcElem = srcVector[Index]
667
        offsetElem = offsetVector[Index]
668
        *(pDst + offsetElem) = srcElem
669
        Update mask (&= ~(1<<Index)
670

671
        */
672

673
        /*
674

675
        // Reference implementation kept around for reference
676

677
        BasicBlock* pCurBB = IRB()->GetInsertBlock();
678
        Function*   pFunc  = pCurBB->getParent();
679
        Type*       pSrcTy = vSrc->getType()->getVectorElementType();
680

681
        // Store vectors on stack
682
        if (pScatterStackSrc == nullptr)
683
        {
684
            // Save off stack allocations and reuse per scatter. Significantly reduces stack
685
            // requirements for shaders with a lot of scatters.
686
            pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
687
            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
688
        }
689

690
        Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
691
        Value* pOffsetsArrayPtr = pScatterStackOffsets;
692
        STORE(vSrc, pSrcArrayPtr);
693
        STORE(vOffsets, pOffsetsArrayPtr);
694

695
        // Cast to pointers for random access
696
        pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
697
        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
698

699
        Value* pMask = VMOVMSK(vMask);
700

701
        // Setup loop basic block
702
        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
703

704
        // compute first set bit
705
        Value* pIndex = CTTZ(pMask, C(false));
706

707
        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
708

709
        // Split current block or create new one if building inline
710
        BasicBlock* pPostLoop;
711
        if (pCurBB->getTerminator())
712
        {
713
            pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
714

715
            // Remove unconditional jump created by splitBasicBlock
716
            pCurBB->getTerminator()->eraseFromParent();
717

718
            // Add terminator to end of original block
719
            IRB()->SetInsertPoint(pCurBB);
720

721
            // Add conditional branch
722
            COND_BR(pIsUndef, pPostLoop, pLoop);
723
        }
724
        else
725
        {
726
            pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
727

728
            // Add conditional branch
729
            COND_BR(pIsUndef, pPostLoop, pLoop);
730
        }
731

732
        // Add loop basic block contents
733
        IRB()->SetInsertPoint(pLoop);
734
        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
735
        PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
736

737
        pIndexPhi->addIncoming(pIndex, pCurBB);
738
        pMaskPhi->addIncoming(pMask, pCurBB);
739

740
        // Extract elements for this index
741
        Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
742
        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
743

744
        // GEP to this offset in dst
745
        Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
746
        pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
747
        STORE(pSrcElem, pCurDst);
748

749
        // Update the mask
750
        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
751

752
        // Terminator
753
        Value* pNewIndex = CTTZ(pNewMask, C(false));
754

755
        pIsUndef = ICMP_EQ(pNewIndex, C(32));
756
        COND_BR(pIsUndef, pPostLoop, pLoop);
757

758
        // Update phi edges
759
        pIndexPhi->addIncoming(pNewIndex, pLoop);
760
        pMaskPhi->addIncoming(pNewMask, pLoop);
761

762
        // Move builder to beginning of post loop
763
        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
764

765
        */
766
    }
767
} // namespace SwrJit
768

769
Product

Resources

Company