CoCalc -- dxil

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/microsoft/compiler/dxil_nir.c
⁴⁵⁶⁴ views
1
/*
2
 * Copyright © Microsoft Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include "dxil_nir.h"
25

26
#include "nir_builder.h"
27
#include "nir_deref.h"
28
#include "nir_to_dxil.h"
29
#include "util/u_math.h"
30

31
static void
32
cl_type_size_align(const struct glsl_type *type, unsigned *size,
33
                   unsigned *align)
34
{
35
   *size = glsl_get_cl_size(type);
36
   *align = glsl_get_cl_alignment(type);
37
}
38

39
static void
40
extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32,
41
                         unsigned dst_bit_size,
42
                         nir_ssa_def **dst_comps,
43
                         unsigned num_dst_comps)
44
{
45
   unsigned step = DIV_ROUND_UP(dst_bit_size, 32);
46
   unsigned comps_per32b = 32 / dst_bit_size;
47
   nir_ssa_def *tmp;
48

49
   for (unsigned i = 0; i < vec32->num_components; i += step) {
50
      switch (dst_bit_size) {
51
      case 64:
52
         tmp = nir_pack_64_2x32_split(b, nir_channel(b, vec32, i),
53
                                         nir_channel(b, vec32, i + 1));
54
         dst_comps[i / 2] = tmp;
55
         break;
56
      case 32:
57
         dst_comps[i] = nir_channel(b, vec32, i);
58
         break;
59
      case 16:
60
      case 8: {
61
         unsigned dst_offs = i * comps_per32b;
62

63
         tmp = nir_unpack_bits(b, nir_channel(b, vec32, i), dst_bit_size);
64
         for (unsigned j = 0; j < comps_per32b && dst_offs + j < num_dst_comps; j++)
65
            dst_comps[dst_offs + j] = nir_channel(b, tmp, j);
66
         }
67

68
         break;
69
      }
70
   }
71
}
72

73
static nir_ssa_def *
74
load_comps_to_vec32(nir_builder *b, unsigned src_bit_size,
75
                    nir_ssa_def **src_comps, unsigned num_src_comps)
76
{
77
   unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32);
78
   unsigned step = DIV_ROUND_UP(src_bit_size, 32);
79
   unsigned comps_per32b = 32 / src_bit_size;
80
   nir_ssa_def *vec32comps[4];
81

82
   for (unsigned i = 0; i < num_vec32comps; i += step) {
83
      switch (src_bit_size) {
84
      case 64:
85
         vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]);
86
         vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]);
87
         break;
88
      case 32:
89
         vec32comps[i] = src_comps[i];
90
         break;
91
      case 16:
92
      case 8: {
93
         unsigned src_offs = i * comps_per32b;
94

95
         vec32comps[i] = nir_u2u32(b, src_comps[src_offs]);
96
         for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) {
97
            nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]),
98
                                           nir_imm_int(b, j * src_bit_size));
99
            vec32comps[i] = nir_ior(b, vec32comps[i], tmp);
100
         }
101
         break;
102
      }
103
      }
104
   }
105

106
   return nir_vec(b, vec32comps, num_vec32comps);
107
}
108

109
static nir_ssa_def *
110
build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx)
111
{
112
   return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx);
113
}
114

115
static bool
116
lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr)
117
{
118
   assert(intr->dest.is_ssa);
119

120
   b->cursor = nir_before_instr(&intr->instr);
121

122
   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
123
   if (!nir_deref_mode_is(deref, nir_var_shader_temp))
124
      return false;
125
   nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align));
126
   nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3)));
127

128
   assert(intr->dest.is_ssa);
129
   unsigned num_components = nir_dest_num_components(intr->dest);
130
   unsigned bit_size = nir_dest_bit_size(intr->dest);
131
   unsigned load_size = MAX2(32, bit_size);
132
   unsigned num_bits = num_components * bit_size;
133
   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
134
   unsigned comp_idx = 0;
135

136
   nir_deref_path path;
137
   nir_deref_path_init(&path, deref, NULL);
138
   nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */));
139

140
   /* Split loads into 32-bit chunks */
141
   for (unsigned i = 0; i < num_bits; i += load_size) {
142
      unsigned subload_num_bits = MIN2(num_bits - i, load_size);
143
      nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32));
144
      nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx);
145

146
      if (load_size == 64) {
147
         idx = nir_iadd(b, idx, nir_imm_int(b, 1));
148
         vec32 = nir_vec2(b, vec32,
149
                             build_load_ptr_dxil(b, path.path[0], idx));
150
      }
151

152
      /* If we have 2 bytes or less to load we need to adjust the u32 value so
153
       * we can always extract the LSB.
154
       */
155
      if (subload_num_bits <= 16) {
156
         nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)),
157
                                          nir_imm_int(b, 8));
158
         vec32 = nir_ushr(b, vec32, shift);
159
      }
160

161
      /* And now comes the pack/unpack step to match the original type. */
162
      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
163
                               subload_num_bits / bit_size);
164
      comp_idx += subload_num_bits / bit_size;
165
   }
166

167
   nir_deref_path_finish(&path);
168
   assert(comp_idx == num_components);
169
   nir_ssa_def *result = nir_vec(b, comps, num_components);
170
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
171
   nir_instr_remove(&intr->instr);
172
   return true;
173
}
174

175
static nir_ssa_def *
176
ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32,
177
                          nir_ssa_def *offset, unsigned num_bytes)
178
{
179
   assert(num_bytes == 16 || num_bytes == 12 || num_bytes == 8 ||
180
          num_bytes == 4 || num_bytes == 3 || num_bytes == 2 ||
181
          num_bytes == 1);
182
   assert(vec32->num_components == 4);
183

184
   /* 16 and 12 byte types are always aligned on 16 bytes. */
185
   if (num_bytes > 8)
186
      return vec32;
187

188
   nir_ssa_def *comps[4];
189
   nir_ssa_def *cond;
190

191
   for (unsigned i = 0; i < 4; i++)
192
      comps[i] = nir_channel(b, vec32, i);
193

194
   /* If we have 8bytes or less to load, select which half the vec4 should
195
    * be used.
196
    */
197
   cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)),
198
                                 nir_imm_int(b, 0));
199

200
   comps[0] = nir_bcsel(b, cond, comps[2], comps[0]);
201
   comps[1] = nir_bcsel(b, cond, comps[3], comps[1]);
202

203
   /* Thanks to the CL alignment constraints, if we want 8 bytes we're done. */
204
   if (num_bytes == 8)
205
      return nir_vec(b, comps, 2);
206

207
   /* 4 bytes or less needed, select which of the 32bit component should be
208
    * used and return it. The sub-32bit split is handled in
209
    * extract_comps_from_vec32().
210
    */
211
   cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)),
212
                                 nir_imm_int(b, 0));
213
   return nir_bcsel(b, cond, comps[1], comps[0]);
214
}
215

216
nir_ssa_def *
217
build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer,
218
                    nir_ssa_def *offset, unsigned num_components,
219
                    unsigned bit_size)
220
{
221
   nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4));
222
   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
223
   unsigned num_bits = num_components * bit_size;
224
   unsigned comp_idx = 0;
225

226
   /* We need to split loads in 16byte chunks because that's the
227
    * granularity of cBufferLoadLegacy().
228
    */
229
   for (unsigned i = 0; i < num_bits; i += (16 * 8)) {
230
      /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec
231
       * load.
232
       */
233
      unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8);
234
      nir_ssa_def *vec32 =
235
         nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8))));
236

237
      /* First re-arrange the vec32 to account for intra 16-byte offset. */
238
      vec32 = ubo_load_select_32b_comps(b, vec32, offset, subload_num_bits / 8);
239

240
      /* If we have 2 bytes or less to load we need to adjust the u32 value so
241
       * we can always extract the LSB.
242
       */
243
      if (subload_num_bits <= 16) {
244
         nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset,
245
                                                      nir_imm_int(b, 3)),
246
                                          nir_imm_int(b, 8));
247
         vec32 = nir_ushr(b, vec32, shift);
248
      }
249

250
      /* And now comes the pack/unpack step to match the original type. */
251
      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
252
                               subload_num_bits / bit_size);
253
      comp_idx += subload_num_bits / bit_size;
254
   }
255

256
   assert(comp_idx == num_components);
257
   return nir_vec(b, comps, num_components);
258
}
259

260
static bool
261
lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
262
{
263
   assert(intr->dest.is_ssa);
264
   assert(intr->src[0].is_ssa);
265
   assert(intr->src[1].is_ssa);
266

267
   b->cursor = nir_before_instr(&intr->instr);
268

269
   nir_ssa_def *buffer = intr->src[0].ssa;
270
   nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3));
271
   enum gl_access_qualifier access = nir_intrinsic_access(intr);
272
   unsigned bit_size = nir_dest_bit_size(intr->dest);
273
   unsigned num_components = nir_dest_num_components(intr->dest);
274
   unsigned num_bits = num_components * bit_size;
275

276
   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
277
   unsigned comp_idx = 0;
278

279
   /* We need to split loads in 16byte chunks because that's the optimal
280
    * granularity of bufferLoad(). Minimum alignment is 4byte, which saves
281
    * from us from extra complexity to extract >= 32 bit components.
282
    */
283
   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
284
      /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
285
       * load.
286
       */
287
      unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32);
288

289
      /* The number of components to store depends on the number of bytes. */
290
      nir_ssa_def *vec32 =
291
         nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, 32), 32,
292
                       buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)),
293
                       .align_mul = 4,
294
                       .align_offset = 0,
295
                       .access = access);
296

297
      /* If we have 2 bytes or less to load we need to adjust the u32 value so
298
       * we can always extract the LSB.
299
       */
300
      if (subload_num_bits <= 16) {
301
         nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)),
302
                                          nir_imm_int(b, 8));
303
         vec32 = nir_ushr(b, vec32, shift);
304
      }
305

306
      /* And now comes the pack/unpack step to match the original type. */
307
      extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx],
308
                               subload_num_bits / bit_size);
309
      comp_idx += subload_num_bits / bit_size;
310
   }
311

312
   assert(comp_idx == num_components);
313
   nir_ssa_def *result = nir_vec(b, comps, num_components);
314
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
315
   nir_instr_remove(&intr->instr);
316
   return true;
317
}
318

319
static bool
320
lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr)
321
{
322
   b->cursor = nir_before_instr(&intr->instr);
323

324
   assert(intr->src[0].is_ssa);
325
   assert(intr->src[1].is_ssa);
326
   assert(intr->src[2].is_ssa);
327

328
   nir_ssa_def *val = intr->src[0].ssa;
329
   nir_ssa_def *buffer = intr->src[1].ssa;
330
   nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3));
331

332
   unsigned bit_size = val->bit_size;
333
   unsigned num_components = val->num_components;
334
   unsigned num_bits = num_components * bit_size;
335

336
   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
337
   unsigned comp_idx = 0;
338

339
   for (unsigned i = 0; i < num_components; i++)
340
      comps[i] = nir_channel(b, val, i);
341

342
   /* We split stores in 16byte chunks because that's the optimal granularity
343
    * of bufferStore(). Minimum alignment is 4byte, which saves from us from
344
    * extra complexity to store >= 32 bit components.
345
    */
346
   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
347
      /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec
348
       * store.
349
       */
350
      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
351
      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
352
      nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
353
                                               substore_num_bits / bit_size);
354
      nir_intrinsic_instr *store;
355

356
      if (substore_num_bits < 32) {
357
         nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1);
358

359
        /* If we have 16 bits or less to store we need to place them
360
         * correctly in the u32 component. Anything greater than 16 bits
361
         * (including uchar3) is naturally aligned on 32bits.
362
         */
363
         if (substore_num_bits <= 16) {
364
            nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3));
365
            nir_ssa_def *shift = nir_imul_imm(b, pos, 8);
366

367
            vec32 = nir_ishl(b, vec32, shift);
368
            mask = nir_ishl(b, mask, shift);
369
         }
370

371
         store = nir_intrinsic_instr_create(b->shader,
372
                                            nir_intrinsic_store_ssbo_masked_dxil);
373
         store->src[0] = nir_src_for_ssa(vec32);
374
         store->src[1] = nir_src_for_ssa(nir_inot(b, mask));
375
         store->src[2] = nir_src_for_ssa(buffer);
376
         store->src[3] = nir_src_for_ssa(local_offset);
377
      } else {
378
         store = nir_intrinsic_instr_create(b->shader,
379
                                            nir_intrinsic_store_ssbo);
380
         store->src[0] = nir_src_for_ssa(vec32);
381
         store->src[1] = nir_src_for_ssa(buffer);
382
         store->src[2] = nir_src_for_ssa(local_offset);
383

384
         nir_intrinsic_set_align(store, 4, 0);
385
      }
386

387
      /* The number of components to store depends on the number of bits. */
388
      store->num_components = DIV_ROUND_UP(substore_num_bits, 32);
389
      nir_builder_instr_insert(b, &store->instr);
390
      comp_idx += substore_num_bits / bit_size;
391
   }
392

393
   nir_instr_remove(&intr->instr);
394
   return true;
395
}
396

397
static void
398
lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op)
399
{
400
   for (unsigned i = 0; i < num_comps; i++) {
401
      nir_intrinsic_instr *load =
402
         nir_intrinsic_instr_create(b->shader, op);
403

404
      load->num_components = 1;
405
      load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
406
      nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
407
      nir_builder_instr_insert(b, &load->instr);
408
      comps[i] = &load->dest.ssa;
409
   }
410
}
411

412
static bool
413
lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr)
414
{
415
   assert(intr->dest.is_ssa);
416
   unsigned bit_size = nir_dest_bit_size(intr->dest);
417
   unsigned num_components = nir_dest_num_components(intr->dest);
418
   unsigned num_bits = num_components * bit_size;
419

420
   b->cursor = nir_before_instr(&intr->instr);
421
   nir_intrinsic_op op = intr->intrinsic;
422

423
   assert(intr->src[0].is_ssa);
424
   nir_ssa_def *offset = intr->src[0].ssa;
425
   if (op == nir_intrinsic_load_shared) {
426
      offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
427
      op = nir_intrinsic_load_shared_dxil;
428
   } else {
429
      offset = nir_u2u32(b, offset);
430
      op = nir_intrinsic_load_scratch_dxil;
431
   }
432
   nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
433
   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
434
   nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2];
435

436
   /* We need to split loads in 32-bit accesses because the buffer
437
    * is an i32 array and DXIL does not support type casts.
438
    */
439
   unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32);
440
   lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op);
441
   unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4);
442

443
   for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) {
444
      unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4);
445
      unsigned num_dest_comps = num_vec32_comps * 32 / bit_size;
446
      nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps);
447

448
      /* If we have 16 bits or less to load we need to adjust the u32 value so
449
       * we can always extract the LSB.
450
       */
451
      if (num_bits <= 16) {
452
         nir_ssa_def *shift =
453
            nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)),
454
                        nir_imm_int(b, 8));
455
         vec32 = nir_ushr(b, vec32, shift);
456
      }
457

458
      /* And now comes the pack/unpack step to match the original type. */
459
      unsigned dest_index = i * 32 / bit_size;
460
      extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps);
461
   }
462

463
   nir_ssa_def *result = nir_vec(b, comps, num_components);
464
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
465
   nir_instr_remove(&intr->instr);
466

467
   return true;
468
}
469

470
static void
471
lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op)
472
{
473

474
   for (unsigned i = 0; i < vec32->num_components; i++) {
475
      nir_intrinsic_instr *store =
476
         nir_intrinsic_instr_create(b->shader, op);
477

478
      store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i));
479
      store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i)));
480
      store->num_components = 1;
481
      nir_builder_instr_insert(b, &store->instr);
482
   }
483
}
484

485
static void
486
lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index,
487
                         nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op)
488
{
489
   nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1);
490

491
   /* If we have 16 bits or less to store we need to place them correctly in
492
    * the u32 component. Anything greater than 16 bits (including uchar3) is
493
    * naturally aligned on 32bits.
494
    */
495
   if (num_bits <= 16) {
496
      nir_ssa_def *shift =
497
         nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8);
498

499
      vec32 = nir_ishl(b, vec32, shift);
500
      mask = nir_ishl(b, mask, shift);
501
   }
502

503
   if (op == nir_intrinsic_store_shared_dxil) {
504
      /* Use the dedicated masked intrinsic */
505
      nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index);
506
   } else {
507
      /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */
508
      nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index);
509

510
      nir_ssa_def *new_val = nir_ior(b, vec32,
511
                                     nir_iand(b,
512
                                              nir_inot(b, mask),
513
                                              load));
514

515
      lower_store_vec32(b, index, new_val, op);
516
   }
517
}
518

519
static bool
520
lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr)
521
{
522
   assert(intr->src[0].is_ssa);
523
   unsigned num_components = nir_src_num_components(intr->src[0]);
524
   unsigned bit_size = nir_src_bit_size(intr->src[0]);
525
   unsigned num_bits = num_components * bit_size;
526

527
   b->cursor = nir_before_instr(&intr->instr);
528
   nir_intrinsic_op op = intr->intrinsic;
529

530
   nir_ssa_def *offset = intr->src[1].ssa;
531
   if (op == nir_intrinsic_store_shared) {
532
      offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr)));
533
      op = nir_intrinsic_store_shared_dxil;
534
   } else {
535
      offset = nir_u2u32(b, offset);
536
      op = nir_intrinsic_store_scratch_dxil;
537
   }
538
   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
539

540
   unsigned comp_idx = 0;
541
   for (unsigned i = 0; i < num_components; i++)
542
      comps[i] = nir_channel(b, intr->src[0].ssa, i);
543

544
   for (unsigned i = 0; i < num_bits; i += 4 * 32) {
545
      /* For each 4byte chunk (or smaller) we generate a 32bit scalar store.
546
       */
547
      unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32);
548
      nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8));
549
      nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx],
550
                                               substore_num_bits / bit_size);
551
      nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2));
552

553
      /* For anything less than 32bits we need to use the masked version of the
554
       * intrinsic to preserve data living in the same 32bit slot.
555
       */
556
      if (num_bits < 32) {
557
         lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op);
558
      } else {
559
         lower_store_vec32(b, index, vec32, op);
560
      }
561

562
      comp_idx += substore_num_bits / bit_size;
563
   }
564

565
   nir_instr_remove(&intr->instr);
566

567
   return true;
568
}
569

570
static void
571
ubo_to_temp_patch_deref_mode(nir_deref_instr *deref)
572
{
573
   deref->modes = nir_var_shader_temp;
574
   nir_foreach_use(use_src, &deref->dest.ssa) {
575
      if (use_src->parent_instr->type != nir_instr_type_deref)
576
	 continue;
577

578
      nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr);
579
      ubo_to_temp_patch_deref_mode(parent);
580
   }
581
}
582

583
static void
584
ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he)
585
{
586
   assert(nir_deref_mode_is(deref, nir_var_mem_constant));
587
   assert(deref->dest.is_ssa);
588
   assert(he->data);
589

590
   nir_foreach_use(use_src, &deref->dest.ssa) {
591
      if (use_src->parent_instr->type == nir_instr_type_deref) {
592
         ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he);
593
      } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
594
         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr);
595
         if (intr->intrinsic != nir_intrinsic_load_deref)
596
            he->data = NULL;
597
      } else {
598
         he->data = NULL;
599
      }
600

601
      if (!he->data)
602
         break;
603
   }
604
}
605

606
bool
607
dxil_nir_lower_ubo_to_temp(nir_shader *nir)
608
{
609
   struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL);
610
   bool progress = false;
611

612
   /* First pass: collect all UBO accesses that could be turned into
613
    * shader temp accesses.
614
    */
615
   foreach_list_typed(nir_function, func, node, &nir->functions) {
616
      if (!func->is_entrypoint)
617
         continue;
618
      assert(func->impl);
619

620
      nir_foreach_block(block, func->impl) {
621
         nir_foreach_instr_safe(instr, block) {
622
            if (instr->type != nir_instr_type_deref)
623
               continue;
624

625
            nir_deref_instr *deref = nir_instr_as_deref(instr);
626
            if (!nir_deref_mode_is(deref, nir_var_mem_constant) ||
627
                deref->deref_type != nir_deref_type_var)
628
                  continue;
629

630
            struct hash_entry *he =
631
               _mesa_hash_table_search(ubo_to_temp, deref->var);
632

633
            if (!he)
634
               he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var);
635

636
            if (!he->data)
637
               continue;
638

639
            ubo_to_temp_update_entry(deref, he);
640
         }
641
      }
642
   }
643

644
   hash_table_foreach(ubo_to_temp, he) {
645
      nir_variable *var = he->data;
646

647
      if (!var)
648
         continue;
649

650
      /* Change the variable mode. */
651
      var->data.mode = nir_var_shader_temp;
652

653
      /* Make sure the variable has a name.
654
       * DXIL variables must have names.
655
       */
656
      if (!var->name)
657
         var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables));
658

659
      progress = true;
660
   }
661
   _mesa_hash_table_destroy(ubo_to_temp, NULL);
662

663
   /* Second pass: patch all derefs that were accessing the converted UBOs
664
    * variables.
665
    */
666
   foreach_list_typed(nir_function, func, node, &nir->functions) {
667
      if (!func->is_entrypoint)
668
         continue;
669
      assert(func->impl);
670

671
      nir_foreach_block(block, func->impl) {
672
         nir_foreach_instr_safe(instr, block) {
673
            if (instr->type != nir_instr_type_deref)
674
               continue;
675

676
            nir_deref_instr *deref = nir_instr_as_deref(instr);
677
            if (nir_deref_mode_is(deref, nir_var_mem_constant) &&
678
                deref->deref_type == nir_deref_type_var &&
679
                deref->var->data.mode == nir_var_shader_temp)
680
               ubo_to_temp_patch_deref_mode(deref);
681
         }
682
      }
683
   }
684

685
   return progress;
686
}
687

688
static bool
689
lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr)
690
{
691
   assert(intr->dest.is_ssa);
692
   assert(intr->src[0].is_ssa);
693
   assert(intr->src[1].is_ssa);
694

695
   b->cursor = nir_before_instr(&intr->instr);
696

697
   nir_ssa_def *result =
698
      build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa,
699
                             nir_dest_num_components(intr->dest),
700
                             nir_dest_bit_size(intr->dest));
701

702
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
703
   nir_instr_remove(&intr->instr);
704
   return true;
705
}
706

707
bool
708
dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir)
709
{
710
   bool progress = false;
711

712
   foreach_list_typed(nir_function, func, node, &nir->functions) {
713
      if (!func->is_entrypoint)
714
         continue;
715
      assert(func->impl);
716

717
      nir_builder b;
718
      nir_builder_init(&b, func->impl);
719

720
      nir_foreach_block(block, func->impl) {
721
         nir_foreach_instr_safe(instr, block) {
722
            if (instr->type != nir_instr_type_intrinsic)
723
               continue;
724
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
725

726
            switch (intr->intrinsic) {
727
            case nir_intrinsic_load_deref:
728
               progress |= lower_load_deref(&b, intr);
729
               break;
730
            case nir_intrinsic_load_shared:
731
            case nir_intrinsic_load_scratch:
732
               progress |= lower_32b_offset_load(&b, intr);
733
               break;
734
            case nir_intrinsic_load_ssbo:
735
               progress |= lower_load_ssbo(&b, intr);
736
               break;
737
            case nir_intrinsic_load_ubo:
738
               progress |= lower_load_ubo(&b, intr);
739
               break;
740
            case nir_intrinsic_store_shared:
741
            case nir_intrinsic_store_scratch:
742
               progress |= lower_32b_offset_store(&b, intr);
743
               break;
744
            case nir_intrinsic_store_ssbo:
745
               progress |= lower_store_ssbo(&b, intr);
746
               break;
747
            default:
748
               break;
749
            }
750
         }
751
      }
752
   }
753

754
   return progress;
755
}
756

757
static bool
758
lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr,
759
                    nir_intrinsic_op dxil_op)
760
{
761
   b->cursor = nir_before_instr(&intr->instr);
762

763
   assert(intr->src[0].is_ssa);
764
   nir_ssa_def *offset =
765
      nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr)));
766
   nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2));
767

768
   nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op);
769
   atomic->src[0] = nir_src_for_ssa(index);
770
   assert(intr->src[1].is_ssa);
771
   atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa);
772
   if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) {
773
      assert(intr->src[2].is_ssa);
774
      atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa);
775
   }
776
   atomic->num_components = 0;
777
   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);
778

779
   nir_builder_instr_insert(b, &atomic->instr);
780
   nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa);
781
   nir_instr_remove(&intr->instr);
782
   return true;
783
}
784

785
bool
786
dxil_nir_lower_atomics_to_dxil(nir_shader *nir)
787
{
788
   bool progress = false;
789

790
   foreach_list_typed(nir_function, func, node, &nir->functions) {
791
      if (!func->is_entrypoint)
792
         continue;
793
      assert(func->impl);
794

795
      nir_builder b;
796
      nir_builder_init(&b, func->impl);
797

798
      nir_foreach_block(block, func->impl) {
799
         nir_foreach_instr_safe(instr, block) {
800
            if (instr->type != nir_instr_type_intrinsic)
801
               continue;
802
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
803

804
            switch (intr->intrinsic) {
805

806
#define ATOMIC(op)                                                            \
807
  case nir_intrinsic_shared_atomic_##op:                                     \
808
     progress |= lower_shared_atomic(&b, intr,                                \
809
                                     nir_intrinsic_shared_atomic_##op##_dxil); \
810
     break
811

812
            ATOMIC(add);
813
            ATOMIC(imin);
814
            ATOMIC(umin);
815
            ATOMIC(imax);
816
            ATOMIC(umax);
817
            ATOMIC(and);
818
            ATOMIC(or);
819
            ATOMIC(xor);
820
            ATOMIC(exchange);
821
            ATOMIC(comp_swap);
822

823
#undef ATOMIC
824
            default:
825
               break;
826
            }
827
         }
828
      }
829
   }
830

831
   return progress;
832
}
833

834
static bool
835
lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref)
836
{
837
   assert(nir_deref_mode_is(deref, nir_var_mem_ssbo));
838
   assert(deref->deref_type == nir_deref_type_var ||
839
          deref->deref_type == nir_deref_type_cast);
840
   nir_variable *var = deref->var;
841

842
   b->cursor = nir_before_instr(&deref->instr);
843

844
   if (deref->deref_type == nir_deref_type_var) {
845
      /* We turn all deref_var into deref_cast and build a pointer value based on
846
       * the var binding which encodes the UAV id.
847
       */
848
      nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32);
849
      nir_deref_instr *deref_cast =
850
         nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type,
851
                              glsl_get_explicit_stride(var->type));
852
      nir_ssa_def_rewrite_uses(&deref->dest.ssa,
853
                               &deref_cast->dest.ssa);
854
      nir_instr_remove(&deref->instr);
855

856
      deref = deref_cast;
857
      return true;
858
   }
859
   return false;
860
}
861

862
bool
863
dxil_nir_lower_deref_ssbo(nir_shader *nir)
864
{
865
   bool progress = false;
866

867
   foreach_list_typed(nir_function, func, node, &nir->functions) {
868
      if (!func->is_entrypoint)
869
         continue;
870
      assert(func->impl);
871

872
      nir_builder b;
873
      nir_builder_init(&b, func->impl);
874

875
      nir_foreach_block(block, func->impl) {
876
         nir_foreach_instr_safe(instr, block) {
877
            if (instr->type != nir_instr_type_deref)
878
               continue;
879

880
            nir_deref_instr *deref = nir_instr_as_deref(instr);
881

882
            if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) ||
883
                (deref->deref_type != nir_deref_type_var &&
884
                 deref->deref_type != nir_deref_type_cast))
885
               continue;
886

887
            progress |= lower_deref_ssbo(&b, deref);
888
         }
889
      }
890
   }
891

892
   return progress;
893
}
894

895
static bool
896
lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu)
897
{
898
   const nir_op_info *info = &nir_op_infos[alu->op];
899
   bool progress = false;
900

901
   b->cursor = nir_before_instr(&alu->instr);
902

903
   for (unsigned i = 0; i < info->num_inputs; i++) {
904
      nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src);
905

906
      if (!deref)
907
         continue;
908

909
      nir_deref_path path;
910
      nir_deref_path_init(&path, deref, NULL);
911
      nir_deref_instr *root_deref = path.path[0];
912
      nir_deref_path_finish(&path);
913

914
      if (root_deref->deref_type != nir_deref_type_cast)
915
         continue;
916

917
      nir_ssa_def *ptr =
918
         nir_iadd(b, root_deref->parent.ssa,
919
                     nir_build_deref_offset(b, deref, cl_type_size_align));
920
      nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr));
921
      progress = true;
922
   }
923

924
   return progress;
925
}
926

927
bool
928
dxil_nir_opt_alu_deref_srcs(nir_shader *nir)
929
{
930
   bool progress = false;
931

932
   foreach_list_typed(nir_function, func, node, &nir->functions) {
933
      if (!func->is_entrypoint)
934
         continue;
935
      assert(func->impl);
936

937
      bool progress = false;
938
      nir_builder b;
939
      nir_builder_init(&b, func->impl);
940

941
      nir_foreach_block(block, func->impl) {
942
         nir_foreach_instr_safe(instr, block) {
943
            if (instr->type != nir_instr_type_alu)
944
               continue;
945

946
            nir_alu_instr *alu = nir_instr_as_alu(instr);
947
            progress |= lower_alu_deref_srcs(&b, alu);
948
         }
949
      }
950
   }
951

952
   return progress;
953
}
954

955
static nir_ssa_def *
956
memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent,
957
                       nir_ssa_def *index)
958
{
959
   nir_deref_instr *deref;
960

961
   index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
962
   assert(parent->deref_type == nir_deref_type_cast);
963
   deref = nir_build_deref_ptr_as_array(b, parent, index);
964

965
   return nir_load_deref(b, deref);
966
}
967

968
static void
969
memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent,
970
                        nir_ssa_def *index, nir_ssa_def *value)
971
{
972
   nir_deref_instr *deref;
973

974
   index = nir_i2i(b, index, nir_dest_bit_size(parent->dest));
975
   assert(parent->deref_type == nir_deref_type_cast);
976
   deref = nir_build_deref_ptr_as_array(b, parent, index);
977
   nir_store_deref(b, deref, value, 1);
978
}
979

980
static bool
981
lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr)
982
{
983
   nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]);
984
   nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]);
985
   assert(intr->src[2].is_ssa);
986
   nir_ssa_def *num_bytes = intr->src[2].ssa;
987

988
   assert(dst_deref && src_deref);
989

990
   b->cursor = nir_after_instr(&intr->instr);
991

992
   dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes,
993
                                       glsl_uint8_t_type(), 1);
994
   src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes,
995
                                       glsl_uint8_t_type(), 1);
996

997
   /*
998
    * We want to avoid 64b instructions, so let's assume we'll always be
999
    * passed a value that fits in a 32b type and truncate the 64b value.
1000
    */
1001
   num_bytes = nir_u2u32(b, num_bytes);
1002

1003
   nir_variable *loop_index_var =
1004
     nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index");
1005
   nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var);
1006
   nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1);
1007

1008
   nir_loop *loop = nir_push_loop(b);
1009
   nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref);
1010
   nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes);
1011
   nir_if *loop_check = nir_push_if(b, cmp);
1012
   nir_jump(b, nir_jump_break);
1013
   nir_pop_if(b, loop_check);
1014
   nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index);
1015
   memcpy_store_deref_elem(b, dst_deref, loop_index, val);
1016
   nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1);
1017
   nir_pop_loop(b, loop);
1018
   nir_instr_remove(&intr->instr);
1019
   return true;
1020
}
1021

1022
bool
1023
dxil_nir_lower_memcpy_deref(nir_shader *nir)
1024
{
1025
   bool progress = false;
1026

1027
   foreach_list_typed(nir_function, func, node, &nir->functions) {
1028
      if (!func->is_entrypoint)
1029
         continue;
1030
      assert(func->impl);
1031

1032
      nir_builder b;
1033
      nir_builder_init(&b, func->impl);
1034

1035
      nir_foreach_block(block, func->impl) {
1036
         nir_foreach_instr_safe(instr, block) {
1037
            if (instr->type != nir_instr_type_intrinsic)
1038
               continue;
1039

1040
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1041

1042
            if (intr->intrinsic == nir_intrinsic_memcpy_deref)
1043
               progress |= lower_memcpy_deref(&b, intr);
1044
         }
1045
      }
1046
   }
1047

1048
   return progress;
1049
}
1050

1051
static void
1052
cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size)
1053
{
1054
   nir_phi_instr *lowered = nir_phi_instr_create(b->shader);
1055
   int num_components = 0;
1056
   int old_bit_size = phi->dest.ssa.bit_size;
1057

1058
   nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size,
1059
                                             nir_type_uint | new_bit_size,
1060
                                             nir_rounding_mode_undef);
1061
   nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size,
1062
                                               nir_type_uint | old_bit_size,
1063
                                               nir_rounding_mode_undef);
1064

1065
   nir_foreach_phi_src(src, phi) {
1066
      assert(num_components == 0 || num_components == src->src.ssa->num_components);
1067
      num_components = src->src.ssa->num_components;
1068

1069
      b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr);
1070

1071
      nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL);
1072

1073
      nir_phi_src *new_src = rzalloc(lowered, nir_phi_src);
1074
      new_src->pred = src->pred;
1075
      new_src->src = nir_src_for_ssa(cast);
1076
      exec_list_push_tail(&lowered->srcs, &new_src->node);
1077
   }
1078

1079
   nir_ssa_dest_init(&lowered->instr, &lowered->dest,
1080
                     num_components, new_bit_size, NULL);
1081

1082
   b->cursor = nir_before_instr(&phi->instr);
1083
   nir_builder_instr_insert(b, &lowered->instr);
1084

1085
   b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor));
1086
   nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL);
1087

1088
   nir_ssa_def_rewrite_uses(&phi->dest.ssa, result);
1089
   nir_instr_remove(&phi->instr);
1090
}
1091

1092
static bool
1093
upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size)
1094
{
1095
   nir_builder b;
1096
   nir_builder_init(&b, impl);
1097
   bool progress = false;
1098

1099
   nir_foreach_block_reverse(block, impl) {
1100
      nir_foreach_instr_safe(instr, block) {
1101
         if (instr->type != nir_instr_type_phi)
1102
            continue;
1103

1104
         nir_phi_instr *phi = nir_instr_as_phi(instr);
1105
         assert(phi->dest.is_ssa);
1106

1107
         if (phi->dest.ssa.bit_size == 1 ||
1108
             phi->dest.ssa.bit_size >= min_bit_size)
1109
            continue;
1110

1111
         cast_phi(&b, phi, min_bit_size);
1112
         progress = true;
1113
      }
1114
   }
1115

1116
   if (progress) {
1117
      nir_metadata_preserve(impl, nir_metadata_block_index |
1118
                                  nir_metadata_dominance);
1119
   }
1120

1121
   return progress;
1122
}
1123

1124
bool
1125
dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size)
1126
{
1127
   bool progress = false;
1128

1129
   nir_foreach_function(function, shader) {
1130
      if (function->impl)
1131
         progress |= upcast_phi_impl(function->impl, min_bit_size);
1132
   }
1133

1134
   return progress;
1135
}
1136

1137
/* In GLSL and SPIR-V, clip and cull distance are arrays of floats (with a limit of 8).
1138
 * In DXIL, clip and cull distances are up to 2 float4s combined.
1139
 * Coming from GLSL, we can request this 2 float4 format, but coming from SPIR-V,
1140
 * we can't, and have to accept a "compact" array of scalar floats.
1141
 *
1142
 * To help emitting a valid input signature for this case, split the variables so that they
1143
 * match what we need to put in the signature (e.g. { float clip[4]; float clip1; float cull[3]; })
1144
 */
1145
bool
1146
dxil_nir_split_clip_cull_distance(nir_shader *shader)
1147
{
1148
   nir_variable *new_var = NULL;
1149
   nir_foreach_function(function, shader) {
1150
      if (!function->impl)
1151
         continue;
1152

1153
      bool progress = false;
1154
      nir_builder b;
1155
      nir_builder_init(&b, function->impl);
1156
      nir_foreach_block(block, function->impl) {
1157
         nir_foreach_instr_safe(instr, block) {
1158
            if (instr->type != nir_instr_type_deref)
1159
               continue;
1160
            nir_deref_instr *deref = nir_instr_as_deref(instr);
1161
            nir_variable *var = nir_deref_instr_get_variable(deref);
1162
            if (!var ||
1163
                var->data.location < VARYING_SLOT_CLIP_DIST0 ||
1164
                var->data.location > VARYING_SLOT_CULL_DIST1 ||
1165
                !var->data.compact)
1166
               continue;
1167

1168
            /* The location should only be inside clip distance, because clip
1169
             * and cull should've been merged by nir_lower_clip_cull_distance_arrays()
1170
             */
1171
            assert(var->data.location == VARYING_SLOT_CLIP_DIST0 ||
1172
                   var->data.location == VARYING_SLOT_CLIP_DIST1);
1173

1174
            /* The deref chain to the clip/cull variables should be simple, just the 
1175
             * var and an array with a constant index, otherwise more lowering/optimization
1176
             * might be needed before this pass, e.g. copy prop, lower_io_to_temporaries,
1177
             * split_var_copies, and/or lower_var_copies
1178
             */
1179
            assert(deref->deref_type == nir_deref_type_var ||
1180
                   deref->deref_type == nir_deref_type_array);
1181

1182
            b.cursor = nir_before_instr(instr);
1183
            if (!new_var) {
1184
               /* Update lengths for new and old vars */
1185
               int old_length = glsl_array_size(var->type);
1186
               int new_length = (old_length + var->data.location_frac) - 4;
1187
               old_length -= new_length;
1188

1189
               /* The existing variable fits in the float4 */
1190
               if (new_length <= 0)
1191
                  continue;
1192

1193
               new_var = nir_variable_clone(var, shader);
1194
               nir_shader_add_variable(shader, new_var);
1195
               assert(glsl_get_base_type(glsl_get_array_element(var->type)) == GLSL_TYPE_FLOAT);
1196
               var->type = glsl_array_type(glsl_float_type(), old_length, 0);
1197
               new_var->type = glsl_array_type(glsl_float_type(), new_length, 0);
1198
               new_var->data.location++;
1199
               new_var->data.location_frac = 0;
1200
            }
1201

1202
            /* Update the type for derefs of the old var */
1203
            if (deref->deref_type == nir_deref_type_var) {
1204
               deref->type = var->type;
1205
               continue;
1206
            }
1207

1208
            nir_const_value *index = nir_src_as_const_value(deref->arr.index);
1209
            assert(index);
1210

1211
            /* Treat this array as a vector starting at the component index in location_frac,
1212
             * so if location_frac is 1 and index is 0, then it's accessing the 'y' component
1213
             * of the vector. If index + location_frac is >= 4, there's no component there,
1214
             * so we need to add a new variable and adjust the index.
1215
             */
1216
            unsigned total_index = index->u32 + var->data.location_frac;
1217
            if (total_index < 4)
1218
               continue;
1219

1220
            nir_deref_instr *new_var_deref = nir_build_deref_var(&b, new_var);
1221
            nir_deref_instr *new_array_deref = nir_build_deref_array(&b, new_var_deref, nir_imm_int(&b, total_index % 4));
1222
            nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa);
1223
            progress = true;
1224
         }
1225
      }
1226
      if (progress)
1227
         nir_metadata_preserve(function->impl, nir_metadata_block_index |
1228
                                               nir_metadata_dominance |
1229
                                               nir_metadata_loop_analysis);
1230
      else
1231
         nir_metadata_preserve(function->impl, nir_metadata_all);
1232
   }
1233

1234
   return new_var != NULL;
1235
}
1236

1237
bool
1238
dxil_nir_lower_double_math(nir_shader *shader)
1239
{
1240
   bool progress = false;
1241
   nir_foreach_function(func, shader) {
1242
      bool func_progress = false;
1243
      if (!func->impl)
1244
         continue;
1245

1246
      nir_builder b;
1247
      nir_builder_init(&b, func->impl);
1248
      nir_foreach_block(block, func->impl) {
1249
         nir_foreach_instr_safe(instr, block) {
1250
            if (instr->type != nir_instr_type_alu)
1251
               continue;
1252

1253
            nir_alu_instr *alu = nir_instr_as_alu(instr);
1254

1255
            /* TODO: See if we can apply this explicitly to packs/unpacks that are then
1256
             * used as a double. As-is, if we had an app explicitly do a 64bit integer op,
1257
             * then try to bitcast to double (not expressible in HLSL, but it is in other
1258
             * source languages), this would unpack the integer and repack as a double, when
1259
             * we probably want to just send the bitcast through to the backend.
1260
             */
1261

1262
            b.cursor = nir_before_instr(&alu->instr);
1263

1264
            for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) {
1265
               if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float &&
1266
                   alu->src[i].src.ssa->bit_size == 64) {
1267
                  nir_ssa_def *packed_double = nir_channel(&b, alu->src[i].src.ssa, alu->src[i].swizzle[0]);
1268
                  nir_ssa_def *unpacked_double = nir_unpack_64_2x32(&b, packed_double);
1269
                  nir_ssa_def *repacked_double = nir_pack_double_2x32_dxil(&b, unpacked_double);
1270
                  nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, repacked_double);
1271
                  memset(alu->src[i].swizzle, 0, ARRAY_SIZE(alu->src[i].swizzle));
1272
                  func_progress = true;
1273
               }
1274
            }
1275

1276
            if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float &&
1277
                alu->dest.dest.ssa.bit_size == 64) {
1278
               b.cursor = nir_after_instr(&alu->instr);
1279
               nir_ssa_def *packed_double = &alu->dest.dest.ssa;
1280
               nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(&b, packed_double);
1281
               nir_ssa_def *repacked_double = nir_pack_64_2x32(&b, unpacked_double);
1282
               nir_ssa_def_rewrite_uses_after(packed_double, repacked_double, unpacked_double->parent_instr);
1283
               func_progress = true;
1284
            }
1285
         }
1286
      }
1287
      
1288
      if (func_progress)
1289
         nir_metadata_preserve(func->impl, nir_metadata_block_index |
1290
                                           nir_metadata_dominance |
1291
                                           nir_metadata_loop_analysis);
1292
      else
1293
         nir_metadata_preserve(func->impl, nir_metadata_all);
1294
      progress |= func_progress;
1295
   }
1296

1297
   return progress;
1298
}
1299

1300
typedef struct {
1301
   gl_system_value *values;
1302
   uint32_t count;
1303
} zero_system_values_state;
1304

1305
static bool
1306
lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state)
1307
{
1308
   if (instr->type != nir_instr_type_intrinsic) {
1309
      return false;
1310
   }
1311

1312
   nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr);
1313

1314
   /* All the intrinsics we care about are loads */
1315
   if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)
1316
      return false;
1317

1318
   assert(intrin->dest.is_ssa);
1319

1320
   zero_system_values_state* state = (zero_system_values_state*)cb_state;
1321
   for (uint32_t i = 0; i < state->count; ++i) {
1322
      gl_system_value value = state->values[i];
1323
      nir_intrinsic_op value_op = nir_intrinsic_from_system_value(value);
1324

1325
      if (intrin->intrinsic == value_op) {
1326
         return true;
1327
      } else if (intrin->intrinsic == nir_intrinsic_load_deref) {
1328
         nir_deref_instr* deref = nir_src_as_deref(intrin->src[0]);
1329
         if (!nir_deref_mode_is(deref, nir_var_system_value))
1330
            return false;
1331

1332
         nir_variable* var = deref->var;
1333
         if (var->data.location == value) {
1334
            return true;
1335
         }
1336
      }
1337
   }
1338

1339
   return false;
1340
}
1341

1342
static nir_ssa_def*
1343
lower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state)
1344
{
1345
   return nir_imm_int(b, 0);
1346
}
1347

1348
bool
1349
dxil_nir_lower_system_values_to_zero(nir_shader* shader,
1350
                                     gl_system_value* system_values,
1351
                                     uint32_t count)
1352
{
1353
   zero_system_values_state state = { system_values, count };
1354
   return nir_shader_lower_instructions(shader,
1355
      lower_system_value_to_zero_filter,
1356
      lower_system_value_to_zero_instr,
1357
      &state);
1358
}
1359

1360
static const struct glsl_type *
1361
get_bare_samplers_for_type(const struct glsl_type *type)
1362
{
1363
   if (glsl_type_is_sampler(type)) {
1364
      if (glsl_sampler_type_is_shadow(type))
1365
         return glsl_bare_shadow_sampler_type();
1366
      else
1367
         return glsl_bare_sampler_type();
1368
   } else if (glsl_type_is_array(type)) {
1369
      return glsl_array_type(
1370
         get_bare_samplers_for_type(glsl_get_array_element(type)),
1371
         glsl_get_length(type),
1372
         0 /*explicit size*/);
1373
   }
1374
   assert(!"Unexpected type");
1375
   return NULL;
1376
}
1377

1378
static bool
1379
redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data)
1380
{
1381
   if (instr->type != nir_instr_type_tex)
1382
      return false;
1383

1384
   nir_tex_instr *tex = nir_instr_as_tex(instr);
1385
   if (!nir_tex_instr_need_sampler(tex))
1386
      return false;
1387

1388
   int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
1389
   if (sampler_idx == -1) {
1390
      /* No derefs, must be using indices */
1391
      struct hash_entry *hash_entry = _mesa_hash_table_u64_search(data, tex->sampler_index);
1392

1393
      /* Already have a bare sampler here */
1394
      if (hash_entry)
1395
         return false;
1396

1397
      nir_variable *typed_sampler = NULL;
1398
      nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) {
1399
         if (var->data.binding <= tex->sampler_index &&
1400
             var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) {
1401
            /* Already have a bare sampler for this binding, add it to the table */
1402
            if (glsl_get_sampler_result_type(glsl_without_array(var->type)) == GLSL_TYPE_VOID) {
1403
               _mesa_hash_table_u64_insert(data, tex->sampler_index, var);
1404
               return false;
1405
            }
1406

1407
            typed_sampler = var;
1408
         }
1409
      }
1410

1411
      /* Clone the typed sampler to a bare sampler and we're done */
1412
      assert(typed_sampler);
1413
      nir_variable *bare_sampler = nir_variable_clone(typed_sampler, b->shader);
1414
      bare_sampler->type = get_bare_samplers_for_type(typed_sampler->type);
1415
      nir_shader_add_variable(b->shader, bare_sampler);
1416
      _mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler);
1417
      return true;
1418
   }
1419

1420
   /* Using derefs, means we have to rewrite the deref chain in addition to cloning */
1421
   nir_deref_instr *final_deref = nir_src_as_deref(tex->src[sampler_idx].src);
1422
   nir_deref_path path;
1423
   nir_deref_path_init(&path, final_deref, NULL);
1424

1425
   nir_deref_instr *old_tail = path.path[0];
1426
   assert(old_tail->deref_type == nir_deref_type_var);
1427
   nir_variable *old_var = old_tail->var;
1428
   if (glsl_get_sampler_result_type(glsl_without_array(old_var->type)) == GLSL_TYPE_VOID) {
1429
      nir_deref_path_finish(&path);
1430
      return false;
1431
   }
1432

1433
   struct hash_entry *hash_entry = _mesa_hash_table_u64_search(data, old_var->data.binding);
1434
   nir_variable *new_var;
1435
   if (hash_entry) {
1436
      new_var = hash_entry->data;
1437
   } else {
1438
      new_var = nir_variable_clone(old_var, b->shader);
1439
      new_var->type = get_bare_samplers_for_type(old_var->type);
1440
      nir_shader_add_variable(b->shader, new_var);
1441
      _mesa_hash_table_u64_insert(data, old_var->data.binding, new_var);
1442
   }
1443

1444
   b->cursor = nir_after_instr(&old_tail->instr);
1445
   nir_deref_instr *new_tail = nir_build_deref_var(b, new_var);
1446

1447
   for (unsigned i = 1; path.path[i]; ++i) {
1448
      b->cursor = nir_after_instr(&path.path[i]->instr);
1449
      new_tail = nir_build_deref_follower(b, new_tail, path.path[i]);
1450
   }
1451

1452
   nir_deref_path_finish(&path);
1453
   nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa);
1454

1455
   return true;
1456
}
1457

1458
bool
1459
dxil_nir_create_bare_samplers(nir_shader *nir)
1460
{
1461
   struct hash_table_u64 *sampler_to_bare = _mesa_hash_table_u64_create(NULL);
1462

1463
   bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs,
1464
      nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, sampler_to_bare);
1465

1466
   _mesa_hash_table_u64_destroy(sampler_to_bare);
1467
   return progress;
1468
}
1469

1470

1471
/* Comparison function to sort io values so that first come normal varyings,
1472
 * then system values, and then system generated values.
1473
 */
1474
static int
1475
variable_location_cmp(const nir_variable* a, const nir_variable* b)
1476
{
1477
   // Sort by driver_location, location, then index
1478
   return a->data.driver_location != b->data.driver_location ?
1479
            a->data.driver_location - b->data.driver_location : 
1480
            a->data.location !=  b->data.location ?
1481
               a->data.location - b->data.location :
1482
               a->data.index - b->data.index;
1483
}
1484

1485
/* Order varyings according to driver location */
1486
uint64_t
1487
dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes)
1488
{
1489
   nir_sort_variables_with_modes(s, variable_location_cmp, modes);
1490

1491
   uint64_t result = 0;
1492
   nir_foreach_variable_with_modes(var, s, modes) {
1493
      result |= 1ull << var->data.location;
1494
   }
1495
   return result;
1496
}
1497

1498
/* Sort PS outputs so that color outputs come first */
1499
void
1500
dxil_sort_ps_outputs(nir_shader* s)
1501
{
1502
   nir_foreach_variable_with_modes_safe(var, s, nir_var_shader_out) {
1503
      /* We use the driver_location here to avoid introducing a new
1504
       * struct or member variable here. The true, updated driver location
1505
       * will be written below, after sorting */
1506
      switch (var->data.location) {
1507
      case FRAG_RESULT_DEPTH:
1508
         var->data.driver_location = 1;
1509
         break;
1510
      case FRAG_RESULT_STENCIL:
1511
         var->data.driver_location = 2;
1512
         break;
1513
      case FRAG_RESULT_SAMPLE_MASK:
1514
         var->data.driver_location = 3;
1515
         break;
1516
      default:
1517
         var->data.driver_location = 0;
1518
      }
1519
   }
1520

1521
   nir_sort_variables_with_modes(s, variable_location_cmp,
1522
                                 nir_var_shader_out);
1523

1524
   unsigned driver_loc = 0;
1525
   nir_foreach_variable_with_modes(var, s, nir_var_shader_out) {
1526
      var->data.driver_location = driver_loc++;
1527
   }
1528
}
1529

1530
/* Order between stage values so that normal varyings come first,
1531
 * then sysvalues and then system generated values.
1532
 */
1533
uint64_t
1534
dxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes,
1535
   uint64_t other_stage_mask)
1536
{
1537
   nir_foreach_variable_with_modes_safe(var, s, modes) {
1538
      /* We use the driver_location here to avoid introducing a new
1539
       * struct or member variable here. The true, updated driver location
1540
       * will be written below, after sorting */
1541
      var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask);
1542
   }
1543

1544
   nir_sort_variables_with_modes(s, variable_location_cmp, modes);
1545

1546
   uint64_t result = 0;
1547
   unsigned driver_loc = 0;
1548
   nir_foreach_variable_with_modes(var, s, modes) {
1549
      result |= 1ull << var->data.location;
1550
      var->data.driver_location = driver_loc++;
1551
   }
1552
   return result;
1553
}
1554

1555
Product

Resources

Company