CoCalc -- ac_shader

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/common/ac_shader_util.c
⁷²³³ views
1
/*
2
 * Copyright 2012 Advanced Micro Devices, Inc.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
#include "ac_shader_util.h"
25
#include "ac_gpu_info.h"
26

27
#include "sid.h"
28

29
#include <assert.h>
30
#include <stdlib.h>
31
#include <string.h>
32

33
unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask)
34
{
35
   if (writes_z) {
36
      /* Z needs 32 bits. */
37
      if (writes_samplemask)
38
         return V_028710_SPI_SHADER_32_ABGR;
39
      else if (writes_stencil)
40
         return V_028710_SPI_SHADER_32_GR;
41
      else
42
         return V_028710_SPI_SHADER_32_R;
43
   } else if (writes_stencil || writes_samplemask) {
44
      /* Both stencil and sample mask need only 16 bits. */
45
      return V_028710_SPI_SHADER_UINT16_ABGR;
46
   } else {
47
      return V_028710_SPI_SHADER_ZERO;
48
   }
49
}
50

51
unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
52
{
53
   unsigned i, cb_shader_mask = 0;
54

55
   for (i = 0; i < 8; i++) {
56
      switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
57
      case V_028714_SPI_SHADER_ZERO:
58
         break;
59
      case V_028714_SPI_SHADER_32_R:
60
         cb_shader_mask |= 0x1 << (i * 4);
61
         break;
62
      case V_028714_SPI_SHADER_32_GR:
63
         cb_shader_mask |= 0x3 << (i * 4);
64
         break;
65
      case V_028714_SPI_SHADER_32_AR:
66
         cb_shader_mask |= 0x9u << (i * 4);
67
         break;
68
      case V_028714_SPI_SHADER_FP16_ABGR:
69
      case V_028714_SPI_SHADER_UNORM16_ABGR:
70
      case V_028714_SPI_SHADER_SNORM16_ABGR:
71
      case V_028714_SPI_SHADER_UINT16_ABGR:
72
      case V_028714_SPI_SHADER_SINT16_ABGR:
73
      case V_028714_SPI_SHADER_32_ABGR:
74
         cb_shader_mask |= 0xfu << (i * 4);
75
         break;
76
      default:
77
         assert(0);
78
      }
79
   }
80
   return cb_shader_mask;
81
}
82

83
/**
84
 * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
85
 * geometry shader.
86
 */
87
uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
88
{
89
   unsigned cut_mode;
90

91
   if (gs_max_vert_out <= 128) {
92
      cut_mode = V_028A40_GS_CUT_128;
93
   } else if (gs_max_vert_out <= 256) {
94
      cut_mode = V_028A40_GS_CUT_256;
95
   } else if (gs_max_vert_out <= 512) {
96
      cut_mode = V_028A40_GS_CUT_512;
97
   } else {
98
      assert(gs_max_vert_out <= 1024);
99
      cut_mode = V_028A40_GS_CUT_1024;
100
   }
101

102
   return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
103
          S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
104
          S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
105
}
106

107
/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
108
/// value for LLVM8+ tbuffer intrinsics.
109
unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt)
110
{
111
   // Some games try to access vertex buffers without a valid format.
112
   // This is a game bug, but we should still handle it gracefully.
113
   if (dfmt == V_008F0C_GFX10_FORMAT_INVALID)
114
      return V_008F0C_GFX10_FORMAT_INVALID;
115

116
   if (chip_class >= GFX10) {
117
      unsigned format;
118
      switch (dfmt) {
119
      default:
120
         unreachable("bad dfmt");
121
      case V_008F0C_BUF_DATA_FORMAT_INVALID:
122
         format = V_008F0C_GFX10_FORMAT_INVALID;
123
         break;
124
      case V_008F0C_BUF_DATA_FORMAT_8:
125
         format = V_008F0C_GFX10_FORMAT_8_UINT;
126
         break;
127
      case V_008F0C_BUF_DATA_FORMAT_8_8:
128
         format = V_008F0C_GFX10_FORMAT_8_8_UINT;
129
         break;
130
      case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
131
         format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT;
132
         break;
133
      case V_008F0C_BUF_DATA_FORMAT_16:
134
         format = V_008F0C_GFX10_FORMAT_16_UINT;
135
         break;
136
      case V_008F0C_BUF_DATA_FORMAT_16_16:
137
         format = V_008F0C_GFX10_FORMAT_16_16_UINT;
138
         break;
139
      case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
140
         format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT;
141
         break;
142
      case V_008F0C_BUF_DATA_FORMAT_32:
143
         format = V_008F0C_GFX10_FORMAT_32_UINT;
144
         break;
145
      case V_008F0C_BUF_DATA_FORMAT_32_32:
146
         format = V_008F0C_GFX10_FORMAT_32_32_UINT;
147
         break;
148
      case V_008F0C_BUF_DATA_FORMAT_32_32_32:
149
         format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
150
         break;
151
      case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
152
         format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT;
153
         break;
154
      case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
155
         format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT;
156
         break;
157
      case V_008F0C_BUF_DATA_FORMAT_10_11_11:
158
         format = V_008F0C_GFX10_FORMAT_10_11_11_UINT;
159
         break;
160
      }
161

162
      // Use the regularity properties of the combined format enum.
163
      //
164
      // Note: float is incompatible with 8-bit data formats,
165
      //       [us]{norm,scaled} are incomparible with 32-bit data formats.
166
      //       [us]scaled are not writable.
167
      switch (nfmt) {
168
      case V_008F0C_BUF_NUM_FORMAT_UNORM:
169
         format -= 4;
170
         break;
171
      case V_008F0C_BUF_NUM_FORMAT_SNORM:
172
         format -= 3;
173
         break;
174
      case V_008F0C_BUF_NUM_FORMAT_USCALED:
175
         format -= 2;
176
         break;
177
      case V_008F0C_BUF_NUM_FORMAT_SSCALED:
178
         format -= 1;
179
         break;
180
      default:
181
         unreachable("bad nfmt");
182
      case V_008F0C_BUF_NUM_FORMAT_UINT:
183
         break;
184
      case V_008F0C_BUF_NUM_FORMAT_SINT:
185
         format += 1;
186
         break;
187
      case V_008F0C_BUF_NUM_FORMAT_FLOAT:
188
         format += 2;
189
         break;
190
      }
191

192
      return format;
193
   } else {
194
      return dfmt | (nfmt << 4);
195
   }
196
}
197

198
static const struct ac_data_format_info data_format_table[] = {
199
   [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
200
   [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
201
   [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
202
   [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
203
   [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
204
   [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
205
   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
206
   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
207
   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
208
   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
209
   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
210
   [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
211
   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
212
   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
213
   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
214
};
215

216
const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
217
{
218
   assert(dfmt < ARRAY_SIZE(data_format_table));
219
   return &data_format_table[dfmt];
220
}
221

222
enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
223
                                     bool is_array)
224
{
225
   switch (dim) {
226
   case GLSL_SAMPLER_DIM_1D:
227
      if (chip_class == GFX9)
228
         return is_array ? ac_image_2darray : ac_image_2d;
229
      return is_array ? ac_image_1darray : ac_image_1d;
230
   case GLSL_SAMPLER_DIM_2D:
231
   case GLSL_SAMPLER_DIM_RECT:
232
   case GLSL_SAMPLER_DIM_EXTERNAL:
233
      return is_array ? ac_image_2darray : ac_image_2d;
234
   case GLSL_SAMPLER_DIM_3D:
235
      return ac_image_3d;
236
   case GLSL_SAMPLER_DIM_CUBE:
237
      return ac_image_cube;
238
   case GLSL_SAMPLER_DIM_MS:
239
      return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
240
   case GLSL_SAMPLER_DIM_SUBPASS:
241
      return ac_image_2darray;
242
   case GLSL_SAMPLER_DIM_SUBPASS_MS:
243
      return ac_image_2darraymsaa;
244
   default:
245
      unreachable("bad sampler dim");
246
   }
247
}
248

249
enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
250
                                   bool is_array)
251
{
252
   enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
253

254
   /* Match the resource type set in the descriptor. */
255
   if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d))
256
      dim = ac_image_2darray;
257
   else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
258
      /* When a single layer of a 3D texture is bound, the shader
259
       * will refer to a 2D target, but the descriptor has a 3D type.
260
       * Since the HW ignores BASE_ARRAY in this case, we need to
261
       * send 3 coordinates. This doesn't hurt when the underlying
262
       * texture is non-3D.
263
       */
264
      dim = ac_image_3d;
265
   }
266

267
   return dim;
268
}
269

270
unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
271
                                  signed char *face_vgpr_index_ptr,
272
                                  signed char *ancillary_vgpr_index_ptr)
273
{
274
   unsigned num_input_vgprs = 0;
275
   signed char face_vgpr_index = -1;
276
   signed char ancillary_vgpr_index = -1;
277

278
   if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
279
      num_input_vgprs += 2;
280
   if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
281
      num_input_vgprs += 2;
282
   if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
283
      num_input_vgprs += 2;
284
   if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
285
      num_input_vgprs += 3;
286
   if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
287
      num_input_vgprs += 2;
288
   if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
289
      num_input_vgprs += 2;
290
   if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
291
      num_input_vgprs += 2;
292
   if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
293
      num_input_vgprs += 1;
294
   if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
295
      num_input_vgprs += 1;
296
   if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
297
      num_input_vgprs += 1;
298
   if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
299
      num_input_vgprs += 1;
300
   if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
301
      num_input_vgprs += 1;
302
   if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
303
      face_vgpr_index = num_input_vgprs;
304
      num_input_vgprs += 1;
305
   }
306
   if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
307
      ancillary_vgpr_index = num_input_vgprs;
308
      num_input_vgprs += 1;
309
   }
310
   if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
311
      num_input_vgprs += 1;
312
   if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
313
      num_input_vgprs += 1;
314

315
   if (face_vgpr_index_ptr)
316
      *face_vgpr_index_ptr = face_vgpr_index;
317
   if (ancillary_vgpr_index_ptr)
318
      *ancillary_vgpr_index_ptr = ancillary_vgpr_index;
319

320
   return num_input_vgprs;
321
}
322

323
void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
324
                                 bool is_depth, bool use_rbplus,
325
                                 struct ac_spi_color_formats *formats)
326
{
327
   /* Alpha is needed for alpha-to-coverage.
328
    * Blending may be with or without alpha.
329
    */
330
   unsigned normal = 0;      /* most optimal, may not support blending or export alpha */
331
   unsigned alpha = 0;       /* exports alpha, but may not support blending */
332
   unsigned blend = 0;       /* supports blending, but may not export alpha */
333
   unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
334

335
   /* Choose the SPI color formats. These are required values for RB+.
336
    * Other chips have multiple choices, though they are not necessarily better.
337
    */
338
   switch (format) {
339
   case V_028C70_COLOR_5_6_5:
340
   case V_028C70_COLOR_1_5_5_5:
341
   case V_028C70_COLOR_5_5_5_1:
342
   case V_028C70_COLOR_4_4_4_4:
343
   case V_028C70_COLOR_10_11_11:
344
   case V_028C70_COLOR_11_11_10:
345
   case V_028C70_COLOR_5_9_9_9:
346
   case V_028C70_COLOR_8:
347
   case V_028C70_COLOR_8_8:
348
   case V_028C70_COLOR_8_8_8_8:
349
   case V_028C70_COLOR_10_10_10_2:
350
   case V_028C70_COLOR_2_10_10_10:
351
      if (ntype == V_028C70_NUMBER_UINT)
352
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
353
      else if (ntype == V_028C70_NUMBER_SINT)
354
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
355
      else
356
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
357

358
      if (!use_rbplus && format == V_028C70_COLOR_8 &&
359
          ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ {
360
         /* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x
361
          * exporting performance. Otherwise, use 32_R to remove useless
362
          * instructions needed for 16-bit compressed exports.
363
          */
364
         blend = normal = V_028714_SPI_SHADER_32_R;
365
      }
366
      break;
367

368
   case V_028C70_COLOR_16:
369
   case V_028C70_COLOR_16_16:
370
   case V_028C70_COLOR_16_16_16_16:
371
      if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
372
         /* UNORM16 and SNORM16 don't support blending */
373
         if (ntype == V_028C70_NUMBER_UNORM)
374
            normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
375
         else
376
            normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
377

378
         /* Use 32 bits per channel for blending. */
379
         if (format == V_028C70_COLOR_16) {
380
            if (swap == V_028C70_SWAP_STD) { /* R */
381
               blend = V_028714_SPI_SHADER_32_R;
382
               blend_alpha = V_028714_SPI_SHADER_32_AR;
383
            } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
384
               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
385
            else
386
               assert(0);
387
         } else if (format == V_028C70_COLOR_16_16) {
388
            if (swap == V_028C70_SWAP_STD) { /* RG */
389
               blend = V_028714_SPI_SHADER_32_GR;
390
               blend_alpha = V_028714_SPI_SHADER_32_ABGR;
391
            } else if (swap == V_028C70_SWAP_ALT) /* RA */
392
               blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
393
            else
394
               assert(0);
395
         } else /* 16_16_16_16 */
396
            blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
397
      } else if (ntype == V_028C70_NUMBER_UINT)
398
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
399
      else if (ntype == V_028C70_NUMBER_SINT)
400
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
401
      else if (ntype == V_028C70_NUMBER_FLOAT)
402
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
403
      else
404
         assert(0);
405
      break;
406

407
   case V_028C70_COLOR_32:
408
      if (swap == V_028C70_SWAP_STD) { /* R */
409
         blend = normal = V_028714_SPI_SHADER_32_R;
410
         alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
411
      } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
412
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
413
      else
414
         assert(0);
415
      break;
416

417
   case V_028C70_COLOR_32_32:
418
      if (swap == V_028C70_SWAP_STD) { /* RG */
419
         blend = normal = V_028714_SPI_SHADER_32_GR;
420
         alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
421
      } else if (swap == V_028C70_SWAP_ALT) /* RA */
422
         alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
423
      else
424
         assert(0);
425
      break;
426

427
   case V_028C70_COLOR_32_32_32_32:
428
   case V_028C70_COLOR_8_24:
429
   case V_028C70_COLOR_24_8:
430
   case V_028C70_COLOR_X24_8_32_FLOAT:
431
      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
432
      break;
433

434
   default:
435
      assert(0);
436
      return;
437
   }
438

439
   /* The DB->CB copy needs 32_ABGR. */
440
   if (is_depth)
441
      alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
442

443
   formats->normal = normal;
444
   formats->alpha = alpha;
445
   formats->blend = blend;
446
   formats->blend_alpha = blend_alpha;
447
}
448

449
void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
450
                           bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask)
451
{
452
   *late_alloc_wave64 = 0; /* The limit is per SA. */
453
   *cu_mask = 0xffff;
454

455
   /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */
456
   if (info->min_good_cu_per_sa <= 2)
457
      return;
458

459
   /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more
460
    * complicated computation is needed to enable late alloc with scratch (see PAL).
461
    */
462
   if (uses_scratch)
463
      return;
464

465
   /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
466
   if (ngg && info->family == CHIP_NAVI14)
467
      return;
468

469
   if (info->chip_class >= GFX10) {
470
      /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32.
471
       * These limits are estimated because they are all safe but they vary in performance.
472
       */
473
      if (ngg_culling)
474
         *late_alloc_wave64 = info->min_good_cu_per_sa * 10;
475
      else
476
         *late_alloc_wave64 = info->min_good_cu_per_sa * 4;
477

478
      /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */
479
      if (info->chip_class == GFX10 && ngg)
480
         *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64);
481

482
      /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
483
       * Others: CU1 must be disabled to prevent a hw deadlock.
484
       *
485
       * The deadlock is caused by late alloc, which usually increases performance.
486
       */
487
      *cu_mask &= info->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) :
488
                                              ~BITFIELD_RANGE(1, 1);
489
   } else {
490
      if (info->min_good_cu_per_sa <= 4) {
491
         /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us
492
          * more than late VS allocation would help.
493
          *
494
          * 2 is the highest safe number that allows us to keep all CUs enabled.
495
          */
496
         *late_alloc_wave64 = 2;
497
      } else {
498
         /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2.
499
          */
500
         *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4;
501
      }
502

503
      /* VS can't execute on one CU if the limit is > 2. */
504
      if (*late_alloc_wave64 > 2)
505
         *cu_mask = 0xfffe; /* 1 CU disabled */
506
   }
507

508
   /* Max number that fits into the register field. */
509
   if (ngg) /* GS */
510
      *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u));
511
   else /* VS */
512
      *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
513
}
514

515
Product

Resources

Company