Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/amd/common/ac_shader_util.c
7233 views
1
/*
2
* Copyright 2012 Advanced Micro Devices, Inc.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
#include "ac_shader_util.h"
25
#include "ac_gpu_info.h"
26
27
#include "sid.h"
28
29
#include <assert.h>
30
#include <stdlib.h>
31
#include <string.h>
32
33
unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask)
34
{
35
if (writes_z) {
36
/* Z needs 32 bits. */
37
if (writes_samplemask)
38
return V_028710_SPI_SHADER_32_ABGR;
39
else if (writes_stencil)
40
return V_028710_SPI_SHADER_32_GR;
41
else
42
return V_028710_SPI_SHADER_32_R;
43
} else if (writes_stencil || writes_samplemask) {
44
/* Both stencil and sample mask need only 16 bits. */
45
return V_028710_SPI_SHADER_UINT16_ABGR;
46
} else {
47
return V_028710_SPI_SHADER_ZERO;
48
}
49
}
50
51
unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
52
{
53
unsigned i, cb_shader_mask = 0;
54
55
for (i = 0; i < 8; i++) {
56
switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
57
case V_028714_SPI_SHADER_ZERO:
58
break;
59
case V_028714_SPI_SHADER_32_R:
60
cb_shader_mask |= 0x1 << (i * 4);
61
break;
62
case V_028714_SPI_SHADER_32_GR:
63
cb_shader_mask |= 0x3 << (i * 4);
64
break;
65
case V_028714_SPI_SHADER_32_AR:
66
cb_shader_mask |= 0x9u << (i * 4);
67
break;
68
case V_028714_SPI_SHADER_FP16_ABGR:
69
case V_028714_SPI_SHADER_UNORM16_ABGR:
70
case V_028714_SPI_SHADER_SNORM16_ABGR:
71
case V_028714_SPI_SHADER_UINT16_ABGR:
72
case V_028714_SPI_SHADER_SINT16_ABGR:
73
case V_028714_SPI_SHADER_32_ABGR:
74
cb_shader_mask |= 0xfu << (i * 4);
75
break;
76
default:
77
assert(0);
78
}
79
}
80
return cb_shader_mask;
81
}
82
83
/**
84
* Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
85
* geometry shader.
86
*/
87
uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
88
{
89
unsigned cut_mode;
90
91
if (gs_max_vert_out <= 128) {
92
cut_mode = V_028A40_GS_CUT_128;
93
} else if (gs_max_vert_out <= 256) {
94
cut_mode = V_028A40_GS_CUT_256;
95
} else if (gs_max_vert_out <= 512) {
96
cut_mode = V_028A40_GS_CUT_512;
97
} else {
98
assert(gs_max_vert_out <= 1024);
99
cut_mode = V_028A40_GS_CUT_1024;
100
}
101
102
return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
103
S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
104
S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
105
}
106
107
/// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
108
/// value for LLVM8+ tbuffer intrinsics.
109
unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt)
110
{
111
// Some games try to access vertex buffers without a valid format.
112
// This is a game bug, but we should still handle it gracefully.
113
if (dfmt == V_008F0C_GFX10_FORMAT_INVALID)
114
return V_008F0C_GFX10_FORMAT_INVALID;
115
116
if (chip_class >= GFX10) {
117
unsigned format;
118
switch (dfmt) {
119
default:
120
unreachable("bad dfmt");
121
case V_008F0C_BUF_DATA_FORMAT_INVALID:
122
format = V_008F0C_GFX10_FORMAT_INVALID;
123
break;
124
case V_008F0C_BUF_DATA_FORMAT_8:
125
format = V_008F0C_GFX10_FORMAT_8_UINT;
126
break;
127
case V_008F0C_BUF_DATA_FORMAT_8_8:
128
format = V_008F0C_GFX10_FORMAT_8_8_UINT;
129
break;
130
case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
131
format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT;
132
break;
133
case V_008F0C_BUF_DATA_FORMAT_16:
134
format = V_008F0C_GFX10_FORMAT_16_UINT;
135
break;
136
case V_008F0C_BUF_DATA_FORMAT_16_16:
137
format = V_008F0C_GFX10_FORMAT_16_16_UINT;
138
break;
139
case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
140
format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT;
141
break;
142
case V_008F0C_BUF_DATA_FORMAT_32:
143
format = V_008F0C_GFX10_FORMAT_32_UINT;
144
break;
145
case V_008F0C_BUF_DATA_FORMAT_32_32:
146
format = V_008F0C_GFX10_FORMAT_32_32_UINT;
147
break;
148
case V_008F0C_BUF_DATA_FORMAT_32_32_32:
149
format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
150
break;
151
case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
152
format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT;
153
break;
154
case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
155
format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT;
156
break;
157
case V_008F0C_BUF_DATA_FORMAT_10_11_11:
158
format = V_008F0C_GFX10_FORMAT_10_11_11_UINT;
159
break;
160
}
161
162
// Use the regularity properties of the combined format enum.
163
//
164
// Note: float is incompatible with 8-bit data formats,
165
// [us]{norm,scaled} are incomparible with 32-bit data formats.
166
// [us]scaled are not writable.
167
switch (nfmt) {
168
case V_008F0C_BUF_NUM_FORMAT_UNORM:
169
format -= 4;
170
break;
171
case V_008F0C_BUF_NUM_FORMAT_SNORM:
172
format -= 3;
173
break;
174
case V_008F0C_BUF_NUM_FORMAT_USCALED:
175
format -= 2;
176
break;
177
case V_008F0C_BUF_NUM_FORMAT_SSCALED:
178
format -= 1;
179
break;
180
default:
181
unreachable("bad nfmt");
182
case V_008F0C_BUF_NUM_FORMAT_UINT:
183
break;
184
case V_008F0C_BUF_NUM_FORMAT_SINT:
185
format += 1;
186
break;
187
case V_008F0C_BUF_NUM_FORMAT_FLOAT:
188
format += 2;
189
break;
190
}
191
192
return format;
193
} else {
194
return dfmt | (nfmt << 4);
195
}
196
}
197
198
static const struct ac_data_format_info data_format_table[] = {
199
[V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
200
[V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
201
[V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
202
[V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
203
[V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
204
[V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
205
[V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
206
[V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
207
[V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
208
[V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
209
[V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
210
[V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
211
[V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
212
[V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
213
[V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
214
};
215
216
const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
217
{
218
assert(dfmt < ARRAY_SIZE(data_format_table));
219
return &data_format_table[dfmt];
220
}
221
222
enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
223
bool is_array)
224
{
225
switch (dim) {
226
case GLSL_SAMPLER_DIM_1D:
227
if (chip_class == GFX9)
228
return is_array ? ac_image_2darray : ac_image_2d;
229
return is_array ? ac_image_1darray : ac_image_1d;
230
case GLSL_SAMPLER_DIM_2D:
231
case GLSL_SAMPLER_DIM_RECT:
232
case GLSL_SAMPLER_DIM_EXTERNAL:
233
return is_array ? ac_image_2darray : ac_image_2d;
234
case GLSL_SAMPLER_DIM_3D:
235
return ac_image_3d;
236
case GLSL_SAMPLER_DIM_CUBE:
237
return ac_image_cube;
238
case GLSL_SAMPLER_DIM_MS:
239
return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
240
case GLSL_SAMPLER_DIM_SUBPASS:
241
return ac_image_2darray;
242
case GLSL_SAMPLER_DIM_SUBPASS_MS:
243
return ac_image_2darraymsaa;
244
default:
245
unreachable("bad sampler dim");
246
}
247
}
248
249
enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
250
bool is_array)
251
{
252
enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
253
254
/* Match the resource type set in the descriptor. */
255
if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d))
256
dim = ac_image_2darray;
257
else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
258
/* When a single layer of a 3D texture is bound, the shader
259
* will refer to a 2D target, but the descriptor has a 3D type.
260
* Since the HW ignores BASE_ARRAY in this case, we need to
261
* send 3 coordinates. This doesn't hurt when the underlying
262
* texture is non-3D.
263
*/
264
dim = ac_image_3d;
265
}
266
267
return dim;
268
}
269
270
unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
271
signed char *face_vgpr_index_ptr,
272
signed char *ancillary_vgpr_index_ptr)
273
{
274
unsigned num_input_vgprs = 0;
275
signed char face_vgpr_index = -1;
276
signed char ancillary_vgpr_index = -1;
277
278
if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
279
num_input_vgprs += 2;
280
if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
281
num_input_vgprs += 2;
282
if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
283
num_input_vgprs += 2;
284
if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
285
num_input_vgprs += 3;
286
if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
287
num_input_vgprs += 2;
288
if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
289
num_input_vgprs += 2;
290
if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
291
num_input_vgprs += 2;
292
if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
293
num_input_vgprs += 1;
294
if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
295
num_input_vgprs += 1;
296
if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
297
num_input_vgprs += 1;
298
if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
299
num_input_vgprs += 1;
300
if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
301
num_input_vgprs += 1;
302
if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
303
face_vgpr_index = num_input_vgprs;
304
num_input_vgprs += 1;
305
}
306
if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
307
ancillary_vgpr_index = num_input_vgprs;
308
num_input_vgprs += 1;
309
}
310
if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
311
num_input_vgprs += 1;
312
if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
313
num_input_vgprs += 1;
314
315
if (face_vgpr_index_ptr)
316
*face_vgpr_index_ptr = face_vgpr_index;
317
if (ancillary_vgpr_index_ptr)
318
*ancillary_vgpr_index_ptr = ancillary_vgpr_index;
319
320
return num_input_vgprs;
321
}
322
323
void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
324
bool is_depth, bool use_rbplus,
325
struct ac_spi_color_formats *formats)
326
{
327
/* Alpha is needed for alpha-to-coverage.
328
* Blending may be with or without alpha.
329
*/
330
unsigned normal = 0; /* most optimal, may not support blending or export alpha */
331
unsigned alpha = 0; /* exports alpha, but may not support blending */
332
unsigned blend = 0; /* supports blending, but may not export alpha */
333
unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
334
335
/* Choose the SPI color formats. These are required values for RB+.
336
* Other chips have multiple choices, though they are not necessarily better.
337
*/
338
switch (format) {
339
case V_028C70_COLOR_5_6_5:
340
case V_028C70_COLOR_1_5_5_5:
341
case V_028C70_COLOR_5_5_5_1:
342
case V_028C70_COLOR_4_4_4_4:
343
case V_028C70_COLOR_10_11_11:
344
case V_028C70_COLOR_11_11_10:
345
case V_028C70_COLOR_5_9_9_9:
346
case V_028C70_COLOR_8:
347
case V_028C70_COLOR_8_8:
348
case V_028C70_COLOR_8_8_8_8:
349
case V_028C70_COLOR_10_10_10_2:
350
case V_028C70_COLOR_2_10_10_10:
351
if (ntype == V_028C70_NUMBER_UINT)
352
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
353
else if (ntype == V_028C70_NUMBER_SINT)
354
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
355
else
356
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
357
358
if (!use_rbplus && format == V_028C70_COLOR_8 &&
359
ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ {
360
/* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x
361
* exporting performance. Otherwise, use 32_R to remove useless
362
* instructions needed for 16-bit compressed exports.
363
*/
364
blend = normal = V_028714_SPI_SHADER_32_R;
365
}
366
break;
367
368
case V_028C70_COLOR_16:
369
case V_028C70_COLOR_16_16:
370
case V_028C70_COLOR_16_16_16_16:
371
if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
372
/* UNORM16 and SNORM16 don't support blending */
373
if (ntype == V_028C70_NUMBER_UNORM)
374
normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
375
else
376
normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
377
378
/* Use 32 bits per channel for blending. */
379
if (format == V_028C70_COLOR_16) {
380
if (swap == V_028C70_SWAP_STD) { /* R */
381
blend = V_028714_SPI_SHADER_32_R;
382
blend_alpha = V_028714_SPI_SHADER_32_AR;
383
} else if (swap == V_028C70_SWAP_ALT_REV) /* A */
384
blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
385
else
386
assert(0);
387
} else if (format == V_028C70_COLOR_16_16) {
388
if (swap == V_028C70_SWAP_STD) { /* RG */
389
blend = V_028714_SPI_SHADER_32_GR;
390
blend_alpha = V_028714_SPI_SHADER_32_ABGR;
391
} else if (swap == V_028C70_SWAP_ALT) /* RA */
392
blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
393
else
394
assert(0);
395
} else /* 16_16_16_16 */
396
blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
397
} else if (ntype == V_028C70_NUMBER_UINT)
398
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
399
else if (ntype == V_028C70_NUMBER_SINT)
400
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
401
else if (ntype == V_028C70_NUMBER_FLOAT)
402
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
403
else
404
assert(0);
405
break;
406
407
case V_028C70_COLOR_32:
408
if (swap == V_028C70_SWAP_STD) { /* R */
409
blend = normal = V_028714_SPI_SHADER_32_R;
410
alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
411
} else if (swap == V_028C70_SWAP_ALT_REV) /* A */
412
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
413
else
414
assert(0);
415
break;
416
417
case V_028C70_COLOR_32_32:
418
if (swap == V_028C70_SWAP_STD) { /* RG */
419
blend = normal = V_028714_SPI_SHADER_32_GR;
420
alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
421
} else if (swap == V_028C70_SWAP_ALT) /* RA */
422
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
423
else
424
assert(0);
425
break;
426
427
case V_028C70_COLOR_32_32_32_32:
428
case V_028C70_COLOR_8_24:
429
case V_028C70_COLOR_24_8:
430
case V_028C70_COLOR_X24_8_32_FLOAT:
431
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
432
break;
433
434
default:
435
assert(0);
436
return;
437
}
438
439
/* The DB->CB copy needs 32_ABGR. */
440
if (is_depth)
441
alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
442
443
formats->normal = normal;
444
formats->alpha = alpha;
445
formats->blend = blend;
446
formats->blend_alpha = blend_alpha;
447
}
448
449
void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
450
bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask)
451
{
452
*late_alloc_wave64 = 0; /* The limit is per SA. */
453
*cu_mask = 0xffff;
454
455
/* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */
456
if (info->min_good_cu_per_sa <= 2)
457
return;
458
459
/* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more
460
* complicated computation is needed to enable late alloc with scratch (see PAL).
461
*/
462
if (uses_scratch)
463
return;
464
465
/* Late alloc is not used for NGG on Navi14 due to a hw bug. */
466
if (ngg && info->family == CHIP_NAVI14)
467
return;
468
469
if (info->chip_class >= GFX10) {
470
/* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32.
471
* These limits are estimated because they are all safe but they vary in performance.
472
*/
473
if (ngg_culling)
474
*late_alloc_wave64 = info->min_good_cu_per_sa * 10;
475
else
476
*late_alloc_wave64 = info->min_good_cu_per_sa * 4;
477
478
/* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */
479
if (info->chip_class == GFX10 && ngg)
480
*late_alloc_wave64 = MIN2(*late_alloc_wave64, 64);
481
482
/* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
483
* Others: CU1 must be disabled to prevent a hw deadlock.
484
*
485
* The deadlock is caused by late alloc, which usually increases performance.
486
*/
487
*cu_mask &= info->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) :
488
~BITFIELD_RANGE(1, 1);
489
} else {
490
if (info->min_good_cu_per_sa <= 4) {
491
/* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us
492
* more than late VS allocation would help.
493
*
494
* 2 is the highest safe number that allows us to keep all CUs enabled.
495
*/
496
*late_alloc_wave64 = 2;
497
} else {
498
/* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2.
499
*/
500
*late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4;
501
}
502
503
/* VS can't execute on one CU if the limit is > 2. */
504
if (*late_alloc_wave64 > 2)
505
*cu_mask = 0xfffe; /* 1 CU disabled */
506
}
507
508
/* Max number that fits into the register field. */
509
if (ngg) /* GS */
510
*late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u));
511
else /* VS */
512
*late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
513
}
514
515