Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/panfrost/lib/pan_indirect_draw.c
4560 views
1
/*
2
* Copyright (C) 2021 Collabora, Ltd.
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
* SOFTWARE.
22
*
23
*/
24
25
#include <stdio.h>
26
#include "pan_bo.h"
27
#include "pan_shader.h"
28
#include "pan_scoreboard.h"
29
#include "pan_encoder.h"
30
#include "pan_indirect_draw.h"
31
#include "pan_pool.h"
32
#include "pan_util.h"
33
#include "panfrost-quirks.h"
34
#include "compiler/nir/nir_builder.h"
35
#include "util/u_memory.h"
36
#include "util/macros.h"
37
38
#define WORD(x) ((x) * 4)
39
40
#define LOOP \
41
for (nir_loop *l = nir_push_loop(b); l != NULL; \
42
nir_pop_loop(b, l), l = NULL)
43
#define BREAK nir_jump(b, nir_jump_break)
44
#define CONTINUE nir_jump(b, nir_jump_continue)
45
46
#define IF(cond) nir_push_if(b, cond);
47
#define ELSE nir_push_else(b, NULL);
48
#define ENDIF nir_pop_if(b, NULL);
49
50
#define MIN_MAX_JOBS 128
51
52
struct draw_data {
53
nir_ssa_def *draw_buf;
54
nir_ssa_def *draw_buf_stride;
55
nir_ssa_def *index_buf;
56
nir_ssa_def *restart_index;
57
nir_ssa_def *vertex_count;
58
nir_ssa_def *start_instance;
59
nir_ssa_def *instance_count;
60
nir_ssa_def *vertex_start;
61
nir_ssa_def *index_bias;
62
nir_ssa_def *draw_ctx;
63
nir_ssa_def *min_max_ctx;
64
};
65
66
struct instance_size {
67
nir_ssa_def *raw;
68
nir_ssa_def *padded;
69
nir_ssa_def *packed;
70
};
71
72
struct jobs_data {
73
nir_ssa_def *vertex_job;
74
nir_ssa_def *tiler_job;
75
nir_ssa_def *base_vertex_offset;
76
nir_ssa_def *first_vertex_sysval;
77
nir_ssa_def *base_vertex_sysval;
78
nir_ssa_def *base_instance_sysval;
79
nir_ssa_def *offset_start;
80
nir_ssa_def *invocation;
81
};
82
83
struct varyings_data {
84
nir_ssa_def *varying_bufs;
85
nir_ssa_def *pos_ptr;
86
nir_ssa_def *psiz_ptr;
87
nir_variable *mem_ptr;
88
};
89
90
struct attribs_data {
91
nir_ssa_def *attrib_count;
92
nir_ssa_def *attrib_bufs;
93
nir_ssa_def *attribs;
94
};
95
96
struct indirect_draw_shader_builder {
97
nir_builder b;
98
const struct panfrost_device *dev;
99
unsigned flags;
100
bool index_min_max_search;
101
unsigned index_size;
102
struct draw_data draw;
103
struct instance_size instance_size;
104
struct jobs_data jobs;
105
struct varyings_data varyings;
106
struct attribs_data attribs;
107
};
108
109
/* Describes an indirect draw (see glDrawArraysIndirect()) */
110
111
struct indirect_draw_info {
112
uint32_t count;
113
uint32_t instance_count;
114
uint32_t start;
115
uint32_t start_instance;
116
};
117
118
struct indirect_indexed_draw_info {
119
uint32_t count;
120
uint32_t instance_count;
121
uint32_t start;
122
int32_t index_bias;
123
uint32_t start_instance;
124
};
125
126
/* Store the min/max index in a separate context. This is not supported yet, but
127
* the DDK seems to put all min/max search jobs at the beginning of the job chain
128
* when multiple indirect draws are issued to avoid the serialization caused by
129
* the draw patching jobs which have the suppress_prefetch flag set. Merging the
130
* min/max and draw contexts would prevent such optimizations (draw contexts are
131
* shared by all indirect draw in a batch).
132
*/
133
134
struct min_max_context {
135
uint32_t min;
136
uint32_t max;
137
};
138
139
/* Per-batch context shared by all indirect draws queued to a given batch. */
140
141
struct indirect_draw_context {
142
/* Pointer to the top of the varying heap. */
143
mali_ptr varying_mem;
144
};
145
146
/* Indirect draw shader inputs. Those are stored in a UBO. */
147
148
struct indirect_draw_inputs {
149
/* indirect_draw_context pointer */
150
mali_ptr draw_ctx;
151
152
/* min_max_context pointer */
153
mali_ptr min_max_ctx;
154
155
/* Pointer to an array of indirect_draw_info objects */
156
mali_ptr draw_buf;
157
158
/* Pointer to an uint32_t containing the number of draws to issue */
159
mali_ptr draw_count_ptr;
160
161
/* index buffer */
162
mali_ptr index_buf;
163
164
/* {base,first}_{vertex,instance} sysvals */
165
mali_ptr first_vertex_sysval;
166
mali_ptr base_vertex_sysval;
167
mali_ptr base_instance_sysval;
168
169
/* Pointers to various cmdstream structs that need to be patched */
170
mali_ptr vertex_job;
171
mali_ptr tiler_job;
172
mali_ptr attrib_bufs;
173
mali_ptr attribs;
174
mali_ptr varying_bufs;
175
uint32_t draw_count;
176
uint32_t draw_buf_stride;
177
uint32_t restart_index;
178
uint32_t attrib_count;
179
};
180
181
static nir_ssa_def *
182
get_input_data(nir_builder *b, unsigned offset, unsigned size)
183
{
184
assert(!(offset & 0x3));
185
assert(size && !(size & 0x3));
186
187
return nir_load_ubo(b, 1, size,
188
nir_imm_int(b, 0),
189
nir_imm_int(b, offset),
190
.align_mul = 4,
191
.align_offset = 0,
192
.range_base = 0,
193
.range = ~0);
194
}
195
196
#define get_input_field(b, name) \
197
get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
198
sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
199
200
static nir_ssa_def *
201
get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
202
{
203
return nir_iadd(b, base, nir_u2u64(b, offset));
204
}
205
206
static nir_ssa_def *
207
get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
208
{
209
return get_address(b, base, nir_imm_int(b, offset));
210
}
211
212
static nir_ssa_def *
213
load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
214
{
215
return nir_load_global(b, addr, 4, ncomps, bit_size);
216
}
217
218
static void
219
store_global(nir_builder *b, nir_ssa_def *addr,
220
nir_ssa_def *value, unsigned ncomps)
221
{
222
nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
223
}
224
225
static nir_ssa_def *
226
get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
227
unsigned offset, unsigned size)
228
{
229
nir_builder *b = &builder->b;
230
return load_global(b,
231
get_address_imm(b, builder->draw.draw_ctx, offset),
232
1, size);
233
}
234
235
static void
236
set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
237
unsigned offset, nir_ssa_def *value, unsigned size)
238
{
239
nir_builder *b = &builder->b;
240
store_global(b,
241
get_address_imm(b, builder->draw.draw_ctx, offset),
242
value, 1);
243
}
244
245
#define get_draw_ctx_field(builder, name) \
246
get_draw_ctx_data(builder, \
247
offsetof(struct indirect_draw_context, name), \
248
sizeof(((struct indirect_draw_context *)0)->name) * 8)
249
250
#define set_draw_ctx_field(builder, name, val) \
251
set_draw_ctx_data(builder, \
252
offsetof(struct indirect_draw_context, name), \
253
val, \
254
sizeof(((struct indirect_draw_context *)0)->name) * 8)
255
256
static nir_ssa_def *
257
get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
258
unsigned offset, unsigned size)
259
{
260
nir_builder *b = &builder->b;
261
return load_global(b,
262
get_address_imm(b, builder->draw.min_max_ctx, offset),
263
1, size);
264
}
265
266
#define get_min_max_ctx_field(builder, name) \
267
get_min_max_ctx_data(builder, \
268
offsetof(struct min_max_context, name), \
269
sizeof(((struct min_max_context *)0)->name) * 8)
270
271
static void
272
update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
273
{
274
nir_builder *b = &builder->b;
275
nir_ssa_def *addr =
276
get_address_imm(b,
277
builder->draw.min_max_ctx,
278
offsetof(struct min_max_context, min));
279
nir_global_atomic_umin(b, 32, addr, val);
280
}
281
282
static void
283
update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
284
{
285
nir_builder *b = &builder->b;
286
nir_ssa_def *addr =
287
get_address_imm(b,
288
builder->draw.min_max_ctx,
289
offsetof(struct min_max_context, max));
290
nir_global_atomic_umax(b, 32, addr, val);
291
}
292
293
#define get_draw_field(b, draw_ptr, field) \
294
load_global(b, \
295
get_address_imm(b, draw_ptr, \
296
offsetof(struct indirect_draw_info, field)), \
297
1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
298
299
#define get_indexed_draw_field(b, draw_ptr, field) \
300
load_global(b, \
301
get_address_imm(b, draw_ptr, \
302
offsetof(struct indirect_indexed_draw_info, field)), \
303
1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
304
305
static void
306
extract_inputs(struct indirect_draw_shader_builder *builder)
307
{
308
nir_builder *b = &builder->b;
309
310
builder->draw.draw_ctx = get_input_field(b, draw_ctx);
311
builder->draw.draw_buf = get_input_field(b, draw_buf);
312
builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
313
314
if (builder->index_size) {
315
builder->draw.index_buf = get_input_field(b, index_buf);
316
builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
317
if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
318
builder->draw.restart_index =
319
get_input_field(b, restart_index);
320
}
321
}
322
323
if (builder->index_min_max_search)
324
return;
325
326
builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
327
builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
328
builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
329
builder->jobs.vertex_job = get_input_field(b, vertex_job);
330
builder->jobs.tiler_job = get_input_field(b, tiler_job);
331
builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
332
builder->attribs.attribs = get_input_field(b, attribs);
333
builder->attribs.attrib_count = get_input_field(b, attrib_count);
334
builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
335
builder->varyings.mem_ptr =
336
nir_local_variable_create(b->impl,
337
glsl_uint64_t_type(),
338
"var_mem_ptr");
339
nir_store_var(b, builder->varyings.mem_ptr,
340
get_draw_ctx_field(builder, varying_mem), 3);
341
}
342
343
static void
344
init_shader_builder(struct indirect_draw_shader_builder *builder,
345
const struct panfrost_device *dev,
346
unsigned flags, unsigned index_size,
347
bool index_min_max_search)
348
{
349
memset(builder, 0, sizeof(*builder));
350
builder->dev = dev;
351
builder->flags = flags;
352
builder->index_size = index_size;
353
354
builder->index_min_max_search = index_min_max_search;
355
356
if (index_min_max_search) {
357
builder->b =
358
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
359
pan_shader_get_compiler_options(dev),
360
"indirect_draw_min_max_index(index_size=%d)",
361
builder->index_size);
362
} else {
363
builder->b =
364
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
365
pan_shader_get_compiler_options(dev),
366
"indirect_draw(index_size=%d%s%s%s)",
367
builder->index_size,
368
flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
369
",psiz" : "",
370
flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
371
",primitive_restart" : "",
372
flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
373
",update_primitive_size" : "");
374
}
375
376
nir_builder *b = &builder->b;
377
b->shader->info.internal = true;
378
nir_variable_create(b->shader, nir_var_mem_ubo,
379
glsl_uint_type(), "inputs");
380
b->shader->info.num_ubos++;
381
382
extract_inputs(builder);
383
}
384
385
static void
386
update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
387
{
388
nir_builder *b = &builder->b;
389
nir_ssa_def *job_ptr =
390
type == MALI_JOB_TYPE_VERTEX ?
391
builder->jobs.vertex_job : builder->jobs.tiler_job;
392
393
/* Update the invocation words. */
394
store_global(b, get_address_imm(b, job_ptr, WORD(8)),
395
builder->jobs.invocation, 2);
396
397
unsigned draw_offset =
398
type == MALI_JOB_TYPE_VERTEX ?
399
pan_section_offset(COMPUTE_JOB, DRAW) :
400
pan_is_bifrost(builder->dev) ?
401
pan_section_offset(BIFROST_TILER_JOB, DRAW) :
402
pan_section_offset(MIDGARD_TILER_JOB, DRAW);
403
unsigned prim_offset =
404
pan_is_bifrost(builder->dev) ?
405
pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE) :
406
pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE);
407
unsigned psiz_offset =
408
pan_is_bifrost(builder->dev) ?
409
pan_section_offset(BIFROST_TILER_JOB, PRIMITIVE_SIZE) :
410
pan_section_offset(MIDGARD_TILER_JOB, PRIMITIVE_SIZE);
411
unsigned index_size = builder->index_size;
412
413
if (type == MALI_JOB_TYPE_TILER) {
414
/* Update PRIMITIVE.{base_vertex_offset,count} */
415
store_global(b,
416
get_address_imm(b, job_ptr, prim_offset + WORD(1)),
417
builder->jobs.base_vertex_offset, 1);
418
store_global(b,
419
get_address_imm(b, job_ptr, prim_offset + WORD(3)),
420
nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
421
422
if (index_size) {
423
nir_ssa_def *addr =
424
get_address_imm(b, job_ptr, prim_offset + WORD(4));
425
nir_ssa_def *indices = load_global(b, addr, 1, 64);
426
nir_ssa_def *offset =
427
nir_imul_imm(b, builder->draw.vertex_start, index_size);
428
429
indices = get_address(b, indices, offset);
430
store_global(b, addr, indices, 2);
431
}
432
433
/* Update PRIMITIVE_SIZE.size_array */
434
if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
435
(builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
436
store_global(b,
437
get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
438
builder->varyings.psiz_ptr, 2);
439
}
440
441
/* Update DRAW.position */
442
store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
443
builder->varyings.pos_ptr, 2);
444
}
445
446
nir_ssa_def *draw_w01 =
447
load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
448
nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
449
450
/* Update DRAW.{instance_size,offset_start} */
451
nir_ssa_def *instance_size =
452
nir_bcsel(b,
453
nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
454
nir_imm_int(b, 0), builder->instance_size.packed);
455
draw_w01 = nir_vec2(b,
456
nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
457
nir_ishl(b, instance_size, nir_imm_int(b, 16))),
458
builder->jobs.offset_start);
459
store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
460
draw_w01, 2);
461
}
462
463
static void
464
split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
465
{
466
/* TODO: Lower this 64bit div to something GPU-friendly */
467
nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
468
nir_ssa_def *div64 = nir_u2u64(b, div);
469
nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
470
nir_ssa_def *f0 = nir_iadd(b,
471
nir_ishl(b, nir_imm_int64(b, 1),
472
nir_iadd_imm(b, r, 32)),
473
half_div64);
474
nir_ssa_def *fi = nir_idiv(b, f0, div64);
475
nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
476
nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
477
nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
478
*d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
479
*r_e = nir_ior(b, r, e);
480
}
481
482
static void
483
update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
484
nir_ssa_def *attrib_buf_ptr,
485
enum mali_attribute_type type,
486
nir_ssa_def *div1,
487
nir_ssa_def *div2)
488
{
489
nir_builder *b = &builder->b;
490
unsigned type_mask = BITFIELD_MASK(6);
491
nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
492
nir_ssa_def *w0 = nir_channel(b, w01, 0);
493
nir_ssa_def *w1 = nir_channel(b, w01, 1);
494
495
/* Word 0 and 1 of the attribute descriptor contain the type,
496
* pointer and the the divisor exponent.
497
*/
498
w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
499
w0 = nir_ior(b, w0, nir_imm_int(b, type));
500
w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
501
502
store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
503
504
if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
505
/* If the divisor is not a power of two, the divisor numerator
506
* is passed in word 1 of the continuation attribute (word 5
507
* if we consider the attribute and its continuation as a
508
* single attribute).
509
*/
510
assert(div2);
511
store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
512
div2, 1);
513
}
514
}
515
516
static void
517
zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
518
nir_ssa_def *attrib_buf_ptr)
519
{
520
/* Stride is an unadorned 32-bit uint at word 2 */
521
nir_builder *b = &builder->b;
522
store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
523
nir_imm_int(b, 0), 1);
524
}
525
526
static void
527
adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
528
nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
529
nir_ssa_def *instance_div)
530
{
531
nir_builder *b = &builder->b;
532
nir_ssa_def *zero = nir_imm_int(b, 0);
533
nir_ssa_def *two = nir_imm_int(b, 2);
534
nir_ssa_def *sub_cur_offset =
535
nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
536
nir_uge(b, builder->draw.instance_count, two));
537
538
nir_ssa_def *add_base_inst_offset =
539
nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
540
nir_ine(b, instance_div, zero));
541
542
IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
543
nir_ssa_def *offset =
544
load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
545
nir_ssa_def *stride =
546
load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
547
548
/* Per-instance data needs to be offset in response to a
549
* delayed start in an indexed draw.
550
*/
551
552
IF (add_base_inst_offset) {
553
offset = nir_iadd(b, offset,
554
nir_idiv(b,
555
nir_imul(b, stride,
556
builder->draw.start_instance),
557
instance_div));
558
} ENDIF
559
560
IF (sub_cur_offset) {
561
offset = nir_isub(b, offset,
562
nir_imul(b, stride,
563
builder->jobs.offset_start));
564
} ENDIF
565
566
store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
567
offset, 1);
568
} ENDIF
569
}
570
571
/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
572
573
static nir_ssa_def *
574
nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
575
{
576
return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
577
}
578
579
/* Based on panfrost_emit_vertex_data() */
580
581
static void
582
update_vertex_attribs(struct indirect_draw_shader_builder *builder)
583
{
584
nir_builder *b = &builder->b;
585
nir_variable *attrib_idx_var =
586
nir_local_variable_create(b->impl, glsl_uint_type(),
587
"attrib_idx");
588
nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
589
nir_ssa_def *single_instance =
590
nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
591
592
LOOP {
593
nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
594
IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
595
BREAK;
596
ENDIF
597
598
nir_ssa_def *attrib_buf_ptr =
599
get_address(b, builder->attribs.attrib_bufs,
600
nir_imul_imm(b, attrib_idx,
601
2 * MALI_ATTRIBUTE_BUFFER_LENGTH));
602
nir_ssa_def *attrib_ptr =
603
get_address(b, builder->attribs.attribs,
604
nir_imul_imm(b, attrib_idx,
605
MALI_ATTRIBUTE_LENGTH));
606
607
nir_ssa_def *r_e, *d;
608
609
if (!pan_is_bifrost(builder->dev)) {
610
IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
611
nir_ssa_def *r_p =
612
nir_bcsel(b, single_instance,
613
nir_imm_int(b, 0x9f),
614
builder->instance_size.packed);
615
616
store_global(b,
617
get_address_imm(b, attrib_buf_ptr, WORD(4)),
618
nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
619
620
nir_store_var(b, attrib_idx_var,
621
nir_iadd_imm(b, attrib_idx, 1), 1);
622
CONTINUE;
623
} ENDIF
624
625
IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
626
split_div(b, builder->instance_size.padded,
627
&r_e, &d);
628
nir_ssa_def *default_div =
629
nir_ior(b, single_instance,
630
nir_ult(b,
631
builder->instance_size.padded,
632
nir_imm_int(b, 2)));
633
r_e = nir_bcsel(b, default_div,
634
nir_imm_int(b, 0x3f), r_e);
635
d = nir_bcsel(b, default_div,
636
nir_imm_int(b, (1u << 31) - 1), d);
637
store_global(b,
638
get_address_imm(b, attrib_buf_ptr, WORD(1)),
639
nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
640
2);
641
nir_store_var(b, attrib_idx_var,
642
nir_iadd_imm(b, attrib_idx, 1), 1);
643
CONTINUE;
644
} ENDIF
645
}
646
647
nir_ssa_def *instance_div =
648
load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
649
650
nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
651
652
nir_ssa_def *multi_instance =
653
nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
654
655
IF (nir_ine(b, div, nir_imm_int(b, 0))) {
656
IF (multi_instance) {
657
IF (nir_is_power_of_two_or_zero(b, div)) {
658
nir_ssa_def *exp =
659
nir_imax(b, nir_ufind_msb(b, div),
660
nir_imm_int(b, 0));
661
update_vertex_attrib_buf(builder, attrib_buf_ptr,
662
MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
663
exp, NULL);
664
} ELSE {
665
split_div(b, div, &r_e, &d);
666
update_vertex_attrib_buf(builder, attrib_buf_ptr,
667
MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
668
r_e, d);
669
} ENDIF
670
} ELSE {
671
/* Single instance with a non-0 divisor: all
672
* accesses should point to attribute 0 */
673
zero_attrib_buf_stride(builder, attrib_buf_ptr);
674
} ENDIF
675
676
adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
677
} ELSE IF (multi_instance) {
678
update_vertex_attrib_buf(builder, attrib_buf_ptr,
679
MALI_ATTRIBUTE_TYPE_1D_MODULUS,
680
builder->instance_size.packed, NULL);
681
} ENDIF ENDIF
682
683
nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
684
}
685
}
686
687
static nir_ssa_def *
688
update_varying_buf(struct indirect_draw_shader_builder *builder,
689
nir_ssa_def *varying_buf_ptr,
690
nir_ssa_def *vertex_count)
691
{
692
nir_builder *b = &builder->b;
693
694
nir_ssa_def *stride =
695
load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
696
nir_ssa_def *size = nir_imul(b, stride, vertex_count);
697
nir_ssa_def *aligned_size =
698
nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
699
nir_ssa_def *var_mem_ptr =
700
nir_load_var(b, builder->varyings.mem_ptr);
701
nir_ssa_def *w0 =
702
nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
703
nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
704
nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
705
store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
706
nir_vec4(b, w0, w1, stride, size), 4);
707
708
nir_store_var(b, builder->varyings.mem_ptr,
709
get_address(b, var_mem_ptr, aligned_size), 3);
710
711
return var_mem_ptr;
712
}
713
714
/* Based on panfrost_emit_varying_descriptor() */
715
716
static void
717
update_varyings(struct indirect_draw_shader_builder *builder)
718
{
719
nir_builder *b = &builder->b;
720
nir_ssa_def *vertex_count =
721
nir_imul(b, builder->instance_size.padded,
722
builder->draw.instance_count);
723
nir_ssa_def *buf_ptr =
724
get_address_imm(b, builder->varyings.varying_bufs,
725
PAN_VARY_GENERAL *
726
MALI_ATTRIBUTE_BUFFER_LENGTH);
727
update_varying_buf(builder, buf_ptr, vertex_count);
728
729
buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
730
PAN_VARY_POSITION *
731
MALI_ATTRIBUTE_BUFFER_LENGTH);
732
builder->varyings.pos_ptr =
733
update_varying_buf(builder, buf_ptr, vertex_count);
734
735
if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
736
buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
737
PAN_VARY_PSIZ *
738
MALI_ATTRIBUTE_BUFFER_LENGTH);
739
builder->varyings.psiz_ptr =
740
update_varying_buf(builder, buf_ptr, vertex_count);
741
}
742
743
set_draw_ctx_field(builder, varying_mem,
744
nir_load_var(b, builder->varyings.mem_ptr));
745
}
746
747
/* Based on panfrost_pack_work_groups_compute() */
748
749
static void
750
get_invocation(struct indirect_draw_shader_builder *builder)
751
{
752
nir_builder *b = &builder->b;
753
nir_ssa_def *one = nir_imm_int(b, 1);
754
nir_ssa_def *max_vertex =
755
nir_usub_sat(b, builder->instance_size.raw, one);
756
nir_ssa_def *max_instance =
757
nir_usub_sat(b, builder->draw.instance_count, one);
758
nir_ssa_def *split =
759
nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
760
nir_imm_int(b, 32),
761
nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
762
763
builder->jobs.invocation =
764
nir_vec2(b,
765
nir_ior(b, max_vertex,
766
nir_ishl(b, max_instance, split)),
767
nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
768
nir_imm_int(b, 2 << 28)));
769
}
770
771
/* Based on panfrost_padded_vertex_count() */
772
773
static nir_ssa_def *
774
get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
775
{
776
nir_ssa_def *one = nir_imm_int(b, 1);
777
nir_ssa_def *zero = nir_imm_int(b, 0);
778
nir_ssa_def *eleven = nir_imm_int(b, 11);
779
nir_ssa_def *four = nir_imm_int(b, 4);
780
781
nir_ssa_def *exp =
782
nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
783
nir_ssa_def *base = nir_ushr(b, val, exp);
784
785
base = nir_iadd(b, base,
786
nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
787
788
nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
789
exp = nir_iadd(b, exp, rshift);
790
base = nir_ushr(b, base, rshift);
791
base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
792
rshift = nir_imax(b, nir_find_lsb(b, base), zero);
793
exp = nir_iadd(b, exp, rshift);
794
base = nir_ushr(b, base, rshift);
795
796
*packed = nir_ior(b, exp,
797
nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
798
return nir_ishl(b, base, exp);
799
}
800
801
static void
802
update_jobs(struct indirect_draw_shader_builder *builder)
803
{
804
get_invocation(builder);
805
update_job(builder, MALI_JOB_TYPE_VERTEX);
806
update_job(builder, MALI_JOB_TYPE_TILER);
807
}
808
809
static void
810
get_instance_size(struct indirect_draw_shader_builder *builder)
811
{
812
nir_builder *b = &builder->b;
813
814
if (!builder->index_size) {
815
builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
816
builder->jobs.offset_start = builder->draw.vertex_start;
817
builder->instance_size.raw = builder->draw.vertex_count;
818
return;
819
}
820
821
unsigned index_size = builder->index_size;
822
nir_ssa_def *min = get_min_max_ctx_field(builder, min);
823
nir_ssa_def *max = get_min_max_ctx_field(builder, max);
824
825
/* We handle unaligned indices here to avoid the extra complexity in
826
* the min/max search job.
827
*/
828
if (builder->index_size < 4) {
829
nir_variable *min_var =
830
nir_local_variable_create(b->impl, glsl_uint_type(), "min");
831
nir_store_var(b, min_var, min, 1);
832
nir_variable *max_var =
833
nir_local_variable_create(b->impl, glsl_uint_type(), "max");
834
nir_store_var(b, max_var, max, 1);
835
836
nir_ssa_def *base =
837
get_address(b, builder->draw.index_buf,
838
nir_imul_imm(b, builder->draw.vertex_start, index_size));
839
nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
840
nir_ssa_def *end =
841
nir_iadd(b, offset,
842
nir_imul_imm(b, builder->draw.vertex_count, index_size));
843
nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
844
unsigned shift = index_size * 8;
845
unsigned mask = (1 << shift) - 1;
846
847
base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
848
849
/* Unaligned start offset, we need to ignore any data that's
850
* outside the requested range. We also handle ranges that are
851
* covering less than 2 words here.
852
*/
853
IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
854
min = nir_load_var(b, min_var);
855
max = nir_load_var(b, max_var);
856
857
nir_ssa_def *val = load_global(b, base, 1, 32);
858
for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
859
nir_ssa_def *oob =
860
nir_ior(b,
861
nir_ult(b, nir_imm_int(b, i), offset),
862
nir_uge(b, nir_imm_int(b, i), end));
863
nir_ssa_def *data = nir_iand_imm(b, val, mask);
864
865
min = nir_umin(b, min,
866
nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
867
max = nir_umax(b, max,
868
nir_bcsel(b, oob, nir_imm_int(b, 0), data));
869
val = nir_ushr_imm(b, val, shift);
870
}
871
872
nir_store_var(b, min_var, min, 1);
873
nir_store_var(b, max_var, max, 1);
874
} ENDIF
875
876
nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
877
878
/* The last word contains less than 4bytes of data, we need to
879
* discard anything falling outside the requested range.
880
*/
881
IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
882
min = nir_load_var(b, min_var);
883
max = nir_load_var(b, max_var);
884
885
nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
886
for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
887
nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
888
nir_ssa_def *data = nir_iand_imm(b, val, mask);
889
890
min = nir_umin(b, min,
891
nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
892
max = nir_umax(b, max,
893
nir_bcsel(b, oob, nir_imm_int(b, 0), data));
894
val = nir_ushr_imm(b, val, shift);
895
}
896
897
nir_store_var(b, min_var, min, 1);
898
nir_store_var(b, max_var, max, 1);
899
} ENDIF
900
901
min = nir_load_var(b, min_var);
902
max = nir_load_var(b, max_var);
903
}
904
905
builder->jobs.base_vertex_offset = nir_ineg(b, min);
906
builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
907
builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
908
}
909
910
/* Patch a draw sequence */
911
912
static void
913
patch(struct indirect_draw_shader_builder *builder)
914
{
915
unsigned index_size = builder->index_size;
916
nir_builder *b = &builder->b;
917
918
nir_ssa_def *draw_ptr = builder->draw.draw_buf;
919
920
if (index_size) {
921
builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
922
builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
923
builder->draw.instance_count =
924
get_indexed_draw_field(b, draw_ptr, instance_count);
925
builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
926
builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
927
} else {
928
builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
929
builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
930
builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
931
builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
932
}
933
934
assert(builder->draw.vertex_count->num_components);
935
936
get_instance_size(builder);
937
938
builder->instance_size.padded =
939
get_padded_count(b, builder->instance_size.raw,
940
&builder->instance_size.packed);
941
942
update_varyings(builder);
943
update_jobs(builder);
944
update_vertex_attribs(builder);
945
946
IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
947
store_global(b, builder->jobs.first_vertex_sysval,
948
builder->jobs.offset_start, 1);
949
} ENDIF
950
951
IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
952
store_global(b, builder->jobs.base_vertex_sysval,
953
index_size ?
954
builder->draw.index_bias :
955
nir_imm_int(b, 0),
956
1);
957
} ENDIF
958
959
IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
960
store_global(b, builder->jobs.base_instance_sysval,
961
builder->draw.start_instance, 1);
962
} ENDIF
963
964
}
965
966
/* Search the min/max index in the range covered by the indirect draw call */
967
968
static void
969
get_index_min_max(struct indirect_draw_shader_builder *builder)
970
{
971
nir_ssa_def *restart_index = builder->draw.restart_index;
972
unsigned index_size = builder->index_size;
973
nir_builder *b = &builder->b;
974
975
nir_ssa_def *draw_ptr = builder->draw.draw_buf;
976
977
builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
978
builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
979
980
nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
981
nir_variable *min_var =
982
nir_local_variable_create(b->impl, glsl_uint_type(), "min");
983
nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
984
nir_variable *max_var =
985
nir_local_variable_create(b->impl, glsl_uint_type(), "max");
986
nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
987
988
nir_ssa_def *base =
989
get_address(b, builder->draw.index_buf,
990
nir_imul_imm(b, builder->draw.vertex_start, index_size));
991
992
993
nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
994
nir_ssa_def *end =
995
nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
996
997
base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
998
999
/* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1000
start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1001
end = nir_iand_imm(b, end, ~3);
1002
1003
/* Add the job offset. */
1004
start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1005
1006
nir_variable *offset_var =
1007
nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1008
nir_store_var(b, offset_var, start, 1);
1009
1010
LOOP {
1011
nir_ssa_def *offset = nir_load_var(b, offset_var);
1012
IF (nir_uge(b, offset, end))
1013
BREAK;
1014
ENDIF
1015
1016
nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1017
nir_ssa_def *old_min = nir_load_var(b, min_var);
1018
nir_ssa_def *old_max = nir_load_var(b, max_var);
1019
nir_ssa_def *new_min;
1020
nir_ssa_def *new_max;
1021
1022
/* TODO: use 8/16 bit arithmetic when index_size < 4. */
1023
for (unsigned i = 0; i < 4; i += index_size) {
1024
nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1025
data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1026
new_min = nir_umin(b, old_min, data);
1027
new_max = nir_umax(b, old_max, data);
1028
if (restart_index) {
1029
new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1030
new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1031
}
1032
old_min = new_min;
1033
old_max = new_max;
1034
}
1035
1036
nir_store_var(b, min_var, new_min, 1);
1037
nir_store_var(b, max_var, new_max, 1);
1038
nir_store_var(b, offset_var,
1039
nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1040
}
1041
1042
IF (nir_ult(b, start, end))
1043
update_min(builder, nir_load_var(b, min_var));
1044
update_max(builder, nir_load_var(b, max_var));
1045
ENDIF
1046
}
1047
1048
static unsigned
1049
get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1050
{
1051
if (!index_min_max_search) {
1052
flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1053
flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1054
if (index_size)
1055
flags |= (util_logbase2(index_size) + 1);
1056
return flags;
1057
}
1058
1059
return PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX +
1060
util_logbase2(index_size);
1061
}
1062
1063
static void
1064
create_indirect_draw_shader(struct panfrost_device *dev,
1065
unsigned flags, unsigned index_size,
1066
bool index_min_max_search)
1067
{
1068
assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1069
struct indirect_draw_shader_builder builder;
1070
init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1071
1072
nir_builder *b = &builder.b;
1073
1074
if (index_min_max_search)
1075
get_index_min_max(&builder);
1076
else
1077
patch(&builder);
1078
1079
struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
1080
struct pan_shader_info shader_info;
1081
struct util_dynarray binary;
1082
1083
util_dynarray_init(&binary, NULL);
1084
pan_shader_compile(dev, b->shader, &inputs, &binary, &shader_info);
1085
1086
assert(!shader_info.tls_size);
1087
assert(!shader_info.wls_size);
1088
assert(!shader_info.sysvals.sysval_count);
1089
1090
unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1091
struct pan_indirect_draw_shader *draw_shader =
1092
&dev->indirect_draw_shaders.shaders[shader_id];
1093
void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1094
(shader_id * MALI_RENDERER_STATE_LENGTH);
1095
1096
pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1097
if (!draw_shader->rsd) {
1098
mali_ptr address =
1099
pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1100
binary.data, binary.size,
1101
pan_is_bifrost(dev) ? 128 : 64);
1102
if (!pan_is_bifrost(dev))
1103
address |= shader_info.midgard.first_tag;
1104
1105
util_dynarray_fini(&binary);
1106
1107
pan_pack(state, RENDERER_STATE, cfg) {
1108
pan_shader_prepare_rsd(dev, &shader_info, address, &cfg);
1109
}
1110
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1111
1112
draw_shader->push = shader_info.push;
1113
draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1114
(shader_id * MALI_RENDERER_STATE_LENGTH);
1115
}
1116
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1117
1118
ralloc_free(b->shader);
1119
}
1120
1121
static mali_ptr
1122
get_renderer_state(struct panfrost_device *dev, unsigned flags,
1123
unsigned index_size, bool index_min_max_search)
1124
{
1125
unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1126
struct pan_indirect_draw_shader *info =
1127
&dev->indirect_draw_shaders.shaders[shader_id];
1128
1129
if (!info->rsd) {
1130
create_indirect_draw_shader(dev, flags, index_size,
1131
index_min_max_search);
1132
assert(info->rsd);
1133
}
1134
1135
return info->rsd;
1136
}
1137
1138
static mali_ptr
1139
get_tls(const struct panfrost_device *dev)
1140
{
1141
return dev->indirect_draw_shaders.states->ptr.gpu +
1142
(PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);
1143
}
1144
1145
static mali_ptr
1146
get_ubos(struct pan_pool *pool,
1147
const struct indirect_draw_inputs *inputs)
1148
{
1149
struct panfrost_ptr inputs_buf =
1150
pan_pool_alloc_aligned(pool, sizeof(inputs), 16);
1151
1152
memcpy(inputs_buf.cpu, &inputs, sizeof(inputs));
1153
1154
struct panfrost_ptr ubos_buf =
1155
pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
1156
1157
pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
1158
cfg.entries = DIV_ROUND_UP(sizeof(inputs), 16);
1159
cfg.pointer = inputs_buf.gpu;
1160
}
1161
1162
return ubos_buf.gpu;
1163
}
1164
1165
static mali_ptr
1166
get_push_uniforms(struct pan_pool *pool,
1167
const struct pan_indirect_draw_shader *shader,
1168
const struct indirect_draw_inputs *inputs)
1169
{
1170
if (!shader->push.count)
1171
return 0;
1172
1173
struct panfrost_ptr push_consts_buf =
1174
pan_pool_alloc_aligned(pool, shader->push.count * 4, 16);
1175
uint32_t *out = push_consts_buf.cpu;
1176
uint8_t *in = (uint8_t *)inputs;
1177
1178
for (unsigned i = 0; i < shader->push.count; ++i)
1179
memcpy(out + i, in + shader->push.words[i].offset, 4);
1180
1181
return push_consts_buf.gpu;
1182
}
1183
1184
static void
1185
panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1186
{
1187
pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1188
if (dev->indirect_draw_shaders.states)
1189
goto out;
1190
1191
unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1192
MALI_RENDERER_STATE_LENGTH) +
1193
MALI_LOCAL_STORAGE_LENGTH;
1194
1195
dev->indirect_draw_shaders.states =
1196
panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1197
1198
/* Prepare the thread storage descriptor now since it's invariant. */
1199
void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1200
(PAN_INDIRECT_DRAW_NUM_SHADERS * MALI_RENDERER_STATE_LENGTH);
1201
pan_pack(tsd, LOCAL_STORAGE, ls) {
1202
ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1203
};
1204
1205
/* FIXME: Currently allocating 512M of growable memory, meaning that we
1206
* only allocate what we really use, the problem is:
1207
* - allocation happens 2M at a time, which might be more than we
1208
* actually need
1209
* - the memory is attached to the device to speed up subsequent
1210
* indirect draws, but that also means it's never shrinked
1211
*/
1212
dev->indirect_draw_shaders.varying_heap =
1213
panfrost_bo_create(dev, 512 * 1024 * 1024,
1214
PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1215
"Indirect draw varying heap");
1216
1217
out:
1218
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1219
}
1220
1221
static unsigned
1222
panfrost_emit_index_min_max_search(struct pan_pool *pool,
1223
struct pan_scoreboard *scoreboard,
1224
const struct pan_indirect_draw_info *draw_info,
1225
const struct indirect_draw_inputs *inputs,
1226
struct indirect_draw_context *draw_ctx,
1227
mali_ptr ubos)
1228
{
1229
struct panfrost_device *dev = pool->dev;
1230
unsigned index_size = draw_info->index_size;
1231
1232
if (!index_size)
1233
return 0;
1234
1235
mali_ptr rsd =
1236
get_renderer_state(dev, draw_info->flags,
1237
draw_info->index_size, true);
1238
unsigned shader_id =
1239
get_shader_id(draw_info->flags, draw_info->index_size, true);
1240
const struct pan_indirect_draw_shader *shader =
1241
&dev->indirect_draw_shaders.shaders[shader_id];
1242
struct panfrost_ptr job =
1243
pan_pool_alloc_desc(pool, COMPUTE_JOB);
1244
void *invocation =
1245
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1246
panfrost_pack_work_groups_compute(invocation,
1247
1, 1, 1, MIN_MAX_JOBS, 1, 1,
1248
false, false);
1249
1250
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1251
cfg.job_task_split = 7;
1252
}
1253
1254
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1255
cfg.draw_descriptor_is_64b = true;
1256
cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);
1257
cfg.state = rsd;
1258
cfg.thread_storage = get_tls(pool->dev);
1259
cfg.uniform_buffers = ubos;
1260
cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
1261
}
1262
1263
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
1264
1265
return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1266
false, false, 0, 0, &job, false);
1267
}
1268
1269
unsigned
1270
panfrost_emit_indirect_draw(struct pan_pool *pool,
1271
struct pan_scoreboard *scoreboard,
1272
const struct pan_indirect_draw_info *draw_info,
1273
struct panfrost_ptr *ctx)
1274
{
1275
struct panfrost_device *dev = pool->dev;
1276
1277
/* Currently only tested on Bifrost, but the logic should be the same
1278
* on Midgard.
1279
*/
1280
assert(pan_is_bifrost(dev));
1281
1282
panfrost_indirect_draw_alloc_deps(dev);
1283
1284
struct panfrost_ptr job =
1285
pan_pool_alloc_desc(pool, COMPUTE_JOB);
1286
mali_ptr rsd =
1287
get_renderer_state(dev, draw_info->flags,
1288
draw_info->index_size, false);
1289
1290
struct indirect_draw_context draw_ctx = {
1291
.varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1292
};
1293
1294
struct panfrost_ptr draw_ctx_ptr = *ctx;
1295
if (!draw_ctx_ptr.cpu) {
1296
draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1297
sizeof(draw_ctx),
1298
sizeof(mali_ptr));
1299
}
1300
1301
struct indirect_draw_inputs inputs = {
1302
.draw_ctx = draw_ctx_ptr.gpu,
1303
.draw_buf = draw_info->draw_buf,
1304
.index_buf = draw_info->index_buf,
1305
.first_vertex_sysval = draw_info->first_vertex_sysval,
1306
.base_vertex_sysval = draw_info->base_vertex_sysval,
1307
.base_instance_sysval = draw_info->base_instance_sysval,
1308
.vertex_job = draw_info->vertex_job,
1309
.tiler_job = draw_info->tiler_job,
1310
.attrib_bufs = draw_info->attrib_bufs,
1311
.attribs = draw_info->attribs,
1312
.varying_bufs = draw_info->varying_bufs,
1313
.attrib_count = draw_info->attrib_count,
1314
};
1315
1316
if (draw_info->index_size) {
1317
inputs.restart_index = draw_info->restart_index;
1318
1319
struct panfrost_ptr min_max_ctx_ptr =
1320
pan_pool_alloc_aligned(pool,
1321
sizeof(struct min_max_context),
1322
4);
1323
struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1324
1325
ctx->min = UINT32_MAX;
1326
ctx->max = 0;
1327
inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1328
}
1329
1330
unsigned shader_id =
1331
get_shader_id(draw_info->flags, draw_info->index_size, false);
1332
const struct pan_indirect_draw_shader *shader =
1333
&dev->indirect_draw_shaders.shaders[shader_id];
1334
mali_ptr ubos = get_ubos(pool, &inputs);
1335
1336
void *invocation =
1337
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1338
panfrost_pack_work_groups_compute(invocation,
1339
1, 1, 1, 1, 1, 1,
1340
false, false);
1341
1342
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1343
cfg.job_task_split = 2;
1344
}
1345
1346
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1347
cfg.draw_descriptor_is_64b = true;
1348
cfg.texture_descriptor_is_64b = !pan_is_bifrost(dev);
1349
cfg.state = rsd;
1350
cfg.thread_storage = get_tls(pool->dev);
1351
cfg.uniform_buffers = ubos;
1352
cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
1353
}
1354
1355
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
1356
1357
unsigned global_dep = draw_info->last_indirect_draw;
1358
unsigned local_dep =
1359
panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1360
&inputs, &draw_ctx, ubos);
1361
1362
if (!ctx->cpu) {
1363
*ctx = draw_ctx_ptr;
1364
memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1365
}
1366
1367
return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1368
false, true, local_dep, global_dep,
1369
&job, false);
1370
}
1371
1372
void
1373
panfrost_init_indirect_draw_shaders(struct panfrost_device *dev,
1374
struct pan_pool *bin_pool)
1375
{
1376
/* We allocate the states and varying_heap BO lazily to avoid
1377
* reserving memory when indirect draws are not used.
1378
*/
1379
pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1380
dev->indirect_draw_shaders.bin_pool = bin_pool;
1381
}
1382
1383
void
1384
panfrost_cleanup_indirect_draw_shaders(struct panfrost_device *dev)
1385
{
1386
panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1387
panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1388
pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1389
}
1390
1391