CoCalc -- pan_cmdstream.c

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/panfrost/pan_cmdstream.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright (C) 2018 Alyssa Rosenzweig
3
 * Copyright (C) 2020 Collabora Ltd.
4
 * Copyright © 2017 Intel Corporation
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the "Software"),
8
 * to deal in the Software without restriction, including without limitation
9
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 * and/or sell copies of the Software, and to permit persons to whom the
11
 * Software is furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice (including the next
14
 * paragraph) shall be included in all copies or substantial portions of the
15
 * Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
 * SOFTWARE.
24
 */
25

26
#include "util/macros.h"
27
#include "util/u_prim.h"
28
#include "util/u_vbuf.h"
29
#include "util/u_helpers.h"
30
#include "util/u_draw.h"
31
#include "util/u_memory.h"
32
#include "pipe/p_defines.h"
33
#include "pipe/p_state.h"
34
#include "indices/u_primconvert.h"
35
#include "gallium/auxiliary/util/u_blend.h"
36

37
#include "panfrost-quirks.h"
38

39
#include "pan_pool.h"
40
#include "pan_bo.h"
41
#include "pan_context.h"
42
#include "pan_job.h"
43
#include "pan_shader.h"
44
#include "pan_texture.h"
45
#include "pan_util.h"
46
#include "pan_indirect_draw.h"
47
#include "pan_indirect_dispatch.h"
48
#include "pan_blitter.h"
49

50
#include "midgard_pack.h"
51

52
/* Statically assert that PIPE_* enums match the hardware enums.
53
 * (As long as they match, we don't need to translate them.)
54
 */
55
UNUSED static void
56
pan_pipe_asserts()
57
{
58
#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
59

60
        /* Compare functions are natural in both Gallium and Mali */
61
        PIPE_ASSERT(PIPE_FUNC_NEVER    == MALI_FUNC_NEVER);
62
        PIPE_ASSERT(PIPE_FUNC_LESS     == MALI_FUNC_LESS);
63
        PIPE_ASSERT(PIPE_FUNC_EQUAL    == MALI_FUNC_EQUAL);
64
        PIPE_ASSERT(PIPE_FUNC_LEQUAL   == MALI_FUNC_LEQUAL);
65
        PIPE_ASSERT(PIPE_FUNC_GREATER  == MALI_FUNC_GREATER);
66
        PIPE_ASSERT(PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL);
67
        PIPE_ASSERT(PIPE_FUNC_GEQUAL   == MALI_FUNC_GEQUAL);
68
        PIPE_ASSERT(PIPE_FUNC_ALWAYS   == MALI_FUNC_ALWAYS);
69
}
70

71
static inline enum mali_sample_pattern
72
panfrost_sample_pattern(unsigned samples)
73
{
74
        switch (samples) {
75
        case 1:  return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
76
        case 4:  return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
77
        case 8:  return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
78
        case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
79
        default: unreachable("Unsupported sample count");
80
        }
81
}
82

83
/* Gets a GPU address for the associated index buffer. Only gauranteed to be
84
 * good for the duration of the draw (transient), could last longer. Also get
85
 * the bounds on the index buffer for the range accessed by the draw. We do
86
 * these operations together because there are natural optimizations which
87
 * require them to be together. */
88

89
static mali_ptr
90
panfrost_get_index_buffer_bounded(struct panfrost_batch *batch,
91
                                  const struct pipe_draw_info *info,
92
                                  const struct pipe_draw_start_count_bias *draw,
93
                                  unsigned *min_index, unsigned *max_index)
94
{
95
        struct panfrost_resource *rsrc = pan_resource(info->index.resource);
96
        struct panfrost_context *ctx = batch->ctx;
97
        off_t offset = draw->start * info->index_size;
98
        bool needs_indices = true;
99
        mali_ptr out = 0;
100

101
        if (info->index_bounds_valid) {
102
                *min_index = info->min_index;
103
                *max_index = info->max_index;
104
                needs_indices = false;
105
        }
106

107
        if (!info->has_user_indices) {
108
                /* Only resources can be directly mapped */
109
                panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
110
                out = rsrc->image.data.bo->ptr.gpu + offset;
111

112
                /* Check the cache */
113
                needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
114
                                                           draw->start,
115
                                                           draw->count,
116
                                                           min_index,
117
                                                           max_index);
118
        } else {
119
                /* Otherwise, we need to upload to transient memory */
120
                const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
121
                struct panfrost_ptr T =
122
                        pan_pool_alloc_aligned(&batch->pool.base,
123
                                               draw->count *
124
                                               info->index_size,
125
                                               info->index_size);
126

127
                memcpy(T.cpu, ibuf8 + offset, draw->count * info->index_size);
128
                out = T.gpu;
129
        }
130

131
        if (needs_indices) {
132
                /* Fallback */
133
                u_vbuf_get_minmax_index(&ctx->base, info, draw, min_index, max_index);
134

135
                if (!info->has_user_indices)
136
                        panfrost_minmax_cache_add(rsrc->index_cache,
137
                                                  draw->start, draw->count,
138
                                                  *min_index, *max_index);
139
        }
140

141
        return out;
142
}
143

144
static unsigned
145
translate_tex_wrap(enum pipe_tex_wrap w, bool supports_clamp, bool using_nearest)
146
{
147
        /* Bifrost doesn't support the GL_CLAMP wrap mode, so instead use
148
         * CLAMP_TO_EDGE and CLAMP_TO_BORDER. On Midgard, CLAMP is broken for
149
         * nearest filtering, so use CLAMP_TO_EDGE in that case. */
150

151
        switch (w) {
152
        case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
153
        case PIPE_TEX_WRAP_CLAMP:
154
                return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :
155
                     (supports_clamp ? MALI_WRAP_MODE_CLAMP :
156
                                       MALI_WRAP_MODE_CLAMP_TO_BORDER);
157
        case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
158
        case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
159
        case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
160
        case PIPE_TEX_WRAP_MIRROR_CLAMP:
161
                return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :
162
                     (supports_clamp ? MALI_WRAP_MODE_MIRRORED_CLAMP :
163
                                       MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER);
164
        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
165
        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
166
        default: unreachable("Invalid wrap");
167
        }
168
}
169

170
/* The hardware compares in the wrong order order, so we have to flip before
171
 * encoding. Yes, really. */
172

173
static enum mali_func
174
panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
175
{
176
        return !cso->compare_mode ? MALI_FUNC_NEVER :
177
                panfrost_flip_compare_func((enum mali_func) cso->compare_func);
178
}
179

180
static enum mali_mipmap_mode
181
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
182
{
183
        switch (f) {
184
        case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
185
        case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
186
        case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
187
        default: unreachable("Invalid");
188
        }
189
}
190

191
static void
192
panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
193
                           struct mali_midgard_sampler_packed *hw)
194
{
195
        bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
196

197
        pan_pack(hw, MIDGARD_SAMPLER, cfg) {
198
                cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
199
                cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
200
                cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
201
                        MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
202
                cfg.normalized_coordinates = cso->normalized_coords;
203

204
                cfg.lod_bias = FIXED_16(cso->lod_bias, true);
205

206
                cfg.minimum_lod = FIXED_16(cso->min_lod, false);
207

208
                /* If necessary, we disable mipmapping in the sampler descriptor by
209
                 * clamping the LOD as tight as possible (from 0 to epsilon,
210
                 * essentially -- remember these are fixed point numbers, so
211
                 * epsilon=1/256) */
212

213
                cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
214
                        cfg.minimum_lod + 1 :
215
                        FIXED_16(cso->max_lod, false);
216

217
                cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, true, using_nearest);
218
                cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, true, using_nearest);
219
                cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, true, using_nearest);
220

221
                cfg.compare_function = panfrost_sampler_compare_func(cso);
222
                cfg.seamless_cube_map = cso->seamless_cube_map;
223

224
                cfg.border_color_r = cso->border_color.ui[0];
225
                cfg.border_color_g = cso->border_color.ui[1];
226
                cfg.border_color_b = cso->border_color.ui[2];
227
                cfg.border_color_a = cso->border_color.ui[3];
228
        }
229
}
230

231
static void
232
panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
233
                                   struct mali_bifrost_sampler_packed *hw)
234
{
235
        bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
236

237
        pan_pack(hw, BIFROST_SAMPLER, cfg) {
238
                cfg.point_sample_magnify = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
239
                cfg.point_sample_minify = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
240
                cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
241
                cfg.normalized_coordinates = cso->normalized_coords;
242

243
                cfg.lod_bias = FIXED_16(cso->lod_bias, true);
244
                cfg.minimum_lod = FIXED_16(cso->min_lod, false);
245
                cfg.maximum_lod = FIXED_16(cso->max_lod, false);
246

247
                if (cso->max_anisotropy > 1) {
248
                        cfg.maximum_anisotropy = cso->max_anisotropy;
249
                        cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
250
                }
251

252
                cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, false, using_nearest);
253
                cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, false, using_nearest);
254
                cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, false, using_nearest);
255

256
                cfg.compare_function = panfrost_sampler_compare_func(cso);
257
                cfg.seamless_cube_map = cso->seamless_cube_map;
258

259
                cfg.border_color_r = cso->border_color.ui[0];
260
                cfg.border_color_g = cso->border_color.ui[1];
261
                cfg.border_color_b = cso->border_color.ui[2];
262
                cfg.border_color_a = cso->border_color.ui[3];
263
        }
264
}
265

266
static void *
267
panfrost_create_sampler_state(
268
        struct pipe_context *pctx,
269
        const struct pipe_sampler_state *cso)
270
{
271
        struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
272
        struct panfrost_device *device = pan_device(pctx->screen);
273

274
        so->base = *cso;
275

276
        if (pan_is_bifrost(device))
277
                panfrost_sampler_desc_init_bifrost(cso, (struct mali_bifrost_sampler_packed *) &so->hw);
278
        else
279
                panfrost_sampler_desc_init(cso, &so->hw);
280

281
        return so;
282
}
283

284
static bool
285
panfrost_fs_required(
286
                struct panfrost_shader_state *fs,
287
                struct panfrost_blend_state *blend,
288
                struct pipe_framebuffer_state *state,
289
                const struct panfrost_zsa_state *zsa)
290
{
291
        /* If we generally have side effects. This inclues use of discard,
292
         * which can affect the results of an occlusion query. */
293
        if (fs->info.fs.sidefx)
294
                return true;
295

296
        /* Using an empty FS requires early-z to be enabled, but alpha test
297
         * needs it disabled */
298
        if ((enum mali_func) zsa->base.alpha_func != MALI_FUNC_ALWAYS)
299
                return true;
300

301
        /* If colour is written we need to execute */
302
        for (unsigned i = 0; i < state->nr_cbufs; ++i) {
303
                if (state->cbufs[i] && !blend->info[i].no_colour)
304
                        return true;
305
        }
306

307
        /* If depth is written and not implied we need to execute.
308
         * TODO: Predicate on Z/S writes being enabled */
309
        return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil);
310
}
311

312
static void
313
panfrost_emit_bifrost_blend(struct panfrost_batch *batch,
314
                            mali_ptr *blend_shaders, void *rts)
315
{
316
        unsigned rt_count = batch->key.nr_cbufs;
317
        struct panfrost_context *ctx = batch->ctx;
318
        const struct panfrost_blend_state *so = ctx->blend;
319
        const struct panfrost_device *dev = pan_device(ctx->base.screen);
320
        struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
321

322
        /* Always have at least one render target for depth-only passes */
323
        for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
324
                /* Disable blending for unbacked render targets */
325
                if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
326
                        pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {
327
                                cfg.enable = false;
328
                                cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OFF;
329
                        }
330

331
                        continue;
332
                }
333

334
                struct pan_blend_info info = so->info[i];
335
                enum pipe_format format = batch->key.cbufs[i]->format;
336
                const struct util_format_description *format_desc;
337
                unsigned chan_size = 0;
338

339
                format_desc = util_format_description(format);
340

341
                for (unsigned i = 0; i < format_desc->nr_channels; i++)
342
                        chan_size = MAX2(format_desc->channel[0].size, chan_size);
343

344
                /* Fixed point constant */
345
                float constant_f = pan_blend_get_constant(
346
                                info.constant_mask,
347
                                ctx->blend_color.color);
348

349
                u16 constant = constant_f * ((1 << chan_size) - 1);
350
                constant <<= 16 - chan_size;
351

352
                struct mali_blend_packed *packed = rts + (i * MALI_BLEND_LENGTH);
353

354
                /* Word 0: Flags and constant */
355
                pan_pack(packed, BLEND, cfg) {
356
                        cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
357
                        cfg.load_destination = info.load_dest;
358
                        cfg.round_to_fb_precision = !ctx->blend->base.dither;
359
                        cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
360
                        cfg.bifrost.constant = constant;
361
                }
362

363
                if (!blend_shaders[i]) {
364
                        /* Word 1: Blend Equation */
365
                        STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);
366
                        packed->opaque[1] = so->equation[i];
367
                }
368

369
                /* Words 2 and 3: Internal blend */
370
                if (blend_shaders[i]) {
371
                        /* The blend shader's address needs to be at
372
                         * the same top 32 bit as the fragment shader.
373
                         * TODO: Ensure that's always the case.
374
                         */
375
                        assert(!fs->bin.bo ||
376
                                        (blend_shaders[i] & (0xffffffffull << 32)) ==
377
                                        (fs->bin.gpu & (0xffffffffull << 32)));
378

379
                        unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
380
                        assert(!(ret_offset & 0x7));
381

382
                        pan_pack(&packed->opaque[2], BIFROST_INTERNAL_BLEND, cfg) {
383
                                cfg.mode = MALI_BIFROST_BLEND_MODE_SHADER;
384
                                cfg.shader.pc = (u32) blend_shaders[i];
385
                                cfg.shader.return_value = ret_offset ?
386
                                        fs->bin.gpu + ret_offset : 0;
387
                        }
388
                } else {
389
                        pan_pack(&packed->opaque[2], BIFROST_INTERNAL_BLEND, cfg) {
390
                                cfg.mode = info.opaque ?
391
                                        MALI_BIFROST_BLEND_MODE_OPAQUE :
392
                                        MALI_BIFROST_BLEND_MODE_FIXED_FUNCTION;
393

394
                                /* If we want the conversion to work properly,
395
                                 * num_comps must be set to 4
396
                                 */
397
                                cfg.fixed_function.num_comps = 4;
398
                                cfg.fixed_function.conversion.memory_format =
399
                                        panfrost_format_to_bifrost_blend(dev, format);
400
                                cfg.fixed_function.conversion.register_format =
401
                                        fs->info.bifrost.blend[i].format;
402
                                cfg.fixed_function.rt = i;
403
                        }
404
                }
405
        }
406
}
407

408
static void
409
panfrost_emit_midgard_blend(struct panfrost_batch *batch,
410
                            mali_ptr *blend_shaders, void *rts)
411
{
412
        unsigned rt_count = batch->key.nr_cbufs;
413
        struct panfrost_context *ctx = batch->ctx;
414
        const struct panfrost_blend_state *so = ctx->blend;
415

416
        /* Always have at least one render target for depth-only passes */
417
        for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
418
                struct mali_blend_packed *packed = rts + (i * MALI_BLEND_LENGTH);
419

420
                /* Disable blending for unbacked render targets */
421
                if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
422
                        pan_pack(packed, BLEND, cfg) {
423
                                cfg.enable = false;
424
                        }
425

426
                        continue;
427
                }
428

429
                pan_pack(packed, BLEND, cfg) {
430
                        struct pan_blend_info info = so->info[i];
431

432
                        cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
433
                        cfg.load_destination = info.load_dest;
434
                        cfg.round_to_fb_precision = !ctx->blend->base.dither;
435
                        cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
436
                        cfg.midgard.blend_shader = (blend_shaders[i] != 0);
437
                        if (blend_shaders[i]) {
438
                                cfg.midgard.shader_pc = blend_shaders[i];
439
                        } else {
440
                                cfg.midgard.constant = pan_blend_get_constant(
441
                                                info.constant_mask,
442
                                                ctx->blend_color.color);
443
                        }
444
                }
445

446
                if (!blend_shaders[i]) {
447
                        /* Word 2: Blend Equation */
448
                        STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);
449
                        packed->opaque[2] = so->equation[i];
450
                }
451
        }
452
}
453

454
static void
455
panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders)
456
{
457
        const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
458
        struct panfrost_blend_state *so = batch->ctx->blend;
459

460
        if (pan_is_bifrost(dev))
461
                panfrost_emit_bifrost_blend(batch, blend_shaders, rts);
462
        else
463
                panfrost_emit_midgard_blend(batch, blend_shaders, rts);
464

465
        for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
466
                if (!so->info[i].no_colour && batch->key.cbufs[i]) {
467
                        batch->draws |= (PIPE_CLEAR_COLOR0 << i);
468
                        batch->resolve |= (PIPE_CLEAR_COLOR0 << i);
469
                }
470
        }
471
}
472

473
/* Construct a partial RSD corresponding to no executed fragment shader, and
474
 * merge with the existing partial RSD. This depends only on the architecture,
475
 * so packing separately allows the packs to be constant folded away. */
476

477
static void
478
pan_merge_empty_fs(struct mali_renderer_state_packed *rsd, bool is_bifrost)
479
{
480
        struct mali_renderer_state_packed empty_rsd;
481

482
        if (is_bifrost) {
483
                pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
484
                        cfg.properties.bifrost.shader_modifies_coverage = true;
485
                        cfg.properties.bifrost.allow_forward_pixel_to_kill = true;
486
                        cfg.properties.bifrost.allow_forward_pixel_to_be_killed = true;
487
                        cfg.properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
488
                }
489
        } else {
490
                pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
491
                        cfg.shader.shader = 0x1;
492
                        cfg.properties.midgard.work_register_count = 1;
493
                        cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
494
                        cfg.properties.midgard.force_early_z = true;
495
                }
496
        }
497

498
        pan_merge((*rsd), empty_rsd, RENDERER_STATE);
499
}
500

501
/* Get the last blend shader, for an erratum workaround */
502

503
static mali_ptr
504
panfrost_last_nonnull(mali_ptr *ptrs, unsigned count)
505
{
506
        for (signed i = ((signed) count - 1); i >= 0; --i) {
507
                if (ptrs[i])
508
                        return ptrs[i];
509
        }
510

511
        return 0;
512
}
513

514
static void
515
panfrost_prepare_fs_state(struct panfrost_context *ctx,
516
                          mali_ptr *blend_shaders,
517
                          struct mali_renderer_state_packed *rsd)
518
{
519
        const struct panfrost_device *dev = pan_device(ctx->base.screen);
520
        struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
521
        const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
522
        struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
523
        struct panfrost_blend_state *so = ctx->blend;
524
        bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
525
        bool msaa = rast->multisample;
526

527
        pan_pack(rsd, RENDERER_STATE, cfg) {
528
                if (pan_is_bifrost(dev) && panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
529
                        /* Track if any colour buffer is reused across draws, either
530
                         * from reading it directly, or from failing to write it */
531
                        unsigned rt_mask = ctx->fb_rt_mask;
532
                        uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);
533
                        bool blend_reads_dest = (so->load_dest_mask & rt_mask);
534

535
                        cfg.properties.bifrost.allow_forward_pixel_to_kill =
536
                                fs->info.fs.can_fpk &&
537
                                !(rt_mask & ~rt_written) &&
538
                                !alpha_to_coverage &&
539
                                !blend_reads_dest;
540
                } else if (!pan_is_bifrost(dev)) {
541
                        unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
542

543
                        if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
544
                                cfg.properties.midgard.force_early_z =
545
                                        fs->info.fs.can_early_z && !alpha_to_coverage &&
546
                                        ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);
547

548
                                bool has_blend_shader = false;
549

550
                                for (unsigned c = 0; c < rt_count; ++c)
551
                                        has_blend_shader |= (blend_shaders[c] != 0);
552

553
                                /* TODO: Reduce this limit? */
554
                                if (has_blend_shader)
555
                                        cfg.properties.midgard.work_register_count = MAX2(fs->info.work_reg_count, 8);
556
                                else
557
                                        cfg.properties.midgard.work_register_count = fs->info.work_reg_count;
558

559
                                /* Hardware quirks around early-zs forcing
560
                                 * without a depth buffer. Note this breaks
561
                                 * occlusion queries. */
562
                                bool has_oq = ctx->occlusion_query && ctx->active_queries;
563
                                bool force_ez_with_discard = !zsa->enabled && !has_oq;
564

565
                                cfg.properties.midgard.shader_reads_tilebuffer =
566
                                        force_ez_with_discard && fs->info.fs.can_discard;
567
                                cfg.properties.midgard.shader_contains_discard =
568
                                        !force_ez_with_discard && fs->info.fs.can_discard;
569
                        }
570

571
                        if (dev->quirks & MIDGARD_SFBD && rt_count > 0) {
572
                                cfg.multisample_misc.sfbd_load_destination = so->info[0].load_dest;
573
                                cfg.multisample_misc.sfbd_blend_shader = (blend_shaders[0] != 0);
574
                                cfg.stencil_mask_misc.sfbd_write_enable = !so->info[0].no_colour;
575
                                cfg.stencil_mask_misc.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
576
                                cfg.stencil_mask_misc.sfbd_dither_disable = !so->base.dither;
577
                                cfg.stencil_mask_misc.sfbd_alpha_to_one = so->base.alpha_to_one;
578

579
                                if (blend_shaders[0]) {
580
                                        cfg.sfbd_blend_shader = blend_shaders[0];
581
                                } else {
582
                                        cfg.sfbd_blend_constant = pan_blend_get_constant(
583
                                                        so->info[0].constant_mask,
584
                                                        ctx->blend_color.color);
585
                                }
586
                        } else if (dev->quirks & MIDGARD_SFBD) {
587
                                /* If there is no colour buffer, leaving fields default is
588
                                 * fine, except for blending which is nonnullable */
589
                                cfg.sfbd_blend_equation.color_mask = 0xf;
590
                                cfg.sfbd_blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
591
                                cfg.sfbd_blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
592
                                cfg.sfbd_blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
593
                                cfg.sfbd_blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
594
                                cfg.sfbd_blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
595
                                cfg.sfbd_blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
596
                        } else {
597
                                /* Workaround on v5 */
598
                                cfg.sfbd_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
599
                        }
600
                }
601

602
                cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
603

604
                cfg.multisample_misc.evaluate_per_sample =
605
                        msaa && (ctx->min_samples > 1);
606

607
                cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
608
                cfg.depth_units = rast->offset_units * 2.0f;
609
                cfg.depth_factor = rast->offset_scale;
610

611
                bool back_enab = zsa->base.stencil[1].enabled;
612
                cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
613
                cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
614

615
                /* v6+ fits register preload here, no alpha testing */
616
                if (dev->arch <= 5)
617
                        cfg.alpha_reference = zsa->base.alpha_ref_value;
618
        }
619
}
620

621
static void
622
panfrost_emit_frag_shader(struct panfrost_context *ctx,
623
                          struct mali_renderer_state_packed *fragmeta,
624
                          mali_ptr *blend_shaders)
625
{
626
        struct panfrost_device *dev = pan_device(ctx->base.screen);
627
        const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
628
        const struct panfrost_rasterizer *rast = ctx->rasterizer;
629
        struct panfrost_shader_state *fs =
630
                panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
631

632
        /* We need to merge several several partial renderer state descriptors,
633
         * so stage to temporary storage rather than reading back write-combine
634
         * memory, which will trash performance. */
635
        struct mali_renderer_state_packed rsd;
636
        panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
637

638
        if ((dev->quirks & MIDGARD_SFBD)
639
                        && ctx->pipe_framebuffer.nr_cbufs > 0
640
                        && !blend_shaders[0]) {
641

642
                /* Word 14: SFBD Blend Equation */
643
                STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);
644
                rsd.opaque[14] = ctx->blend->equation[0];
645
        }
646

647
        /* Merge with CSO state and upload */
648
        if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa))
649
                pan_merge(rsd, fs->partial_rsd, RENDERER_STATE);
650
        else
651
                pan_merge_empty_fs(&rsd, pan_is_bifrost(dev));
652

653
        /* Word 8, 9 Misc state */
654
        rsd.opaque[8] |= zsa->rsd_depth.opaque[0]
655
                       | rast->multisample.opaque[0];
656

657
        rsd.opaque[9] |= zsa->rsd_stencil.opaque[0]
658
                       | rast->stencil_misc.opaque[0];
659

660
        /* Word 10, 11 Stencil Front and Back */
661
        rsd.opaque[10] |= zsa->stencil_front.opaque[0];
662
        rsd.opaque[11] |= zsa->stencil_back.opaque[0];
663

664
        memcpy(fragmeta, &rsd, sizeof(rsd));
665
}
666

667
static mali_ptr
668
panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
669
{
670
        struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
671

672
        panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
673
        panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
674

675
        return ss->state.gpu;
676
}
677

678
static mali_ptr
679
panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
680
{
681
        struct panfrost_context *ctx = batch->ctx;
682
        struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
683

684
        panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
685

686
        struct panfrost_device *dev = pan_device(ctx->base.screen);
687
        unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
688
        struct panfrost_ptr xfer;
689

690
        if (dev->quirks & MIDGARD_SFBD) {
691
                xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
692
        } else {
693
                xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base,
694
                                                     PAN_DESC(RENDERER_STATE),
695
                                                     PAN_DESC_ARRAY(rt_count, BLEND));
696
        }
697

698
        mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS];
699
        unsigned shader_offset = 0;
700
        struct panfrost_bo *shader_bo = NULL;
701

702
        for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c) {
703
                if (ctx->pipe_framebuffer.cbufs[c]) {
704
                        blend_shaders[c] = panfrost_get_blend(batch,
705
                                        c, &shader_bo, &shader_offset);
706
                }
707
        }
708

709
        panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);
710

711
        if (!(dev->quirks & MIDGARD_SFBD))
712
                panfrost_emit_blend(batch, xfer.cpu + MALI_RENDERER_STATE_LENGTH, blend_shaders);
713
        else {
714
                batch->draws |= PIPE_CLEAR_COLOR0;
715
                batch->resolve |= PIPE_CLEAR_COLOR0;
716
        }
717

718
        if (ctx->depth_stencil->base.depth_enabled)
719
                batch->read |= PIPE_CLEAR_DEPTH;
720

721
        if (ctx->depth_stencil->base.stencil[0].enabled)
722
                batch->read |= PIPE_CLEAR_STENCIL;
723

724
        return xfer.gpu;
725
}
726

727
static mali_ptr
728
panfrost_emit_viewport(struct panfrost_batch *batch)
729
{
730
        struct panfrost_context *ctx = batch->ctx;
731
        const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
732
        const struct pipe_scissor_state *ss = &ctx->scissor;
733
        const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
734

735
        /* Derive min/max from translate/scale. Note since |x| >= 0 by
736
         * definition, we have that -|x| <= |x| hence translate - |scale| <=
737
         * translate + |scale|, so the ordering is correct here. */
738
        float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
739
        float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
740
        float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
741
        float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
742
        float minz = (vp->translate[2] - fabsf(vp->scale[2]));
743
        float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
744

745
        /* Scissor to the intersection of viewport and to the scissor, clamped
746
         * to the framebuffer */
747

748
        unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0));
749
        unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0));
750
        unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0));
751
        unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0));
752

753
        if (ss && rast->scissor) {
754
                minx = MAX2(ss->minx, minx);
755
                miny = MAX2(ss->miny, miny);
756
                maxx = MIN2(ss->maxx, maxx);
757
                maxy = MIN2(ss->maxy, maxy);
758
        }
759

760
        /* Set the range to [1, 1) so max values don't wrap round */
761
        if (maxx == 0 || maxy == 0)
762
                maxx = maxy = minx = miny = 1;
763

764
        struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
765

766
        pan_pack(T.cpu, VIEWPORT, cfg) {
767
                /* [minx, maxx) and [miny, maxy) are exclusive ranges, but
768
                 * these are inclusive */
769
                cfg.scissor_minimum_x = minx;
770
                cfg.scissor_minimum_y = miny;
771
                cfg.scissor_maximum_x = maxx - 1;
772
                cfg.scissor_maximum_y = maxy - 1;
773

774
                cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
775
                cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
776
        }
777

778
        panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
779
        batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
780

781
        return T.gpu;
782
}
783

784
static mali_ptr
785
panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
786
                                 enum pipe_shader_type st,
787
                                 struct panfrost_constant_buffer *buf,
788
                                 unsigned index)
789
{
790
        struct pipe_constant_buffer *cb = &buf->cb[index];
791
        struct panfrost_resource *rsrc = pan_resource(cb->buffer);
792

793
        if (rsrc) {
794
                panfrost_batch_read_rsrc(batch, rsrc, st);
795

796
                /* Alignment gauranteed by
797
                 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
798
                return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset;
799
        } else if (cb->user_buffer) {
800
                return pan_pool_upload_aligned(&batch->pool.base,
801
                                               cb->user_buffer +
802
                                               cb->buffer_offset,
803
                                               cb->buffer_size, 16);
804
        } else {
805
                unreachable("No constant buffer");
806
        }
807
}
808

809
struct sysval_uniform {
810
        union {
811
                float f[4];
812
                int32_t i[4];
813
                uint32_t u[4];
814
                uint64_t du[2];
815
        };
816
};
817

818
static void
819
panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
820
                                      struct sysval_uniform *uniform)
821
{
822
        struct panfrost_context *ctx = batch->ctx;
823
        const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
824

825
        uniform->f[0] = vp->scale[0];
826
        uniform->f[1] = vp->scale[1];
827
        uniform->f[2] = vp->scale[2];
828
}
829

830
static void
831
panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
832
                                       struct sysval_uniform *uniform)
833
{
834
        struct panfrost_context *ctx = batch->ctx;
835
        const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
836

837
        uniform->f[0] = vp->translate[0];
838
        uniform->f[1] = vp->translate[1];
839
        uniform->f[2] = vp->translate[2];
840
}
841

842
static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
843
                                       enum pipe_shader_type st,
844
                                       unsigned int sysvalid,
845
                                       struct sysval_uniform *uniform)
846
{
847
        struct panfrost_context *ctx = batch->ctx;
848
        unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
849
        unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
850
        bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
851
        struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
852

853
        assert(dim);
854

855
        if (tex->target == PIPE_BUFFER) {
856
                assert(dim == 1);
857
                uniform->i[0] =
858
                        tex->u.buf.size / util_format_get_blocksize(tex->format);
859
                return;
860
        }
861

862
        uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
863

864
        if (dim > 1)
865
                uniform->i[1] = u_minify(tex->texture->height0,
866
                                         tex->u.tex.first_level);
867

868
        if (dim > 2)
869
                uniform->i[2] = u_minify(tex->texture->depth0,
870
                                         tex->u.tex.first_level);
871

872
        if (is_array)
873
                uniform->i[dim] = tex->texture->array_size;
874
}
875

876
static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
877
                                              enum pipe_shader_type st,
878
                                              unsigned int sysvalid,
879
                                              struct sysval_uniform *uniform)
880
{
881
        struct panfrost_context *ctx = batch->ctx;
882
        unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
883
        unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
884
        unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
885

886
        assert(dim && dim < 4);
887

888
        struct pipe_image_view *image = &ctx->images[st][idx];
889

890
        if (image->resource->target == PIPE_BUFFER) {
891
                unsigned blocksize = util_format_get_blocksize(image->format);
892
                uniform->i[0] = image->resource->width0 / blocksize;
893
                return;
894
        }
895

896
        uniform->i[0] = u_minify(image->resource->width0,
897
                                 image->u.tex.level);
898

899
        if (dim > 1)
900
                uniform->i[1] = u_minify(image->resource->height0,
901
                                         image->u.tex.level);
902

903
        if (dim > 2)
904
                uniform->i[2] = u_minify(image->resource->depth0,
905
                                         image->u.tex.level);
906

907
        if (is_array)
908
                uniform->i[dim] = image->resource->array_size;
909
}
910

911
static void
912
panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
913
                            enum pipe_shader_type st,
914
                            unsigned ssbo_id,
915
                            struct sysval_uniform *uniform)
916
{
917
        struct panfrost_context *ctx = batch->ctx;
918

919
        assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
920
        struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
921

922
        /* Compute address */
923
        struct panfrost_resource *rsrc = pan_resource(sb.buffer);
924
        struct panfrost_bo *bo = rsrc->image.data.bo;
925

926
        panfrost_batch_write_rsrc(batch, rsrc, st);
927

928
        util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
929
                        sb.buffer_offset, sb.buffer_size);
930

931
        /* Upload address and size as sysval */
932
        uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
933
        uniform->u[2] = sb.buffer_size;
934
}
935

936
static void
937
panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
938
                               enum pipe_shader_type st,
939
                               unsigned samp_idx,
940
                               struct sysval_uniform *uniform)
941
{
942
        struct panfrost_context *ctx = batch->ctx;
943
        struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
944

945
        uniform->f[0] = sampl->min_lod;
946
        uniform->f[1] = sampl->max_lod;
947
        uniform->f[2] = sampl->lod_bias;
948

949
        /* Even without any errata, Midgard represents "no mipmapping" as
950
         * fixing the LOD with the clamps; keep behaviour consistent. c.f.
951
         * panfrost_create_sampler_state which also explains our choice of
952
         * epsilon value (again to keep behaviour consistent) */
953

954
        if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
955
                uniform->f[1] = uniform->f[0] + (1.0/256.0);
956
}
957

958
static void
959
panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
960
                                       struct sysval_uniform *uniform)
961
{
962
        struct panfrost_context *ctx = batch->ctx;
963

964
        uniform->u[0] = ctx->compute_grid->grid[0];
965
        uniform->u[1] = ctx->compute_grid->grid[1];
966
        uniform->u[2] = ctx->compute_grid->grid[2];
967
}
968

969
static void
970
panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
971
                                        struct sysval_uniform *uniform)
972
{
973
        struct panfrost_context *ctx = batch->ctx;
974

975
        uniform->u[0] = ctx->compute_grid->block[0];
976
        uniform->u[1] = ctx->compute_grid->block[1];
977
        uniform->u[2] = ctx->compute_grid->block[2];
978
}
979

980
static void
981
panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
982
                                struct sysval_uniform *uniform)
983
{
984
        struct panfrost_context *ctx = batch->ctx;
985

986
        uniform->u[0] = ctx->compute_grid->work_dim;
987
}
988

989
/* Sample positions are pushed in a Bifrost specific format on Bifrost. On
990
 * Midgard, we emulate the Bifrost path with some extra arithmetic in the
991
 * shader, to keep the code as unified as possible. */
992

993
static void
994
panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
995
                                struct sysval_uniform *uniform)
996
{
997
        struct panfrost_context *ctx = batch->ctx;
998
        struct panfrost_device *dev = pan_device(ctx->base.screen);
999

1000
        unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1001
        uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples));
1002
}
1003

1004
static void
1005
panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1006
                                struct sysval_uniform *uniform)
1007
{
1008
        unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1009
        uniform->u[0] = samples > 1;
1010
}
1011

1012
static void
1013
panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1014
                unsigned size_and_rt, struct sysval_uniform *uniform)
1015
{
1016
        struct panfrost_context *ctx = batch->ctx;
1017
        struct panfrost_device *dev = pan_device(ctx->base.screen);
1018
        unsigned rt = size_and_rt & 0xF;
1019
        unsigned size = size_and_rt >> 4;
1020

1021
        if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1022
                enum pipe_format format = batch->key.cbufs[rt]->format;
1023
                uniform->u[0] =
1024
                        pan_blend_get_bifrost_desc(dev, format, rt, size) >> 32;
1025
        } else {
1026
                pan_pack(&uniform->u[0], BIFROST_INTERNAL_CONVERSION, cfg)
1027
                        cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;
1028
        }
1029
}
1030

1031
void
1032
panfrost_analyze_sysvals(struct panfrost_shader_state *ss)
1033
{
1034
        unsigned dirty = 0;
1035
        unsigned dirty_shader =
1036
                PAN_DIRTY_STAGE_RENDERER | PAN_DIRTY_STAGE_CONST;
1037

1038
        for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {
1039
                switch (PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[i])) {
1040
                case PAN_SYSVAL_VIEWPORT_SCALE:
1041
                case PAN_SYSVAL_VIEWPORT_OFFSET:
1042
                        dirty |= PAN_DIRTY_VIEWPORT;
1043
                        break;
1044

1045
                case PAN_SYSVAL_TEXTURE_SIZE:
1046
                        dirty_shader |= PAN_DIRTY_STAGE_TEXTURE;
1047
                        break;
1048

1049
                case PAN_SYSVAL_SSBO:
1050
                        dirty_shader |= PAN_DIRTY_STAGE_SSBO;
1051
                        break;
1052

1053
                case PAN_SYSVAL_SAMPLER:
1054
                        dirty_shader |= PAN_DIRTY_STAGE_SAMPLER;
1055
                        break;
1056

1057
                case PAN_SYSVAL_IMAGE_SIZE:
1058
                        dirty_shader |= PAN_DIRTY_STAGE_IMAGE;
1059
                        break;
1060

1061
                case PAN_SYSVAL_NUM_WORK_GROUPS:
1062
                case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1063
                case PAN_SYSVAL_WORK_DIM:
1064
                case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1065
                        dirty |= PAN_DIRTY_PARAMS;
1066
                        break;
1067

1068
                case PAN_SYSVAL_DRAWID:
1069
                        dirty |= PAN_DIRTY_DRAWID;
1070
                        break;
1071

1072
                case PAN_SYSVAL_SAMPLE_POSITIONS:
1073
                case PAN_SYSVAL_MULTISAMPLED:
1074
                case PAN_SYSVAL_RT_CONVERSION:
1075
                        /* Nothing beyond the batch itself */
1076
                        break;
1077
                default:
1078
                        unreachable("Invalid sysval");
1079
                }
1080
        }
1081

1082
        ss->dirty_3d = dirty;
1083
        ss->dirty_shader = dirty_shader;
1084
}
1085

1086
static void
1087
panfrost_upload_sysvals(struct panfrost_batch *batch,
1088
                        const struct panfrost_ptr *ptr,
1089
                        struct panfrost_shader_state *ss,
1090
                        enum pipe_shader_type st)
1091
{
1092
        struct sysval_uniform *uniforms = ptr->cpu;
1093

1094
        for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {
1095
                int sysval = ss->info.sysvals.sysvals[i];
1096

1097
                switch (PAN_SYSVAL_TYPE(sysval)) {
1098
                case PAN_SYSVAL_VIEWPORT_SCALE:
1099
                        panfrost_upload_viewport_scale_sysval(batch,
1100
                                                              &uniforms[i]);
1101
                        break;
1102
                case PAN_SYSVAL_VIEWPORT_OFFSET:
1103
                        panfrost_upload_viewport_offset_sysval(batch,
1104
                                                               &uniforms[i]);
1105
                        break;
1106
                case PAN_SYSVAL_TEXTURE_SIZE:
1107
                        panfrost_upload_txs_sysval(batch, st,
1108
                                                   PAN_SYSVAL_ID(sysval),
1109
                                                   &uniforms[i]);
1110
                        break;
1111
                case PAN_SYSVAL_SSBO:
1112
                        panfrost_upload_ssbo_sysval(batch, st,
1113
                                                    PAN_SYSVAL_ID(sysval),
1114
                                                    &uniforms[i]);
1115
                        break;
1116
                case PAN_SYSVAL_NUM_WORK_GROUPS:
1117
                        for (unsigned j = 0; j < 3; j++) {
1118
                                batch->num_wg_sysval[j] =
1119
                                        ptr->gpu + (i * sizeof(*uniforms)) + (j * 4);
1120
                        }
1121
                        panfrost_upload_num_work_groups_sysval(batch,
1122
                                                               &uniforms[i]);
1123
                        break;
1124
                case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1125
                        panfrost_upload_local_group_size_sysval(batch,
1126
                                                                &uniforms[i]);
1127
                        break;
1128
                case PAN_SYSVAL_WORK_DIM:
1129
                        panfrost_upload_work_dim_sysval(batch,
1130
                                                        &uniforms[i]);
1131
                        break;
1132
                case PAN_SYSVAL_SAMPLER:
1133
                        panfrost_upload_sampler_sysval(batch, st,
1134
                                                       PAN_SYSVAL_ID(sysval),
1135
                                                       &uniforms[i]);
1136
                        break;
1137
                case PAN_SYSVAL_IMAGE_SIZE:
1138
                        panfrost_upload_image_size_sysval(batch, st,
1139
                                                          PAN_SYSVAL_ID(sysval),
1140
                                                          &uniforms[i]);
1141
                        break;
1142
                case PAN_SYSVAL_SAMPLE_POSITIONS:
1143
                        panfrost_upload_sample_positions_sysval(batch,
1144
                                                        &uniforms[i]);
1145
                        break;
1146
                case PAN_SYSVAL_MULTISAMPLED:
1147
                        panfrost_upload_multisampled_sysval(batch,
1148
                                                               &uniforms[i]);
1149
                        break;
1150
                case PAN_SYSVAL_RT_CONVERSION:
1151
                        panfrost_upload_rt_conversion_sysval(batch,
1152
                                        PAN_SYSVAL_ID(sysval), &uniforms[i]);
1153
                        break;
1154
                case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1155
                        batch->ctx->first_vertex_sysval_ptr =
1156
                                ptr->gpu + (i * sizeof(*uniforms));
1157
                        batch->ctx->base_vertex_sysval_ptr =
1158
                                batch->ctx->first_vertex_sysval_ptr + 4;
1159
                        batch->ctx->base_instance_sysval_ptr =
1160
                                batch->ctx->first_vertex_sysval_ptr + 8;
1161

1162
                        uniforms[i].u[0] = batch->ctx->offset_start;
1163
                        uniforms[i].u[1] = batch->ctx->base_vertex;
1164
                        uniforms[i].u[2] = batch->ctx->base_instance;
1165
                        break;
1166
                case PAN_SYSVAL_DRAWID:
1167
                        uniforms[i].u[0] = batch->ctx->drawid;
1168
                        break;
1169
                default:
1170
                        assert(0);
1171
                }
1172
        }
1173
}
1174

1175
static const void *
1176
panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1177
                                 struct panfrost_constant_buffer *buf,
1178
                                 unsigned index)
1179
{
1180
        struct pipe_constant_buffer *cb = &buf->cb[index];
1181
        struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1182

1183
        if (rsrc) {
1184
                panfrost_bo_mmap(rsrc->image.data.bo);
1185
                panfrost_flush_writer(ctx, rsrc);
1186
                panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false);
1187

1188
                return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset;
1189
        } else if (cb->user_buffer) {
1190
                return cb->user_buffer + cb->buffer_offset;
1191
        } else
1192
                unreachable("No constant buffer");
1193
}
1194

1195
static mali_ptr
1196
panfrost_emit_const_buf(struct panfrost_batch *batch,
1197
                        enum pipe_shader_type stage,
1198
                        mali_ptr *push_constants)
1199
{
1200
        struct panfrost_context *ctx = batch->ctx;
1201
        struct panfrost_shader_variants *all = ctx->shader[stage];
1202

1203
        if (!all)
1204
                return 0;
1205

1206
        struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1207
        struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1208

1209
        /* Allocate room for the sysval and the uniforms */
1210
        size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
1211
        struct panfrost_ptr transfer =
1212
                pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1213

1214
        /* Upload sysvals requested by the shader */
1215
        panfrost_upload_sysvals(batch, &transfer, ss, stage);
1216

1217
        /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1218
        struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
1219
        unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1220
        unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1221

1222
        struct panfrost_ptr ubos =
1223
                pan_pool_alloc_desc_array(&batch->pool.base,
1224
                                          ubo_count + 1,
1225
                                          UNIFORM_BUFFER);
1226

1227
        uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1228

1229
        /* Upload sysval as a final UBO */
1230

1231
        if (sys_size) {
1232
                pan_pack(ubo_ptr + ubo_count, UNIFORM_BUFFER, cfg) {
1233
                        cfg.entries = DIV_ROUND_UP(sys_size, 16);
1234
                        cfg.pointer = transfer.gpu;
1235
                }
1236
        }
1237

1238
        /* The rest are honest-to-goodness UBOs */
1239

1240
        u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1241
                size_t usz = buf->cb[ubo].buffer_size;
1242

1243
                if (usz == 0) {
1244
                        ubo_ptr[ubo] = 0;
1245
                        continue;
1246
                }
1247

1248
                /* Issue (57) for the ARB_uniform_buffer_object spec says that
1249
                 * the buffer can be larger than the uniform data inside it,
1250
                 * so clamp ubo size to what hardware supports. */
1251

1252
                pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1253
                        cfg.entries = MIN2(DIV_ROUND_UP(usz, 16), 1 << 12);
1254
                        cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1255
                                        stage, buf, ubo);
1256
                }
1257
        }
1258

1259
        if (ss->info.push.count == 0)
1260
                return ubos.gpu;
1261

1262
        /* Copy push constants required by the shader */
1263
        struct panfrost_ptr push_transfer =
1264
                pan_pool_alloc_aligned(&batch->pool.base,
1265
                                       ss->info.push.count * 4, 16);
1266

1267
        uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;
1268
        *push_constants = push_transfer.gpu;
1269

1270
        for (unsigned i = 0; i < ss->info.push.count; ++i) {
1271
                struct panfrost_ubo_word src = ss->info.push.words[i];
1272

1273
                if (src.ubo == sysval_ubo) {
1274
                        unsigned sysval_idx = src.offset / 16;
1275
                        unsigned sysval_comp = (src.offset % 16) / 4;
1276
                        unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]);
1277
                        mali_ptr ptr = push_transfer.gpu + (4 * i);
1278

1279
                        switch (sysval_type) {
1280
                        case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1281
                                switch (sysval_comp) {
1282
                                case 0:
1283
                                        batch->ctx->first_vertex_sysval_ptr = ptr;
1284
                                        break;
1285
                                case 1:
1286
                                        batch->ctx->base_vertex_sysval_ptr = ptr;
1287
                                        break;
1288
                                case 2:
1289
                                        batch->ctx->base_instance_sysval_ptr = ptr;
1290
                                        break;
1291
                                case 3:
1292
                                        /* Spurious (Midgard doesn't pack) */
1293
                                        break;
1294
                                default:
1295
                                        unreachable("Invalid vertex/instance offset component\n");
1296
                                }
1297
                                break;
1298

1299
                        case PAN_SYSVAL_NUM_WORK_GROUPS:
1300
                                batch->num_wg_sysval[sysval_comp] = ptr;
1301
                                break;
1302

1303
                        default:
1304
                                break;
1305
                        }
1306
                }
1307
                /* Map the UBO, this should be cheap. However this is reading
1308
                 * from write-combine memory which is _very_ slow. It might pay
1309
                 * off to upload sysvals to a staging buffer on the CPU on the
1310
                 * assumption sysvals will get pushed (TODO) */
1311

1312
                const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
1313
                        panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1314

1315
                /* TODO: Is there any benefit to combining ranges */
1316
                memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
1317
        }
1318

1319
        return ubos.gpu;
1320
}
1321

1322
static mali_ptr
1323
panfrost_emit_shared_memory(struct panfrost_batch *batch,
1324
                            const struct pipe_grid_info *info)
1325
{
1326
        struct panfrost_context *ctx = batch->ctx;
1327
        struct panfrost_device *dev = pan_device(ctx->base.screen);
1328
        struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1329
        struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1330
        struct panfrost_ptr t =
1331
                pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1332

1333
        pan_pack(t.cpu, LOCAL_STORAGE, ls) {
1334
                unsigned wls_single_size =
1335
                        util_next_power_of_two(MAX2(ss->info.wls_size, 128));
1336

1337
                if (ss->info.wls_size) {
1338
                        ls.wls_instances =
1339
                                util_next_power_of_two(info->grid[0]) *
1340
                                util_next_power_of_two(info->grid[1]) *
1341
                                util_next_power_of_two(info->grid[2]);
1342

1343
                        ls.wls_size_scale = util_logbase2(wls_single_size) + 1;
1344

1345
                        unsigned wls_size = wls_single_size * ls.wls_instances * dev->core_count;
1346

1347
                        ls.wls_base_pointer =
1348
                                (panfrost_batch_get_shared_memory(batch,
1349
                                                                  wls_size,
1350
                                                                  1))->ptr.gpu;
1351
                } else {
1352
                        ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1353
                }
1354

1355
                if (ss->info.tls_size) {
1356
                        unsigned shift =
1357
                                panfrost_get_stack_shift(ss->info.tls_size);
1358
                        struct panfrost_bo *bo =
1359
                                panfrost_batch_get_scratchpad(batch,
1360
                                                              ss->info.tls_size,
1361
                                                              dev->thread_tls_alloc,
1362
                                                              dev->core_count);
1363

1364
                        ls.tls_size = shift;
1365
                        ls.tls_base_pointer = bo->ptr.gpu;
1366
                }
1367
        };
1368

1369
        return t.gpu;
1370
}
1371

1372
static mali_ptr
1373
panfrost_get_tex_desc(struct panfrost_batch *batch,
1374
                      enum pipe_shader_type st,
1375
                      struct panfrost_sampler_view *view)
1376
{
1377
        if (!view)
1378
                return (mali_ptr) 0;
1379

1380
        struct pipe_sampler_view *pview = &view->base;
1381
        struct panfrost_resource *rsrc = pan_resource(pview->texture);
1382

1383
        panfrost_batch_read_rsrc(batch, rsrc, st);
1384
        panfrost_batch_add_bo(batch, view->state.bo, st);
1385

1386
        return view->state.gpu;
1387
}
1388

1389
static void
1390
panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1391
                             struct pipe_context *pctx)
1392
{
1393
        struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1394
        if (view->texture_bo != rsrc->image.data.bo->ptr.gpu ||
1395
            view->modifier != rsrc->image.layout.modifier) {
1396
                panfrost_bo_unreference(view->state.bo);
1397
                panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1398
        }
1399
}
1400

1401
static mali_ptr
1402
panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1403
                                  enum pipe_shader_type stage)
1404
{
1405
        struct panfrost_context *ctx = batch->ctx;
1406
        struct panfrost_device *device = pan_device(ctx->base.screen);
1407

1408
        if (!ctx->sampler_view_count[stage])
1409
                return 0;
1410

1411
        if (pan_is_bifrost(device)) {
1412
                struct panfrost_ptr T =
1413
                        pan_pool_alloc_desc_array(&batch->pool.base,
1414
                                                  ctx->sampler_view_count[stage],
1415
                                                  BIFROST_TEXTURE);
1416
                struct mali_bifrost_texture_packed *out =
1417
                        (struct mali_bifrost_texture_packed *) T.cpu;
1418

1419
                for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1420
                        struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1421
                        struct pipe_sampler_view *pview = &view->base;
1422
                        struct panfrost_resource *rsrc = pan_resource(pview->texture);
1423

1424
                        panfrost_update_sampler_view(view, &ctx->base);
1425
                        out[i] = view->bifrost_descriptor;
1426

1427
                        panfrost_batch_read_rsrc(batch, rsrc, stage);
1428
                        panfrost_batch_add_bo(batch, view->state.bo, stage);
1429
                }
1430

1431
                return T.gpu;
1432
        } else {
1433
                uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1434

1435
                for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1436
                        struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1437

1438
                        panfrost_update_sampler_view(view, &ctx->base);
1439

1440
                        trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1441
                }
1442

1443
                return pan_pool_upload_aligned(&batch->pool.base,
1444
                                               trampolines,
1445
                                               sizeof(uint64_t) *
1446
                                               ctx->sampler_view_count[stage],
1447
                                               sizeof(uint64_t));
1448
        }
1449
}
1450

1451
static mali_ptr
1452
panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1453
                                  enum pipe_shader_type stage)
1454
{
1455
        struct panfrost_context *ctx = batch->ctx;
1456

1457
        if (!ctx->sampler_count[stage])
1458
                return 0;
1459

1460
        assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1461
        assert(MALI_BIFROST_SAMPLER_ALIGN == MALI_MIDGARD_SAMPLER_ALIGN);
1462

1463
        struct panfrost_ptr T =
1464
                pan_pool_alloc_desc_array(&batch->pool.base,
1465
                                          ctx->sampler_count[stage],
1466
                                          MIDGARD_SAMPLER);
1467
        struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1468

1469
        for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1470
                out[i] = ctx->samplers[stage][i]->hw;
1471

1472
        return T.gpu;
1473
}
1474

1475
/* Packs all image attribute descs and attribute buffer descs.
1476
 * `first_image_buf_index` must be the index of the first image attribute buffer descriptor.
1477
 */
1478
static void
1479
emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1480
                   struct mali_attribute_packed *attribs, unsigned first_buf)
1481
{
1482
        struct panfrost_device *dev = pan_device(ctx->base.screen);
1483
        unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1484

1485
        for (unsigned i = 0; i < last_bit; ++i) {
1486
                enum pipe_format format = ctx->images[shader][i].format;
1487

1488
                pan_pack(attribs + i, ATTRIBUTE, cfg) {
1489
                        /* Continuation record means 2 buffers per image */
1490
                        cfg.buffer_index = first_buf + (i * 2);
1491
                        cfg.offset_enable = !pan_is_bifrost(dev);
1492
                        cfg.format = dev->formats[format].hw;
1493
                }
1494
        }
1495
}
1496

1497
static enum mali_attribute_type
1498
pan_modifier_to_attr_type(uint64_t modifier)
1499
{
1500
        switch (modifier) {
1501
        case DRM_FORMAT_MOD_LINEAR:
1502
                return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1503
        case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1504
                return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1505
        default:
1506
                unreachable("Invalid modifier for attribute record");
1507
        }
1508
}
1509

1510
static void
1511
emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1512
                struct mali_attribute_buffer_packed *bufs,
1513
                unsigned first_image_buf_index)
1514
{
1515
        struct panfrost_context *ctx = batch->ctx;
1516
        unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1517

1518
        for (unsigned i = 0; i < last_bit; ++i) {
1519
                struct pipe_image_view *image = &ctx->images[shader][i];
1520

1521
                /* TODO: understand how v3d/freedreno does it */
1522
                if (!(ctx->image_mask[shader] & (1 << i)) ||
1523
                    !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1524
                        /* Unused image bindings */
1525
                        pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg);
1526
                        pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg);
1527
                        continue;
1528
                }
1529

1530
                struct panfrost_resource *rsrc = pan_resource(image->resource);
1531

1532
                /* TODO: MSAA */
1533
                assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported");
1534

1535
                bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1536
                bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1537

1538
                unsigned offset = is_buffer ? image->u.buf.offset :
1539
                        panfrost_texture_offset(&rsrc->image.layout,
1540
                                                image->u.tex.level,
1541
                                                is_3d ? 0 : image->u.tex.first_layer,
1542
                                                is_3d ? image->u.tex.first_layer : 0);
1543

1544
                if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE) {
1545
                        panfrost_batch_write_rsrc(batch, rsrc, shader);
1546

1547
                        unsigned level = is_buffer ? 0 : image->u.tex.level;
1548
                        BITSET_SET(rsrc->valid.data, level);
1549

1550
                        if (is_buffer) {
1551
                                util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1552
                                                0, rsrc->base.width0);
1553
                        }
1554
                } else {
1555
                        panfrost_batch_read_rsrc(batch, rsrc, shader);
1556
                }
1557

1558
                pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1559
                        cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1560
                        cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset;
1561
                        cfg.stride = util_format_get_blocksize(image->format);
1562
                        cfg.size = rsrc->image.data.bo->size - offset;
1563
                }
1564

1565
                if (is_buffer) {
1566
                        pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1567
                                cfg.s_dimension = rsrc->base.width0 /
1568
                                        util_format_get_blocksize(image->format);
1569
                                cfg.t_dimension = cfg.r_dimension = 1;
1570
                        }
1571

1572
                        continue;
1573
                }
1574

1575
                pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1576
                        unsigned level = image->u.tex.level;
1577

1578
                        cfg.s_dimension = u_minify(rsrc->base.width0, level);
1579
                        cfg.t_dimension = u_minify(rsrc->base.height0, level);
1580
                        cfg.r_dimension = is_3d ?
1581
                                u_minify(rsrc->base.depth0, level) :
1582
                                image->u.tex.last_layer - image->u.tex.first_layer + 1;
1583

1584
                        cfg.row_stride =
1585
                                rsrc->image.layout.slices[level].row_stride;
1586

1587
                        if (rsrc->base.target != PIPE_TEXTURE_2D) {
1588
                                cfg.slice_stride =
1589
                                        panfrost_get_layer_stride(&rsrc->image.layout,
1590
                                                                  level);
1591
                        }
1592
                }
1593
        }
1594
}
1595

1596
static mali_ptr
1597
panfrost_emit_image_attribs(struct panfrost_batch *batch,
1598
                            mali_ptr *buffers,
1599
                            enum pipe_shader_type type)
1600
{
1601
        struct panfrost_context *ctx = batch->ctx;
1602
        struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type);
1603

1604
        if (!shader->info.attribute_count) {
1605
                *buffers = 0;
1606
                return 0;
1607
        }
1608

1609
        struct panfrost_device *dev = pan_device(ctx->base.screen);
1610

1611
        /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1612
        unsigned attr_count = shader->info.attribute_count;
1613
        unsigned buf_count = (attr_count * 2) + (pan_is_bifrost(dev) ? 1 : 0);
1614

1615
        struct panfrost_ptr bufs =
1616
                pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1617

1618
        struct panfrost_ptr attribs =
1619
                pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1620

1621
        emit_image_attribs(ctx, type, attribs.cpu, 0);
1622
        emit_image_bufs(batch, type, bufs.cpu, 0);
1623

1624
        /* We need an empty attrib buf to stop the prefetching on Bifrost */
1625
        if (pan_is_bifrost(dev)) {
1626
                pan_pack(bufs.cpu +
1627
                         ((buf_count - 1) * MALI_ATTRIBUTE_BUFFER_LENGTH),
1628
                         ATTRIBUTE_BUFFER, cfg);
1629
        }
1630

1631
        *buffers = bufs.gpu;
1632
        return attribs.gpu;
1633
}
1634

1635
static mali_ptr
1636
panfrost_emit_vertex_data(struct panfrost_batch *batch,
1637
                          mali_ptr *buffers)
1638
{
1639
        struct panfrost_context *ctx = batch->ctx;
1640
        struct panfrost_device *dev = pan_device(ctx->base.screen);
1641
        struct panfrost_vertex_state *so = ctx->vertex;
1642
        struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1643
        bool instanced = ctx->indirect_draw || ctx->instance_count > 1;
1644
        uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1645
        unsigned nr_images = util_last_bit(image_mask);
1646

1647
        /* Worst case: everything is NPOT, which is only possible if instancing
1648
         * is enabled. Otherwise single record is gauranteed.
1649
         * Also, we allocate more memory than what's needed here if either instancing
1650
         * is enabled or images are present, this can be improved. */
1651
        unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
1652
        unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
1653
                           (pan_is_bifrost(dev) ? 1 : 0);
1654

1655
        /* Midgard needs vertexid/instanceid handled specially */
1656
        bool special_vbufs = dev->arch < 6 && vs->info.attribute_count >= PAN_VERTEX_ID;
1657

1658
        if (special_vbufs)
1659
                nr_bufs += 2;
1660

1661
        if (!nr_bufs) {
1662
                *buffers = 0;
1663
                return 0;
1664
        }
1665

1666
        struct panfrost_ptr S =
1667
                pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
1668
                                          ATTRIBUTE_BUFFER);
1669
        struct panfrost_ptr T =
1670
                pan_pool_alloc_desc_array(&batch->pool.base,
1671
                                          vs->info.attribute_count,
1672
                                          ATTRIBUTE);
1673

1674
        struct mali_attribute_buffer_packed *bufs =
1675
                (struct mali_attribute_buffer_packed *) S.cpu;
1676

1677
        struct mali_attribute_packed *out =
1678
                (struct mali_attribute_packed *) T.cpu;
1679

1680
        unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1681
        unsigned k = 0;
1682

1683
        for (unsigned i = 0; i < so->nr_bufs; ++i) {
1684
                unsigned vbi = so->buffers[i].vbi;
1685
                unsigned divisor = so->buffers[i].divisor;
1686
                attrib_to_buffer[i] = k;
1687

1688
                if (!(ctx->vb_mask & (1 << vbi)))
1689
                        continue;
1690

1691
                struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1692
                struct panfrost_resource *rsrc;
1693

1694
                rsrc = pan_resource(buf->buffer.resource);
1695
                if (!rsrc)
1696
                        continue;
1697

1698
                panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1699

1700
                /* Mask off lower bits, see offset fixup below */
1701
                mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset;
1702
                mali_ptr addr = raw_addr & ~63;
1703

1704
                /* Since we advanced the base pointer, we shrink the buffer
1705
                 * size, but add the offset we subtracted */
1706
                unsigned size = rsrc->base.width0 + (raw_addr - addr)
1707
                        - buf->buffer_offset;
1708

1709
                /* When there is a divisor, the hardware-level divisor is
1710
                 * the product of the instance divisor and the padded count */
1711
                unsigned stride = buf->stride;
1712

1713
                if (ctx->indirect_draw) {
1714
                        /* We allocated 2 records for each attribute buffer */
1715
                        assert((k & 1) == 0);
1716

1717
                        /* With indirect draws we can't guess the vertex_count.
1718
                         * Pre-set the address, stride and size fields, the
1719
                         * compute shader do the rest.
1720
                         */
1721
                        pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1722
                                cfg.type = MALI_ATTRIBUTE_TYPE_1D;
1723
                                cfg.pointer = addr;
1724
                                cfg.stride = stride;
1725
                                cfg.size = size;
1726
                        }
1727

1728
                        /* We store the unmodified divisor in the continuation
1729
                         * slot so the compute shader can retrieve it.
1730
                         */
1731
                        pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1732
                                cfg.divisor = divisor;
1733
                        }
1734

1735
                        k += 2;
1736
                        continue;
1737
                }
1738

1739
                unsigned hw_divisor = ctx->padded_count * divisor;
1740

1741
                if (ctx->instance_count <= 1) {
1742
                        /* Per-instance would be every attribute equal */
1743
                        if (divisor)
1744
                                stride = 0;
1745

1746
                        pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1747
                                cfg.pointer = addr;
1748
                                cfg.stride = stride;
1749
                                cfg.size = size;
1750
                        }
1751
                } else if (!divisor) {
1752
                        pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1753
                                cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1754
                                cfg.pointer = addr;
1755
                                cfg.stride = stride;
1756
                                cfg.size = size;
1757
                                cfg.divisor = ctx->padded_count;
1758
                        }
1759
                } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1760
                        pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1761
                                cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1762
                                cfg.pointer = addr;
1763
                                cfg.stride = stride;
1764
                                cfg.size = size;
1765
                                cfg.divisor_r = __builtin_ctz(hw_divisor);
1766
                        }
1767

1768
                } else {
1769
                        unsigned shift = 0, extra_flags = 0;
1770

1771
                        unsigned magic_divisor =
1772
                                panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1773

1774
                        /* Records with continuations must be aligned */
1775
                        k = ALIGN_POT(k, 2);
1776
                        attrib_to_buffer[i] = k;
1777

1778
                        pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1779
                                cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1780
                                cfg.pointer = addr;
1781
                                cfg.stride = stride;
1782
                                cfg.size = size;
1783

1784
                                cfg.divisor_r = shift;
1785
                                cfg.divisor_e = extra_flags;
1786
                        }
1787

1788
                        pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1789
                                cfg.divisor_numerator = magic_divisor;
1790
                                cfg.divisor = divisor;
1791
                        }
1792

1793
                        ++k;
1794
                }
1795

1796
                ++k;
1797
        }
1798

1799
        /* Add special gl_VertexID/gl_InstanceID buffers */
1800
        if (unlikely(special_vbufs)) {
1801
                panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1802

1803
                pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1804
                        cfg.buffer_index = k++;
1805
                        cfg.format = so->formats[PAN_VERTEX_ID];
1806
                }
1807

1808
                panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1809

1810
                pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1811
                        cfg.buffer_index = k++;
1812
                        cfg.format = so->formats[PAN_INSTANCE_ID];
1813
                }
1814
        }
1815

1816
        k = ALIGN_POT(k, 2);
1817
        emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
1818
        emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
1819
        k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
1820

1821
        /* We need an empty attrib buf to stop the prefetching on Bifrost */
1822
        if (pan_is_bifrost(dev))
1823
                pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
1824

1825
        /* Attribute addresses require 64-byte alignment, so let:
1826
         *
1827
         *      base' = base & ~63 = base - (base & 63)
1828
         *      offset' = offset + (base & 63)
1829
         *
1830
         * Since base' + offset' = base + offset, these are equivalent
1831
         * addressing modes and now base is 64 aligned.
1832
         */
1833

1834
        for (unsigned i = 0; i < so->num_elements; ++i) {
1835
                unsigned vbi = so->pipe[i].vertex_buffer_index;
1836
                struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1837

1838
                /* BOs are aligned; just fixup for buffer_offset */
1839
                signed src_offset = so->pipe[i].src_offset;
1840
                src_offset += (buf->buffer_offset & 63);
1841

1842
                /* Base instance offset */
1843
                if (ctx->base_instance && so->pipe[i].instance_divisor) {
1844
                        src_offset += (ctx->base_instance * buf->stride) /
1845
                                      so->pipe[i].instance_divisor;
1846
                }
1847

1848
                /* Also, somewhat obscurely per-instance data needs to be
1849
                 * offset in response to a delayed start in an indexed draw */
1850

1851
                if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1852
                        src_offset -= buf->stride * ctx->offset_start;
1853

1854
                pan_pack(out + i, ATTRIBUTE, cfg) {
1855
                        cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
1856
                        cfg.format = so->formats[i];
1857
                        cfg.offset = src_offset;
1858
                }
1859
        }
1860

1861
        *buffers = S.gpu;
1862
        return T.gpu;
1863
}
1864

1865
static mali_ptr
1866
panfrost_emit_varyings(struct panfrost_batch *batch,
1867
                struct mali_attribute_buffer_packed *slot,
1868
                unsigned stride, unsigned count)
1869
{
1870
        unsigned size = stride * count;
1871
        mali_ptr ptr =
1872
                batch->ctx->indirect_draw ? 0 :
1873
                pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
1874

1875
        pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1876
                cfg.stride = stride;
1877
                cfg.size = size;
1878
                cfg.pointer = ptr;
1879
        }
1880

1881
        return ptr;
1882
}
1883

1884
static unsigned
1885
panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1886
{
1887
        return target->buffer_offset + (pan_so_target(target)->offset * stride);
1888
}
1889

1890
static void
1891
panfrost_emit_streamout(struct panfrost_batch *batch,
1892
                        struct mali_attribute_buffer_packed *slot,
1893
                        unsigned stride, unsigned count,
1894
                        struct pipe_stream_output_target *target)
1895
{
1896
        unsigned max_size = target->buffer_size;
1897
        unsigned expected_size = stride * count;
1898

1899
        /* Grab the BO and bind it to the batch */
1900
        struct panfrost_resource *rsrc = pan_resource(target->buffer);
1901
        struct panfrost_bo *bo = rsrc->image.data.bo;
1902

1903
        panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1904
        panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
1905

1906
        unsigned offset = panfrost_xfb_offset(stride, target);
1907

1908
        pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1909
                cfg.pointer = bo->ptr.gpu + (offset & ~63);
1910
                cfg.stride = stride;
1911
                cfg.size = MIN2(max_size, expected_size) + (offset & 63);
1912

1913
                util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1914
                                offset, cfg.size);
1915
        }
1916
}
1917

1918
/* Helpers for manipulating stream out information so we can pack varyings
1919
 * accordingly. Compute the src_offset for a given captured varying */
1920

1921
static struct pipe_stream_output *
1922
pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1923
{
1924
        for (unsigned i = 0; i < info->num_outputs; ++i) {
1925
                if (info->output[i].register_index == loc)
1926
                        return &info->output[i];
1927
        }
1928

1929
        unreachable("Varying not captured");
1930
}
1931

1932
/* Given a varying, figure out which index it corresponds to */
1933

1934
static inline unsigned
1935
pan_varying_index(unsigned present, enum pan_special_varying v)
1936
{
1937
        return util_bitcount(present & BITFIELD_MASK(v));
1938
}
1939

1940
/* Get the base offset for XFB buffers, which by convention come after
1941
 * everything else. Wrapper function for semantic reasons; by construction this
1942
 * is just popcount. */
1943

1944
static inline unsigned
1945
pan_xfb_base(unsigned present)
1946
{
1947
        return util_bitcount(present);
1948
}
1949

1950
/* Determines which varying buffers are required */
1951

1952
static inline unsigned
1953
pan_varying_present(const struct panfrost_device *dev,
1954
                    struct pan_shader_info *producer,
1955
                    struct pan_shader_info *consumer,
1956
                    uint16_t point_coord_mask)
1957
{
1958
        /* At the moment we always emit general and position buffers. Not
1959
         * strictly necessary but usually harmless */
1960

1961
        unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
1962

1963
        /* Enable special buffers by the shader info */
1964

1965
        if (producer->vs.writes_point_size)
1966
                present |= BITFIELD_BIT(PAN_VARY_PSIZ);
1967

1968
        /* On Bifrost, special fragment varyings are replaced by LD_VAR_SPECIAL */
1969
        if (pan_is_bifrost(dev))
1970
                return present;
1971

1972
        /* On Midgard, these exist as real varyings */
1973
        if (consumer->fs.reads_point_coord)
1974
                present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
1975

1976
        if (consumer->fs.reads_face)
1977
                present |= BITFIELD_BIT(PAN_VARY_FACE);
1978

1979
        if (consumer->fs.reads_frag_coord)
1980
                present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
1981

1982
        /* Also, if we have a point sprite, we need a point coord buffer */
1983

1984
        for (unsigned i = 0; i < consumer->varyings.input_count; i++)  {
1985
                gl_varying_slot loc = consumer->varyings.input[i].location;
1986

1987
                if (util_varying_is_point_coord(loc, point_coord_mask))
1988
                        present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
1989
        }
1990

1991
        return present;
1992
}
1993

1994
/* Emitters for varying records */
1995

1996
static void
1997
pan_emit_vary(const struct panfrost_device *dev,
1998
              struct mali_attribute_packed *out,
1999
              unsigned buffer_index,
2000
              mali_pixel_format format, unsigned offset)
2001
{
2002
        pan_pack(out, ATTRIBUTE, cfg) {
2003
                cfg.buffer_index = buffer_index;
2004
                cfg.offset_enable = !pan_is_bifrost(dev);
2005
                cfg.format = format;
2006
                cfg.offset = offset;
2007
        }
2008
}
2009

2010
/* Special records */
2011

2012
static const struct {
2013
       unsigned components;
2014
       enum mali_format format;
2015
} pan_varying_formats[PAN_VARY_MAX] = {
2016
        [PAN_VARY_POSITION]     = { 4, MALI_SNAP_4 },
2017
        [PAN_VARY_PSIZ]         = { 1, MALI_R16F },
2018
        [PAN_VARY_PNTCOORD]     = { 1, MALI_R16F },
2019
        [PAN_VARY_FACE]         = { 1, MALI_R32I },
2020
        [PAN_VARY_FRAGCOORD]    = { 4, MALI_RGBA32F },
2021
};
2022

2023
static mali_pixel_format
2024
pan_special_format(const struct panfrost_device *dev,
2025
                enum pan_special_varying buf)
2026
{
2027
        assert(buf < PAN_VARY_MAX);
2028
        mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2029

2030
        if (dev->quirks & HAS_SWIZZLES) {
2031
                unsigned nr = pan_varying_formats[buf].components;
2032
                format |= panfrost_get_default_swizzle(nr);
2033
        }
2034

2035
        return format;
2036
}
2037

2038
static void
2039
pan_emit_vary_special(const struct panfrost_device *dev,
2040
                      struct mali_attribute_packed *out,
2041
                      unsigned present, enum pan_special_varying buf)
2042
{
2043
        pan_emit_vary(dev, out, pan_varying_index(present, buf),
2044
                        pan_special_format(dev, buf), 0);
2045
}
2046

2047
/* Negative indicates a varying is not found */
2048

2049
static signed
2050
pan_find_vary(const struct pan_shader_varying *vary,
2051
                unsigned vary_count, unsigned loc)
2052
{
2053
        for (unsigned i = 0; i < vary_count; ++i) {
2054
                if (vary[i].location == loc)
2055
                        return i;
2056
        }
2057

2058
        return -1;
2059
}
2060

2061
/* Assign varying locations for the general buffer. Returns the calculated
2062
 * per-vertex stride, and outputs offsets into the passed array. Negative
2063
 * offset indicates a varying is not used. */
2064

2065
static unsigned
2066
pan_assign_varyings(const struct panfrost_device *dev,
2067
                    struct pan_shader_info *producer,
2068
                    struct pan_shader_info *consumer,
2069
                    signed *offsets)
2070
{
2071
        unsigned producer_count = producer->varyings.output_count;
2072
        unsigned consumer_count = consumer->varyings.input_count;
2073

2074
        const struct pan_shader_varying *producer_vars = producer->varyings.output;
2075
        const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2076

2077
        unsigned stride = 0;
2078

2079
        for (unsigned i = 0; i < producer_count; ++i) {
2080
                signed loc = pan_find_vary(consumer_vars, consumer_count,
2081
                                producer_vars[i].location);
2082

2083
                if (loc >= 0) {
2084
                        offsets[i] = stride;
2085

2086
                        enum pipe_format format = consumer_vars[loc].format;
2087
                        stride += util_format_get_blocksize(format);
2088
                } else {
2089
                        offsets[i] = -1;
2090
                }
2091
        }
2092

2093
        return stride;
2094
}
2095

2096
/* Emitter for a single varying (attribute) descriptor */
2097

2098
static void
2099
panfrost_emit_varying(const struct panfrost_device *dev,
2100
                      struct mali_attribute_packed *out,
2101
                      const struct pan_shader_varying varying,
2102
                      enum pipe_format pipe_format,
2103
                      unsigned present,
2104
                      uint16_t point_sprite_mask,
2105
                      struct pipe_stream_output_info *xfb,
2106
                      uint64_t xfb_loc_mask,
2107
                      unsigned max_xfb,
2108
                      unsigned *xfb_offsets,
2109
                      signed offset,
2110
                      enum pan_special_varying pos_varying)
2111
{
2112
        /* Note: varying.format != pipe_format in some obscure cases due to a
2113
         * limitation of the NIR linker. This should be fixed in the future to
2114
         * eliminate the additional lookups. See:
2115
         * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2116
         */
2117
        gl_varying_slot loc = varying.location;
2118
        mali_pixel_format format = dev->formats[pipe_format].hw;
2119

2120
        struct pipe_stream_output *o = (xfb_loc_mask & BITFIELD64_BIT(loc)) ?
2121
                pan_get_so(xfb, loc) : NULL;
2122

2123
        if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2124
                pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2125
        } else if (o && o->output_buffer < max_xfb) {
2126
                unsigned fixup_offset = xfb_offsets[o->output_buffer] & 63;
2127

2128
                pan_emit_vary(dev, out,
2129
                                pan_xfb_base(present) + o->output_buffer,
2130
                                format, (o->dst_offset * 4) + fixup_offset);
2131
        } else if (loc == VARYING_SLOT_POS) {
2132
                pan_emit_vary_special(dev, out, present, pos_varying);
2133
        } else if (loc == VARYING_SLOT_PSIZ) {
2134
                pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2135
        } else if (loc == VARYING_SLOT_FACE) {
2136
                pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2137
        } else if (offset < 0) {
2138
                pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2139
        } else {
2140
                STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2141
                pan_emit_vary(dev, out, 0, format, offset);
2142
        }
2143
}
2144

2145
/* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2146
 * rather than draw time (under good conditions). */
2147

2148
static void
2149
panfrost_emit_varying_descs(
2150
                struct panfrost_pool *pool,
2151
                struct panfrost_shader_state *producer,
2152
                struct panfrost_shader_state *consumer,
2153
                struct panfrost_streamout *xfb,
2154
                uint16_t point_coord_mask,
2155
                struct pan_linkage *out)
2156
{
2157
        struct panfrost_device *dev = pool->base.dev;
2158
        struct pipe_stream_output_info *xfb_info = &producer->stream_output;
2159
        unsigned producer_count = producer->info.varyings.output_count;
2160
        unsigned consumer_count = consumer->info.varyings.input_count;
2161

2162
        /* Offsets within the general varying buffer, indexed by location */
2163
        signed offsets[PIPE_MAX_ATTRIBS];
2164
        assert(producer_count < ARRAY_SIZE(offsets));
2165
        assert(consumer_count < ARRAY_SIZE(offsets));
2166

2167
        /* Allocate enough descriptors for both shader stages */
2168
        struct panfrost_ptr T =
2169
                pan_pool_alloc_desc_array(&pool->base,
2170
                                          producer_count + consumer_count,
2171
                                          ATTRIBUTE);
2172

2173
        /* Take a reference if we're being put on the CSO */
2174
        if (!pool->owned) {
2175
                out->bo = pool->transient_bo;
2176
                panfrost_bo_reference(out->bo);
2177
        }
2178

2179
        struct mali_attribute_packed *descs = T.cpu;
2180
        out->producer = producer_count ? T.gpu : 0;
2181
        out->consumer = consumer_count ? T.gpu +
2182
                (MALI_ATTRIBUTE_LENGTH * producer_count) : 0;
2183

2184
        /* Lay out the varyings. Must use producer to lay out, in order to
2185
         * respect transform feedback precisions. */
2186
        out->present = pan_varying_present(dev, &producer->info,
2187
                        &consumer->info, point_coord_mask);
2188

2189
        out->stride = pan_assign_varyings(dev, &producer->info,
2190
                        &consumer->info, offsets);
2191

2192
        unsigned xfb_offsets[PIPE_MAX_SO_BUFFERS];
2193

2194
        for (unsigned i = 0; i < xfb->num_targets; ++i) {
2195
                xfb_offsets[i] = panfrost_xfb_offset(xfb_info->stride[i] * 4,
2196
                                xfb->targets[i]);
2197
        }
2198

2199
        for (unsigned i = 0; i < producer_count; ++i) {
2200
                signed j = pan_find_vary(consumer->info.varyings.input,
2201
                                consumer->info.varyings.input_count,
2202
                                producer->info.varyings.output[i].location);
2203

2204
                enum pipe_format format = (j >= 0) ?
2205
                        consumer->info.varyings.input[j].format :
2206
                        producer->info.varyings.output[i].format;
2207

2208
                panfrost_emit_varying(dev, descs + i,
2209
                                producer->info.varyings.output[i], format,
2210
                                out->present, 0, &producer->stream_output,
2211
                                producer->so_mask, xfb->num_targets,
2212
                                xfb_offsets, offsets[i], PAN_VARY_POSITION);
2213
        }
2214

2215
        for (unsigned i = 0; i < consumer_count; ++i) {
2216
                signed j = pan_find_vary(producer->info.varyings.output,
2217
                                producer->info.varyings.output_count,
2218
                                consumer->info.varyings.input[i].location);
2219

2220
                signed offset = (j >= 0) ? offsets[j] : -1;
2221

2222
                panfrost_emit_varying(dev, descs + producer_count + i,
2223
                                consumer->info.varyings.input[i],
2224
                                consumer->info.varyings.input[i].format,
2225
                                out->present, point_coord_mask,
2226
                                &producer->stream_output, producer->so_mask,
2227
                                xfb->num_targets, xfb_offsets, offset,
2228
                                PAN_VARY_FRAGCOORD);
2229
        }
2230
}
2231

2232
static void
2233
pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2234
                unsigned present,
2235
                enum pan_special_varying v,
2236
                unsigned special)
2237
{
2238
        if (present & BITFIELD_BIT(v)) {
2239
                unsigned idx = pan_varying_index(present, v);
2240

2241
                pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2242
                        cfg.special = special;
2243
                        cfg.type = 0;
2244
                }
2245
        }
2246
}
2247

2248
static void
2249
panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2250
                                 unsigned vertex_count,
2251
                                 mali_ptr *vs_attribs,
2252
                                 mali_ptr *fs_attribs,
2253
                                 mali_ptr *buffers,
2254
                                 unsigned *buffer_count,
2255
                                 mali_ptr *position,
2256
                                 mali_ptr *psiz,
2257
                                 bool point_coord_replace)
2258
{
2259
        /* Load the shaders */
2260
        struct panfrost_context *ctx = batch->ctx;
2261
        struct panfrost_device *dev = pan_device(ctx->base.screen);
2262
        struct panfrost_shader_state *vs, *fs;
2263

2264
        vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2265
        fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2266

2267
        uint16_t point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2268

2269
        /* TODO: point sprites need lowering on Bifrost */
2270
        if (!point_coord_replace || pan_is_bifrost(dev))
2271
                point_coord_mask =  0;
2272

2273
        /* In good conditions, we only need to link varyings once */
2274
        bool prelink =
2275
                (point_coord_mask == 0) &&
2276
                (ctx->streamout.num_targets == 0) &&
2277
                !vs->info.separable &&
2278
                !fs->info.separable;
2279

2280
        /* Try to reduce copies */
2281
        struct pan_linkage _linkage;
2282
        struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2283

2284
        /* Emit ATTRIBUTE descriptors if needed */
2285
        if (!prelink || vs->linkage.bo == NULL) {
2286
                struct panfrost_pool *pool =
2287
                        prelink ? &ctx->descs : &batch->pool;
2288

2289
                panfrost_emit_varying_descs(pool, vs, fs, &ctx->streamout, point_coord_mask, linkage);
2290
        }
2291

2292
        struct pipe_stream_output_info *so = &vs->stream_output;
2293
        unsigned present = linkage->present, stride = linkage->stride;
2294
        unsigned xfb_base = pan_xfb_base(present);
2295
        struct panfrost_ptr T =
2296
                pan_pool_alloc_desc_array(&batch->pool.base,
2297
                                          xfb_base +
2298
                                          ctx->streamout.num_targets + 1,
2299
                                          ATTRIBUTE_BUFFER);
2300
        struct mali_attribute_buffer_packed *varyings =
2301
                (struct mali_attribute_buffer_packed *) T.cpu;
2302

2303
        if (buffer_count)
2304
                *buffer_count = xfb_base + ctx->streamout.num_targets;
2305

2306
        /* Suppress prefetch on Bifrost */
2307
        memset(varyings + (xfb_base * ctx->streamout.num_targets), 0, sizeof(*varyings));
2308

2309
        /* Emit the stream out buffers. We need enough room for all the
2310
         * vertices we emit across all instances */
2311

2312
        unsigned out_count = ctx->instance_count *
2313
                u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2314

2315
        for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2316
                panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2317
                                        so->stride[i] * 4,
2318
                                        out_count,
2319
                                        ctx->streamout.targets[i]);
2320
        }
2321

2322
        if (stride) {
2323
                panfrost_emit_varyings(batch,
2324
                                &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2325
                                stride, vertex_count);
2326
        }
2327

2328
        /* fp32 vec4 gl_Position */
2329
        *position = panfrost_emit_varyings(batch,
2330
                        &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2331
                        sizeof(float) * 4, vertex_count);
2332

2333
        if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2334
                *psiz = panfrost_emit_varyings(batch,
2335
                                &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2336
                                2, vertex_count);
2337
        }
2338

2339
        pan_emit_special_input(varyings, present,
2340
                        PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2341
        pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2342
                        MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2343
        pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2344
                        MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2345

2346
        *buffers = T.gpu;
2347
        *vs_attribs = linkage->producer;
2348
        *fs_attribs = linkage->consumer;
2349
}
2350

2351
static void
2352
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2353
                                const struct panfrost_ptr *vertex_job,
2354
                                const struct panfrost_ptr *tiler_job)
2355
{
2356
        struct panfrost_context *ctx = batch->ctx;
2357

2358
        /* If rasterizer discard is enable, only submit the vertex. XXX - set
2359
         * job_barrier in case buffers get ping-ponged and we need to enforce
2360
         * ordering, this has a perf hit! See
2361
         * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */
2362

2363
        unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2364
                                           MALI_JOB_TYPE_VERTEX, true, false,
2365
                                           ctx->indirect_draw ?
2366
                                           batch->indirect_draw_job_id : 0,
2367
                                           0, vertex_job, false);
2368

2369
        if (ctx->rasterizer->base.rasterizer_discard || batch->scissor_culls_everything)
2370
                return;
2371

2372
        panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2373
                         MALI_JOB_TYPE_TILER, false, false,
2374
                         vertex, 0, tiler_job, false);
2375
}
2376

2377
static void
2378
emit_tls(struct panfrost_batch *batch)
2379
{
2380
        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2381

2382
        /* Emitted with the FB descriptor on Midgard. */
2383
        if (!pan_is_bifrost(dev) && batch->framebuffer.gpu)
2384
                return;
2385

2386
        struct panfrost_bo *tls_bo =
2387
                batch->stack_size ?
2388
                panfrost_batch_get_scratchpad(batch,
2389
                                              batch->stack_size,
2390
                                              dev->thread_tls_alloc,
2391
                                              dev->core_count):
2392
                NULL;
2393
        struct pan_tls_info tls = {
2394
                .tls = {
2395
                        .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2396
                        .size = batch->stack_size,
2397
                },
2398
        };
2399

2400
        assert(batch->tls.cpu);
2401
        pan_emit_tls(dev, &tls, batch->tls.cpu);
2402
}
2403

2404
static void
2405
emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
2406
{
2407
        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2408
        struct panfrost_bo *tls_bo =
2409
                batch->stack_size ?
2410
                panfrost_batch_get_scratchpad(batch,
2411
                                              batch->stack_size,
2412
                                              dev->thread_tls_alloc,
2413
                                              dev->core_count):
2414
                NULL;
2415
        struct pan_tls_info tls = {
2416
                .tls = {
2417
                        .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2418
                        .size = batch->stack_size,
2419
                },
2420
        };
2421

2422
        batch->framebuffer.gpu |=
2423
                pan_emit_fbd(dev, fb, &tls, &batch->tiler_ctx,
2424
                             batch->framebuffer.cpu);
2425
}
2426

2427
/* Mark a surface as written */
2428

2429
static void
2430
panfrost_initialize_surface(struct panfrost_batch *batch,
2431
                            struct pipe_surface *surf)
2432
{
2433
        if (surf) {
2434
                struct panfrost_resource *rsrc = pan_resource(surf->texture);
2435
                BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2436
        }
2437
}
2438

2439
/* Generate a fragment job. This should be called once per frame. (According to
2440
 * presentations, this is supposed to correspond to eglSwapBuffers) */
2441

2442
static mali_ptr
2443
emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2444
{
2445
        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2446

2447
        /* Mark the affected buffers as initialized, since we're writing to it.
2448
         * Also, add the surfaces we're writing to to the batch */
2449

2450
        struct pipe_framebuffer_state *fb = &batch->key;
2451

2452
        for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2453
                panfrost_initialize_surface(batch, fb->cbufs[i]);
2454

2455
        panfrost_initialize_surface(batch, fb->zsbuf);
2456

2457
        /* The passed tile coords can be out of range in some cases, so we need
2458
         * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2459
         * Theoretically we also need to clamp the coordinates positive, but we
2460
         * avoid that edge case as all four values are unsigned. Also,
2461
         * theoretically we could clamp the minima, but if that has to happen
2462
         * the asserts would fail anyway (since the maxima would get clamped
2463
         * and then be smaller than the minima). An edge case of sorts occurs
2464
         * when no scissors are added to draw, so by default min=~0 and max=0.
2465
         * But that can't happen if any actual drawing occurs (beyond a
2466
         * wallpaper reload), so this is again irrelevant in practice. */
2467

2468
        batch->maxx = MIN2(batch->maxx, fb->width);
2469
        batch->maxy = MIN2(batch->maxy, fb->height);
2470

2471
        /* Rendering region must be at least 1x1; otherwise, there is nothing
2472
         * to do and the whole job chain should have been discarded. */
2473

2474
        assert(batch->maxx > batch->minx);
2475
        assert(batch->maxy > batch->miny);
2476

2477
        struct panfrost_ptr transfer =
2478
                pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
2479

2480
        pan_emit_fragment_job(dev, pfb, batch->framebuffer.gpu,
2481
                              transfer.cpu);
2482

2483
        return transfer.gpu;
2484
}
2485

2486
#define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
2487

2488
static uint8_t
2489
pan_draw_mode(enum pipe_prim_type mode)
2490
{
2491
        switch (mode) {
2492
                DEFINE_CASE(POINTS);
2493
                DEFINE_CASE(LINES);
2494
                DEFINE_CASE(LINE_LOOP);
2495
                DEFINE_CASE(LINE_STRIP);
2496
                DEFINE_CASE(TRIANGLES);
2497
                DEFINE_CASE(TRIANGLE_STRIP);
2498
                DEFINE_CASE(TRIANGLE_FAN);
2499
                DEFINE_CASE(QUADS);
2500
                DEFINE_CASE(QUAD_STRIP);
2501
                DEFINE_CASE(POLYGON);
2502

2503
        default:
2504
                unreachable("Invalid draw mode");
2505
        }
2506
}
2507

2508
#undef DEFINE_CASE
2509

2510
/* Count generated primitives (when there is no geom/tess shaders) for
2511
 * transform feedback */
2512

2513
static void
2514
panfrost_statistics_record(
2515
                struct panfrost_context *ctx,
2516
                const struct pipe_draw_info *info,
2517
                const struct pipe_draw_start_count_bias *draw)
2518
{
2519
        if (!ctx->active_queries)
2520
                return;
2521

2522
        uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2523
        ctx->prims_generated += prims;
2524

2525
        if (!ctx->streamout.num_targets)
2526
                return;
2527

2528
        ctx->tf_prims_generated += prims;
2529
}
2530

2531
static void
2532
panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2533
{
2534
        for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2535
                unsigned count;
2536

2537
                count = u_stream_outputs_for_vertices(ctx->active_prim,
2538
                                                      ctx->vertex_count);
2539
                pan_so_target(ctx->streamout.targets[i])->offset += count;
2540
        }
2541
}
2542

2543
static inline void
2544
pan_emit_draw_descs(struct panfrost_batch *batch,
2545
                struct MALI_DRAW *d, enum pipe_shader_type st)
2546
{
2547
        d->offset_start = batch->ctx->offset_start;
2548
        d->instance_size = batch->ctx->instance_count > 1 ?
2549
                           batch->ctx->padded_count : 1;
2550

2551
        d->uniform_buffers = batch->uniform_buffers[st];
2552
        d->push_uniforms = batch->push_uniforms[st];
2553
        d->textures = batch->textures[st];
2554
        d->samplers = batch->samplers[st];
2555
}
2556

2557
static inline enum mali_index_type
2558
panfrost_translate_index_size(unsigned size)
2559
{
2560
        STATIC_ASSERT(MALI_INDEX_TYPE_NONE  == 0);
2561
        STATIC_ASSERT(MALI_INDEX_TYPE_UINT8  == 1);
2562
        STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2);
2563

2564
        return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size;
2565
}
2566

2567
static void
2568
panfrost_draw_emit_vertex(struct panfrost_batch *batch,
2569
                          const struct pipe_draw_info *info,
2570
                          void *invocation_template,
2571
                          mali_ptr vs_vary, mali_ptr varyings,
2572
                          mali_ptr attribs, mali_ptr attrib_bufs,
2573
                          void *job)
2574
{
2575
        struct panfrost_context *ctx = batch->ctx;
2576
        struct panfrost_device *device = pan_device(ctx->base.screen);
2577

2578
        void *section =
2579
                pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
2580
        memcpy(section, invocation_template, MALI_INVOCATION_LENGTH);
2581

2582
        pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
2583
                cfg.job_task_split = 5;
2584
        }
2585

2586
        pan_section_pack(job, COMPUTE_JOB, DRAW, cfg) {
2587
                cfg.draw_descriptor_is_64b = true;
2588
                if (!pan_is_bifrost(device))
2589
                        cfg.texture_descriptor_is_64b = true;
2590
                cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
2591
                cfg.attributes = attribs;
2592
                cfg.attribute_buffers = attrib_bufs;
2593
                cfg.varyings = vs_vary;
2594
                cfg.varying_buffers = vs_vary ? varyings : 0;
2595
                cfg.thread_storage = batch->tls.gpu;
2596
                pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
2597
        }
2598

2599
        pan_section_pack(job, COMPUTE_JOB, DRAW_PADDING, cfg);
2600
}
2601

2602
static void
2603
panfrost_emit_primitive_size(struct panfrost_context *ctx,
2604
                             bool points, mali_ptr size_array,
2605
                             void *prim_size)
2606
{
2607
        struct panfrost_rasterizer *rast = ctx->rasterizer;
2608

2609
        pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
2610
                if (panfrost_writes_point_size(ctx)) {
2611
                        cfg.size_array = size_array;
2612
                } else {
2613
                        cfg.constant = points ?
2614
                                       rast->base.point_size :
2615
                                       rast->base.line_width;
2616
                }
2617
        }
2618
}
2619

2620
static bool
2621
panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info)
2622
{
2623
        unsigned implicit_index = (1 << (info->index_size * 8)) - 1;
2624
        bool implicit = info->restart_index == implicit_index;
2625
        return info->primitive_restart && implicit;
2626
}
2627

2628
static inline void
2629
panfrost_update_state_tex(struct panfrost_batch *batch,
2630
                          enum pipe_shader_type st)
2631
{
2632
        struct panfrost_context *ctx = batch->ctx;
2633
        struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
2634

2635
        unsigned dirty_3d = ctx->dirty;
2636
        unsigned dirty = ctx->dirty_shader[st];
2637

2638
        if (dirty & PAN_DIRTY_STAGE_TEXTURE) {
2639
                batch->textures[st] =
2640
                        panfrost_emit_texture_descriptors(batch, st);
2641
        }
2642

2643
        if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2644
                batch->samplers[st] =
2645
                        panfrost_emit_sampler_descriptors(batch, st);
2646
        }
2647

2648
        if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2649
                batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st,
2650
                                &batch->push_uniforms[st]);
2651
        }
2652
}
2653

2654
static inline void
2655
panfrost_update_state_3d(struct panfrost_batch *batch)
2656
{
2657
        unsigned dirty = batch->ctx->dirty;
2658

2659
        if (dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
2660
                batch->viewport = panfrost_emit_viewport(batch);
2661

2662
        if (dirty & PAN_DIRTY_TLS_SIZE)
2663
                panfrost_batch_adjust_stack_size(batch);
2664
}
2665

2666
static void
2667
panfrost_update_state_vs(struct panfrost_batch *batch)
2668
{
2669
        enum pipe_shader_type st = PIPE_SHADER_VERTEX;
2670
        unsigned dirty = batch->ctx->dirty_shader[st];
2671

2672
        if (dirty & PAN_DIRTY_STAGE_RENDERER)
2673
                batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2674

2675
        panfrost_update_state_tex(batch, st);
2676
}
2677

2678
static void
2679
panfrost_update_state_fs(struct panfrost_batch *batch)
2680
{
2681
        enum pipe_shader_type st = PIPE_SHADER_FRAGMENT;
2682
        unsigned dirty = batch->ctx->dirty_shader[st];
2683

2684
        if (dirty & PAN_DIRTY_STAGE_RENDERER)
2685
                batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2686

2687
        if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2688
                batch->attribs[st] = panfrost_emit_image_attribs(batch,
2689
                                &batch->attrib_bufs[st], st);
2690
        }
2691

2692
        panfrost_update_state_tex(batch, st);
2693
}
2694

2695
static void
2696
panfrost_draw_emit_tiler(struct panfrost_batch *batch,
2697
                         const struct pipe_draw_info *info,
2698
                         const struct pipe_draw_start_count_bias *draw,
2699
                         void *invocation_template,
2700
                         mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
2701
                         mali_ptr pos, mali_ptr psiz, void *job)
2702
{
2703
        struct panfrost_context *ctx = batch->ctx;
2704
        struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2705
        struct panfrost_device *device = pan_device(ctx->base.screen);
2706

2707
        void *section = pan_is_bifrost(device) ?
2708
                        pan_section_ptr(job, BIFROST_TILER_JOB, INVOCATION) :
2709
                        pan_section_ptr(job, MIDGARD_TILER_JOB, INVOCATION);
2710
        memcpy(section, invocation_template, MALI_INVOCATION_LENGTH);
2711

2712
        section = pan_is_bifrost(device) ?
2713
                  pan_section_ptr(job, BIFROST_TILER_JOB, PRIMITIVE) :
2714
                  pan_section_ptr(job, MIDGARD_TILER_JOB, PRIMITIVE);
2715
        pan_pack(section, PRIMITIVE, cfg) {
2716
                cfg.draw_mode = pan_draw_mode(info->mode);
2717
                if (panfrost_writes_point_size(ctx))
2718
                        cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
2719

2720
                /* For line primitives, PRIMITIVE.first_provoking_vertex must
2721
                 * be set to true and the provoking vertex is selected with
2722
                 * DRAW.flat_shading_vertex.
2723
                 */
2724
                if (info->mode == PIPE_PRIM_LINES ||
2725
                    info->mode == PIPE_PRIM_LINE_LOOP ||
2726
                    info->mode == PIPE_PRIM_LINE_STRIP)
2727
                        cfg.first_provoking_vertex = true;
2728
                else
2729
                        cfg.first_provoking_vertex = rast->flatshade_first;
2730

2731
                if (panfrost_is_implicit_prim_restart(info)) {
2732
                        cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
2733
                } else if (info->primitive_restart) {
2734
                        cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
2735
                        cfg.primitive_restart_index = info->restart_index;
2736
                }
2737

2738
                cfg.job_task_split = 6;
2739

2740
                cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
2741
                cfg.index_type = panfrost_translate_index_size(info->index_size);
2742

2743
                if (cfg.index_type) {
2744
                        cfg.indices = indices;
2745
                        cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
2746
                }
2747
        }
2748

2749
        bool points = info->mode == PIPE_PRIM_POINTS;
2750
        void *prim_size = pan_is_bifrost(device) ?
2751
                          pan_section_ptr(job, BIFROST_TILER_JOB, PRIMITIVE_SIZE) :
2752
                          pan_section_ptr(job, MIDGARD_TILER_JOB, PRIMITIVE_SIZE);
2753

2754
        if (pan_is_bifrost(device)) {
2755
                panfrost_emit_primitive_size(ctx, points, psiz, prim_size);
2756
                pan_section_pack(job, BIFROST_TILER_JOB, TILER, cfg) {
2757
                        cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
2758
                }
2759
                pan_section_pack(job, BIFROST_TILER_JOB, PADDING, padding) {}
2760
        }
2761

2762
        section = pan_is_bifrost(device) ?
2763
                  pan_section_ptr(job, BIFROST_TILER_JOB, DRAW) :
2764
                  pan_section_ptr(job, MIDGARD_TILER_JOB, DRAW);
2765
        pan_pack(section, DRAW, cfg) {
2766
                cfg.four_components_per_vertex = true;
2767
                cfg.draw_descriptor_is_64b = true;
2768
                if (!pan_is_bifrost(device))
2769
                        cfg.texture_descriptor_is_64b = true;
2770
                cfg.front_face_ccw = rast->front_ccw;
2771
                cfg.cull_front_face = rast->cull_face & PIPE_FACE_FRONT;
2772
                cfg.cull_back_face = rast->cull_face & PIPE_FACE_BACK;
2773
                cfg.position = pos;
2774
                cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
2775
                cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
2776
                cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
2777
                cfg.viewport = batch->viewport;
2778
                cfg.varyings = fs_vary;
2779
                cfg.varying_buffers = fs_vary ? varyings : 0;
2780
                cfg.thread_storage = batch->tls.gpu;
2781

2782
                /* For all primitives but lines DRAW.flat_shading_vertex must
2783
                 * be set to 0 and the provoking vertex is selected with the
2784
                 * PRIMITIVE.first_provoking_vertex field.
2785
                 */
2786
                if (info->mode == PIPE_PRIM_LINES ||
2787
                    info->mode == PIPE_PRIM_LINE_LOOP ||
2788
                    info->mode == PIPE_PRIM_LINE_STRIP) {
2789
                        /* The logic is inverted on bifrost. */
2790
                        cfg.flat_shading_vertex =
2791
                                pan_is_bifrost(device) ?
2792
                                rast->flatshade_first : !rast->flatshade_first;
2793
                }
2794

2795
                pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
2796

2797
                if (ctx->occlusion_query && ctx->active_queries) {
2798
                        if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
2799
                                cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
2800
                        else
2801
                                cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
2802

2803
                        struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
2804
                        cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
2805
                        panfrost_batch_write_rsrc(ctx->batch, rsrc,
2806
                                              PIPE_SHADER_FRAGMENT);
2807
                }
2808
        }
2809

2810
        if (!pan_is_bifrost(device))
2811
                panfrost_emit_primitive_size(ctx, points, psiz, prim_size);
2812
        else
2813
                pan_section_pack(job, BIFROST_TILER_JOB, DRAW_PADDING, cfg);
2814
}
2815

2816
static void
2817
panfrost_direct_draw(struct panfrost_batch *batch,
2818
                     const struct pipe_draw_info *info,
2819
                     unsigned drawid_offset,
2820
                     const struct pipe_draw_start_count_bias *draw)
2821
{
2822
        if (!draw->count || !info->instance_count)
2823
                return;
2824

2825
        struct panfrost_context *ctx = batch->ctx;
2826
        struct panfrost_device *device = pan_device(ctx->base.screen);
2827

2828
        /* Fallback for unsupported modes */
2829
        if (!(ctx->draw_modes & BITFIELD_BIT(info->mode))) {
2830
                if (draw->count < 4) {
2831
                        /* Degenerate case? */
2832
                        return;
2833
                }
2834

2835
                util_primconvert_save_rasterizer_state(ctx->primconvert, &ctx->rasterizer->base);
2836
                util_primconvert_draw_vbo(ctx->primconvert, info, drawid_offset, NULL, draw, 1);
2837
                return;
2838
        }
2839

2840
        /* Take into account a negative bias */
2841
        ctx->indirect_draw = false;
2842
        ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);
2843
        ctx->instance_count = info->instance_count;
2844
        ctx->base_vertex = info->index_size ? draw->index_bias : 0;
2845
        ctx->base_instance = info->start_instance;
2846
        ctx->active_prim = info->mode;
2847
        ctx->drawid = drawid_offset;
2848

2849
        struct panfrost_ptr tiler =
2850
                pan_is_bifrost(device) ?
2851
                pan_pool_alloc_desc(&batch->pool.base, BIFROST_TILER_JOB) :
2852
                pan_pool_alloc_desc(&batch->pool.base, MIDGARD_TILER_JOB);
2853
        struct panfrost_ptr vertex =
2854
                pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
2855

2856
        unsigned vertex_count = ctx->vertex_count;
2857

2858
        unsigned min_index = 0, max_index = 0;
2859
        mali_ptr indices = 0;
2860

2861
        if (info->index_size) {
2862
                indices = panfrost_get_index_buffer_bounded(batch, info, draw,
2863
                                                            &min_index,
2864
                                                            &max_index);
2865

2866
                /* Use the corresponding values */
2867
                vertex_count = max_index - min_index + 1;
2868
                ctx->offset_start = min_index + draw->index_bias;
2869
        } else {
2870
                ctx->offset_start = draw->start;
2871
        }
2872

2873
        if (info->instance_count > 1)
2874
                ctx->padded_count = panfrost_padded_vertex_count(vertex_count);
2875
        else
2876
                ctx->padded_count = vertex_count;
2877

2878
        panfrost_statistics_record(ctx, info, draw);
2879

2880
        struct mali_invocation_packed invocation;
2881
        if (info->instance_count > 1) {
2882
                panfrost_pack_work_groups_compute(&invocation,
2883
                                                  1, vertex_count, info->instance_count,
2884
                                                  1, 1, 1, true, false);
2885
        } else {
2886
                pan_pack(&invocation, INVOCATION, cfg) {
2887
                        cfg.invocations = MALI_POSITIVE(vertex_count);
2888
                        cfg.size_y_shift = 0;
2889
                        cfg.size_z_shift = 0;
2890
                        cfg.workgroups_x_shift = 0;
2891
                        cfg.workgroups_y_shift = 0;
2892
                        cfg.workgroups_z_shift = 32;
2893
                        cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
2894
                }
2895
        }
2896

2897
        /* Emit all sort of descriptors. */
2898
        mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
2899

2900
        panfrost_emit_varying_descriptor(batch,
2901
                                         ctx->padded_count *
2902
                                         ctx->instance_count,
2903
                                         &vs_vary, &fs_vary, &varyings,
2904
                                         NULL, &pos, &psiz,
2905
                                         info->mode == PIPE_PRIM_POINTS);
2906

2907
        mali_ptr attribs, attrib_bufs;
2908
        attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
2909

2910
        panfrost_update_state_3d(batch);
2911
        panfrost_update_state_vs(batch);
2912
        panfrost_update_state_fs(batch);
2913
        panfrost_clean_state_3d(ctx);
2914

2915
        /* Fire off the draw itself */
2916
        panfrost_draw_emit_vertex(batch, info, &invocation,
2917
                                  vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
2918
        panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
2919
                                 fs_vary, varyings, pos, psiz, tiler.cpu);
2920
        panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
2921

2922
        /* Increment transform feedback offsets */
2923
        panfrost_update_streamout_offsets(ctx);
2924
}
2925

2926
static void
2927
panfrost_indirect_draw(struct panfrost_batch *batch,
2928
                       const struct pipe_draw_info *info,
2929
                       unsigned drawid_offset,
2930
                       const struct pipe_draw_indirect_info *indirect,
2931
                       const struct pipe_draw_start_count_bias *draw)
2932
{
2933
        /* Indirect draw count and multi-draw not supported. */
2934
        assert(indirect->draw_count == 1 && !indirect->indirect_draw_count);
2935

2936
        struct panfrost_context *ctx = batch->ctx;
2937
        struct panfrost_device *dev = pan_device(ctx->base.screen);
2938

2939
        /* TODO: update statistics (see panfrost_statistics_record()) */
2940
        /* TODO: Increment transform feedback offsets */
2941
        assert(ctx->streamout.num_targets == 0);
2942

2943
        assert(ctx->draw_modes & (1 << info->mode));
2944
        ctx->active_prim = info->mode;
2945
        ctx->drawid = drawid_offset;
2946
        ctx->indirect_draw = true;
2947

2948
        struct panfrost_ptr tiler =
2949
                pan_pool_alloc_aligned(&batch->pool.base,
2950
                                       pan_is_bifrost(dev) ?
2951
                                       MALI_BIFROST_TILER_JOB_LENGTH :
2952
                                       MALI_MIDGARD_TILER_JOB_LENGTH,
2953
                                       64);
2954
        struct panfrost_ptr vertex =
2955
                pan_pool_alloc_aligned(&batch->pool.base,
2956
                                       MALI_COMPUTE_JOB_LENGTH,
2957
                                       64);
2958

2959
        struct panfrost_shader_state *vs =
2960
                panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2961

2962
        struct panfrost_bo *index_buf = NULL;
2963

2964
        if (info->index_size) {
2965
                assert(!info->has_user_indices);
2966
                struct panfrost_resource *rsrc = pan_resource(info->index.resource);
2967
                index_buf = rsrc->image.data.bo;
2968
                panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2969
        }
2970

2971
        mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
2972
        unsigned varying_buf_count;
2973

2974
        /* We want to create templates, set all count fields to 0 to reflect
2975
         * that.
2976
         */
2977
        ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0;
2978
        ctx->offset_start = 0;
2979

2980
        /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the
2981
         * vertex shader uses gl_VertexID or gl_BaseVertex.
2982
         */
2983
        ctx->first_vertex_sysval_ptr = 0;
2984
        ctx->base_vertex_sysval_ptr = 0;
2985
        ctx->base_instance_sysval_ptr = 0;
2986

2987
        panfrost_update_state_3d(batch);
2988
        panfrost_update_state_vs(batch);
2989
        panfrost_update_state_fs(batch);
2990
        panfrost_clean_state_3d(ctx);
2991

2992
        bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS);
2993

2994
        panfrost_emit_varying_descriptor(batch, 0,
2995
                                         &vs_vary, &fs_vary, &varyings,
2996
                                         &varying_buf_count, &pos, &psiz,
2997
                                         point_coord_replace);
2998

2999
        mali_ptr attribs, attrib_bufs;
3000
        attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
3001

3002
        /* Zero-ed invocation, the compute job will update it. */
3003
        static struct mali_invocation_packed invocation;
3004

3005
        /* Fire off the draw itself */
3006
        panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings,
3007
                                  attribs, attrib_bufs, vertex.cpu);
3008
        panfrost_draw_emit_tiler(batch, info, draw, &invocation,
3009
                                 index_buf ? index_buf->ptr.gpu : 0,
3010
                                 fs_vary, varyings, pos, psiz, tiler.cpu);
3011

3012
        /* Add the varying heap BO to the batch if we're allocating varyings. */
3013
        if (varyings) {
3014
                panfrost_batch_add_bo(batch,
3015
                                      dev->indirect_draw_shaders.varying_heap,
3016
                                      PIPE_SHADER_VERTEX);
3017
        }
3018

3019
        assert(indirect->buffer);
3020

3021
        struct panfrost_resource *draw_buf = pan_resource(indirect->buffer);
3022

3023
        /* Don't count images: those attributes don't need to be patched. */
3024
        unsigned attrib_count =
3025
                vs->info.attribute_count -
3026
                util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]);
3027

3028
        panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX);
3029

3030
        struct pan_indirect_draw_info draw_info = {
3031
                .last_indirect_draw = batch->indirect_draw_job_id,
3032
                .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset,
3033
                .index_buf = index_buf ? index_buf->ptr.gpu : 0,
3034
                .first_vertex_sysval = ctx->first_vertex_sysval_ptr,
3035
                .base_vertex_sysval = ctx->base_vertex_sysval_ptr,
3036
                .base_instance_sysval = ctx->base_instance_sysval_ptr,
3037
                .vertex_job = vertex.gpu,
3038
                .tiler_job = tiler.gpu,
3039
                .attrib_bufs = attrib_bufs,
3040
                .attribs = attribs,
3041
                .attrib_count = attrib_count,
3042
                .varying_bufs = varyings,
3043
                .index_size = info->index_size,
3044
        };
3045

3046
        if (panfrost_writes_point_size(ctx))
3047
                draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE;
3048

3049
        if (vs->info.vs.writes_point_size)
3050
                draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ;
3051

3052

3053
        if (info->primitive_restart) {
3054
                draw_info.restart_index = info->restart_index;
3055
                draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART;
3056
        }
3057

3058
        batch->indirect_draw_job_id =
3059
                panfrost_emit_indirect_draw(&batch->pool.base,
3060
                                            &batch->scoreboard,
3061
                                            &draw_info,
3062
                                            &batch->indirect_draw_ctx);
3063

3064
        panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3065
}
3066

3067
static void
3068
panfrost_draw_vbo(struct pipe_context *pipe,
3069
                  const struct pipe_draw_info *info,
3070
                  unsigned drawid_offset,
3071
                  const struct pipe_draw_indirect_info *indirect,
3072
                  const struct pipe_draw_start_count_bias *draws,
3073
                  unsigned num_draws)
3074
{
3075
        struct panfrost_context *ctx = pan_context(pipe);
3076
        struct panfrost_device *dev = pan_device(pipe->screen);
3077

3078
        if (!panfrost_render_condition_check(ctx))
3079
                return;
3080

3081
        /* Emulate indirect draws when debugging */
3082
        if (dev->debug & PAN_DBG_NOINDIRECT && indirect && indirect->buffer) {
3083
                assert(num_draws == 1);
3084
                util_draw_indirect(pipe, info, indirect);
3085
                return;
3086
        }
3087

3088
        /* Do some common setup */
3089
        struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3090

3091
        /* Don't add too many jobs to a single batch. Hardware has a hard limit
3092
         * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to
3093
         * avoid the risk of timeouts. This might not be a good idea. */
3094
        if (unlikely(batch->scoreboard.job_index > 10000))
3095
                batch = panfrost_get_fresh_batch_for_fbo(ctx);
3096

3097
        unsigned zs_draws = ctx->depth_stencil->draws;
3098
        batch->draws |= zs_draws;
3099
        batch->resolve |= zs_draws;
3100

3101
        /* Mark everything dirty when debugging */
3102
        if (unlikely(dev->debug & PAN_DBG_DIRTY))
3103
                panfrost_dirty_state_all(ctx);
3104

3105
        /* Conservatively assume draw parameters always change */
3106
        ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3107

3108
        if (indirect) {
3109
                assert(num_draws == 1);
3110

3111
                if (indirect->count_from_stream_output) {
3112
                        struct pipe_draw_start_count_bias tmp_draw = *draws;
3113
                        struct panfrost_streamout_target *so =
3114
                                pan_so_target(indirect->count_from_stream_output);
3115

3116
                        tmp_draw.start = 0;
3117
                        tmp_draw.count = so->offset;
3118
                        tmp_draw.index_bias = 0;
3119
                        panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw);
3120
                        return;
3121
                }
3122

3123
                panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]);
3124
                return;
3125
        }
3126

3127
        struct pipe_draw_info tmp_info = *info;
3128
        unsigned drawid = drawid_offset;
3129

3130
        for (unsigned i = 0; i < num_draws; i++) {
3131
                panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);
3132

3133
                if (tmp_info.increment_draw_id) {
3134
                        ctx->dirty |= PAN_DIRTY_DRAWID;
3135
                        drawid++;
3136
                }
3137
        }
3138

3139
}
3140

3141
/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3142
 * construct the COMPUTE job and some of its payload.
3143
 */
3144

3145
static void
3146
panfrost_launch_grid(struct pipe_context *pipe,
3147
                const struct pipe_grid_info *info)
3148
{
3149
        struct panfrost_context *ctx = pan_context(pipe);
3150
        struct panfrost_device *dev = pan_device(pipe->screen);
3151

3152
        /* XXX - shouldn't be necessary with working memory barriers. Affected
3153
         * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3154
        panfrost_flush_all_batches(ctx);
3155

3156
        struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3157

3158
        struct panfrost_shader_state *cs =
3159
                &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0];
3160

3161
        /* Indirect dispatch can't handle workgroup local storage since that
3162
         * would require dynamic memory allocation. Bail in this case. */
3163
        if (info->indirect && !cs->info.wls_size) {
3164
                struct pipe_transfer *transfer;
3165
                uint32_t *params = pipe_buffer_map_range(pipe, info->indirect,
3166
                                info->indirect_offset,
3167
                                3 * sizeof(uint32_t),
3168
                                PIPE_MAP_READ,
3169
                                &transfer);
3170

3171
                struct pipe_grid_info direct = *info;
3172
                direct.indirect = NULL;
3173
                direct.grid[0] = params[0];
3174
                direct.grid[1] = params[1];
3175
                direct.grid[2] = params[2];
3176
                pipe_buffer_unmap(pipe, transfer);
3177

3178
                if (params[0] && params[1] && params[2])
3179
                        panfrost_launch_grid(pipe, &direct);
3180

3181
                return;
3182
        }
3183

3184
        ctx->compute_grid = info;
3185

3186
        struct panfrost_ptr t =
3187
                pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3188

3189
        /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so
3190
         * reuse the graphics path for this by lowering to Gallium */
3191

3192
        struct pipe_constant_buffer ubuf = {
3193
                .buffer = NULL,
3194
                .buffer_offset = 0,
3195
                .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->cbase.req_input_mem,
3196
                .user_buffer = info->input
3197
        };
3198

3199
        if (info->input)
3200
                pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf);
3201

3202
        /* Invoke according to the grid info */
3203

3204
        void *invocation =
3205
                pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION);
3206
        unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] };
3207

3208
        if (info->indirect)
3209
                num_wg[0] = num_wg[1] = num_wg[2] = 1;
3210

3211
        panfrost_pack_work_groups_compute(invocation,
3212
                                          num_wg[0], num_wg[1], num_wg[2],
3213
                                          info->block[0], info->block[1],
3214
                                          info->block[2],
3215
                                          false, info->indirect != NULL);
3216

3217
        pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
3218
                cfg.job_task_split =
3219
                        util_logbase2_ceil(info->block[0] + 1) +
3220
                        util_logbase2_ceil(info->block[1] + 1) +
3221
                        util_logbase2_ceil(info->block[2] + 1);
3222
        }
3223

3224
        pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
3225
                cfg.draw_descriptor_is_64b = true;
3226
                if (!pan_is_bifrost(dev))
3227
                        cfg.texture_descriptor_is_64b = true;
3228
                cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE);
3229
                cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
3230
                cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
3231
                cfg.uniform_buffers = panfrost_emit_const_buf(batch,
3232
                                PIPE_SHADER_COMPUTE, &cfg.push_uniforms);
3233
                cfg.textures = panfrost_emit_texture_descriptors(batch,
3234
                                PIPE_SHADER_COMPUTE);
3235
                cfg.samplers = panfrost_emit_sampler_descriptors(batch,
3236
                                PIPE_SHADER_COMPUTE);
3237
        }
3238

3239
        pan_section_pack(t.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
3240

3241
        unsigned indirect_dep = 0;
3242
        if (info->indirect) {
3243
                struct pan_indirect_dispatch_info indirect = {
3244
                        .job = t.gpu,
3245
                        .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
3246
                                        info->indirect_offset,
3247
                        .num_wg_sysval = {
3248
                                batch->num_wg_sysval[0],
3249
                                batch->num_wg_sysval[1],
3250
                                batch->num_wg_sysval[2],
3251
                        },
3252
                };
3253

3254
                indirect_dep = pan_indirect_dispatch_emit(&batch->pool.base,
3255
                                                          &batch->scoreboard,
3256
                                                          &indirect);
3257
        }
3258

3259
        panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3260
                         MALI_JOB_TYPE_COMPUTE, true, false,
3261
                         indirect_dep, 0, &t, false);
3262
        panfrost_flush_all_batches(ctx);
3263
}
3264

3265
static void *
3266
panfrost_create_rasterizer_state(
3267
        struct pipe_context *pctx,
3268
        const struct pipe_rasterizer_state *cso)
3269
{
3270
        struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3271

3272
        so->base = *cso;
3273

3274
        /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */
3275
        assert(cso->offset_clamp == 0.0);
3276

3277
        pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3278
                cfg.multisample_enable = cso->multisample;
3279
                cfg.fixed_function_near_discard = cso->depth_clip_near;
3280
                cfg.fixed_function_far_discard = cso->depth_clip_far;
3281
                cfg.shader_depth_range_fixed = true;
3282
        }
3283

3284
        pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3285
                cfg.depth_range_1 = cso->offset_tri;
3286
                cfg.depth_range_2 = cso->offset_tri;
3287
                cfg.single_sampled_lines = !cso->multisample;
3288
        }
3289

3290
        return so;
3291
}
3292

3293
/* Assigns a vertex buffer for a given (index, divisor) tuple */
3294

3295
static unsigned
3296
pan_assign_vertex_buffer(struct pan_vertex_buffer *buffers,
3297
                         unsigned *nr_bufs,
3298
                         unsigned vbi,
3299
                         unsigned divisor)
3300
{
3301
        /* Look up the buffer */
3302
        for (unsigned i = 0; i < (*nr_bufs); ++i) {
3303
                if (buffers[i].vbi == vbi && buffers[i].divisor == divisor)
3304
                        return i;
3305
        }
3306

3307
        /* Else, create a new buffer */
3308
        unsigned idx = (*nr_bufs)++;
3309

3310
        buffers[idx] = (struct pan_vertex_buffer) {
3311
                .vbi = vbi,
3312
                .divisor = divisor
3313
        };
3314

3315
        return idx;
3316
}
3317

3318
static void *
3319
panfrost_create_vertex_elements_state(
3320
        struct pipe_context *pctx,
3321
        unsigned num_elements,
3322
        const struct pipe_vertex_element *elements)
3323
{
3324
        struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3325
        struct panfrost_device *dev = pan_device(pctx->screen);
3326

3327
        so->num_elements = num_elements;
3328
        memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3329

3330
        /* Assign attribute buffers corresponding to the vertex buffers, keyed
3331
         * for a particular divisor since that's how instancing works on Mali */
3332
        for (unsigned i = 0; i < num_elements; ++i) {
3333
                so->element_buffer[i] = pan_assign_vertex_buffer(
3334
                                so->buffers, &so->nr_bufs,
3335
                                elements[i].vertex_buffer_index,
3336
                                elements[i].instance_divisor);
3337
        }
3338

3339
        for (int i = 0; i < num_elements; ++i) {
3340
                enum pipe_format fmt = elements[i].src_format;
3341
                const struct util_format_description *desc = util_format_description(fmt);
3342
                so->formats[i] = dev->formats[desc->format].hw;
3343
                assert(so->formats[i]);
3344
        }
3345

3346
        /* Let's also prepare vertex builtins */
3347
        so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
3348
        so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
3349

3350
        return so;
3351
}
3352

3353
static inline unsigned
3354
pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3355
{
3356
        switch (in) {
3357
        case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP;
3358
        case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO;
3359
        case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE;
3360
        case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT;
3361
        case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT;
3362
        case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP;
3363
        case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP;
3364
        case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT;
3365
        default: unreachable("Invalid stencil op");
3366
        }
3367
}
3368

3369
static inline void
3370
pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3371
                    struct mali_stencil_packed *out)
3372
{
3373
        pan_pack(out, STENCIL, s) {
3374
                s.mask = in->valuemask;
3375
                s.compare_function = (enum mali_func) in->func;
3376
                s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3377
                s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3378
                s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3379
        }
3380
}
3381

3382
static void *
3383
panfrost_create_depth_stencil_state(struct pipe_context *pipe,
3384
                                    const struct pipe_depth_stencil_alpha_state *zsa)
3385
{
3386
        struct panfrost_device *dev = pan_device(pipe->screen);
3387
        struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3388
        so->base = *zsa;
3389

3390
        /* Normalize (there's no separate enable) */
3391
        if (!zsa->alpha_enabled)
3392
                so->base.alpha_func = MALI_FUNC_ALWAYS;
3393

3394
        /* Prepack relevant parts of the Renderer State Descriptor. They will
3395
         * be ORed in at draw-time */
3396
        pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3397
                cfg.depth_function = zsa->depth_enabled ?
3398
                        (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS;
3399

3400
                cfg.depth_write_mask = zsa->depth_writemask;
3401
        }
3402

3403
        pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3404
                cfg.stencil_enable = zsa->stencil[0].enabled;
3405

3406
                cfg.stencil_mask_front = zsa->stencil[0].writemask;
3407
                cfg.stencil_mask_back = zsa->stencil[1].enabled ?
3408
                        zsa->stencil[1].writemask : zsa->stencil[0].writemask;
3409

3410
                if (dev->arch < 6) {
3411
                        cfg.alpha_test_compare_function =
3412
                                (enum mali_func) so->base.alpha_func;
3413
                }
3414
        }
3415

3416
        /* Stencil tests have their own words in the RSD */
3417
        pan_pipe_to_stencil(&zsa->stencil[0], &so->stencil_front);
3418

3419
        if (zsa->stencil[1].enabled)
3420
                pan_pipe_to_stencil(&zsa->stencil[1], &so->stencil_back);
3421
	else
3422
                so->stencil_back = so->stencil_front;
3423

3424
        so->enabled = zsa->stencil[0].enabled ||
3425
                (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3426

3427
        /* Write masks need tracking together */
3428
        if (zsa->depth_writemask)
3429
                so->draws |= PIPE_CLEAR_DEPTH;
3430

3431
        if (zsa->stencil[0].enabled)
3432
                so->draws |= PIPE_CLEAR_STENCIL;
3433

3434
        /* TODO: Bounds test should be easy */
3435
        assert(!zsa->depth_bounds_test);
3436

3437
        return so;
3438
}
3439

3440
void
3441
panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
3442
                                struct pipe_context *pctx,
3443
                                struct pipe_resource *texture)
3444
{
3445
        struct panfrost_device *device = pan_device(pctx->screen);
3446
        struct panfrost_context *ctx = pan_context(pctx);
3447
        struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
3448
        enum pipe_format format = so->base.format;
3449
        assert(prsrc->image.data.bo);
3450

3451
        /* Format to access the stencil portion of a Z32_S8 texture */
3452
        if (format == PIPE_FORMAT_X32_S8X24_UINT) {
3453
                assert(prsrc->separate_stencil);
3454
                texture = &prsrc->separate_stencil->base;
3455
                prsrc = (struct panfrost_resource *)texture;
3456
                format = texture->format;
3457
        }
3458

3459
        const struct util_format_description *desc = util_format_description(format);
3460

3461
        bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM);
3462

3463
        if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) {
3464
                if (desc->is_snorm)
3465
                        format = PIPE_FORMAT_R8G8B8A8_SNORM;
3466
                else
3467
                        format = PIPE_FORMAT_R8G8B8A8_UNORM;
3468
                desc = util_format_description(format);
3469
        }
3470

3471
        so->texture_bo = prsrc->image.data.bo->ptr.gpu;
3472
        so->modifier = prsrc->image.layout.modifier;
3473

3474
        /* MSAA only supported for 2D textures */
3475

3476
        assert(texture->nr_samples <= 1 ||
3477
               so->base.target == PIPE_TEXTURE_2D ||
3478
               so->base.target == PIPE_TEXTURE_2D_ARRAY);
3479

3480
        enum mali_texture_dimension type =
3481
                panfrost_translate_texture_dimension(so->base.target);
3482

3483
        bool is_buffer = (so->base.target == PIPE_BUFFER);
3484

3485
        unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
3486
        unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
3487
        unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
3488
        unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
3489
        unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
3490
        unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) /
3491
                            util_format_get_blocksize(format);
3492

3493
        if (so->base.target == PIPE_TEXTURE_3D) {
3494
                first_layer /= prsrc->image.layout.depth;
3495
                last_layer /= prsrc->image.layout.depth;
3496
                assert(!first_layer && !last_layer);
3497
        }
3498

3499
        struct pan_image_view iview = {
3500
                .format = format,
3501
                .dim = type,
3502
                .first_level = first_level,
3503
                .last_level = last_level,
3504
                .first_layer = first_layer,
3505
                .last_layer = last_layer,
3506
                .swizzle = {
3507
                        so->base.swizzle_r,
3508
                        so->base.swizzle_g,
3509
                        so->base.swizzle_b,
3510
                        so->base.swizzle_a,
3511
                },
3512
                .image = &prsrc->image,
3513

3514
                .buf.offset = buf_offset,
3515
                .buf.size = buf_size,
3516
        };
3517

3518
        unsigned size =
3519
                (pan_is_bifrost(device) ? 0 : MALI_MIDGARD_TEXTURE_LENGTH) +
3520
                panfrost_estimate_texture_payload_size(device, &iview);
3521

3522
        struct panfrost_ptr payload = pan_pool_alloc_aligned(&ctx->descs.base, size, 64);
3523
        so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
3524

3525
        void *tex = pan_is_bifrost(device) ?
3526
                    &so->bifrost_descriptor : payload.cpu;
3527

3528
        if (!pan_is_bifrost(device)) {
3529
                payload.cpu += MALI_MIDGARD_TEXTURE_LENGTH;
3530
                payload.gpu += MALI_MIDGARD_TEXTURE_LENGTH;
3531
        }
3532

3533
        panfrost_new_texture(device, &iview, tex, &payload);
3534
}
3535

3536
static struct pipe_sampler_view *
3537
panfrost_create_sampler_view(
3538
        struct pipe_context *pctx,
3539
        struct pipe_resource *texture,
3540
        const struct pipe_sampler_view *template)
3541
{
3542
        struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
3543

3544
        pipe_reference(NULL, &texture->reference);
3545

3546
        so->base = *template;
3547
        so->base.texture = texture;
3548
        so->base.reference.count = 1;
3549
        so->base.context = pctx;
3550

3551
        panfrost_create_sampler_view_bo(so, pctx, texture);
3552

3553
        return (struct pipe_sampler_view *) so;
3554
}
3555

3556
/* A given Gallium blend state can be encoded to the hardware in numerous,
3557
 * dramatically divergent ways due to the interactions of blending with
3558
 * framebuffer formats. Conceptually, there are two modes:
3559
 *
3560
 * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3561
 *   state, and suitable blend constant)
3562
 *
3563
 * - Blend shaders (for everything else)
3564
 *
3565
 * A given Gallium blend configuration will compile to exactly one
3566
 * fixed-function blend state, if it compiles to any, although the constant
3567
 * will vary across runs as that is tracked outside of the Gallium CSO.
3568
 *
3569
 * However, that same blend configuration will compile to many different blend
3570
 * shaders, depending on the framebuffer formats active. The rationale is that
3571
 * blend shaders override not just fixed-function blending but also
3572
 * fixed-function format conversion, so blend shaders are keyed to a particular
3573
 * framebuffer format. As an example, the tilebuffer format is identical for
3574
 * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3575
 * blend shaders.
3576
 *
3577
 * All of this state is encapsulated in the panfrost_blend_state struct
3578
 * (our subclass of pipe_blend_state).
3579
 */
3580

3581
/* Create a blend CSO. Essentially, try to compile a fixed-function
3582
 * expression and initialize blend shaders */
3583

3584
static void *
3585
panfrost_create_blend_state(struct pipe_context *pipe,
3586
                            const struct pipe_blend_state *blend)
3587
{
3588
        struct panfrost_device *dev = pan_device(pipe->screen);
3589
        struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3590
        so->base = *blend;
3591

3592
        so->pan.logicop_enable = blend->logicop_enable;
3593
        so->pan.logicop_func = blend->logicop_func;
3594
        so->pan.rt_count = blend->max_rt + 1;
3595

3596
        for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3597
                unsigned g = blend->independent_blend_enable ? c : 0;
3598
                const struct pipe_rt_blend_state pipe = blend->rt[g];
3599
                struct pan_blend_equation equation = {0};
3600

3601
                equation.color_mask = pipe.colormask;
3602
                equation.blend_enable = pipe.blend_enable;
3603

3604
                if (pipe.blend_enable) {
3605
                        equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func);
3606
                        equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor);
3607
                        equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor);
3608
                        equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor);
3609
                        equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor);
3610
                        equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func);
3611
                        equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor);
3612
                        equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor);
3613
                        equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor);
3614
                        equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor);
3615
                }
3616

3617
                /* Determine some common properties */
3618
                unsigned constant_mask = pan_blend_constant_mask(equation);
3619
                so->info[c] = (struct pan_blend_info) {
3620
                        .no_colour = (equation.color_mask == 0),
3621
                        .opaque = pan_blend_is_opaque(equation),
3622
                        .constant_mask = constant_mask,
3623

3624
                        /* TODO: check the dest for the logicop */
3625
                        .load_dest = blend->logicop_enable ||
3626
                                pan_blend_reads_dest(equation),
3627

3628
                        /* Could this possibly be fixed-function? */
3629
                        .fixed_function = !blend->logicop_enable &&
3630
                                pan_blend_can_fixed_function(equation) &&
3631
                                (!constant_mask ||
3632
                                 pan_blend_supports_constant(dev->arch, c))
3633
                };
3634

3635
                so->pan.rts[c].equation = equation;
3636

3637
                /* Bifrost needs to know if any render target loads its
3638
                 * destination in the hot draw path, so precompute this */
3639
                if (so->info[c].load_dest)
3640
                        so->load_dest_mask |= BITFIELD_BIT(c);
3641

3642
                /* Converting equations to Mali style is expensive, do it at
3643
                 * CSO create time instead of draw-time */
3644
                if (so->info[c].fixed_function) {
3645
                        pan_pack(&so->equation[c], BLEND_EQUATION, cfg)
3646
                                pan_blend_to_fixed_function_equation(equation, &cfg);
3647
                }
3648
        }
3649

3650
        return so;
3651
}
3652

3653
static void
3654
prepare_rsd(struct panfrost_device *dev,
3655
            struct panfrost_shader_state *state,
3656
            struct panfrost_pool *pool, bool upload)
3657
{
3658
        struct mali_renderer_state_packed *out = &state->partial_rsd;
3659

3660
        if (upload) {
3661
                struct panfrost_ptr ptr =
3662
                        pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3663

3664
                state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3665
                out = ptr.cpu;
3666
        }
3667

3668
        pan_pack(out, RENDERER_STATE, cfg) {
3669
                pan_shader_prepare_rsd(dev, &state->info, state->bin.gpu,
3670
                                       &cfg);
3671
        }
3672
}
3673

3674
static void
3675
panfrost_get_sample_position(struct pipe_context *context,
3676
                             unsigned sample_count,
3677
                             unsigned sample_index,
3678
                             float *out_value)
3679
{
3680
        panfrost_query_sample_position(
3681
                        panfrost_sample_pattern(sample_count),
3682
                        sample_index,
3683
                        out_value);
3684
}
3685

3686
static void
3687
screen_destroy(struct pipe_screen *pscreen)
3688
{
3689
        struct panfrost_device *dev = pan_device(pscreen);
3690
        pan_blitter_cleanup(dev);
3691
}
3692

3693
static void
3694
preload(struct panfrost_batch *batch, struct pan_fb_info *fb)
3695
{
3696
        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3697

3698
        pan_preload_fb(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
3699
                       pan_is_bifrost(dev) ? batch->tiler_ctx.bifrost : 0);
3700
}
3701

3702
void
3703
panfrost_cmdstream_screen_init(struct panfrost_screen *screen)
3704
{
3705
        struct panfrost_device *dev = &screen->dev;
3706

3707
        screen->vtbl.prepare_rsd = prepare_rsd;
3708
        screen->vtbl.emit_tls    = emit_tls;
3709
        screen->vtbl.emit_fbd    = emit_fbd;
3710
        screen->vtbl.emit_fragment_job = emit_fragment_job;
3711
        screen->vtbl.screen_destroy = screen_destroy;
3712
        screen->vtbl.preload     = preload;
3713

3714
        pan_blitter_init(dev, &screen->blitter.bin_pool.base,
3715
                         &screen->blitter.desc_pool.base);
3716
}
3717

3718
void
3719
panfrost_cmdstream_context_init(struct pipe_context *pipe)
3720
{
3721
        pipe->draw_vbo           = panfrost_draw_vbo;
3722
        pipe->launch_grid        = panfrost_launch_grid;
3723

3724
        pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
3725
        pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
3726
        pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
3727
        pipe->create_sampler_view = panfrost_create_sampler_view;
3728
        pipe->create_sampler_state = panfrost_create_sampler_state;
3729
        pipe->create_blend_state = panfrost_create_blend_state;
3730

3731
        pipe->get_sample_position = panfrost_get_sample_position;
3732
}
3733

3734
Product

Resources

Company