Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/r600/evergreen_compute.c
4570 views
1
/*
2
* Copyright 2011 Adam Rak <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* on the rights to use, copy, modify, merge, publish, distribute, sub
8
* license, and/or sell copies of the Software, and to permit persons to whom
9
* the Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE.
22
*
23
* Authors:
24
* Adam Rak <[email protected]>
25
*/
26
27
#ifdef HAVE_OPENCL
28
#include <gelf.h>
29
#include <libelf.h>
30
#endif
31
#include <stdio.h>
32
#include <errno.h>
33
#include "pipe/p_defines.h"
34
#include "pipe/p_state.h"
35
#include "pipe/p_context.h"
36
#include "util/u_blitter.h"
37
#include "util/list.h"
38
#include "util/u_transfer.h"
39
#include "util/u_surface.h"
40
#include "util/u_pack_color.h"
41
#include "util/u_memory.h"
42
#include "util/u_inlines.h"
43
#include "util/u_framebuffer.h"
44
#include "tgsi/tgsi_parse.h"
45
#include "pipebuffer/pb_buffer.h"
46
#include "evergreend.h"
47
#include "r600_shader.h"
48
#include "r600_pipe.h"
49
#include "r600_formats.h"
50
#include "evergreen_compute.h"
51
#include "evergreen_compute_internal.h"
52
#include "compute_memory_pool.h"
53
#include "sb/sb_public.h"
54
#include <inttypes.h>
55
56
/**
57
RAT0 is for global binding write
58
VTX1 is for global binding read
59
60
for wrting images RAT1...
61
for reading images TEX2...
62
TEX2-RAT1 is paired
63
64
TEX2... consumes the same fetch resources, that VTX2... would consume
65
66
CONST0 and VTX0 is for parameters
67
CONST0 is binding smaller input parameter buffer, and for constant indexing,
68
also constant cached
69
VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70
the constant cache can handle
71
72
RAT-s are limited to 12, so we can only bind at most 11 texture for writing
73
because we reserve RAT0 for global bindings. With byteaddressing enabled,
74
we should reserve another one too.=> 10 image binding for writing max.
75
76
from Nvidia OpenCL:
77
CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
78
CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
79
80
so 10 for writing is enough. 176 is the max for reading according to the docs
81
82
writable images should be listed first < 10, so their id corresponds to RAT(id+1)
83
writable images will consume TEX slots, VTX slots too because of linear indexing
84
85
*/
86
87
#ifdef HAVE_OPENCL
88
static void radeon_shader_binary_init(struct r600_shader_binary *b)
89
{
90
memset(b, 0, sizeof(*b));
91
}
92
93
static void radeon_shader_binary_clean(struct r600_shader_binary *b)
94
{
95
if (!b)
96
return;
97
FREE(b->code);
98
FREE(b->config);
99
FREE(b->rodata);
100
FREE(b->global_symbol_offsets);
101
FREE(b->relocs);
102
FREE(b->disasm_string);
103
}
104
#endif
105
106
struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
107
unsigned size)
108
{
109
struct pipe_resource *buffer = NULL;
110
assert(size);
111
112
buffer = pipe_buffer_create((struct pipe_screen*) screen,
113
0, PIPE_USAGE_IMMUTABLE, size);
114
115
return (struct r600_resource *)buffer;
116
}
117
118
119
static void evergreen_set_rat(struct r600_pipe_compute *pipe,
120
unsigned id,
121
struct r600_resource *bo,
122
int start,
123
int size)
124
{
125
struct pipe_surface rat_templ;
126
struct r600_surface *surf = NULL;
127
struct r600_context *rctx = NULL;
128
129
assert(id < 12);
130
assert((size & 3) == 0);
131
assert((start & 0xFF) == 0);
132
133
rctx = pipe->ctx;
134
135
COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
136
137
/* Create the RAT surface */
138
memset(&rat_templ, 0, sizeof(rat_templ));
139
rat_templ.format = PIPE_FORMAT_R32_UINT;
140
rat_templ.u.tex.level = 0;
141
rat_templ.u.tex.first_layer = 0;
142
rat_templ.u.tex.last_layer = 0;
143
144
/* Add the RAT the list of color buffers. Drop the old buffer first. */
145
pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
146
pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
147
(struct pipe_context *)pipe->ctx,
148
(struct pipe_resource *)bo, &rat_templ);
149
150
/* Update the number of color buffers */
151
pipe->ctx->framebuffer.state.nr_cbufs =
152
MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
153
154
/* Update the cb_target_mask
155
* XXX: I think this is a potential spot for bugs once we start doing
156
* GL interop. cb_target_mask may be modified in the 3D sections
157
* of this driver. */
158
pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
159
160
surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
161
evergreen_init_color_surface_rat(rctx, surf);
162
}
163
164
static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
165
unsigned vb_index,
166
unsigned offset,
167
struct pipe_resource *buffer)
168
{
169
struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
170
struct pipe_vertex_buffer *vb = &state->vb[vb_index];
171
vb->stride = 1;
172
vb->buffer_offset = offset;
173
vb->buffer.resource = buffer;
174
vb->is_user_buffer = false;
175
176
/* The vertex instructions in the compute shaders use the texture cache,
177
* so we need to invalidate it. */
178
rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
179
state->enabled_mask |= 1 << vb_index;
180
state->dirty_mask |= 1 << vb_index;
181
r600_mark_atom_dirty(rctx, &state->atom);
182
}
183
184
static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
185
unsigned cb_index,
186
unsigned offset,
187
unsigned size,
188
struct pipe_resource *buffer)
189
{
190
struct pipe_constant_buffer cb;
191
cb.buffer_size = size;
192
cb.buffer_offset = offset;
193
cb.buffer = buffer;
194
cb.user_buffer = NULL;
195
196
rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
197
}
198
199
/* We need to define these R600 registers here, because we can't include
200
* evergreend.h and r600d.h.
201
*/
202
#define R_028868_SQ_PGM_RESOURCES_VS 0x028868
203
#define R_028850_SQ_PGM_RESOURCES_PS 0x028850
204
205
#ifdef HAVE_OPENCL
206
static void parse_symbol_table(Elf_Data *symbol_table_data,
207
const GElf_Shdr *symbol_table_header,
208
struct r600_shader_binary *binary)
209
{
210
GElf_Sym symbol;
211
unsigned i = 0;
212
unsigned symbol_count =
213
symbol_table_header->sh_size / symbol_table_header->sh_entsize;
214
215
/* We are over allocating this list, because symbol_count gives the
216
* total number of symbols, and we will only be filling the list
217
* with offsets of global symbols. The memory savings from
218
* allocating the correct size of this list will be small, and
219
* I don't think it is worth the cost of pre-computing the number
220
* of global symbols.
221
*/
222
binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
223
224
while (gelf_getsym(symbol_table_data, i++, &symbol)) {
225
unsigned i;
226
if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
227
symbol.st_shndx == 0 /* Undefined symbol */) {
228
continue;
229
}
230
231
binary->global_symbol_offsets[binary->global_symbol_count] =
232
symbol.st_value;
233
234
/* Sort the list using bubble sort. This list will usually
235
* be small. */
236
for (i = binary->global_symbol_count; i > 0; --i) {
237
uint64_t lhs = binary->global_symbol_offsets[i - 1];
238
uint64_t rhs = binary->global_symbol_offsets[i];
239
if (lhs < rhs) {
240
break;
241
}
242
binary->global_symbol_offsets[i] = lhs;
243
binary->global_symbol_offsets[i - 1] = rhs;
244
}
245
++binary->global_symbol_count;
246
}
247
}
248
249
250
static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
251
unsigned symbol_sh_link,
252
struct r600_shader_binary *binary)
253
{
254
unsigned i;
255
256
if (!relocs || !symbols || !binary->reloc_count) {
257
return;
258
}
259
binary->relocs = CALLOC(binary->reloc_count,
260
sizeof(struct r600_shader_reloc));
261
for (i = 0; i < binary->reloc_count; i++) {
262
GElf_Sym symbol;
263
GElf_Rel rel;
264
char *symbol_name;
265
struct r600_shader_reloc *reloc = &binary->relocs[i];
266
267
gelf_getrel(relocs, i, &rel);
268
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
269
symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
270
271
reloc->offset = rel.r_offset;
272
strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
273
reloc->name[sizeof(reloc->name)-1] = 0;
274
}
275
}
276
277
static void r600_elf_read(const char *elf_data, unsigned elf_size,
278
struct r600_shader_binary *binary)
279
{
280
char *elf_buffer;
281
Elf *elf;
282
Elf_Scn *section = NULL;
283
Elf_Data *symbols = NULL, *relocs = NULL;
284
size_t section_str_index;
285
unsigned symbol_sh_link = 0;
286
287
/* One of the libelf implementations
288
* (http://www.mr511.de/software/english.htm) requires calling
289
* elf_version() before elf_memory().
290
*/
291
elf_version(EV_CURRENT);
292
elf_buffer = MALLOC(elf_size);
293
memcpy(elf_buffer, elf_data, elf_size);
294
295
elf = elf_memory(elf_buffer, elf_size);
296
297
elf_getshdrstrndx(elf, &section_str_index);
298
299
while ((section = elf_nextscn(elf, section))) {
300
const char *name;
301
Elf_Data *section_data = NULL;
302
GElf_Shdr section_header;
303
if (gelf_getshdr(section, &section_header) != &section_header) {
304
fprintf(stderr, "Failed to read ELF section header\n");
305
return;
306
}
307
name = elf_strptr(elf, section_str_index, section_header.sh_name);
308
if (!strcmp(name, ".text")) {
309
section_data = elf_getdata(section, section_data);
310
binary->code_size = section_data->d_size;
311
binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
312
memcpy(binary->code, section_data->d_buf, binary->code_size);
313
} else if (!strcmp(name, ".AMDGPU.config")) {
314
section_data = elf_getdata(section, section_data);
315
binary->config_size = section_data->d_size;
316
binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
317
memcpy(binary->config, section_data->d_buf, binary->config_size);
318
} else if (!strcmp(name, ".AMDGPU.disasm")) {
319
/* Always read disassembly if it's available. */
320
section_data = elf_getdata(section, section_data);
321
binary->disasm_string = strndup(section_data->d_buf,
322
section_data->d_size);
323
} else if (!strncmp(name, ".rodata", 7)) {
324
section_data = elf_getdata(section, section_data);
325
binary->rodata_size = section_data->d_size;
326
binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
327
memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
328
} else if (!strncmp(name, ".symtab", 7)) {
329
symbols = elf_getdata(section, section_data);
330
symbol_sh_link = section_header.sh_link;
331
parse_symbol_table(symbols, &section_header, binary);
332
} else if (!strcmp(name, ".rel.text")) {
333
relocs = elf_getdata(section, section_data);
334
binary->reloc_count = section_header.sh_size /
335
section_header.sh_entsize;
336
}
337
}
338
339
parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
340
341
if (elf){
342
elf_end(elf);
343
}
344
FREE(elf_buffer);
345
346
/* Cache the config size per symbol */
347
if (binary->global_symbol_count) {
348
binary->config_size_per_symbol =
349
binary->config_size / binary->global_symbol_count;
350
} else {
351
binary->global_symbol_count = 1;
352
binary->config_size_per_symbol = binary->config_size;
353
}
354
}
355
356
static const unsigned char *r600_shader_binary_config_start(
357
const struct r600_shader_binary *binary,
358
uint64_t symbol_offset)
359
{
360
unsigned i;
361
for (i = 0; i < binary->global_symbol_count; ++i) {
362
if (binary->global_symbol_offsets[i] == symbol_offset) {
363
unsigned offset = i * binary->config_size_per_symbol;
364
return binary->config + offset;
365
}
366
}
367
return binary->config;
368
}
369
370
static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
371
struct r600_bytecode *bc,
372
uint64_t symbol_offset,
373
boolean *use_kill)
374
{
375
unsigned i;
376
const unsigned char *config =
377
r600_shader_binary_config_start(binary, symbol_offset);
378
379
for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
380
unsigned reg =
381
util_le32_to_cpu(*(uint32_t*)(config + i));
382
unsigned value =
383
util_le32_to_cpu(*(uint32_t*)(config + i + 4));
384
switch (reg) {
385
/* R600 / R700 */
386
case R_028850_SQ_PGM_RESOURCES_PS:
387
case R_028868_SQ_PGM_RESOURCES_VS:
388
/* Evergreen / Northern Islands */
389
case R_028844_SQ_PGM_RESOURCES_PS:
390
case R_028860_SQ_PGM_RESOURCES_VS:
391
case R_0288D4_SQ_PGM_RESOURCES_LS:
392
bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
393
bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
394
break;
395
case R_02880C_DB_SHADER_CONTROL:
396
*use_kill = G_02880C_KILL_ENABLE(value);
397
break;
398
case R_0288E8_SQ_LDS_ALLOC:
399
bc->nlds_dw = value;
400
break;
401
}
402
}
403
}
404
405
static unsigned r600_create_shader(struct r600_bytecode *bc,
406
const struct r600_shader_binary *binary,
407
boolean *use_kill)
408
409
{
410
assert(binary->code_size % 4 == 0);
411
bc->bytecode = CALLOC(1, binary->code_size);
412
memcpy(bc->bytecode, binary->code, binary->code_size);
413
bc->ndw = binary->code_size / 4;
414
415
r600_shader_binary_read_config(binary, bc, 0, use_kill);
416
return 0;
417
}
418
419
#endif
420
421
static void r600_destroy_shader(struct r600_bytecode *bc)
422
{
423
FREE(bc->bytecode);
424
}
425
426
static void *evergreen_create_compute_state(struct pipe_context *ctx,
427
const struct pipe_compute_state *cso)
428
{
429
struct r600_context *rctx = (struct r600_context *)ctx;
430
struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
431
#ifdef HAVE_OPENCL
432
const struct pipe_binary_program_header *header;
433
void *p;
434
boolean use_kill;
435
#endif
436
437
shader->ctx = rctx;
438
shader->local_size = cso->req_local_mem;
439
shader->private_size = cso->req_private_mem;
440
shader->input_size = cso->req_input_mem;
441
442
shader->ir_type = cso->ir_type;
443
444
if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
445
shader->ir_type == PIPE_SHADER_IR_NIR) {
446
shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
447
return shader;
448
}
449
#ifdef HAVE_OPENCL
450
COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
451
header = cso->prog;
452
radeon_shader_binary_init(&shader->binary);
453
r600_elf_read(header->blob, header->num_bytes, &shader->binary);
454
r600_create_shader(&shader->bc, &shader->binary, &use_kill);
455
456
/* Upload code + ROdata */
457
shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
458
shader->bc.ndw * 4);
459
p = r600_buffer_map_sync_with_rings(
460
&rctx->b, shader->code_bo,
461
PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
462
//TODO: use util_memcpy_cpu_to_le32 ?
463
memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
464
rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
465
#endif
466
467
return shader;
468
}
469
470
static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
471
{
472
struct r600_context *rctx = (struct r600_context *)ctx;
473
struct r600_pipe_compute *shader = state;
474
475
COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
476
477
if (!shader)
478
return;
479
480
if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
481
shader->ir_type == PIPE_SHADER_IR_NIR) {
482
r600_delete_shader_selector(ctx, shader->sel);
483
} else {
484
#ifdef HAVE_OPENCL
485
radeon_shader_binary_clean(&shader->binary);
486
pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
487
pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
488
#endif
489
r600_destroy_shader(&shader->bc);
490
}
491
FREE(shader);
492
}
493
494
static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
495
{
496
struct r600_context *rctx = (struct r600_context *)ctx;
497
struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
498
COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
499
500
if (!state) {
501
rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
502
return;
503
}
504
505
if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
506
cstate->ir_type == PIPE_SHADER_IR_NIR) {
507
bool compute_dirty;
508
cstate->sel->ir_type = cstate->ir_type;
509
if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
510
R600_ERR("Failed to select compute shader\n");
511
}
512
513
rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
514
}
515
516
/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
517
* kernel parameters there are implicit parameters that need to be stored
518
* in the vertex buffer as well. Here is how these parameters are organized in
519
* the buffer:
520
*
521
* DWORDS 0-2: Number of work groups in each dimension (x,y,z)
522
* DWORDS 3-5: Number of global work items in each dimension (x,y,z)
523
* DWORDS 6-8: Number of work items within each work group in each dimension
524
* (x,y,z)
525
* DWORDS 9+ : Kernel parameters
526
*/
527
static void evergreen_compute_upload_input(struct pipe_context *ctx,
528
const struct pipe_grid_info *info)
529
{
530
struct r600_context *rctx = (struct r600_context *)ctx;
531
struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
532
unsigned i;
533
/* We need to reserve 9 dwords (36 bytes) for implicit kernel
534
* parameters.
535
*/
536
unsigned input_size;
537
uint32_t *num_work_groups_start;
538
uint32_t *global_size_start;
539
uint32_t *local_size_start;
540
uint32_t *kernel_parameters_start;
541
struct pipe_box box;
542
struct pipe_transfer *transfer = NULL;
543
544
if (!shader)
545
return;
546
if (shader->input_size == 0) {
547
return;
548
}
549
input_size = shader->input_size + 36;
550
if (!shader->kernel_param) {
551
/* Add space for the grid dimensions */
552
shader->kernel_param = (struct r600_resource *)
553
pipe_buffer_create(ctx->screen, 0,
554
PIPE_USAGE_IMMUTABLE, input_size);
555
}
556
557
u_box_1d(0, input_size, &box);
558
num_work_groups_start = ctx->buffer_map(ctx,
559
(struct pipe_resource*)shader->kernel_param,
560
0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
561
&box, &transfer);
562
global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
563
local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
564
kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
565
566
/* Copy the work group size */
567
memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
568
569
/* Copy the global size */
570
for (i = 0; i < 3; i++) {
571
global_size_start[i] = info->grid[i] * info->block[i];
572
}
573
574
/* Copy the local dimensions */
575
memcpy(local_size_start, info->block, 3 * sizeof(uint));
576
577
/* Copy the kernel inputs */
578
memcpy(kernel_parameters_start, info->input, shader->input_size);
579
580
for (i = 0; i < (input_size / 4); i++) {
581
COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
582
((unsigned*)num_work_groups_start)[i]);
583
}
584
585
ctx->buffer_unmap(ctx, transfer);
586
587
/* ID=0 and ID=3 are reserved for the parameters.
588
* LLVM will preferably use ID=0, but it does not work for dynamic
589
* indices. */
590
evergreen_cs_set_vertex_buffer(rctx, 3, 0,
591
(struct pipe_resource*)shader->kernel_param);
592
evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
593
(struct pipe_resource*)shader->kernel_param);
594
}
595
596
static void evergreen_emit_dispatch(struct r600_context *rctx,
597
const struct pipe_grid_info *info,
598
uint32_t indirect_grid[3])
599
{
600
int i;
601
struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
602
struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
603
bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
604
unsigned num_waves;
605
unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
606
unsigned wave_divisor = (16 * num_pipes);
607
int group_size = 1;
608
int grid_size = 1;
609
unsigned lds_size = shader->local_size / 4;
610
611
if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
612
shader->ir_type != PIPE_SHADER_IR_NIR)
613
lds_size += shader->bc.nlds_dw;
614
615
/* Calculate group_size/grid_size */
616
for (i = 0; i < 3; i++) {
617
group_size *= info->block[i];
618
}
619
620
for (i = 0; i < 3; i++) {
621
grid_size *= info->grid[i];
622
}
623
624
/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
625
num_waves = (info->block[0] * info->block[1] * info->block[2] +
626
wave_divisor - 1) / wave_divisor;
627
628
COMPUTE_DBG(rctx->screen, "Using %u pipes, "
629
"%u wavefronts per thread block, "
630
"allocating %u dwords lds.\n",
631
num_pipes, num_waves, lds_size);
632
633
radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
634
635
radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
636
radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
637
radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
638
radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
639
640
radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
641
group_size);
642
643
radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
644
radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
645
radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
646
radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
647
648
if (rctx->b.chip_class < CAYMAN) {
649
assert(lds_size <= 8192);
650
} else {
651
/* Cayman appears to have a slightly smaller limit, see the
652
* value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
653
assert(lds_size <= 8160);
654
}
655
656
radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
657
lds_size | (num_waves << 14));
658
659
if (info->indirect) {
660
radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
661
radeon_emit(cs, indirect_grid[0]);
662
radeon_emit(cs, indirect_grid[1]);
663
radeon_emit(cs, indirect_grid[2]);
664
radeon_emit(cs, 1);
665
} else {
666
/* Dispatch packet */
667
radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
668
radeon_emit(cs, info->grid[0]);
669
radeon_emit(cs, info->grid[1]);
670
radeon_emit(cs, info->grid[2]);
671
/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
672
radeon_emit(cs, 1);
673
}
674
675
if (rctx->is_debug)
676
eg_trace_emit(rctx);
677
}
678
679
static void compute_setup_cbs(struct r600_context *rctx)
680
{
681
struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
682
unsigned i;
683
684
/* Emit colorbuffers. */
685
/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
686
for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
687
struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
688
unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
689
(struct r600_resource*)cb->base.texture,
690
RADEON_USAGE_READWRITE,
691
RADEON_PRIO_SHADER_RW_BUFFER);
692
693
radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
694
radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
695
radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
696
radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
697
radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
698
radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
699
radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
700
radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
701
702
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
703
radeon_emit(cs, reloc);
704
705
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
706
radeon_emit(cs, reloc);
707
}
708
for (; i < 8 ; i++)
709
radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
710
S_028C70_FORMAT(V_028C70_COLOR_INVALID));
711
for (; i < 12; i++)
712
radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
713
S_028C70_FORMAT(V_028C70_COLOR_INVALID));
714
715
/* Set CB_TARGET_MASK XXX: Use cb_misc_state */
716
radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
717
rctx->compute_cb_target_mask);
718
}
719
720
static void compute_emit_cs(struct r600_context *rctx,
721
const struct pipe_grid_info *info)
722
{
723
struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
724
bool compute_dirty = false;
725
struct r600_pipe_shader *current;
726
struct r600_shader_atomic combined_atomics[8];
727
uint8_t atomic_used_mask;
728
uint32_t indirect_grid[3] = { 0, 0, 0 };
729
730
/* make sure that the gfx ring is only one active */
731
if (radeon_emitted(&rctx->b.dma.cs, 0)) {
732
rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
733
}
734
735
r600_update_compressed_resource_state(rctx, true);
736
737
if (!rctx->cmd_buf_is_compute) {
738
rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
739
rctx->cmd_buf_is_compute = true;
740
}
741
742
if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
743
rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
744
if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
745
R600_ERR("Failed to select compute shader\n");
746
return;
747
}
748
749
current = rctx->cs_shader_state.shader->sel->current;
750
if (compute_dirty) {
751
rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
752
r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
753
r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
754
}
755
756
bool need_buf_const = current->shader.uses_tex_buffers ||
757
current->shader.has_txq_cube_array_z_comp;
758
759
if (info->indirect) {
760
struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
761
unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
762
unsigned offset = info->indirect_offset / 4;
763
indirect_grid[0] = data[offset];
764
indirect_grid[1] = data[offset + 1];
765
indirect_grid[2] = data[offset + 2];
766
}
767
for (int i = 0; i < 3; i++) {
768
rctx->cs_block_grid_sizes[i] = info->block[i];
769
rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
770
}
771
rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
772
rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
773
774
evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
775
r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
776
777
if (need_buf_const) {
778
eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
779
}
780
r600_update_driver_const_buffers(rctx, true);
781
782
evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
783
if (atomic_used_mask) {
784
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
785
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
786
}
787
} else
788
r600_need_cs_space(rctx, 0, true, 0);
789
790
/* Initialize all the compute-related registers.
791
*
792
* See evergreen_init_atom_start_compute_cs() in this file for the list
793
* of registers initialized by the start_compute_cs_cmd atom.
794
*/
795
r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
796
797
/* emit config state */
798
if (rctx->b.chip_class == EVERGREEN) {
799
if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
800
rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
801
radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
802
radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
803
radeon_emit(cs, 0);
804
radeon_emit(cs, 0);
805
radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
806
} else
807
r600_emit_atom(rctx, &rctx->config_state.atom);
808
}
809
810
rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
811
r600_flush_emit(rctx);
812
813
if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
814
rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
815
816
compute_setup_cbs(rctx);
817
818
/* Emit vertex buffer state */
819
rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
820
r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
821
} else {
822
uint32_t rat_mask;
823
824
rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
825
radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
826
rat_mask);
827
}
828
829
r600_emit_atom(rctx, &rctx->b.render_cond_atom);
830
831
/* Emit constant buffer state */
832
r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
833
834
/* Emit sampler state */
835
r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
836
837
/* Emit sampler view (texture resource) state */
838
r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
839
840
/* Emit images state */
841
r600_emit_atom(rctx, &rctx->compute_images.atom);
842
843
/* Emit buffers state */
844
r600_emit_atom(rctx, &rctx->compute_buffers.atom);
845
846
/* Emit shader state */
847
r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
848
849
/* Emit dispatch state and dispatch packet */
850
evergreen_emit_dispatch(rctx, info, indirect_grid);
851
852
/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
853
*/
854
rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
855
R600_CONTEXT_INV_VERTEX_CACHE |
856
R600_CONTEXT_INV_TEX_CACHE;
857
r600_flush_emit(rctx);
858
rctx->b.flags = 0;
859
860
if (rctx->b.chip_class >= CAYMAN) {
861
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
862
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
863
/* DEALLOC_STATE prevents the GPU from hanging when a
864
* SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
865
* with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
866
*/
867
radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
868
radeon_emit(cs, 0);
869
}
870
if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
871
rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
872
evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
873
874
#if 0
875
COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
876
for (i = 0; i < cs->cdw; i++) {
877
COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
878
}
879
#endif
880
881
}
882
883
884
/**
885
* Emit function for r600_cs_shader_state atom
886
*/
887
void evergreen_emit_cs_shader(struct r600_context *rctx,
888
struct r600_atom *atom)
889
{
890
struct r600_cs_shader_state *state =
891
(struct r600_cs_shader_state*)atom;
892
struct r600_pipe_compute *shader = state->shader;
893
struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
894
uint64_t va;
895
struct r600_resource *code_bo;
896
unsigned ngpr, nstack;
897
898
if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
899
shader->ir_type == PIPE_SHADER_IR_NIR) {
900
code_bo = shader->sel->current->bo;
901
va = shader->sel->current->bo->gpu_address;
902
ngpr = shader->sel->current->shader.bc.ngpr;
903
nstack = shader->sel->current->shader.bc.nstack;
904
} else {
905
code_bo = shader->code_bo;
906
va = shader->code_bo->gpu_address + state->pc;
907
ngpr = shader->bc.ngpr;
908
nstack = shader->bc.nstack;
909
}
910
911
radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
912
radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
913
radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
914
S_0288D4_NUM_GPRS(ngpr) |
915
S_0288D4_DX10_CLAMP(1) |
916
S_0288D4_STACK_SIZE(nstack));
917
radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
918
919
radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
920
radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
921
code_bo, RADEON_USAGE_READ,
922
RADEON_PRIO_SHADER_BINARY));
923
}
924
925
static void evergreen_launch_grid(struct pipe_context *ctx,
926
const struct pipe_grid_info *info)
927
{
928
struct r600_context *rctx = (struct r600_context *)ctx;
929
#ifdef HAVE_OPENCL
930
struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
931
boolean use_kill;
932
933
if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
934
shader->ir_type != PIPE_SHADER_IR_NIR) {
935
rctx->cs_shader_state.pc = info->pc;
936
/* Get the config information for this kernel. */
937
r600_shader_binary_read_config(&shader->binary, &shader->bc,
938
info->pc, &use_kill);
939
} else {
940
use_kill = false;
941
rctx->cs_shader_state.pc = 0;
942
}
943
#endif
944
945
COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
946
947
948
evergreen_compute_upload_input(ctx, info);
949
compute_emit_cs(rctx, info);
950
}
951
952
static void evergreen_set_compute_resources(struct pipe_context *ctx,
953
unsigned start, unsigned count,
954
struct pipe_surface **surfaces)
955
{
956
struct r600_context *rctx = (struct r600_context *)ctx;
957
struct r600_surface **resources = (struct r600_surface **)surfaces;
958
959
COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
960
start, count);
961
962
for (unsigned i = 0; i < count; i++) {
963
/* The First four vertex buffers are reserved for parameters and
964
* global buffers. */
965
unsigned vtx_id = 4 + i;
966
if (resources[i]) {
967
struct r600_resource_global *buffer =
968
(struct r600_resource_global*)
969
resources[i]->base.texture;
970
if (resources[i]->base.writable) {
971
assert(i+1 < 12);
972
973
evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
974
(struct r600_resource *)resources[i]->base.texture,
975
buffer->chunk->start_in_dw*4,
976
resources[i]->base.texture->width0);
977
}
978
979
evergreen_cs_set_vertex_buffer(rctx, vtx_id,
980
buffer->chunk->start_in_dw * 4,
981
resources[i]->base.texture);
982
}
983
}
984
}
985
986
static void evergreen_set_global_binding(struct pipe_context *ctx,
987
unsigned first, unsigned n,
988
struct pipe_resource **resources,
989
uint32_t **handles)
990
{
991
struct r600_context *rctx = (struct r600_context *)ctx;
992
struct compute_memory_pool *pool = rctx->screen->global_pool;
993
struct r600_resource_global **buffers =
994
(struct r600_resource_global **)resources;
995
unsigned i;
996
997
COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
998
first, n);
999
1000
if (!resources) {
1001
/* XXX: Unset */
1002
return;
1003
}
1004
1005
/* We mark these items for promotion to the pool if they
1006
* aren't already there */
1007
for (i = first; i < first + n; i++) {
1008
struct compute_memory_item *item = buffers[i]->chunk;
1009
1010
if (!is_item_in_pool(item))
1011
buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1012
}
1013
1014
if (compute_memory_finalize_pending(pool, ctx) == -1) {
1015
/* XXX: Unset */
1016
return;
1017
}
1018
1019
for (i = first; i < first + n; i++)
1020
{
1021
uint32_t buffer_offset;
1022
uint32_t handle;
1023
assert(resources[i]->target == PIPE_BUFFER);
1024
assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1025
1026
buffer_offset = util_le32_to_cpu(*(handles[i]));
1027
handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1028
1029
*(handles[i]) = util_cpu_to_le32(handle);
1030
}
1031
1032
/* globals for writing */
1033
evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1034
/* globals for reading */
1035
evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1036
(struct pipe_resource*)pool->bo);
1037
1038
/* constants for reading, LLVM puts them in text segment */
1039
evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1040
(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1041
}
1042
1043
/**
1044
* This function initializes all the compute specific registers that need to
1045
* be initialized for each compute command stream. Registers that are common
1046
* to both compute and 3D will be initialized at the beginning of each compute
1047
* command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
1048
* packet requires that the shader type bit be set, we must initialize all
1049
* context registers needed for compute in this function. The registers
1050
* initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1051
* functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1052
* on the GPU family.
1053
*/
1054
void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1055
{
1056
struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1057
int num_threads;
1058
int num_stack_entries;
1059
1060
/* since all required registers are initialized in the
1061
* start_compute_cs_cmd atom, we can EMIT_EARLY here.
1062
*/
1063
r600_init_command_buffer(cb, 256);
1064
cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1065
1066
/* We're setting config registers here. */
1067
r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1068
r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1069
1070
switch (rctx->b.family) {
1071
case CHIP_CEDAR:
1072
default:
1073
num_threads = 128;
1074
num_stack_entries = 256;
1075
break;
1076
case CHIP_REDWOOD:
1077
num_threads = 128;
1078
num_stack_entries = 256;
1079
break;
1080
case CHIP_JUNIPER:
1081
num_threads = 128;
1082
num_stack_entries = 512;
1083
break;
1084
case CHIP_CYPRESS:
1085
case CHIP_HEMLOCK:
1086
num_threads = 128;
1087
num_stack_entries = 512;
1088
break;
1089
case CHIP_PALM:
1090
num_threads = 128;
1091
num_stack_entries = 256;
1092
break;
1093
case CHIP_SUMO:
1094
num_threads = 128;
1095
num_stack_entries = 256;
1096
break;
1097
case CHIP_SUMO2:
1098
num_threads = 128;
1099
num_stack_entries = 512;
1100
break;
1101
case CHIP_BARTS:
1102
num_threads = 128;
1103
num_stack_entries = 512;
1104
break;
1105
case CHIP_TURKS:
1106
num_threads = 128;
1107
num_stack_entries = 256;
1108
break;
1109
case CHIP_CAICOS:
1110
num_threads = 128;
1111
num_stack_entries = 256;
1112
break;
1113
}
1114
1115
/* The primitive type always needs to be POINTLIST for compute. */
1116
r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1117
V_008958_DI_PT_POINTLIST);
1118
1119
if (rctx->b.chip_class < CAYMAN) {
1120
1121
/* These registers control which simds can be used by each stage.
1122
* The default for these registers is 0xffffffff, which means
1123
* all simds are available for each stage. It's possible we may
1124
* want to play around with these in the future, but for now
1125
* the default value is fine.
1126
*
1127
* R_008E20_SQ_STATIC_THREAD_MGMT1
1128
* R_008E24_SQ_STATIC_THREAD_MGMT2
1129
* R_008E28_SQ_STATIC_THREAD_MGMT3
1130
*/
1131
1132
/* XXX: We may need to adjust the thread and stack resource
1133
* values for 3D/compute interop */
1134
1135
r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1136
1137
/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1138
* Set the number of threads used by the PS/VS/GS/ES stage to
1139
* 0.
1140
*/
1141
r600_store_value(cb, 0);
1142
1143
/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1144
* Set the number of threads used by the CS (aka LS) stage to
1145
* the maximum number of threads and set the number of threads
1146
* for the HS stage to 0. */
1147
r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1148
1149
/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1150
* Set the Control Flow stack entries to 0 for PS/VS stages */
1151
r600_store_value(cb, 0);
1152
1153
/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1154
* Set the Control Flow stack entries to 0 for GS/ES stages */
1155
r600_store_value(cb, 0);
1156
1157
/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1158
* Set the Contol Flow stack entries to 0 for the HS stage, and
1159
* set it to the maximum value for the CS (aka LS) stage. */
1160
r600_store_value(cb,
1161
S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1162
}
1163
/* Give the compute shader all the available LDS space.
1164
* NOTE: This only sets the maximum number of dwords that a compute
1165
* shader can allocate. When a shader is executed, we still need to
1166
* allocate the appropriate amount of LDS dwords using the
1167
* CM_R_0288E8_SQ_LDS_ALLOC register.
1168
*/
1169
if (rctx->b.chip_class < CAYMAN) {
1170
r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1171
S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1172
} else {
1173
r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1174
S_0286FC_NUM_PS_LDS(0) |
1175
S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1176
}
1177
1178
/* Context Registers */
1179
1180
if (rctx->b.chip_class < CAYMAN) {
1181
/* workaround for hw issues with dyn gpr - must set all limits
1182
* to 240 instead of 0, 0x1e == 240 / 8
1183
*/
1184
r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1185
S_028838_PS_GPRS(0x1e) |
1186
S_028838_VS_GPRS(0x1e) |
1187
S_028838_GS_GPRS(0x1e) |
1188
S_028838_ES_GPRS(0x1e) |
1189
S_028838_HS_GPRS(0x1e) |
1190
S_028838_LS_GPRS(0x1e));
1191
}
1192
1193
/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1194
r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1195
S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1196
1197
r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1198
1199
r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1200
S_0286E8_TID_IN_GROUP_ENA(1) |
1201
S_0286E8_TGID_ENA(1) |
1202
S_0286E8_DISABLE_INDEX_PACK(1));
1203
1204
/* The LOOP_CONST registers are an optimizations for loops that allows
1205
* you to store the initial counter, increment value, and maximum
1206
* counter value in a register so that hardware can calculate the
1207
* correct number of iterations for the loop, so that you don't need
1208
* to have the loop counter in your shader code. We don't currently use
1209
* this optimization, so we must keep track of the counter in the
1210
* shader and use a break instruction to exit loops. However, the
1211
* hardware will still uses this register to determine when to exit a
1212
* loop, so we need to initialize the counter to 0, set the increment
1213
* value to 1 and the maximum counter value to the 4095 (0xfff) which
1214
* is the maximum value allowed. This gives us a maximum of 4096
1215
* iterations for our loops, but hopefully our break instruction will
1216
* execute before some time before the 4096th iteration.
1217
*/
1218
eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1219
}
1220
1221
void evergreen_init_compute_state_functions(struct r600_context *rctx)
1222
{
1223
rctx->b.b.create_compute_state = evergreen_create_compute_state;
1224
rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1225
rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1226
// rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1227
rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1228
rctx->b.b.set_global_binding = evergreen_set_global_binding;
1229
rctx->b.b.launch_grid = evergreen_launch_grid;
1230
1231
}
1232
1233
void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1234
struct pipe_resource *resource,
1235
unsigned level,
1236
unsigned usage,
1237
const struct pipe_box *box,
1238
struct pipe_transfer **ptransfer)
1239
{
1240
struct r600_context *rctx = (struct r600_context*)ctx;
1241
struct compute_memory_pool *pool = rctx->screen->global_pool;
1242
struct r600_resource_global* buffer =
1243
(struct r600_resource_global*)resource;
1244
1245
struct compute_memory_item *item = buffer->chunk;
1246
struct pipe_resource *dst = NULL;
1247
unsigned offset = box->x;
1248
1249
if (is_item_in_pool(item)) {
1250
compute_memory_demote_item(pool, item, ctx);
1251
}
1252
else {
1253
if (item->real_buffer == NULL) {
1254
item->real_buffer =
1255
r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256
}
1257
}
1258
1259
dst = (struct pipe_resource*)item->real_buffer;
1260
1261
if (usage & PIPE_MAP_READ)
1262
buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1263
1264
COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1265
"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1266
"width = %u, height = %u, depth = %u)\n", level, usage,
1267
box->x, box->y, box->z, box->width, box->height,
1268
box->depth);
1269
COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1270
"%u (box.x)\n", item->id, box->x);
1271
1272
1273
assert(resource->target == PIPE_BUFFER);
1274
assert(resource->bind & PIPE_BIND_GLOBAL);
1275
assert(box->x >= 0);
1276
assert(box->y == 0);
1277
assert(box->z == 0);
1278
1279
///TODO: do it better, mapping is not possible if the pool is too big
1280
return pipe_buffer_map_range(ctx, dst,
1281
offset, box->width, usage, ptransfer);
1282
}
1283
1284
void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1285
struct pipe_transfer *transfer)
1286
{
1287
/* struct r600_resource_global are not real resources, they just map
1288
* to an offset within the compute memory pool. The function
1289
* r600_compute_global_transfer_map() maps the memory pool
1290
* resource rather than the struct r600_resource_global passed to
1291
* it as an argument and then initializes ptransfer->resource with
1292
* the memory pool resource (via pipe_buffer_map_range).
1293
* When transfer_unmap is called it uses the memory pool's
1294
* vtable which calls r600_buffer_transfer_map() rather than
1295
* this function.
1296
*/
1297
assert (!"This function should not be called");
1298
}
1299
1300
void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1301
struct pipe_resource *res)
1302
{
1303
struct r600_resource_global* buffer = NULL;
1304
struct r600_screen* rscreen = NULL;
1305
1306
assert(res->target == PIPE_BUFFER);
1307
assert(res->bind & PIPE_BIND_GLOBAL);
1308
1309
buffer = (struct r600_resource_global*)res;
1310
rscreen = (struct r600_screen*)screen;
1311
1312
compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1313
1314
buffer->chunk = NULL;
1315
free(res);
1316
}
1317
1318
struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1319
const struct pipe_resource *templ)
1320
{
1321
struct r600_resource_global* result = NULL;
1322
struct r600_screen* rscreen = NULL;
1323
int size_in_dw = 0;
1324
1325
assert(templ->target == PIPE_BUFFER);
1326
assert(templ->bind & PIPE_BIND_GLOBAL);
1327
assert(templ->array_size == 1 || templ->array_size == 0);
1328
assert(templ->depth0 == 1 || templ->depth0 == 0);
1329
assert(templ->height0 == 1 || templ->height0 == 0);
1330
1331
result = (struct r600_resource_global*)
1332
CALLOC(sizeof(struct r600_resource_global), 1);
1333
rscreen = (struct r600_screen*)screen;
1334
1335
COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1336
COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1337
templ->array_size);
1338
1339
result->base.b.b = *templ;
1340
result->base.b.b.screen = screen;
1341
result->base.compute_global_bo = true;
1342
pipe_reference_init(&result->base.b.b.reference, 1);
1343
1344
size_in_dw = (templ->width0+3) / 4;
1345
1346
result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1347
1348
if (result->chunk == NULL)
1349
{
1350
free(result);
1351
return NULL;
1352
}
1353
1354
return &result->base.b.b;
1355
}
1356
1357