CoCalc -- evergreen

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/gallium/drivers/r600/evergreen_compute.c
⁴⁵⁷⁰ views
1
/*
2
 * Copyright 2011 Adam Rak <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * on the rights to use, copy, modify, merge, publish, distribute, sub
8
 * license, and/or sell copies of the Software, and to permit persons to whom
9
 * the Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22
 *
23
 * Authors:
24
 *      Adam Rak <[email protected]>
25
 */
26

27
#ifdef HAVE_OPENCL
28
#include <gelf.h>
29
#include <libelf.h>
30
#endif
31
#include <stdio.h>
32
#include <errno.h>
33
#include "pipe/p_defines.h"
34
#include "pipe/p_state.h"
35
#include "pipe/p_context.h"
36
#include "util/u_blitter.h"
37
#include "util/list.h"
38
#include "util/u_transfer.h"
39
#include "util/u_surface.h"
40
#include "util/u_pack_color.h"
41
#include "util/u_memory.h"
42
#include "util/u_inlines.h"
43
#include "util/u_framebuffer.h"
44
#include "tgsi/tgsi_parse.h"
45
#include "pipebuffer/pb_buffer.h"
46
#include "evergreend.h"
47
#include "r600_shader.h"
48
#include "r600_pipe.h"
49
#include "r600_formats.h"
50
#include "evergreen_compute.h"
51
#include "evergreen_compute_internal.h"
52
#include "compute_memory_pool.h"
53
#include "sb/sb_public.h"
54
#include <inttypes.h>
55

56
/**
57
RAT0 is for global binding write
58
VTX1 is for global binding read
59

60
for wrting images RAT1...
61
for reading images TEX2...
62
  TEX2-RAT1 is paired
63

64
TEX2... consumes the same fetch resources, that VTX2... would consume
65

66
CONST0 and VTX0 is for parameters
67
  CONST0 is binding smaller input parameter buffer, and for constant indexing,
68
  also constant cached
69
  VTX0 is for indirect/non-constant indexing, or if the input is bigger than
70
  the constant cache can handle
71

72
RAT-s are limited to 12, so we can only bind at most 11 texture for writing
73
because we reserve RAT0 for global bindings. With byteaddressing enabled,
74
we should reserve another one too.=> 10 image binding for writing max.
75

76
from Nvidia OpenCL:
77
  CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
78
  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8 
79

80
so 10 for writing is enough. 176 is the max for reading according to the docs
81

82
writable images should be listed first < 10, so their id corresponds to RAT(id+1)
83
writable images will consume TEX slots, VTX slots too because of linear indexing
84

85
*/
86

87
#ifdef HAVE_OPENCL
88
static void radeon_shader_binary_init(struct r600_shader_binary *b)
89
{
90
	memset(b, 0, sizeof(*b));
91
}
92

93
static void radeon_shader_binary_clean(struct r600_shader_binary *b)
94
{
95
	if (!b)
96
		return;
97
	FREE(b->code);
98
	FREE(b->config);
99
	FREE(b->rodata);
100
	FREE(b->global_symbol_offsets);
101
	FREE(b->relocs);
102
	FREE(b->disasm_string);
103
}
104
#endif
105

106
struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
107
						     unsigned size)
108
{
109
	struct pipe_resource *buffer = NULL;
110
	assert(size);
111

112
	buffer = pipe_buffer_create((struct pipe_screen*) screen,
113
				    0, PIPE_USAGE_IMMUTABLE, size);
114

115
	return (struct r600_resource *)buffer;
116
}
117

118

119
static void evergreen_set_rat(struct r600_pipe_compute *pipe,
120
			      unsigned id,
121
			      struct r600_resource *bo,
122
			      int start,
123
			      int size)
124
{
125
	struct pipe_surface rat_templ;
126
	struct r600_surface *surf = NULL;
127
	struct r600_context *rctx = NULL;
128

129
	assert(id < 12);
130
	assert((size & 3) == 0);
131
	assert((start & 0xFF) == 0);
132

133
	rctx = pipe->ctx;
134

135
	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
136

137
	/* Create the RAT surface */
138
	memset(&rat_templ, 0, sizeof(rat_templ));
139
	rat_templ.format = PIPE_FORMAT_R32_UINT;
140
	rat_templ.u.tex.level = 0;
141
	rat_templ.u.tex.first_layer = 0;
142
	rat_templ.u.tex.last_layer = 0;
143

144
	/* Add the RAT the list of color buffers. Drop the old buffer first. */
145
	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
146
	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
147
		(struct pipe_context *)pipe->ctx,
148
		(struct pipe_resource *)bo, &rat_templ);
149

150
	/* Update the number of color buffers */
151
	pipe->ctx->framebuffer.state.nr_cbufs =
152
		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
153

154
	/* Update the cb_target_mask
155
	 * XXX: I think this is a potential spot for bugs once we start doing
156
	 * GL interop.  cb_target_mask may be modified in the 3D sections
157
	 * of this driver. */
158
	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
159

160
	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
161
	evergreen_init_color_surface_rat(rctx, surf);
162
}
163

164
static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
165
					   unsigned vb_index,
166
					   unsigned offset,
167
					   struct pipe_resource *buffer)
168
{
169
	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
170
	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
171
	vb->stride = 1;
172
	vb->buffer_offset = offset;
173
	vb->buffer.resource = buffer;
174
	vb->is_user_buffer = false;
175

176
	/* The vertex instructions in the compute shaders use the texture cache,
177
	 * so we need to invalidate it. */
178
	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
179
	state->enabled_mask |= 1 << vb_index;
180
	state->dirty_mask |= 1 << vb_index;
181
	r600_mark_atom_dirty(rctx, &state->atom);
182
}
183

184
static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
185
					     unsigned cb_index,
186
					     unsigned offset,
187
					     unsigned size,
188
					     struct pipe_resource *buffer)
189
{
190
	struct pipe_constant_buffer cb;
191
	cb.buffer_size = size;
192
	cb.buffer_offset = offset;
193
	cb.buffer = buffer;
194
	cb.user_buffer = NULL;
195

196
	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
197
}
198

199
/* We need to define these R600 registers here, because we can't include
200
 * evergreend.h and r600d.h.
201
 */
202
#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
203
#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
204

205
#ifdef HAVE_OPENCL
206
static void parse_symbol_table(Elf_Data *symbol_table_data,
207
				const GElf_Shdr *symbol_table_header,
208
				struct r600_shader_binary *binary)
209
{
210
	GElf_Sym symbol;
211
	unsigned i = 0;
212
	unsigned symbol_count =
213
		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
214

215
	/* We are over allocating this list, because symbol_count gives the
216
	 * total number of symbols, and we will only be filling the list
217
	 * with offsets of global symbols.  The memory savings from
218
	 * allocating the correct size of this list will be small, and
219
	 * I don't think it is worth the cost of pre-computing the number
220
	 * of global symbols.
221
	 */
222
	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
223

224
	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
225
		unsigned i;
226
		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
227
		    symbol.st_shndx == 0 /* Undefined symbol */) {
228
			continue;
229
		}
230

231
		binary->global_symbol_offsets[binary->global_symbol_count] =
232
					symbol.st_value;
233

234
		/* Sort the list using bubble sort.  This list will usually
235
		 * be small. */
236
		for (i = binary->global_symbol_count; i > 0; --i) {
237
			uint64_t lhs = binary->global_symbol_offsets[i - 1];
238
			uint64_t rhs = binary->global_symbol_offsets[i];
239
			if (lhs < rhs) {
240
				break;
241
			}
242
			binary->global_symbol_offsets[i] = lhs;
243
			binary->global_symbol_offsets[i - 1] = rhs;
244
		}
245
		++binary->global_symbol_count;
246
	}
247
}
248

249

250
static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
251
			unsigned symbol_sh_link,
252
			struct r600_shader_binary *binary)
253
{
254
	unsigned i;
255

256
	if (!relocs || !symbols || !binary->reloc_count) {
257
		return;
258
	}
259
	binary->relocs = CALLOC(binary->reloc_count,
260
			sizeof(struct r600_shader_reloc));
261
	for (i = 0; i < binary->reloc_count; i++) {
262
		GElf_Sym symbol;
263
		GElf_Rel rel;
264
		char *symbol_name;
265
		struct r600_shader_reloc *reloc = &binary->relocs[i];
266

267
		gelf_getrel(relocs, i, &rel);
268
		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
269
		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
270

271
		reloc->offset = rel.r_offset;
272
		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
273
		reloc->name[sizeof(reloc->name)-1] = 0;
274
	}
275
}
276

277
static void r600_elf_read(const char *elf_data, unsigned elf_size,
278
		 struct r600_shader_binary *binary)
279
{
280
	char *elf_buffer;
281
	Elf *elf;
282
	Elf_Scn *section = NULL;
283
	Elf_Data *symbols = NULL, *relocs = NULL;
284
	size_t section_str_index;
285
	unsigned symbol_sh_link = 0;
286

287
	/* One of the libelf implementations
288
	 * (http://www.mr511.de/software/english.htm) requires calling
289
	 * elf_version() before elf_memory().
290
	 */
291
	elf_version(EV_CURRENT);
292
	elf_buffer = MALLOC(elf_size);
293
	memcpy(elf_buffer, elf_data, elf_size);
294

295
	elf = elf_memory(elf_buffer, elf_size);
296

297
	elf_getshdrstrndx(elf, &section_str_index);
298

299
	while ((section = elf_nextscn(elf, section))) {
300
		const char *name;
301
		Elf_Data *section_data = NULL;
302
		GElf_Shdr section_header;
303
		if (gelf_getshdr(section, &section_header) != &section_header) {
304
			fprintf(stderr, "Failed to read ELF section header\n");
305
			return;
306
		}
307
		name = elf_strptr(elf, section_str_index, section_header.sh_name);
308
		if (!strcmp(name, ".text")) {
309
			section_data = elf_getdata(section, section_data);
310
			binary->code_size = section_data->d_size;
311
			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
312
			memcpy(binary->code, section_data->d_buf, binary->code_size);
313
		} else if (!strcmp(name, ".AMDGPU.config")) {
314
			section_data = elf_getdata(section, section_data);
315
			binary->config_size = section_data->d_size;
316
			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
317
			memcpy(binary->config, section_data->d_buf, binary->config_size);
318
		} else if (!strcmp(name, ".AMDGPU.disasm")) {
319
			/* Always read disassembly if it's available. */
320
			section_data = elf_getdata(section, section_data);
321
			binary->disasm_string = strndup(section_data->d_buf,
322
							section_data->d_size);
323
		} else if (!strncmp(name, ".rodata", 7)) {
324
			section_data = elf_getdata(section, section_data);
325
			binary->rodata_size = section_data->d_size;
326
			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
327
			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
328
		} else if (!strncmp(name, ".symtab", 7)) {
329
			symbols = elf_getdata(section, section_data);
330
			symbol_sh_link = section_header.sh_link;
331
			parse_symbol_table(symbols, &section_header, binary);
332
		} else if (!strcmp(name, ".rel.text")) {
333
			relocs = elf_getdata(section, section_data);
334
			binary->reloc_count = section_header.sh_size /
335
					section_header.sh_entsize;
336
		}
337
	}
338

339
	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
340

341
	if (elf){
342
		elf_end(elf);
343
	}
344
	FREE(elf_buffer);
345

346
	/* Cache the config size per symbol */
347
	if (binary->global_symbol_count) {
348
		binary->config_size_per_symbol =
349
			binary->config_size / binary->global_symbol_count;
350
	} else {
351
		binary->global_symbol_count = 1;
352
		binary->config_size_per_symbol = binary->config_size;
353
	}
354
}
355

356
static const unsigned char *r600_shader_binary_config_start(
357
	const struct r600_shader_binary *binary,
358
	uint64_t symbol_offset)
359
{
360
	unsigned i;
361
	for (i = 0; i < binary->global_symbol_count; ++i) {
362
		if (binary->global_symbol_offsets[i] == symbol_offset) {
363
			unsigned offset = i * binary->config_size_per_symbol;
364
			return binary->config + offset;
365
		}
366
	}
367
	return binary->config;
368
}
369

370
static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
371
					   struct r600_bytecode *bc,
372
					   uint64_t symbol_offset,
373
					   boolean *use_kill)
374
{
375
       unsigned i;
376
       const unsigned char *config =
377
               r600_shader_binary_config_start(binary, symbol_offset);
378

379
       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
380
               unsigned reg =
381
                       util_le32_to_cpu(*(uint32_t*)(config + i));
382
               unsigned value =
383
                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
384
               switch (reg) {
385
               /* R600 / R700 */
386
               case R_028850_SQ_PGM_RESOURCES_PS:
387
               case R_028868_SQ_PGM_RESOURCES_VS:
388
               /* Evergreen / Northern Islands */
389
               case R_028844_SQ_PGM_RESOURCES_PS:
390
               case R_028860_SQ_PGM_RESOURCES_VS:
391
               case R_0288D4_SQ_PGM_RESOURCES_LS:
392
                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
393
                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
394
                       break;
395
               case R_02880C_DB_SHADER_CONTROL:
396
                       *use_kill = G_02880C_KILL_ENABLE(value);
397
                       break;
398
               case R_0288E8_SQ_LDS_ALLOC:
399
                       bc->nlds_dw = value;
400
                       break;
401
               }
402
       }
403
}
404

405
static unsigned r600_create_shader(struct r600_bytecode *bc,
406
				   const struct r600_shader_binary *binary,
407
				   boolean *use_kill)
408

409
{
410
	assert(binary->code_size % 4 == 0);
411
	bc->bytecode = CALLOC(1, binary->code_size);
412
	memcpy(bc->bytecode, binary->code, binary->code_size);
413
	bc->ndw = binary->code_size / 4;
414

415
	r600_shader_binary_read_config(binary, bc, 0, use_kill);
416
	return 0;
417
}
418

419
#endif
420

421
static void r600_destroy_shader(struct r600_bytecode *bc)
422
{
423
	FREE(bc->bytecode);
424
}
425

426
static void *evergreen_create_compute_state(struct pipe_context *ctx,
427
					    const struct pipe_compute_state *cso)
428
{
429
	struct r600_context *rctx = (struct r600_context *)ctx;
430
	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
431
#ifdef HAVE_OPENCL
432
	const struct pipe_binary_program_header *header;
433
	void *p;
434
	boolean use_kill;
435
#endif
436

437
	shader->ctx = rctx;
438
	shader->local_size = cso->req_local_mem;
439
	shader->private_size = cso->req_private_mem;
440
	shader->input_size = cso->req_input_mem;
441

442
	shader->ir_type = cso->ir_type;
443

444
	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
445
	    shader->ir_type == PIPE_SHADER_IR_NIR) {
446
		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
447
		return shader;
448
	}
449
#ifdef HAVE_OPENCL
450
	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
451
	header = cso->prog;
452
	radeon_shader_binary_init(&shader->binary);
453
	r600_elf_read(header->blob, header->num_bytes, &shader->binary);
454
	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
455

456
	/* Upload code + ROdata */
457
	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
458
							shader->bc.ndw * 4);
459
	p = r600_buffer_map_sync_with_rings(
460
		&rctx->b, shader->code_bo,
461
		PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
462
	//TODO: use util_memcpy_cpu_to_le32 ?
463
	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
464
	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
465
#endif
466

467
	return shader;
468
}
469

470
static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
471
{
472
	struct r600_context *rctx = (struct r600_context *)ctx;
473
	struct r600_pipe_compute *shader = state;
474

475
	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
476

477
	if (!shader)
478
		return;
479

480
	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
481
	    shader->ir_type == PIPE_SHADER_IR_NIR) {
482
		r600_delete_shader_selector(ctx, shader->sel);
483
	} else {
484
#ifdef HAVE_OPENCL
485
		radeon_shader_binary_clean(&shader->binary);
486
		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
487
		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
488
#endif
489
		r600_destroy_shader(&shader->bc);
490
	}
491
	FREE(shader);
492
}
493

494
static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
495
{
496
	struct r600_context *rctx = (struct r600_context *)ctx;
497
	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
498
	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
499

500
	if (!state) {
501
		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
502
		return;
503
	}
504

505
	if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
506
	    cstate->ir_type == PIPE_SHADER_IR_NIR) {
507
		bool compute_dirty;
508
		cstate->sel->ir_type = cstate->ir_type;
509
		if (r600_shader_select(ctx, cstate->sel, &compute_dirty))
510
			R600_ERR("Failed to select compute shader\n");
511
	}
512
	
513
	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
514
}
515

516
/* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
517
 * kernel parameters there are implicit parameters that need to be stored
518
 * in the vertex buffer as well.  Here is how these parameters are organized in
519
 * the buffer:
520
 *
521
 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
522
 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
523
 * DWORDS 6-8: Number of work items within each work group in each dimension
524
 *             (x,y,z)
525
 * DWORDS 9+ : Kernel parameters
526
 */
527
static void evergreen_compute_upload_input(struct pipe_context *ctx,
528
					   const struct pipe_grid_info *info)
529
{
530
	struct r600_context *rctx = (struct r600_context *)ctx;
531
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
532
	unsigned i;
533
	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
534
	 * parameters.
535
	 */
536
	unsigned input_size;
537
	uint32_t *num_work_groups_start;
538
	uint32_t *global_size_start;
539
	uint32_t *local_size_start;
540
	uint32_t *kernel_parameters_start;
541
	struct pipe_box box;
542
	struct pipe_transfer *transfer = NULL;
543

544
	if (!shader)
545
		return;
546
	if (shader->input_size == 0) {
547
		return;
548
	}
549
	input_size = shader->input_size + 36;
550
	if (!shader->kernel_param) {
551
		/* Add space for the grid dimensions */
552
		shader->kernel_param = (struct r600_resource *)
553
			pipe_buffer_create(ctx->screen, 0,
554
					PIPE_USAGE_IMMUTABLE, input_size);
555
	}
556

557
	u_box_1d(0, input_size, &box);
558
	num_work_groups_start = ctx->buffer_map(ctx,
559
			(struct pipe_resource*)shader->kernel_param,
560
			0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
561
			&box, &transfer);
562
	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
563
	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
564
	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
565

566
	/* Copy the work group size */
567
	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
568

569
	/* Copy the global size */
570
	for (i = 0; i < 3; i++) {
571
		global_size_start[i] = info->grid[i] * info->block[i];
572
	}
573

574
	/* Copy the local dimensions */
575
	memcpy(local_size_start, info->block, 3 * sizeof(uint));
576

577
	/* Copy the kernel inputs */
578
	memcpy(kernel_parameters_start, info->input, shader->input_size);
579

580
	for (i = 0; i < (input_size / 4); i++) {
581
		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
582
			((unsigned*)num_work_groups_start)[i]);
583
	}
584

585
	ctx->buffer_unmap(ctx, transfer);
586

587
	/* ID=0 and ID=3 are reserved for the parameters.
588
	 * LLVM will preferably use ID=0, but it does not work for dynamic
589
	 * indices. */
590
	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
591
			(struct pipe_resource*)shader->kernel_param);
592
	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
593
			(struct pipe_resource*)shader->kernel_param);
594
}
595

596
static void evergreen_emit_dispatch(struct r600_context *rctx,
597
				    const struct pipe_grid_info *info,
598
				    uint32_t indirect_grid[3])
599
{
600
	int i;
601
	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
602
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
603
	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
604
	unsigned num_waves;
605
	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
606
	unsigned wave_divisor = (16 * num_pipes);
607
	int group_size = 1;
608
	int grid_size = 1;
609
	unsigned lds_size = shader->local_size / 4;
610

611
	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
612
	    shader->ir_type != PIPE_SHADER_IR_NIR)
613
		lds_size += shader->bc.nlds_dw;
614
	
615
	/* Calculate group_size/grid_size */
616
	for (i = 0; i < 3; i++) {
617
		group_size *= info->block[i];
618
	}
619

620
	for (i = 0; i < 3; i++)	{
621
		grid_size *= info->grid[i];
622
	}
623

624
	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
625
	num_waves = (info->block[0] * info->block[1] * info->block[2] +
626
			wave_divisor - 1) / wave_divisor;
627

628
	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
629
				"%u wavefronts per thread block, "
630
				"allocating %u dwords lds.\n",
631
				num_pipes, num_waves, lds_size);
632

633
	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
634

635
	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
636
	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
637
	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
638
	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
639

640
	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
641
								group_size);
642

643
	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
644
	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
645
	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
646
	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
647

648
	if (rctx->b.chip_class < CAYMAN) {
649
		assert(lds_size <= 8192);
650
	} else {
651
		/* Cayman appears to have a slightly smaller limit, see the
652
		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
653
		assert(lds_size <= 8160);
654
	}
655

656
	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
657
					lds_size | (num_waves << 14));
658

659
	if (info->indirect) {
660
		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
661
		radeon_emit(cs, indirect_grid[0]);
662
		radeon_emit(cs, indirect_grid[1]);
663
		radeon_emit(cs, indirect_grid[2]);
664
		radeon_emit(cs, 1);
665
	} else {
666
		/* Dispatch packet */
667
		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
668
		radeon_emit(cs, info->grid[0]);
669
		radeon_emit(cs, info->grid[1]);
670
		radeon_emit(cs, info->grid[2]);
671
		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
672
		radeon_emit(cs, 1);
673
	}
674

675
	if (rctx->is_debug)
676
		eg_trace_emit(rctx);
677
}
678

679
static void compute_setup_cbs(struct r600_context *rctx)
680
{
681
	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
682
	unsigned i;
683

684
	/* Emit colorbuffers. */
685
	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
686
	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
687
		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
688
		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
689
						       (struct r600_resource*)cb->base.texture,
690
						       RADEON_USAGE_READWRITE,
691
						       RADEON_PRIO_SHADER_RW_BUFFER);
692

693
		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
694
		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
695
		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
696
		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
697
		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
698
		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
699
		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
700
		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
701

702
		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
703
		radeon_emit(cs, reloc);
704

705
		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
706
		radeon_emit(cs, reloc);
707
	}
708
	for (; i < 8 ; i++)
709
		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
710
					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
711
	for (; i < 12; i++)
712
		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
713
					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
714

715
	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
716
	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
717
				       rctx->compute_cb_target_mask);
718
}
719

720
static void compute_emit_cs(struct r600_context *rctx,
721
			    const struct pipe_grid_info *info)
722
{
723
	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
724
	bool compute_dirty = false;
725
	struct r600_pipe_shader *current;
726
	struct r600_shader_atomic combined_atomics[8];
727
	uint8_t atomic_used_mask;
728
	uint32_t indirect_grid[3] = { 0, 0, 0 };
729

730
	/* make sure that the gfx ring is only one active */
731
	if (radeon_emitted(&rctx->b.dma.cs, 0)) {
732
		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
733
	}
734

735
	r600_update_compressed_resource_state(rctx, true);
736

737
	if (!rctx->cmd_buf_is_compute) {
738
		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
739
		rctx->cmd_buf_is_compute = true;
740
	}
741

742
	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
743
	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
744
		if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty)) {
745
			R600_ERR("Failed to select compute shader\n");
746
			return;
747
		}
748
		
749
		current = rctx->cs_shader_state.shader->sel->current;
750
		if (compute_dirty) {
751
			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
752
			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
753
			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
754
		}
755

756
		bool need_buf_const = current->shader.uses_tex_buffers ||
757
			current->shader.has_txq_cube_array_z_comp;
758

759
		if (info->indirect) {
760
			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
761
			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
762
			unsigned offset = info->indirect_offset / 4;
763
			indirect_grid[0] = data[offset];
764
			indirect_grid[1] = data[offset + 1];
765
			indirect_grid[2] = data[offset + 2];
766
		}
767
		for (int i = 0; i < 3; i++) {
768
			rctx->cs_block_grid_sizes[i] = info->block[i];
769
			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
770
		}
771
		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
772
		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
773

774
		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
775
		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
776

777
		if (need_buf_const) {
778
			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
779
		}
780
		r600_update_driver_const_buffers(rctx, true);
781

782
		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
783
		if (atomic_used_mask) {
784
			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
785
			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
786
		}
787
	} else
788
		r600_need_cs_space(rctx, 0, true, 0);
789

790
	/* Initialize all the compute-related registers.
791
	 *
792
	 * See evergreen_init_atom_start_compute_cs() in this file for the list
793
	 * of registers initialized by the start_compute_cs_cmd atom.
794
	 */
795
	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
796

797
	/* emit config state */
798
	if (rctx->b.chip_class == EVERGREEN) {
799
		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
800
		    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
801
			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
802
			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
803
			radeon_emit(cs, 0);
804
			radeon_emit(cs, 0);
805
			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
806
		} else
807
			r600_emit_atom(rctx, &rctx->config_state.atom);
808
	}
809

810
	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
811
	r600_flush_emit(rctx);
812

813
	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
814
	    rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
815

816
		compute_setup_cbs(rctx);
817

818
		/* Emit vertex buffer state */
819
		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
820
		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
821
	} else {
822
		uint32_t rat_mask;
823

824
		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
825
		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
826
					       rat_mask);
827
	}
828

829
	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
830

831
	/* Emit constant buffer state */
832
	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
833

834
	/* Emit sampler state */
835
	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
836

837
	/* Emit sampler view (texture resource) state */
838
	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
839

840
	/* Emit images state */
841
	r600_emit_atom(rctx, &rctx->compute_images.atom);
842

843
	/* Emit buffers state */
844
	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
845

846
	/* Emit shader state */
847
	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
848

849
	/* Emit dispatch state and dispatch packet */
850
	evergreen_emit_dispatch(rctx, info, indirect_grid);
851

852
	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
853
	 */
854
	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
855
		      R600_CONTEXT_INV_VERTEX_CACHE |
856
	              R600_CONTEXT_INV_TEX_CACHE;
857
	r600_flush_emit(rctx);
858
	rctx->b.flags = 0;
859

860
	if (rctx->b.chip_class >= CAYMAN) {
861
		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
862
		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
863
		/* DEALLOC_STATE prevents the GPU from hanging when a
864
		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
865
		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
866
		 */
867
		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
868
		radeon_emit(cs, 0);
869
	}
870
	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
871
	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
872
		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
873

874
#if 0
875
	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
876
	for (i = 0; i < cs->cdw; i++) {
877
		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
878
	}
879
#endif
880

881
}
882

883

884
/**
885
 * Emit function for r600_cs_shader_state atom
886
 */
887
void evergreen_emit_cs_shader(struct r600_context *rctx,
888
			      struct r600_atom *atom)
889
{
890
	struct r600_cs_shader_state *state =
891
					(struct r600_cs_shader_state*)atom;
892
	struct r600_pipe_compute *shader = state->shader;
893
	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
894
	uint64_t va;
895
	struct r600_resource *code_bo;
896
	unsigned ngpr, nstack;
897

898
	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
899
	    shader->ir_type == PIPE_SHADER_IR_NIR) {
900
		code_bo = shader->sel->current->bo;
901
		va = shader->sel->current->bo->gpu_address;
902
		ngpr = shader->sel->current->shader.bc.ngpr;
903
		nstack = shader->sel->current->shader.bc.nstack;
904
	} else {
905
		code_bo = shader->code_bo;
906
		va = shader->code_bo->gpu_address + state->pc;
907
		ngpr = shader->bc.ngpr;
908
		nstack = shader->bc.nstack;
909
	}
910

911
	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
912
	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
913
	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
914
			S_0288D4_NUM_GPRS(ngpr) |
915
			S_0288D4_DX10_CLAMP(1) |
916
			S_0288D4_STACK_SIZE(nstack));
917
	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
918

919
	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
920
	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
921
					      code_bo, RADEON_USAGE_READ,
922
					      RADEON_PRIO_SHADER_BINARY));
923
}
924

925
static void evergreen_launch_grid(struct pipe_context *ctx,
926
				  const struct pipe_grid_info *info)
927
{
928
	struct r600_context *rctx = (struct r600_context *)ctx;
929
#ifdef HAVE_OPENCL
930
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
931
	boolean use_kill;
932

933
	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
934
	    shader->ir_type != PIPE_SHADER_IR_NIR) {
935
		rctx->cs_shader_state.pc = info->pc;
936
		/* Get the config information for this kernel. */
937
		r600_shader_binary_read_config(&shader->binary, &shader->bc,
938
					       info->pc, &use_kill);
939
	} else {
940
		use_kill = false;
941
		rctx->cs_shader_state.pc = 0;
942
	}
943
#endif
944

945
	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
946

947

948
	evergreen_compute_upload_input(ctx, info);
949
	compute_emit_cs(rctx, info);
950
}
951

952
static void evergreen_set_compute_resources(struct pipe_context *ctx,
953
					    unsigned start, unsigned count,
954
					    struct pipe_surface **surfaces)
955
{
956
	struct r600_context *rctx = (struct r600_context *)ctx;
957
	struct r600_surface **resources = (struct r600_surface **)surfaces;
958

959
	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
960
			start, count);
961

962
	for (unsigned i = 0; i < count; i++) {
963
		/* The First four vertex buffers are reserved for parameters and
964
		 * global buffers. */
965
		unsigned vtx_id = 4 + i;
966
		if (resources[i]) {
967
			struct r600_resource_global *buffer =
968
				(struct r600_resource_global*)
969
				resources[i]->base.texture;
970
			if (resources[i]->base.writable) {
971
				assert(i+1 < 12);
972

973
				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
974
				(struct r600_resource *)resources[i]->base.texture,
975
				buffer->chunk->start_in_dw*4,
976
				resources[i]->base.texture->width0);
977
			}
978

979
			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
980
					buffer->chunk->start_in_dw * 4,
981
					resources[i]->base.texture);
982
		}
983
	}
984
}
985

986
static void evergreen_set_global_binding(struct pipe_context *ctx,
987
					 unsigned first, unsigned n,
988
					 struct pipe_resource **resources,
989
					 uint32_t **handles)
990
{
991
	struct r600_context *rctx = (struct r600_context *)ctx;
992
	struct compute_memory_pool *pool = rctx->screen->global_pool;
993
	struct r600_resource_global **buffers =
994
		(struct r600_resource_global **)resources;
995
	unsigned i;
996

997
	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
998
			first, n);
999

1000
	if (!resources) {
1001
		/* XXX: Unset */
1002
		return;
1003
	}
1004

1005
	/* We mark these items for promotion to the pool if they
1006
	 * aren't already there */
1007
	for (i = first; i < first + n; i++) {
1008
		struct compute_memory_item *item = buffers[i]->chunk;
1009

1010
		if (!is_item_in_pool(item))
1011
			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
1012
	}
1013

1014
	if (compute_memory_finalize_pending(pool, ctx) == -1) {
1015
		/* XXX: Unset */
1016
		return;
1017
	}
1018

1019
	for (i = first; i < first + n; i++)
1020
	{
1021
		uint32_t buffer_offset;
1022
		uint32_t handle;
1023
		assert(resources[i]->target == PIPE_BUFFER);
1024
		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1025

1026
		buffer_offset = util_le32_to_cpu(*(handles[i]));
1027
		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1028

1029
		*(handles[i]) = util_cpu_to_le32(handle);
1030
	}
1031

1032
	/* globals for writing */
1033
	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1034
	/* globals for reading */
1035
	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1036
				(struct pipe_resource*)pool->bo);
1037

1038
	/* constants for reading, LLVM puts them in text segment */
1039
	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1040
				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1041
}
1042

1043
/**
1044
 * This function initializes all the compute specific registers that need to
1045
 * be initialized for each compute command stream.  Registers that are common
1046
 * to both compute and 3D will be initialized at the beginning of each compute
1047
 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1048
 * packet requires that the shader type bit be set, we must initialize all
1049
 * context registers needed for compute in this function.  The registers
1050
 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1051
 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1052
 * on the GPU family.
1053
 */
1054
void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1055
{
1056
	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1057
	int num_threads;
1058
	int num_stack_entries;
1059

1060
	/* since all required registers are initialized in the
1061
	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1062
	 */
1063
	r600_init_command_buffer(cb, 256);
1064
	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1065

1066
	/* We're setting config registers here. */
1067
	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1068
	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1069

1070
	switch (rctx->b.family) {
1071
	case CHIP_CEDAR:
1072
	default:
1073
		num_threads = 128;
1074
		num_stack_entries = 256;
1075
		break;
1076
	case CHIP_REDWOOD:
1077
		num_threads = 128;
1078
		num_stack_entries = 256;
1079
		break;
1080
	case CHIP_JUNIPER:
1081
		num_threads = 128;
1082
		num_stack_entries = 512;
1083
		break;
1084
	case CHIP_CYPRESS:
1085
	case CHIP_HEMLOCK:
1086
		num_threads = 128;
1087
		num_stack_entries = 512;
1088
		break;
1089
	case CHIP_PALM:
1090
		num_threads = 128;
1091
		num_stack_entries = 256;
1092
		break;
1093
	case CHIP_SUMO:
1094
		num_threads = 128;
1095
		num_stack_entries = 256;
1096
		break;
1097
	case CHIP_SUMO2:
1098
		num_threads = 128;
1099
		num_stack_entries = 512;
1100
		break;
1101
	case CHIP_BARTS:
1102
		num_threads = 128;
1103
		num_stack_entries = 512;
1104
		break;
1105
	case CHIP_TURKS:
1106
		num_threads = 128;
1107
		num_stack_entries = 256;
1108
		break;
1109
	case CHIP_CAICOS:
1110
		num_threads = 128;
1111
		num_stack_entries = 256;
1112
		break;
1113
	}
1114

1115
	/* The primitive type always needs to be POINTLIST for compute. */
1116
	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1117
						V_008958_DI_PT_POINTLIST);
1118

1119
	if (rctx->b.chip_class < CAYMAN) {
1120

1121
		/* These registers control which simds can be used by each stage.
1122
		 * The default for these registers is 0xffffffff, which means
1123
		 * all simds are available for each stage.  It's possible we may
1124
		 * want to play around with these in the future, but for now
1125
		 * the default value is fine.
1126
		 *
1127
		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1128
		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1129
		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1130
		 */
1131

1132
		/* XXX: We may need to adjust the thread and stack resource
1133
		 * values for 3D/compute interop */
1134

1135
		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1136

1137
		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1138
		 * Set the number of threads used by the PS/VS/GS/ES stage to
1139
		 * 0.
1140
		 */
1141
		r600_store_value(cb, 0);
1142

1143
		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1144
		 * Set the number of threads used by the CS (aka LS) stage to
1145
		 * the maximum number of threads and set the number of threads
1146
		 * for the HS stage to 0. */
1147
		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1148

1149
		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1150
		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1151
		r600_store_value(cb, 0);
1152

1153
		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1154
		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1155
		r600_store_value(cb, 0);
1156

1157
		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1158
		 * Set the Contol Flow stack entries to 0 for the HS stage, and
1159
		 * set it to the maximum value for the CS (aka LS) stage. */
1160
		r600_store_value(cb,
1161
			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1162
	}
1163
	/* Give the compute shader all the available LDS space.
1164
	 * NOTE: This only sets the maximum number of dwords that a compute
1165
	 * shader can allocate.  When a shader is executed, we still need to
1166
	 * allocate the appropriate amount of LDS dwords using the
1167
	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1168
	 */
1169
	if (rctx->b.chip_class < CAYMAN) {
1170
		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1171
			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1172
	} else {
1173
		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1174
			S_0286FC_NUM_PS_LDS(0) |
1175
			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1176
	}
1177

1178
	/* Context Registers */
1179

1180
	if (rctx->b.chip_class < CAYMAN) {
1181
		/* workaround for hw issues with dyn gpr - must set all limits
1182
		 * to 240 instead of 0, 0x1e == 240 / 8
1183
		 */
1184
		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1185
				S_028838_PS_GPRS(0x1e) |
1186
				S_028838_VS_GPRS(0x1e) |
1187
				S_028838_GS_GPRS(0x1e) |
1188
				S_028838_ES_GPRS(0x1e) |
1189
				S_028838_HS_GPRS(0x1e) |
1190
				S_028838_LS_GPRS(0x1e));
1191
	}
1192

1193
	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1194
	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1195
		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1196

1197
	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1198

1199
	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1200
			       S_0286E8_TID_IN_GROUP_ENA(1) |
1201
			       S_0286E8_TGID_ENA(1) |
1202
			       S_0286E8_DISABLE_INDEX_PACK(1));
1203

1204
	/* The LOOP_CONST registers are an optimizations for loops that allows
1205
	 * you to store the initial counter, increment value, and maximum
1206
	 * counter value in a register so that hardware can calculate the
1207
	 * correct number of iterations for the loop, so that you don't need
1208
	 * to have the loop counter in your shader code.  We don't currently use
1209
	 * this optimization, so we must keep track of the counter in the
1210
	 * shader and use a break instruction to exit loops.  However, the
1211
	 * hardware will still uses this register to determine when to exit a
1212
	 * loop, so we need to initialize the counter to 0, set the increment
1213
	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1214
	 * is the maximum value allowed.  This gives us a maximum of 4096
1215
	 * iterations for our loops, but hopefully our break instruction will
1216
	 * execute before some time before the 4096th iteration.
1217
	 */
1218
	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1219
}
1220

1221
void evergreen_init_compute_state_functions(struct r600_context *rctx)
1222
{
1223
	rctx->b.b.create_compute_state = evergreen_create_compute_state;
1224
	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1225
	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1226
//	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1227
	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1228
	rctx->b.b.set_global_binding = evergreen_set_global_binding;
1229
	rctx->b.b.launch_grid = evergreen_launch_grid;
1230

1231
}
1232

1233
void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1234
				      struct pipe_resource *resource,
1235
				      unsigned level,
1236
				      unsigned usage,
1237
				      const struct pipe_box *box,
1238
				      struct pipe_transfer **ptransfer)
1239
{
1240
	struct r600_context *rctx = (struct r600_context*)ctx;
1241
	struct compute_memory_pool *pool = rctx->screen->global_pool;
1242
	struct r600_resource_global* buffer =
1243
		(struct r600_resource_global*)resource;
1244

1245
	struct compute_memory_item *item = buffer->chunk;
1246
	struct pipe_resource *dst = NULL;
1247
	unsigned offset = box->x;
1248

1249
	if (is_item_in_pool(item)) {
1250
		compute_memory_demote_item(pool, item, ctx);
1251
	}
1252
	else {
1253
		if (item->real_buffer == NULL) {
1254
			item->real_buffer =
1255
					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256
		}
1257
	}
1258

1259
	dst = (struct pipe_resource*)item->real_buffer;
1260

1261
	if (usage & PIPE_MAP_READ)
1262
		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1263

1264
	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1265
			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1266
			"width = %u, height = %u, depth = %u)\n", level, usage,
1267
			box->x, box->y, box->z, box->width, box->height,
1268
			box->depth);
1269
	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1270
		"%u (box.x)\n", item->id, box->x);
1271

1272

1273
	assert(resource->target == PIPE_BUFFER);
1274
	assert(resource->bind & PIPE_BIND_GLOBAL);
1275
	assert(box->x >= 0);
1276
	assert(box->y == 0);
1277
	assert(box->z == 0);
1278

1279
	///TODO: do it better, mapping is not possible if the pool is too big
1280
	return pipe_buffer_map_range(ctx, dst,
1281
			offset, box->width, usage, ptransfer);
1282
}
1283

1284
void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1285
					struct pipe_transfer *transfer)
1286
{
1287
	/* struct r600_resource_global are not real resources, they just map
1288
	 * to an offset within the compute memory pool.  The function
1289
	 * r600_compute_global_transfer_map() maps the memory pool
1290
	 * resource rather than the struct r600_resource_global passed to
1291
	 * it as an argument and then initializes ptransfer->resource with
1292
	 * the memory pool resource (via pipe_buffer_map_range).
1293
	 * When transfer_unmap is called it uses the memory pool's
1294
	 * vtable which calls r600_buffer_transfer_map() rather than
1295
	 * this function.
1296
	 */
1297
	assert (!"This function should not be called");
1298
}
1299

1300
void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1301
					struct pipe_resource *res)
1302
{
1303
	struct r600_resource_global* buffer = NULL;
1304
	struct r600_screen* rscreen = NULL;
1305

1306
	assert(res->target == PIPE_BUFFER);
1307
	assert(res->bind & PIPE_BIND_GLOBAL);
1308

1309
	buffer = (struct r600_resource_global*)res;
1310
	rscreen = (struct r600_screen*)screen;
1311

1312
	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1313

1314
	buffer->chunk = NULL;
1315
	free(res);
1316
}
1317

1318
struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1319
							const struct pipe_resource *templ)
1320
{
1321
	struct r600_resource_global* result = NULL;
1322
	struct r600_screen* rscreen = NULL;
1323
	int size_in_dw = 0;
1324

1325
	assert(templ->target == PIPE_BUFFER);
1326
	assert(templ->bind & PIPE_BIND_GLOBAL);
1327
	assert(templ->array_size == 1 || templ->array_size == 0);
1328
	assert(templ->depth0 == 1 || templ->depth0 == 0);
1329
	assert(templ->height0 == 1 || templ->height0 == 0);
1330

1331
	result = (struct r600_resource_global*)
1332
	CALLOC(sizeof(struct r600_resource_global), 1);
1333
	rscreen = (struct r600_screen*)screen;
1334

1335
	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1336
	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1337
			templ->array_size);
1338

1339
	result->base.b.b = *templ;
1340
	result->base.b.b.screen = screen;
1341
	result->base.compute_global_bo = true;
1342
	pipe_reference_init(&result->base.b.b.reference, 1);
1343

1344
	size_in_dw = (templ->width0+3) / 4;
1345

1346
	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1347

1348
	if (result->chunk == NULL)
1349
	{
1350
		free(result);
1351
		return NULL;
1352
	}
1353

1354
	return &result->base.b.b;
1355
}
1356

1357
Product

Resources

Company