Path: blob/21.2-virgl/src/gallium/drivers/nouveau/nv50/nv50_compute.c
4574 views
/*1* Copyright 2012 Francisco Jerez2* Copyright 2015 Samuel Pitoiset3*4* Permission is hereby granted, free of charge, to any person obtaining5* a copy of this software and associated documentation files (the6* "Software"), to deal in the Software without restriction, including7* without limitation the rights to use, copy, modify, merge, publish,8* distribute, sublicense, and/or sell copies of the Software, and to9* permit persons to whom the Software is furnished to do so, subject to10* the following conditions:11*12* The above copyright notice and this permission notice (including the13* next paragraph) shall be included in all copies or substantial14* portions of the Software.15*16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,17* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF18* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.19* IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE20* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION21* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION22* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.23*24*/2526#include "util/format/u_format.h"27#include "nv50/nv50_context.h"28#include "nv50/nv50_compute.xml.h"2930#include "codegen/nv50_ir_driver.h"3132int33nv50_screen_compute_setup(struct nv50_screen *screen,34struct nouveau_pushbuf *push)35{36struct nouveau_device *dev = screen->base.device;37struct nouveau_object *chan = screen->base.channel;38struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data;39unsigned obj_class;40int i, ret;4142switch (dev->chipset & 0xf0) {43case 0x50:44case 0x80:45case 0x90:46obj_class = NV50_COMPUTE_CLASS;47break;48case 0xa0:49switch (dev->chipset) {50case 0xa3:51case 0xa5:52case 0xa8:53obj_class = NVA3_COMPUTE_CLASS;54break;55default:56obj_class = NV50_COMPUTE_CLASS;57break;58}59break;60default:61NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);62return -1;63}6465ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0,66&screen->compute);67if (ret)68return ret;6970BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);71PUSH_DATA (push, screen->compute->handle);7273BEGIN_NV04(push, NV50_CP(UNK02A0), 1);74PUSH_DATA (push, 1);75BEGIN_NV04(push, NV50_CP(DMA_STACK), 1);76PUSH_DATA (push, fifo->vram);77BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2);78PUSH_DATAh(push, screen->stack_bo->offset);79PUSH_DATA (push, screen->stack_bo->offset);80BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1);81PUSH_DATA (push, 4);8283BEGIN_NV04(push, NV50_CP(UNK0290), 1);84PUSH_DATA (push, 1);85BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1);86PUSH_DATA (push, 1);87BEGIN_NV04(push, NV50_CP(REG_MODE), 1);88PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);89BEGIN_NV04(push, NV50_CP(UNK0384), 1);90PUSH_DATA (push, 0x100);91BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1);92PUSH_DATA (push, fifo->vram);9394for (i = 0; i < 15; i++) {95BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2);96PUSH_DATA (push, 0);97PUSH_DATA (push, 0);98BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1);99PUSH_DATA (push, 0);100BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1);101PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);102}103104BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2);105PUSH_DATA (push, 0);106PUSH_DATA (push, 0);107BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1);108PUSH_DATA (push, ~0);109BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1);110PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);111112BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1);113PUSH_DATA (push, 7);114BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1);115PUSH_DATA (push, 1);116BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1);117PUSH_DATA (push, 7);118BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1);119PUSH_DATA (push, 1);120BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);121PUSH_DATA (push, 0);122123BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1);124PUSH_DATA (push, fifo->vram);125BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1);126PUSH_DATA (push, 0x54);127BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1);128PUSH_DATA (push, 0);129130BEGIN_NV04(push, NV50_CP(DMA_TIC), 1);131PUSH_DATA (push, fifo->vram);132BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3);133PUSH_DATAh(push, screen->txc->offset);134PUSH_DATA (push, screen->txc->offset);135PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);136137BEGIN_NV04(push, NV50_CP(DMA_TSC), 1);138PUSH_DATA (push, fifo->vram);139BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3);140PUSH_DATAh(push, screen->txc->offset + 65536);141PUSH_DATA (push, screen->txc->offset + 65536);142PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);143144BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1);145PUSH_DATA (push, fifo->vram);146147BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1);148PUSH_DATA (push, fifo->vram);149BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2);150PUSH_DATAh(push, screen->tls_bo->offset + 65536);151PUSH_DATA (push, screen->tls_bo->offset + 65536);152BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1);153PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));154155BEGIN_NV04(push, NV50_CP(CB_DEF_ADDRESS_HIGH), 3);156PUSH_DATAh(push, screen->uniforms->offset + (3 << 16));157PUSH_DATA (push, screen->uniforms->offset + (3 << 16));158PUSH_DATA (push, (NV50_CB_PCP << 16) | 0x0000);159160BEGIN_NV04(push, NV50_CP(QUERY_ADDRESS_HIGH), 2);161PUSH_DATAh(push, screen->fence.bo->offset + 16);162PUSH_DATA (push, screen->fence.bo->offset + 16);163164return 0;165}166167static void168nv50_compute_validate_samplers(struct nv50_context *nv50)169{170bool need_flush = nv50_validate_tsc(nv50, NV50_SHADER_STAGE_COMPUTE);171if (need_flush) {172BEGIN_NV04(nv50->base.pushbuf, NV50_CP(TSC_FLUSH), 1);173PUSH_DATA (nv50->base.pushbuf, 0);174}175176/* Invalidate all 3D samplers because they are aliased. */177nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS;178}179180static void181nv50_compute_validate_textures(struct nv50_context *nv50)182{183bool need_flush = nv50_validate_tic(nv50, NV50_SHADER_STAGE_COMPUTE);184if (need_flush) {185BEGIN_NV04(nv50->base.pushbuf, NV50_CP(TIC_FLUSH), 1);186PUSH_DATA (nv50->base.pushbuf, 0);187}188189/* Invalidate all 3D textures because they are aliased. */190nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);191nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;192}193194static inline void195nv50_compute_invalidate_constbufs(struct nv50_context *nv50)196{197int s;198199/* Invalidate all 3D constbufs because they are aliased with COMPUTE. */200for (s = 0; s < NV50_MAX_3D_SHADER_STAGES; s++) {201nv50->constbuf_dirty[s] |= nv50->constbuf_valid[s];202nv50->state.uniform_buffer_bound[s] = false;203}204nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;205}206207static void208nv50_compute_validate_constbufs(struct nv50_context *nv50)209{210struct nouveau_pushbuf *push = nv50->base.pushbuf;211const int s = NV50_SHADER_STAGE_COMPUTE;212213while (nv50->constbuf_dirty[s]) {214int i = ffs(nv50->constbuf_dirty[s]) - 1;215nv50->constbuf_dirty[s] &= ~(1 << i);216217if (nv50->constbuf[s][i].user) {218const unsigned b = NV50_CB_PVP + s;219unsigned start = 0;220unsigned words = nv50->constbuf[s][0].size / 4;221if (i) {222NOUVEAU_ERR("user constbufs only supported in slot 0\n");223continue;224}225if (!nv50->state.uniform_buffer_bound[s]) {226nv50->state.uniform_buffer_bound[s] = true;227BEGIN_NV04(push, NV50_CP(SET_PROGRAM_CB), 1);228PUSH_DATA (push, (b << 12) | (i << 8) | 1);229}230while (words) {231unsigned nr = MIN2(words, NV04_PFIFO_MAX_PACKET_LEN);232233PUSH_SPACE(push, nr + 3);234BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);235PUSH_DATA (push, (start << 8) | b);236BEGIN_NI04(push, NV50_CP(CB_DATA(0)), nr);237PUSH_DATAp(push, &nv50->constbuf[s][0].u.data[start * 4], nr);238239start += nr;240words -= nr;241}242} else {243struct nv04_resource *res =244nv04_resource(nv50->constbuf[s][i].u.buf);245if (res) {246/* TODO: allocate persistent bindings */247const unsigned b = s * 16 + i;248249assert(nouveau_resource_mapped_by_gpu(&res->base));250251BEGIN_NV04(push, NV50_CP(CB_DEF_ADDRESS_HIGH), 3);252PUSH_DATAh(push, res->address + nv50->constbuf[s][i].offset);253PUSH_DATA (push, res->address + nv50->constbuf[s][i].offset);254PUSH_DATA (push, (b << 16) |255(nv50->constbuf[s][i].size & 0xffff));256BEGIN_NV04(push, NV50_CP(SET_PROGRAM_CB), 1);257PUSH_DATA (push, (b << 12) | (i << 8) | 1);258259BCTX_REFN(nv50->bufctx_cp, CP_CB(i), res, RD);260261nv50->cb_dirty = 1; /* Force cache flush for UBO. */262res->cb_bindings[s] |= 1 << i;263} else {264BEGIN_NV04(push, NV50_CP(SET_PROGRAM_CB), 1);265PUSH_DATA (push, (i << 8) | 0);266}267if (i == 0)268nv50->state.uniform_buffer_bound[s] = false;269}270}271272// TODO: Check if having orthogonal slots means the two don't trample over273// each other.274nv50_compute_invalidate_constbufs(nv50);275}276277static void278nv50_get_surface_dims(const struct pipe_image_view *view,279int *width, int *height, int *depth)280{281struct nv04_resource *res = nv04_resource(view->resource);282int level;283284*width = *height = *depth = 1;285if (res->base.target == PIPE_BUFFER) {286*width = view->u.buf.size / util_format_get_blocksize(view->format);287return;288}289290level = view->u.tex.level;291*width = u_minify(view->resource->width0, level);292*height = u_minify(view->resource->height0, level);293*depth = u_minify(view->resource->depth0, level);294295switch (res->base.target) {296case PIPE_TEXTURE_1D_ARRAY:297case PIPE_TEXTURE_2D_ARRAY:298case PIPE_TEXTURE_CUBE:299case PIPE_TEXTURE_CUBE_ARRAY:300*depth = view->u.tex.last_layer - view->u.tex.first_layer + 1;301break;302case PIPE_TEXTURE_1D:303case PIPE_TEXTURE_2D:304case PIPE_TEXTURE_RECT:305case PIPE_TEXTURE_3D:306break;307default:308assert(!"unexpected texture target");309break;310}311}312313static void314nv50_mark_image_range_valid(const struct pipe_image_view *view)315{316struct nv04_resource *res = (struct nv04_resource *)view->resource;317318assert(view->resource->target == PIPE_BUFFER);319320util_range_add(&res->base, &res->valid_buffer_range,321view->u.buf.offset,322view->u.buf.offset + view->u.buf.size);323}324325static inline void326nv50_set_surface_info(struct nouveau_pushbuf *push,327const struct pipe_image_view *view,328int width, int height, int depth)329{330struct nv04_resource *res;331uint32_t *const info = push->cur;332333push->cur += 12;334335/* Make sure to always initialize the surface information area because it's336* used to check if the given image is bound or not. */337memset(info, 0, 12 * sizeof(*info));338339if (!view || !view->resource)340return;341res = nv04_resource(view->resource);342343/* Stick the image dimensions for the imageSize() builtin. */344info[0] = width;345info[1] = height;346info[2] = depth;347348/* Stick the blockwidth (ie. number of bytes per pixel) to calculate pixel349* offset and to check if the format doesn't mismatch. */350info[3] = util_format_get_blocksize(view->format);351352if (res->base.target != PIPE_BUFFER) {353struct nv50_miptree *mt = nv50_miptree(&res->base);354struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level];355unsigned nby = align(util_format_get_nblocksy(view->format, height),356NV50_TILE_SIZE_Y(lvl->tile_mode));357358if (mt->layout_3d) {359info[4] = nby;360info[11] = view->u.tex.first_layer;361} else {362info[4] = mt->layer_stride / lvl->pitch;363}364info[6] = mt->ms_x;365info[7] = mt->ms_y;366info[8] = NV50_TILE_SHIFT_X(lvl->tile_mode);367info[9] = NV50_TILE_SHIFT_Y(lvl->tile_mode);368info[10] = NV50_TILE_SHIFT_Z(lvl->tile_mode);369}370}371372static void373nv50_compute_validate_surfaces(struct nv50_context *nv50)374{375struct nouveau_pushbuf *push = nv50->base.pushbuf;376int i;377378for (i = 0; i < NV50_MAX_GLOBALS - 1; i++) {379struct nv50_gmem_state *gmem = &nv50->compprog->cp.gmem[i];380int width, height, depth;381uint64_t address = 0;382383BEGIN_NV04(push, NV50_CP(GLOBAL(i)), 5);384385if (gmem->valid && !gmem->image && nv50->buffers[gmem->slot].buffer) {386struct pipe_shader_buffer *buffer = &nv50->buffers[gmem->slot];387struct nv04_resource *res = nv04_resource(buffer->buffer);388PUSH_DATAh(push, res->address + buffer->buffer_offset);389PUSH_DATA (push, res->address + buffer->buffer_offset);390PUSH_DATA (push, 0); /* pitch? */391PUSH_DATA (push, ALIGN(buffer->buffer_size, 256) - 1);392PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);393BCTX_REFN(nv50->bufctx_cp, CP_BUF, res, RDWR);394util_range_add(&res->base, &res->valid_buffer_range,395buffer->buffer_offset,396buffer->buffer_offset +397buffer->buffer_size);398399PUSH_SPACE(push, 1 + 3);400BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);401PUSH_DATA (push, NV50_CB_AUX_BUF_INFO(i) << (8 - 2) | NV50_CB_AUX);402BEGIN_NI04(push, NV50_CP(CB_DATA(0)), 1);403PUSH_DATA (push, buffer->buffer_size);404} else if (gmem->valid && gmem->image && nv50->images[gmem->slot].resource) {405struct pipe_image_view *view = &nv50->images[gmem->slot];406struct nv04_resource *res = nv04_resource(view->resource);407408/* get surface dimensions based on the target. */409nv50_get_surface_dims(view, &width, &height, &depth);410411address = res->address;412if (res->base.target == PIPE_BUFFER) {413address += view->u.buf.offset;414assert(!(address & 0xff));415416if (view->access & PIPE_IMAGE_ACCESS_WRITE)417nv50_mark_image_range_valid(view);418419PUSH_DATAh(push, address);420PUSH_DATA (push, address);421PUSH_DATA (push, 0); /* pitch? */422PUSH_DATA (push, ALIGN(view->u.buf.size, 0x100) - 1);423PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);424} else {425struct nv50_miptree *mt = nv50_miptree(view->resource);426struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level];427const unsigned z = view->u.tex.first_layer;428unsigned max_size;429430if (mt->layout_3d) {431address += nv50_mt_zslice_offset(mt, view->u.tex.level, 0);432max_size = mt->total_size;433} else {434address += mt->layer_stride * z;435max_size = mt->layer_stride * (view->u.tex.last_layer - view->u.tex.first_layer + 1);436}437address += lvl->offset;438439PUSH_DATAh(push, address);440PUSH_DATA (push, address);441if (mt->layout_3d) {442// We have to adjust the size of the 3d surface to be443// accessible within 2d limits. The size of each z tile goes444// into the x direction, while the number of z tiles goes into445// the y direction.446const unsigned nby = util_format_get_nblocksy(view->format, height);447const unsigned tsy = NV50_TILE_SIZE_Y(lvl->tile_mode);448const unsigned tsz = NV50_TILE_SIZE_Z(lvl->tile_mode);449const unsigned pitch = lvl->pitch * tsz;450const unsigned maxy = align(nby, tsy) * align(depth, tsz) >> NV50_TILE_SHIFT_Z(lvl->tile_mode);451PUSH_DATA (push, pitch * tsy);452PUSH_DATA (push, (maxy - 1) << 16 | (pitch - 1));453PUSH_DATA (push, (lvl->tile_mode & 0xff) << 4);454} else if (nouveau_bo_memtype(res->bo)) {455PUSH_DATA (push, lvl->pitch * NV50_TILE_SIZE_Y(lvl->tile_mode));456PUSH_DATA (push, (max_size / lvl->pitch - 1) << 16 | (lvl->pitch - 1));457PUSH_DATA (push, (lvl->tile_mode & 0xff) << 4);458} else {459PUSH_DATA (push, lvl->pitch);460PUSH_DATA (push, align(lvl->pitch * height, 0x100) - 1);461PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);462}463}464465BCTX_REFN(nv50->bufctx_cp, CP_SUF, res, RDWR);466467PUSH_SPACE(push, 12 + 3);468BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);469PUSH_DATA (push, NV50_CB_AUX_BUF_INFO(i) << (8 - 2) | NV50_CB_AUX);470BEGIN_NI04(push, NV50_CP(CB_DATA(0)), 12);471nv50_set_surface_info(push, view, width, height, depth);472} else {473PUSH_DATA (push, 0);474PUSH_DATA (push, 0);475PUSH_DATA (push, 0);476PUSH_DATA (push, 0);477PUSH_DATA (push, 0);478}479}480}481482static void483nv50_compute_validate_globals(struct nv50_context *nv50)484{485unsigned i;486487for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);488++i) {489struct pipe_resource *res = *util_dynarray_element(490&nv50->global_residents, struct pipe_resource *, i);491if (res)492nv50_add_bufctx_resident(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL,493nv04_resource(res), NOUVEAU_BO_RDWR);494}495}496497static struct nv50_state_validate498validate_list_cp[] = {499{ nv50_compprog_validate, NV50_NEW_CP_PROGRAM },500{ nv50_compute_validate_constbufs, NV50_NEW_CP_CONSTBUF },501{ nv50_compute_validate_surfaces, NV50_NEW_CP_SURFACES |502NV50_NEW_CP_BUFFERS |503NV50_NEW_CP_PROGRAM },504{ nv50_compute_validate_textures, NV50_NEW_CP_TEXTURES },505{ nv50_compute_validate_samplers, NV50_NEW_CP_SAMPLERS },506{ nv50_compute_validate_globals, NV50_NEW_CP_GLOBALS },507};508509static bool510nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask)511{512bool ret;513514/* TODO: validate textures, samplers, surfaces */515ret = nv50_state_validate(nv50, mask, validate_list_cp,516ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp,517nv50->bufctx_cp);518519if (unlikely(nv50->state.flushed))520nv50_bufctx_fence(nv50->bufctx_cp, true);521return ret;522}523524static void525nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)526{527struct nv50_screen *screen = nv50->screen;528struct nouveau_pushbuf *push = screen->base.pushbuf;529unsigned size = align(nv50->compprog->parm_size, 0x4);530531BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);532PUSH_DATA (push, (1 + (size / 4)) << 8);533534if (size) {535struct nouveau_mm_allocation *mm;536struct nouveau_bo *bo = NULL;537unsigned offset;538539mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset);540assert(mm);541542nouveau_bo_map(bo, 0, screen->base.client);543memcpy(bo->map + offset, input, size);544545nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);546nouveau_pushbuf_bufctx(push, nv50->bufctx);547nouveau_pushbuf_validate(push);548549nouveau_pushbuf_space(push, 0, 0, 1);550551BEGIN_NV04(push, NV50_CP(USER_PARAM(1)), size / 4);552nouveau_pushbuf_data(push, bo, offset, size);553554nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);555nouveau_bo_ref(NULL, &bo);556nouveau_bufctx_reset(nv50->bufctx, 0);557}558}559560void561nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)562{563struct nv50_context *nv50 = nv50_context(pipe);564struct nouveau_pushbuf *push = nv50->base.pushbuf;565unsigned block_size = info->block[0] * info->block[1] * info->block[2];566struct nv50_program *cp = nv50->compprog;567bool ret;568569ret = !nv50_state_validate_cp(nv50, ~0);570if (ret) {571NOUVEAU_ERR("Failed to launch grid !\n");572return;573}574575nv50_compute_upload_input(nv50, info->input);576577BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);578PUSH_DATA (push, cp->code_base);579580BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);581PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x14, 0x40));582BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);583PUSH_DATA (push, cp->max_gpr);584585/* no indirect support - just read the parameters out */586uint32_t grid[3];587if (unlikely(info->indirect)) {588pipe_buffer_read(pipe, info->indirect, info->indirect_offset,589sizeof(grid), grid);590} else {591memcpy(grid, info->grid, sizeof(grid));592}593594/* grid/block setup */595BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2);596PUSH_DATA (push, info->block[1] << 16 | info->block[0]);597PUSH_DATA (push, info->block[2]);598BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1);599PUSH_DATA (push, 1 << 16 | block_size);600BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1);601PUSH_DATA (push, 1);602BEGIN_NV04(push, NV50_CP(GRIDDIM), 1);603PUSH_DATA (push, grid[1] << 16 | grid[0]);604BEGIN_NV04(push, NV50_CP(GRIDID), 1);605PUSH_DATA (push, 1);606607for (int i = 0; i < grid[2]; i++) {608BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), 1);609PUSH_DATA (push, grid[2] | i << 16);610611/* kernel launching */612BEGIN_NV04(push, NV50_CP(LAUNCH), 1);613PUSH_DATA (push, 0);614}615616BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);617PUSH_DATA (push, 0);618619/* bind a compute shader clobbers fragment shader state */620nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;621622nv50->compute_invocations += info->block[0] * info->block[1] * info->block[2] *623grid[0] * grid[1] * grid[2];624}625626627