Path: blob/21.2-virgl/src/broadcom/vulkan/v3dv_uniforms.c
4560 views
/*1* Copyright © 2019 Raspberry Pi2*3* Based in part on v3d driver which is:4*5* Copyright © 2014-2017 Broadcom6*7* Permission is hereby granted, free of charge, to any person obtaining a8* copy of this software and associated documentation files (the "Software"),9* to deal in the Software without restriction, including without limitation10* the rights to use, copy, modify, merge, publish, distribute, sublicense,11* and/or sell copies of the Software, and to permit persons to whom the12* Software is furnished to do so, subject to the following conditions:13*14* The above copyright notice and this permission notice (including the next15* paragraph) shall be included in all copies or substantial portions of the16* Software.17*18* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR19* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,20* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL21* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER22* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING23* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS24* IN THE SOFTWARE.25*/2627#include "v3dv_private.h"28#include "vk_format_info.h"2930/* The only version specific structure that we need is31* TMU_CONFIG_PARAMETER_1. This didn't seem to change significantly from32* previous V3D versions and we don't expect that to change, so for now let's33* just hardcode the V3D version here.34*/35#define V3D_VERSION 4136#include "broadcom/common/v3d_macros.h"37#include "broadcom/cle/v3dx_pack.h"3839/* Our Vulkan resource indices represent indices in descriptor maps which40* include all shader stages, so we need to size the arrays below41* accordingly. For now we only support a maximum of 3 stages: VS, GS, FS.42*/43#define MAX_STAGES 34445#define MAX_TOTAL_TEXTURE_SAMPLERS (V3D_MAX_TEXTURE_SAMPLERS * MAX_STAGES)46struct texture_bo_list {47struct v3dv_bo *tex[MAX_TOTAL_TEXTURE_SAMPLERS];48};4950/* This tracks state BOs for both textures and samplers, so we51* multiply by 2.52*/53#define MAX_TOTAL_STATES (2 * V3D_MAX_TEXTURE_SAMPLERS * MAX_STAGES)54struct state_bo_list {55uint32_t count;56struct v3dv_bo *states[MAX_TOTAL_STATES];57};5859#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES)60#define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES)61struct buffer_bo_list {62struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS];63struct v3dv_bo *ssbo[MAX_TOTAL_STORAGE_BUFFERS];64};6566static bool67state_bo_in_list(struct state_bo_list *list, struct v3dv_bo *bo)68{69for (int i = 0; i < list->count; i++) {70if (list->states[i] == bo)71return true;72}73return false;74}7576/*77* This method checks if the ubo used for push constants is needed to be78* updated or not.79*80* push contants ubo is only used for push constants accessed by a non-const81* index.82*83* FIXME: right now for this cases we are uploading the full84* push_constants_data. An improvement would be to upload only the data that85* we need to rely on a UBO.86*/87static void88check_push_constants_ubo(struct v3dv_cmd_buffer *cmd_buffer,89struct v3dv_pipeline *pipeline)90{91if (!(cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS) ||92pipeline->layout->push_constant_size == 0)93return;9495if (cmd_buffer->push_constants_resource.bo == NULL) {96cmd_buffer->push_constants_resource.bo =97v3dv_bo_alloc(cmd_buffer->device, MAX_PUSH_CONSTANTS_SIZE,98"push constants", true);99100if (!cmd_buffer->push_constants_resource.bo) {101fprintf(stderr, "Failed to allocate memory for push constants\n");102abort();103}104105bool ok = v3dv_bo_map(cmd_buffer->device,106cmd_buffer->push_constants_resource.bo,107MAX_PUSH_CONSTANTS_SIZE);108if (!ok) {109fprintf(stderr, "failed to map push constants buffer\n");110abort();111}112} else {113if (cmd_buffer->push_constants_resource.offset + MAX_PUSH_CONSTANTS_SIZE <=114cmd_buffer->push_constants_resource.bo->size) {115cmd_buffer->push_constants_resource.offset += MAX_PUSH_CONSTANTS_SIZE;116} else {117/* FIXME: we got out of space for push descriptors. Should we create118* a new bo? This could be easier with a uploader119*/120}121}122123memcpy(cmd_buffer->push_constants_resource.bo->map +124cmd_buffer->push_constants_resource.offset,125cmd_buffer->push_constants_data,126MAX_PUSH_CONSTANTS_SIZE);127128cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PUSH_CONSTANTS;129}130131/** V3D 4.x TMU configuration parameter 0 (texture) */132static void133write_tmu_p0(struct v3dv_cmd_buffer *cmd_buffer,134struct v3dv_pipeline *pipeline,135enum broadcom_shader_stage stage,136struct v3dv_cl_out **uniforms,137uint32_t data,138struct texture_bo_list *tex_bos,139struct state_bo_list *state_bos)140{141uint32_t texture_idx = v3d_unit_data_get_unit(data);142143struct v3dv_descriptor_state *descriptor_state =144v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);145146/* We need to ensure that the texture bo is added to the job */147struct v3dv_bo *texture_bo =148v3dv_descriptor_map_get_texture_bo(descriptor_state,149&pipeline->shared_data->maps[stage]->texture_map,150pipeline->layout, texture_idx);151assert(texture_bo);152assert(texture_idx < V3D_MAX_TEXTURE_SAMPLERS);153tex_bos->tex[texture_idx] = texture_bo;154155struct v3dv_cl_reloc state_reloc =156v3dv_descriptor_map_get_texture_shader_state(cmd_buffer->device, descriptor_state,157&pipeline->shared_data->maps[stage]->texture_map,158pipeline->layout,159texture_idx);160161cl_aligned_u32(uniforms, state_reloc.bo->offset +162state_reloc.offset +163v3d_unit_data_get_offset(data));164165/* Texture and Sampler states are typically suballocated, so they are166* usually the same BO: only flag them once to avoid trying to add them167* multiple times to the job later.168*/169if (!state_bo_in_list(state_bos, state_reloc.bo)) {170assert(state_bos->count < 2 * V3D_MAX_TEXTURE_SAMPLERS);171state_bos->states[state_bos->count++] = state_reloc.bo;172}173}174175/** V3D 4.x TMU configuration parameter 1 (sampler) */176static void177write_tmu_p1(struct v3dv_cmd_buffer *cmd_buffer,178struct v3dv_pipeline *pipeline,179enum broadcom_shader_stage stage,180struct v3dv_cl_out **uniforms,181uint32_t data,182struct state_bo_list *state_bos)183{184uint32_t sampler_idx = v3d_unit_data_get_unit(data);185struct v3dv_descriptor_state *descriptor_state =186v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);187188assert(sampler_idx != V3DV_NO_SAMPLER_16BIT_IDX &&189sampler_idx != V3DV_NO_SAMPLER_32BIT_IDX);190191struct v3dv_cl_reloc sampler_state_reloc =192v3dv_descriptor_map_get_sampler_state(cmd_buffer->device, descriptor_state,193&pipeline->shared_data->maps[stage]->sampler_map,194pipeline->layout, sampler_idx);195196const struct v3dv_sampler *sampler =197v3dv_descriptor_map_get_sampler(descriptor_state,198&pipeline->shared_data->maps[stage]->sampler_map,199pipeline->layout, sampler_idx);200assert(sampler);201202/* Set unnormalized coordinates flag from sampler object */203uint32_t p1_packed = v3d_unit_data_get_offset(data);204if (sampler->unnormalized_coordinates) {205struct V3DX(TMU_CONFIG_PARAMETER_1) p1_unpacked;206V3DX(TMU_CONFIG_PARAMETER_1_unpack)((uint8_t *)&p1_packed, &p1_unpacked);207p1_unpacked.unnormalized_coordinates = true;208V3DX(TMU_CONFIG_PARAMETER_1_pack)(NULL, (uint8_t *)&p1_packed,209&p1_unpacked);210}211212cl_aligned_u32(uniforms, sampler_state_reloc.bo->offset +213sampler_state_reloc.offset +214p1_packed);215216/* Texture and Sampler states are typically suballocated, so they are217* usually the same BO: only flag them once to avoid trying to add them218* multiple times to the job later.219*/220if (!state_bo_in_list(state_bos, sampler_state_reloc.bo)) {221assert(state_bos->count < 2 * V3D_MAX_TEXTURE_SAMPLERS);222state_bos->states[state_bos->count++] = sampler_state_reloc.bo;223}224}225226static void227write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,228struct v3dv_pipeline *pipeline,229enum broadcom_shader_stage stage,230struct v3dv_cl_out **uniforms,231enum quniform_contents content,232uint32_t data,233struct buffer_bo_list *buffer_bos)234{235struct v3dv_descriptor_state *descriptor_state =236v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);237238struct v3dv_descriptor_map *map =239content == QUNIFORM_UBO_ADDR || content == QUNIFORM_GET_UBO_SIZE ?240&pipeline->shared_data->maps[stage]->ubo_map :241&pipeline->shared_data->maps[stage]->ssbo_map;242243uint32_t offset =244content == QUNIFORM_UBO_ADDR ?245v3d_unit_data_get_offset(data) :2460;247248uint32_t dynamic_offset = 0;249250/* For ubos, index is shifted, as 0 is reserved for push constants.251*/252if (content == QUNIFORM_UBO_ADDR &&253v3d_unit_data_get_unit(data) == 0) {254/* This calls is to ensure that the push_constant_ubo is255* updated. It already take into account it is should do the256* update or not257*/258check_push_constants_ubo(cmd_buffer, pipeline);259260struct v3dv_cl_reloc *resource =261&cmd_buffer->push_constants_resource;262assert(resource->bo);263264cl_aligned_u32(uniforms, resource->bo->offset +265resource->offset +266offset + dynamic_offset);267buffer_bos->ubo[0] = resource->bo;268} else {269uint32_t index =270content == QUNIFORM_UBO_ADDR ?271v3d_unit_data_get_unit(data) - 1 :272data;273274struct v3dv_descriptor *descriptor =275v3dv_descriptor_map_get_descriptor(descriptor_state, map,276pipeline->layout,277index, &dynamic_offset);278assert(descriptor);279assert(descriptor->buffer);280assert(descriptor->buffer->mem);281assert(descriptor->buffer->mem->bo);282283if (content == QUNIFORM_GET_SSBO_SIZE ||284content == QUNIFORM_GET_UBO_SIZE) {285cl_aligned_u32(uniforms, descriptor->range);286} else {287cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset +288descriptor->buffer->mem_offset +289descriptor->offset +290offset + dynamic_offset);291292if (content == QUNIFORM_UBO_ADDR) {293assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS);294buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo;295} else {296assert(index < MAX_TOTAL_STORAGE_BUFFERS);297buffer_bos->ssbo[index] = descriptor->buffer->mem->bo;298}299}300}301}302303static uint32_t304get_texture_size_from_image_view(struct v3dv_image_view *image_view,305enum quniform_contents contents,306uint32_t data)307{308switch(contents) {309case QUNIFORM_IMAGE_WIDTH:310case QUNIFORM_TEXTURE_WIDTH:311/* We don't u_minify the values, as we are using the image_view312* extents313*/314return image_view->extent.width;315case QUNIFORM_IMAGE_HEIGHT:316case QUNIFORM_TEXTURE_HEIGHT:317return image_view->extent.height;318case QUNIFORM_IMAGE_DEPTH:319case QUNIFORM_TEXTURE_DEPTH:320return image_view->extent.depth;321case QUNIFORM_IMAGE_ARRAY_SIZE:322case QUNIFORM_TEXTURE_ARRAY_SIZE:323if (image_view->type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {324return image_view->last_layer - image_view->first_layer + 1;325} else {326assert((image_view->last_layer - image_view->first_layer + 1) % 6 == 0);327return (image_view->last_layer - image_view->first_layer + 1) / 6;328}329case QUNIFORM_TEXTURE_LEVELS:330return image_view->max_level - image_view->base_level + 1;331case QUNIFORM_TEXTURE_SAMPLES:332assert(image_view->image);333return image_view->image->samples;334default:335unreachable("Bad texture size field");336}337}338339340static uint32_t341get_texture_size_from_buffer_view(struct v3dv_buffer_view *buffer_view,342enum quniform_contents contents,343uint32_t data)344{345switch(contents) {346case QUNIFORM_IMAGE_WIDTH:347case QUNIFORM_TEXTURE_WIDTH:348return buffer_view->num_elements;349/* Only size can be queried for texel buffers */350default:351unreachable("Bad texture size field for texel buffers");352}353}354355static uint32_t356get_texture_size(struct v3dv_cmd_buffer *cmd_buffer,357struct v3dv_pipeline *pipeline,358enum broadcom_shader_stage stage,359enum quniform_contents contents,360uint32_t data)361{362uint32_t texture_idx = data;363364struct v3dv_descriptor_state *descriptor_state =365v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);366367struct v3dv_descriptor *descriptor =368v3dv_descriptor_map_get_descriptor(descriptor_state,369&pipeline->shared_data->maps[stage]->texture_map,370pipeline->layout,371texture_idx, NULL);372373assert(descriptor);374375switch (descriptor->type) {376case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:377case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:378case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:379case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:380return get_texture_size_from_image_view(descriptor->image_view,381contents, data);382case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:383case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:384return get_texture_size_from_buffer_view(descriptor->buffer_view,385contents, data);386default:387unreachable("Wrong descriptor for getting texture size");388}389}390391struct v3dv_cl_reloc392v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,393struct v3dv_pipeline *pipeline,394struct v3dv_shader_variant *variant,395uint32_t **wg_count_offsets)396{397struct v3d_uniform_list *uinfo =398&variant->prog_data.base->uniforms;399struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;400401struct v3dv_job *job = cmd_buffer->state.job;402assert(job);403assert(job->cmd_buffer == cmd_buffer);404405struct texture_bo_list tex_bos = { 0 };406struct state_bo_list state_bos = { 0 };407struct buffer_bo_list buffer_bos = { 0 };408409/* The hardware always pre-fetches the next uniform (also when there410* aren't any), so we always allocate space for an extra slot. This411* fixes MMU exceptions reported since Linux kernel 5.4 when the412* uniforms fill up the tail bytes of a page in the indirect413* BO. In that scenario, when the hardware pre-fetches after reading414* the last uniform it will read beyond the end of the page and trigger415* the MMU exception.416*/417v3dv_cl_ensure_space(&job->indirect, (uinfo->count + 1) * 4, 4);418419struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);420421struct v3dv_cl_out *uniforms = cl_start(&job->indirect);422423for (int i = 0; i < uinfo->count; i++) {424uint32_t data = uinfo->data[i];425426switch (uinfo->contents[i]) {427case QUNIFORM_CONSTANT:428cl_aligned_u32(&uniforms, data);429break;430431case QUNIFORM_UNIFORM:432cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]);433break;434435case QUNIFORM_VIEWPORT_X_SCALE:436cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);437break;438439case QUNIFORM_VIEWPORT_Y_SCALE:440cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);441break;442443case QUNIFORM_VIEWPORT_Z_OFFSET:444cl_aligned_f(&uniforms, dynamic->viewport.translate[0][2]);445break;446447case QUNIFORM_VIEWPORT_Z_SCALE:448cl_aligned_f(&uniforms, dynamic->viewport.scale[0][2]);449break;450451case QUNIFORM_SSBO_OFFSET:452case QUNIFORM_UBO_ADDR:453case QUNIFORM_GET_SSBO_SIZE:454case QUNIFORM_GET_UBO_SIZE:455write_ubo_ssbo_uniforms(cmd_buffer, pipeline, variant->stage, &uniforms,456uinfo->contents[i], data, &buffer_bos);457458break;459460case QUNIFORM_IMAGE_TMU_CONFIG_P0:461case QUNIFORM_TMU_CONFIG_P0:462write_tmu_p0(cmd_buffer, pipeline, variant->stage,463&uniforms, data, &tex_bos, &state_bos);464break;465466case QUNIFORM_TMU_CONFIG_P1:467write_tmu_p1(cmd_buffer, pipeline, variant->stage,468&uniforms, data, &state_bos);469break;470471case QUNIFORM_IMAGE_WIDTH:472case QUNIFORM_IMAGE_HEIGHT:473case QUNIFORM_IMAGE_DEPTH:474case QUNIFORM_IMAGE_ARRAY_SIZE:475case QUNIFORM_TEXTURE_WIDTH:476case QUNIFORM_TEXTURE_HEIGHT:477case QUNIFORM_TEXTURE_DEPTH:478case QUNIFORM_TEXTURE_ARRAY_SIZE:479case QUNIFORM_TEXTURE_LEVELS:480case QUNIFORM_TEXTURE_SAMPLES:481cl_aligned_u32(&uniforms,482get_texture_size(cmd_buffer,483pipeline,484variant->stage,485uinfo->contents[i],486data));487break;488489/* We generate this from geometry shaders to cap the generated gl_Layer490* to be within the number of layers of the framebuffer so we prevent the491* binner from trying to access tile state memory out of bounds (for492* layers that don't exist).493*494* Unfortunately, for secondary command buffers we may not know the495* number of layers in the framebuffer at this stage. Since we are496* only using this to sanitize the shader and it should not have any497* impact on correct shaders that emit valid values for gl_Layer,498* we just work around it by using the largest number of layers we499* support.500*501* FIXME: we could do better than this by recording in the job that502* the value at this uniform offset is not correct, and patch it when503* we execute the secondary command buffer into a primary, since we do504* have the correct number of layers at that point, but again, since this505* is only for sanityzing the shader and it only affects the specific case506* of secondary command buffers without framebuffer info available it507* might not be worth the trouble.508*/509case QUNIFORM_FB_LAYERS: {510uint32_t num_layers;511if (job->frame_tiling.layers != 0) {512num_layers = job->frame_tiling.layers;513} else if (cmd_buffer->state.framebuffer) {514num_layers = cmd_buffer->state.framebuffer->layers;515} else {516assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);517num_layers = 2048;518#if DEBUG519fprintf(stderr, "Skipping gl_LayerID shader sanity check for "520"secondary command buffer\n");521#endif522}523cl_aligned_u32(&uniforms, num_layers);524break;525}526527case QUNIFORM_NUM_WORK_GROUPS:528assert(job->type == V3DV_JOB_TYPE_GPU_CSD);529assert(job->csd.wg_count[data] > 0);530if (wg_count_offsets)531wg_count_offsets[data] = (uint32_t *) uniforms;532cl_aligned_u32(&uniforms, job->csd.wg_count[data]);533break;534535case QUNIFORM_WORK_GROUP_BASE:536assert(job->type == V3DV_JOB_TYPE_GPU_CSD);537cl_aligned_u32(&uniforms, job->csd.wg_base[data]);538break;539540case QUNIFORM_SHARED_OFFSET:541assert(job->type == V3DV_JOB_TYPE_GPU_CSD);542assert(job->csd.shared_memory);543cl_aligned_u32(&uniforms, job->csd.shared_memory->offset);544break;545546case QUNIFORM_SPILL_OFFSET:547assert(pipeline->spill.bo);548cl_aligned_u32(&uniforms, pipeline->spill.bo->offset);549break;550551case QUNIFORM_SPILL_SIZE_PER_THREAD:552assert(pipeline->spill.size_per_thread > 0);553cl_aligned_u32(&uniforms, pipeline->spill.size_per_thread);554break;555556default:557unreachable("unsupported quniform_contents uniform type\n");558}559}560561cl_end(&job->indirect, uniforms);562563for (int i = 0; i < MAX_TOTAL_TEXTURE_SAMPLERS; i++) {564if (tex_bos.tex[i])565v3dv_job_add_bo(job, tex_bos.tex[i]);566}567568for (int i = 0; i < state_bos.count; i++)569v3dv_job_add_bo(job, state_bos.states[i]);570571for (int i = 0; i < MAX_TOTAL_UNIFORM_BUFFERS; i++) {572if (buffer_bos.ubo[i])573v3dv_job_add_bo(job, buffer_bos.ubo[i]);574}575576for (int i = 0; i < MAX_TOTAL_STORAGE_BUFFERS; i++) {577if (buffer_bos.ssbo[i])578v3dv_job_add_bo(job, buffer_bos.ssbo[i]);579}580581if (job->csd.shared_memory)582v3dv_job_add_bo(job, job->csd.shared_memory);583584if (pipeline->spill.bo)585v3dv_job_add_bo(job, pipeline->spill.bo);586587return uniform_stream;588}589590struct v3dv_cl_reloc591v3dv_write_uniforms(struct v3dv_cmd_buffer *cmd_buffer,592struct v3dv_pipeline *pipeline,593struct v3dv_shader_variant *variant)594{595return v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline, variant, NULL);596}597598599