Path: blob/21.2-virgl/src/intel/vulkan/anv_batch_chain.c
4547 views
/*1* Copyright © 2015 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include <assert.h>24#include <stdbool.h>25#include <string.h>26#include <unistd.h>27#include <fcntl.h>2829#include "anv_private.h"30#include "anv_measure.h"3132#include "genxml/gen8_pack.h"33#include "genxml/genX_bits.h"34#include "perf/intel_perf.h"3536#include "util/debug.h"3738/** \file anv_batch_chain.c39*40* This file contains functions related to anv_cmd_buffer as a data41* structure. This involves everything required to create and destroy42* the actual batch buffers as well as link them together and handle43* relocations and surface state. It specifically does *not* contain any44* handling of actual vkCmd calls beyond vkCmdExecuteCommands.45*/4647/*-----------------------------------------------------------------------*48* Functions related to anv_reloc_list49*-----------------------------------------------------------------------*/5051VkResult52anv_reloc_list_init(struct anv_reloc_list *list,53const VkAllocationCallbacks *alloc)54{55memset(list, 0, sizeof(*list));56return VK_SUCCESS;57}5859static VkResult60anv_reloc_list_init_clone(struct anv_reloc_list *list,61const VkAllocationCallbacks *alloc,62const struct anv_reloc_list *other_list)63{64list->num_relocs = other_list->num_relocs;65list->array_length = other_list->array_length;6667if (list->num_relocs > 0) {68list->relocs =69vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,70VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);71if (list->relocs == NULL)72return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);7374list->reloc_bos =75vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,76VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);77if (list->reloc_bos == NULL) {78vk_free(alloc, list->relocs);79return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);80}8182memcpy(list->relocs, other_list->relocs,83list->array_length * sizeof(*list->relocs));84memcpy(list->reloc_bos, other_list->reloc_bos,85list->array_length * sizeof(*list->reloc_bos));86} else {87list->relocs = NULL;88list->reloc_bos = NULL;89}9091list->dep_words = other_list->dep_words;9293if (list->dep_words > 0) {94list->deps =95vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8,96VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);97memcpy(list->deps, other_list->deps,98list->dep_words * sizeof(BITSET_WORD));99} else {100list->deps = NULL;101}102103return VK_SUCCESS;104}105106void107anv_reloc_list_finish(struct anv_reloc_list *list,108const VkAllocationCallbacks *alloc)109{110vk_free(alloc, list->relocs);111vk_free(alloc, list->reloc_bos);112vk_free(alloc, list->deps);113}114115static VkResult116anv_reloc_list_grow(struct anv_reloc_list *list,117const VkAllocationCallbacks *alloc,118size_t num_additional_relocs)119{120if (list->num_relocs + num_additional_relocs <= list->array_length)121return VK_SUCCESS;122123size_t new_length = MAX2(16, list->array_length * 2);124while (new_length < list->num_relocs + num_additional_relocs)125new_length *= 2;126127struct drm_i915_gem_relocation_entry *new_relocs =128vk_realloc(alloc, list->relocs,129new_length * sizeof(*list->relocs), 8,130VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);131if (new_relocs == NULL)132return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);133list->relocs = new_relocs;134135struct anv_bo **new_reloc_bos =136vk_realloc(alloc, list->reloc_bos,137new_length * sizeof(*list->reloc_bos), 8,138VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);139if (new_reloc_bos == NULL)140return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);141list->reloc_bos = new_reloc_bos;142143list->array_length = new_length;144145return VK_SUCCESS;146}147148static VkResult149anv_reloc_list_grow_deps(struct anv_reloc_list *list,150const VkAllocationCallbacks *alloc,151uint32_t min_num_words)152{153if (min_num_words <= list->dep_words)154return VK_SUCCESS;155156uint32_t new_length = MAX2(32, list->dep_words * 2);157while (new_length < min_num_words)158new_length *= 2;159160BITSET_WORD *new_deps =161vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,162VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);163if (new_deps == NULL)164return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);165list->deps = new_deps;166167/* Zero out the new data */168memset(list->deps + list->dep_words, 0,169(new_length - list->dep_words) * sizeof(BITSET_WORD));170list->dep_words = new_length;171172return VK_SUCCESS;173}174175#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))176177VkResult178anv_reloc_list_add_bo(struct anv_reloc_list *list,179const VkAllocationCallbacks *alloc,180struct anv_bo *target_bo)181{182assert(!target_bo->is_wrapper);183assert(target_bo->flags & EXEC_OBJECT_PINNED);184185uint32_t idx = target_bo->gem_handle;186VkResult result = anv_reloc_list_grow_deps(list, alloc,187(idx / BITSET_WORDBITS) + 1);188if (unlikely(result != VK_SUCCESS))189return result;190191BITSET_SET(list->deps, idx);192193return VK_SUCCESS;194}195196VkResult197anv_reloc_list_add(struct anv_reloc_list *list,198const VkAllocationCallbacks *alloc,199uint32_t offset, struct anv_bo *target_bo, uint32_t delta,200uint64_t *address_u64_out)201{202struct drm_i915_gem_relocation_entry *entry;203int index;204205struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo);206uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset);207if (address_u64_out)208*address_u64_out = target_bo_offset + delta;209210assert(unwrapped_target_bo->gem_handle > 0);211assert(unwrapped_target_bo->refcount > 0);212213if (unwrapped_target_bo->flags & EXEC_OBJECT_PINNED)214return anv_reloc_list_add_bo(list, alloc, unwrapped_target_bo);215216VkResult result = anv_reloc_list_grow(list, alloc, 1);217if (result != VK_SUCCESS)218return result;219220/* XXX: Can we use I915_EXEC_HANDLE_LUT? */221index = list->num_relocs++;222list->reloc_bos[index] = target_bo;223entry = &list->relocs[index];224entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */225entry->delta = delta;226entry->offset = offset;227entry->presumed_offset = target_bo_offset;228entry->read_domains = 0;229entry->write_domain = 0;230VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));231232return VK_SUCCESS;233}234235static void236anv_reloc_list_clear(struct anv_reloc_list *list)237{238list->num_relocs = 0;239if (list->dep_words > 0)240memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));241}242243static VkResult244anv_reloc_list_append(struct anv_reloc_list *list,245const VkAllocationCallbacks *alloc,246struct anv_reloc_list *other, uint32_t offset)247{248VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs);249if (result != VK_SUCCESS)250return result;251252if (other->num_relocs > 0) {253memcpy(&list->relocs[list->num_relocs], &other->relocs[0],254other->num_relocs * sizeof(other->relocs[0]));255memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],256other->num_relocs * sizeof(other->reloc_bos[0]));257258for (uint32_t i = 0; i < other->num_relocs; i++)259list->relocs[i + list->num_relocs].offset += offset;260261list->num_relocs += other->num_relocs;262}263264anv_reloc_list_grow_deps(list, alloc, other->dep_words);265for (uint32_t w = 0; w < other->dep_words; w++)266list->deps[w] |= other->deps[w];267268return VK_SUCCESS;269}270271/*-----------------------------------------------------------------------*272* Functions related to anv_batch273*-----------------------------------------------------------------------*/274275void *276anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)277{278if (batch->next + num_dwords * 4 > batch->end) {279VkResult result = batch->extend_cb(batch, batch->user_data);280if (result != VK_SUCCESS) {281anv_batch_set_error(batch, result);282return NULL;283}284}285286void *p = batch->next;287288batch->next += num_dwords * 4;289assert(batch->next <= batch->end);290291return p;292}293294struct anv_address295anv_batch_address(struct anv_batch *batch, void *batch_location)296{297assert(batch->start < batch_location);298299/* Allow a jump at the current location of the batch. */300assert(batch->next >= batch_location);301302return anv_address_add(batch->start_addr, batch_location - batch->start);303}304305void306anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)307{308uint32_t size, offset;309310size = other->next - other->start;311assert(size % 4 == 0);312313if (batch->next + size > batch->end) {314VkResult result = batch->extend_cb(batch, batch->user_data);315if (result != VK_SUCCESS) {316anv_batch_set_error(batch, result);317return;318}319}320321assert(batch->next + size <= batch->end);322323VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));324memcpy(batch->next, other->start, size);325326offset = batch->next - batch->start;327VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc,328other->relocs, offset);329if (result != VK_SUCCESS) {330anv_batch_set_error(batch, result);331return;332}333334batch->next += size;335}336337/*-----------------------------------------------------------------------*338* Functions related to anv_batch_bo339*-----------------------------------------------------------------------*/340341static VkResult342anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,343uint32_t size,344struct anv_batch_bo **bbo_out)345{346VkResult result;347348struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),3498, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);350if (bbo == NULL)351return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);352353result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,354size, &bbo->bo);355if (result != VK_SUCCESS)356goto fail_alloc;357358result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->pool->alloc);359if (result != VK_SUCCESS)360goto fail_bo_alloc;361362*bbo_out = bbo;363364return VK_SUCCESS;365366fail_bo_alloc:367anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);368fail_alloc:369vk_free(&cmd_buffer->pool->alloc, bbo);370371return result;372}373374static VkResult375anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,376const struct anv_batch_bo *other_bbo,377struct anv_batch_bo **bbo_out)378{379VkResult result;380381struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),3828, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);383if (bbo == NULL)384return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);385386result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,387other_bbo->bo->size, &bbo->bo);388if (result != VK_SUCCESS)389goto fail_alloc;390391result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->pool->alloc,392&other_bbo->relocs);393if (result != VK_SUCCESS)394goto fail_bo_alloc;395396bbo->length = other_bbo->length;397memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length);398*bbo_out = bbo;399400return VK_SUCCESS;401402fail_bo_alloc:403anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);404fail_alloc:405vk_free(&cmd_buffer->pool->alloc, bbo);406407return result;408}409410static void411anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,412size_t batch_padding)413{414anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, },415bbo->bo->map, bbo->bo->size - batch_padding);416batch->relocs = &bbo->relocs;417anv_reloc_list_clear(&bbo->relocs);418}419420static void421anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,422size_t batch_padding)423{424batch->start_addr = (struct anv_address) { .bo = bbo->bo, };425batch->start = bbo->bo->map;426batch->next = bbo->bo->map + bbo->length;427batch->end = bbo->bo->map + bbo->bo->size - batch_padding;428batch->relocs = &bbo->relocs;429}430431static void432anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)433{434assert(batch->start == bbo->bo->map);435bbo->length = batch->next - batch->start;436VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));437}438439static VkResult440anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo,441struct anv_batch *batch, size_t aditional,442size_t batch_padding)443{444assert(batch->start == bbo->bo->map);445bbo->length = batch->next - batch->start;446447size_t new_size = bbo->bo->size;448while (new_size <= bbo->length + aditional + batch_padding)449new_size *= 2;450451if (new_size == bbo->bo->size)452return VK_SUCCESS;453454struct anv_bo *new_bo;455VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,456new_size, &new_bo);457if (result != VK_SUCCESS)458return result;459460memcpy(new_bo->map, bbo->bo->map, bbo->length);461462anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);463464bbo->bo = new_bo;465anv_batch_bo_continue(bbo, batch, batch_padding);466467return VK_SUCCESS;468}469470static void471anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,472struct anv_batch_bo *prev_bbo,473struct anv_batch_bo *next_bbo,474uint32_t next_bbo_offset)475{476const uint32_t bb_start_offset =477prev_bbo->length - GFX8_MI_BATCH_BUFFER_START_length * 4;478ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;479480/* Make sure we're looking at a MI_BATCH_BUFFER_START */481assert(((*bb_start >> 29) & 0x07) == 0);482assert(((*bb_start >> 23) & 0x3f) == 49);483484if (cmd_buffer->device->physical->use_softpin) {485assert(prev_bbo->bo->flags & EXEC_OBJECT_PINNED);486assert(next_bbo->bo->flags & EXEC_OBJECT_PINNED);487488write_reloc(cmd_buffer->device,489prev_bbo->bo->map + bb_start_offset + 4,490next_bbo->bo->offset + next_bbo_offset, true);491} else {492uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1;493assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4);494495prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo;496prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset;497498/* Use a bogus presumed offset to force a relocation */499prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1;500}501}502503static void504anv_batch_bo_destroy(struct anv_batch_bo *bbo,505struct anv_cmd_buffer *cmd_buffer)506{507anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc);508anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);509vk_free(&cmd_buffer->pool->alloc, bbo);510}511512static VkResult513anv_batch_bo_list_clone(const struct list_head *list,514struct anv_cmd_buffer *cmd_buffer,515struct list_head *new_list)516{517VkResult result = VK_SUCCESS;518519list_inithead(new_list);520521struct anv_batch_bo *prev_bbo = NULL;522list_for_each_entry(struct anv_batch_bo, bbo, list, link) {523struct anv_batch_bo *new_bbo = NULL;524result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);525if (result != VK_SUCCESS)526break;527list_addtail(&new_bbo->link, new_list);528529if (prev_bbo)530anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);531532prev_bbo = new_bbo;533}534535if (result != VK_SUCCESS) {536list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) {537list_del(&bbo->link);538anv_batch_bo_destroy(bbo, cmd_buffer);539}540}541542return result;543}544545/*-----------------------------------------------------------------------*546* Functions related to anv_batch_bo547*-----------------------------------------------------------------------*/548549static struct anv_batch_bo *550anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)551{552return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link);553}554555struct anv_address556anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)557{558struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device);559struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);560return (struct anv_address) {561.bo = pool->block_pool.bo,562.offset = bt_block->offset - pool->start_offset,563};564}565566static void567emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer,568struct anv_bo *bo, uint32_t offset)569{570/* In gfx8+ the address field grew to two dwords to accomodate 48 bit571* offsets. The high 16 bits are in the last dword, so we can use the gfx8572* version in either case, as long as we set the instruction length in the573* header accordingly. This means that we always emit three dwords here574* and all the padding and adjustment we do in this file works for all575* gens.576*/577578#define GFX7_MI_BATCH_BUFFER_START_length 2579#define GFX7_MI_BATCH_BUFFER_START_length_bias 2580581const uint32_t gfx7_length =582GFX7_MI_BATCH_BUFFER_START_length - GFX7_MI_BATCH_BUFFER_START_length_bias;583const uint32_t gfx8_length =584GFX8_MI_BATCH_BUFFER_START_length - GFX8_MI_BATCH_BUFFER_START_length_bias;585586anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_START, bbs) {587bbs.DWordLength = cmd_buffer->device->info.ver < 8 ?588gfx7_length : gfx8_length;589bbs.SecondLevelBatchBuffer = Firstlevelbatch;590bbs.AddressSpaceIndicator = ASI_PPGTT;591bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset };592}593}594595static void596cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,597struct anv_batch_bo *bbo)598{599struct anv_batch *batch = &cmd_buffer->batch;600struct anv_batch_bo *current_bbo =601anv_cmd_buffer_current_batch_bo(cmd_buffer);602603/* We set the end of the batch a little short so we would be sure we604* have room for the chaining command. Since we're about to emit the605* chaining command, let's set it back where it should go.606*/607batch->end += GFX8_MI_BATCH_BUFFER_START_length * 4;608assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);609610emit_batch_buffer_start(cmd_buffer, bbo->bo, 0);611612anv_batch_bo_finish(current_bbo, batch);613}614615static void616anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,617struct anv_cmd_buffer *cmd_buffer_to)618{619assert(cmd_buffer_from->device->physical->use_softpin);620621uint32_t *bb_start = cmd_buffer_from->batch_end;622623struct anv_batch_bo *last_bbo =624list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);625struct anv_batch_bo *first_bbo =626list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);627628struct GFX8_MI_BATCH_BUFFER_START gen_bb_start = {629__anv_cmd_header(GFX8_MI_BATCH_BUFFER_START),630.SecondLevelBatchBuffer = Firstlevelbatch,631.AddressSpaceIndicator = ASI_PPGTT,632.BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 },633};634struct anv_batch local_batch = {635.start = last_bbo->bo->map,636.end = last_bbo->bo->map + last_bbo->bo->size,637.relocs = &last_bbo->relocs,638.alloc = &cmd_buffer_from->pool->alloc,639};640641__anv_cmd_pack(GFX8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);642643last_bbo->chained = true;644}645646static void647anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)648{649assert(cmd_buffer->device->physical->use_softpin);650651struct anv_batch_bo *last_bbo =652list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);653last_bbo->chained = false;654655uint32_t *batch = cmd_buffer->batch_end;656anv_pack_struct(batch, GFX8_MI_BATCH_BUFFER_END,657__anv_cmd_header(GFX8_MI_BATCH_BUFFER_END));658}659660static VkResult661anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)662{663struct anv_cmd_buffer *cmd_buffer = _data;664struct anv_batch_bo *new_bbo;665/* Cap reallocation to chunk. */666uint32_t alloc_size = MIN2(cmd_buffer->total_batch_size,667ANV_MAX_CMD_BUFFER_BATCH_SIZE);668669VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);670if (result != VK_SUCCESS)671return result;672673cmd_buffer->total_batch_size += alloc_size;674675struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);676if (seen_bbo == NULL) {677anv_batch_bo_destroy(new_bbo, cmd_buffer);678return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);679}680*seen_bbo = new_bbo;681682cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo);683684list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);685686anv_batch_bo_start(new_bbo, batch, GFX8_MI_BATCH_BUFFER_START_length * 4);687688return VK_SUCCESS;689}690691static VkResult692anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data)693{694struct anv_cmd_buffer *cmd_buffer = _data;695struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);696697anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096,698GFX8_MI_BATCH_BUFFER_START_length * 4);699700return VK_SUCCESS;701}702703/** Allocate a binding table704*705* This function allocates a binding table. This is a bit more complicated706* than one would think due to a combination of Vulkan driver design and some707* unfortunate hardware restrictions.708*709* The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for710* the binding table pointer which means that all binding tables need to live711* in the bottom 64k of surface state base address. The way the GL driver has712* classically dealt with this restriction is to emit all surface states713* on-the-fly into the batch and have a batch buffer smaller than 64k. This714* isn't really an option in Vulkan for a couple of reasons:715*716* 1) In Vulkan, we have growing (or chaining) batches so surface states have717* to live in their own buffer and we have to be able to re-emit718* STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In719* order to avoid emitting STATE_BASE_ADDRESS any more often than needed720* (it's not that hard to hit 64k of just binding tables), we allocate721* surface state objects up-front when VkImageView is created. In order722* for this to work, surface state objects need to be allocated from a723* global buffer.724*725* 2) We tried to design the surface state system in such a way that it's726* already ready for bindless texturing. The way bindless texturing works727* on our hardware is that you have a big pool of surface state objects728* (with its own state base address) and the bindless handles are simply729* offsets into that pool. With the architecture we chose, we already730* have that pool and it's exactly the same pool that we use for regular731* surface states so we should already be ready for bindless.732*733* 3) For render targets, we need to be able to fill out the surface states734* later in vkBeginRenderPass so that we can assign clear colors735* correctly. One way to do this would be to just create the surface736* state data and then repeatedly copy it into the surface state BO every737* time we have to re-emit STATE_BASE_ADDRESS. While this works, it's738* rather annoying and just being able to allocate them up-front and739* re-use them for the entire render pass.740*741* While none of these are technically blockers for emitting state on the fly742* like we do in GL, the ability to have a single surface state pool is743* simplifies things greatly. Unfortunately, it comes at a cost...744*745* Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't746* place the binding tables just anywhere in surface state base address.747* Because 64k isn't a whole lot of space, we can't simply restrict the748* surface state buffer to 64k, we have to be more clever. The solution we've749* chosen is to have a block pool with a maximum size of 2G that starts at750* zero and grows in both directions. All surface states are allocated from751* the top of the pool (positive offsets) and we allocate blocks (< 64k) of752* binding tables from the bottom of the pool (negative offsets). Every time753* we allocate a new binding table block, we set surface state base address to754* point to the bottom of the binding table block. This way all of the755* binding tables in the block are in the bottom 64k of surface state base756* address. When we fill out the binding table, we add the distance between757* the bottom of our binding table block and zero of the block pool to the758* surface state offsets so that they are correct relative to out new surface759* state base address at the bottom of the binding table block.760*761* \see adjust_relocations_from_block_pool()762* \see adjust_relocations_too_block_pool()763*764* \param[in] entries The number of surface state entries the binding765* table should be able to hold.766*767* \param[out] state_offset The offset surface surface state base address768* where the surface states live. This must be769* added to the surface state offset when it is770* written into the binding table entry.771*772* \return An anv_state representing the binding table773*/774struct anv_state775anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,776uint32_t entries, uint32_t *state_offset)777{778struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);779780uint32_t bt_size = align_u32(entries * 4, 32);781782struct anv_state state = cmd_buffer->bt_next;783if (bt_size > state.alloc_size)784return (struct anv_state) { 0 };785786state.alloc_size = bt_size;787cmd_buffer->bt_next.offset += bt_size;788cmd_buffer->bt_next.map += bt_size;789cmd_buffer->bt_next.alloc_size -= bt_size;790791assert(bt_block->offset < 0);792*state_offset = -bt_block->offset;793794return state;795}796797struct anv_state798anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer)799{800struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;801return anv_state_stream_alloc(&cmd_buffer->surface_state_stream,802isl_dev->ss.size, isl_dev->ss.align);803}804805struct anv_state806anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,807uint32_t size, uint32_t alignment)808{809return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,810size, alignment);811}812813VkResult814anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)815{816struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);817if (bt_block == NULL) {818anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);819return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);820}821822*bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);823824/* The bt_next state is a rolling state (we update it as we suballocate825* from it) which is relative to the start of the binding table block.826*/827cmd_buffer->bt_next = *bt_block;828cmd_buffer->bt_next.offset = 0;829830return VK_SUCCESS;831}832833VkResult834anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)835{836struct anv_batch_bo *batch_bo;837VkResult result;838839list_inithead(&cmd_buffer->batch_bos);840841cmd_buffer->total_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;842843result = anv_batch_bo_create(cmd_buffer,844cmd_buffer->total_batch_size,845&batch_bo);846if (result != VK_SUCCESS)847return result;848849list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);850851cmd_buffer->batch.alloc = &cmd_buffer->pool->alloc;852cmd_buffer->batch.user_data = cmd_buffer;853854if (cmd_buffer->device->can_chain_batches) {855cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;856} else {857cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch;858}859860anv_batch_bo_start(batch_bo, &cmd_buffer->batch,861GFX8_MI_BATCH_BUFFER_START_length * 4);862863int success = u_vector_init(&cmd_buffer->seen_bbos,864sizeof(struct anv_bo *),8658 * sizeof(struct anv_bo *));866if (!success)867goto fail_batch_bo;868869*(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;870871/* u_vector requires power-of-two size elements */872unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state));873success = u_vector_init(&cmd_buffer->bt_block_states,874pow2_state_size, 8 * pow2_state_size);875if (!success)876goto fail_seen_bbos;877878result = anv_reloc_list_init(&cmd_buffer->surface_relocs,879&cmd_buffer->pool->alloc);880if (result != VK_SUCCESS)881goto fail_bt_blocks;882cmd_buffer->last_ss_pool_center = 0;883884result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);885if (result != VK_SUCCESS)886goto fail_bt_blocks;887888return VK_SUCCESS;889890fail_bt_blocks:891u_vector_finish(&cmd_buffer->bt_block_states);892fail_seen_bbos:893u_vector_finish(&cmd_buffer->seen_bbos);894fail_batch_bo:895anv_batch_bo_destroy(batch_bo, cmd_buffer);896897return result;898}899900void901anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)902{903struct anv_state *bt_block;904u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)905anv_binding_table_pool_free(cmd_buffer->device, *bt_block);906u_vector_finish(&cmd_buffer->bt_block_states);907908anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);909910u_vector_finish(&cmd_buffer->seen_bbos);911912/* Destroy all of the batch buffers */913list_for_each_entry_safe(struct anv_batch_bo, bbo,914&cmd_buffer->batch_bos, link) {915list_del(&bbo->link);916anv_batch_bo_destroy(bbo, cmd_buffer);917}918}919920void921anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)922{923/* Delete all but the first batch bo */924assert(!list_is_empty(&cmd_buffer->batch_bos));925while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {926struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);927list_del(&bbo->link);928anv_batch_bo_destroy(bbo, cmd_buffer);929}930assert(!list_is_empty(&cmd_buffer->batch_bos));931932anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),933&cmd_buffer->batch,934GFX8_MI_BATCH_BUFFER_START_length * 4);935936while (u_vector_length(&cmd_buffer->bt_block_states) > 1) {937struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);938anv_binding_table_pool_free(cmd_buffer->device, *bt_block);939}940assert(u_vector_length(&cmd_buffer->bt_block_states) == 1);941cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states);942cmd_buffer->bt_next.offset = 0;943944anv_reloc_list_clear(&cmd_buffer->surface_relocs);945cmd_buffer->last_ss_pool_center = 0;946947/* Reset the list of seen buffers */948cmd_buffer->seen_bbos.head = 0;949cmd_buffer->seen_bbos.tail = 0;950951struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);952953*(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;954955956assert(!cmd_buffer->device->can_chain_batches ||957first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);958cmd_buffer->total_batch_size = first_bbo->bo->size;959}960961void962anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)963{964struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);965966if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {967/* When we start a batch buffer, we subtract a certain amount of968* padding from the end to ensure that we always have room to emit a969* BATCH_BUFFER_START to chain to the next BO. We need to remove970* that padding before we end the batch; otherwise, we may end up971* with our BATCH_BUFFER_END in another BO.972*/973cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;974assert(cmd_buffer->batch.start == batch_bo->bo->map);975assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);976977/* Save end instruction location to override it later. */978cmd_buffer->batch_end = cmd_buffer->batch.next;979980/* If we can chain this command buffer to another one, leave some place981* for the jump instruction.982*/983batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);984if (batch_bo->chained)985emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);986else987anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_END, bbe);988989/* Round batch up to an even number of dwords. */990if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)991anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);992993cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;994} else {995assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);996/* If this is a secondary command buffer, we need to determine the997* mode in which it will be executed with vkExecuteCommands. We998* determine this statically here so that this stays in sync with the999* actual ExecuteCommands implementation.1000*/1001const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;1002if (!cmd_buffer->device->can_chain_batches) {1003cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT;1004} else if (cmd_buffer->device->physical->use_call_secondary) {1005cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;1006/* If the secondary command buffer begins & ends in the same BO and1007* its length is less than the length of CS prefetch, add some NOOPs1008* instructions so the last MI_BATCH_BUFFER_START is outside the CS1009* prefetch.1010*/1011if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) {1012const struct intel_device_info *devinfo = &cmd_buffer->device->info;1013/* Careful to have everything in signed integer. */1014int32_t prefetch_len = devinfo->cs_prefetch_size;1015int32_t batch_len =1016cmd_buffer->batch.next - cmd_buffer->batch.start;10171018for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4)1019anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);1020}10211022void *jump_addr =1023anv_batch_emitn(&cmd_buffer->batch,1024GFX8_MI_BATCH_BUFFER_START_length,1025GFX8_MI_BATCH_BUFFER_START,1026.AddressSpaceIndicator = ASI_PPGTT,1027.SecondLevelBatchBuffer = Firstlevelbatch) +1028(GFX8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);1029cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);10301031/* The emit above may have caused us to chain batch buffers which1032* would mean that batch_bo is no longer valid.1033*/1034batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);1035} else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&1036(length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) {1037/* If the secondary has exactly one batch buffer in its list *and*1038* that batch buffer is less than half of the maximum size, we're1039* probably better of simply copying it into our batch.1040*/1041cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;1042} else if (!(cmd_buffer->usage_flags &1043VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {1044cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;10451046/* In order to chain, we need this command buffer to contain an1047* MI_BATCH_BUFFER_START which will jump back to the calling batch.1048* It doesn't matter where it points now so long as has a valid1049* relocation. We'll adjust it later as part of the chaining1050* process.1051*1052* We set the end of the batch a little short so we would be sure we1053* have room for the chaining command. Since we're about to emit the1054* chaining command, let's set it back where it should go.1055*/1056cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;1057assert(cmd_buffer->batch.start == batch_bo->bo->map);1058assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);10591060emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);1061assert(cmd_buffer->batch.start == batch_bo->bo->map);1062} else {1063cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;1064}1065}10661067anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);1068}10691070static VkResult1071anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,1072struct list_head *list)1073{1074list_for_each_entry(struct anv_batch_bo, bbo, list, link) {1075struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);1076if (bbo_ptr == NULL)1077return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);10781079*bbo_ptr = bbo;1080}10811082return VK_SUCCESS;1083}10841085void1086anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,1087struct anv_cmd_buffer *secondary)1088{1089anv_measure_add_secondary(primary, secondary);1090switch (secondary->exec_mode) {1091case ANV_CMD_BUFFER_EXEC_MODE_EMIT:1092anv_batch_emit_batch(&primary->batch, &secondary->batch);1093break;1094case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: {1095struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary);1096unsigned length = secondary->batch.end - secondary->batch.start;1097anv_batch_bo_grow(primary, bbo, &primary->batch, length,1098GFX8_MI_BATCH_BUFFER_START_length * 4);1099anv_batch_emit_batch(&primary->batch, &secondary->batch);1100break;1101}1102case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {1103struct anv_batch_bo *first_bbo =1104list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);1105struct anv_batch_bo *last_bbo =1106list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);11071108emit_batch_buffer_start(primary, first_bbo->bo, 0);11091110struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);1111assert(primary->batch.start == this_bbo->bo->map);1112uint32_t offset = primary->batch.next - primary->batch.start;11131114/* Make the tail of the secondary point back to right after the1115* MI_BATCH_BUFFER_START in the primary batch.1116*/1117anv_batch_bo_link(primary, last_bbo, this_bbo, offset);11181119anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);1120break;1121}1122case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {1123struct list_head copy_list;1124VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,1125secondary,1126©_list);1127if (result != VK_SUCCESS)1128return; /* FIXME */11291130anv_cmd_buffer_add_seen_bbos(primary, ©_list);11311132struct anv_batch_bo *first_bbo =1133list_first_entry(©_list, struct anv_batch_bo, link);1134struct anv_batch_bo *last_bbo =1135list_last_entry(©_list, struct anv_batch_bo, link);11361137cmd_buffer_chain_to_batch_bo(primary, first_bbo);11381139list_splicetail(©_list, &primary->batch_bos);11401141anv_batch_bo_continue(last_bbo, &primary->batch,1142GFX8_MI_BATCH_BUFFER_START_length * 4);1143break;1144}1145case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {1146struct anv_batch_bo *first_bbo =1147list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);11481149uint64_t *write_return_addr =1150anv_batch_emitn(&primary->batch,1151GFX8_MI_STORE_DATA_IMM_length + 1 /* QWord write */,1152GFX8_MI_STORE_DATA_IMM,1153.Address = secondary->return_addr)1154+ (GFX8_MI_STORE_DATA_IMM_ImmediateData_start / 8);11551156emit_batch_buffer_start(primary, first_bbo->bo, 0);11571158*write_return_addr =1159anv_address_physical(anv_batch_address(&primary->batch,1160primary->batch.next));11611162anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);1163break;1164}1165default:1166assert(!"Invalid execution mode");1167}11681169anv_reloc_list_append(&primary->surface_relocs, &primary->pool->alloc,1170&secondary->surface_relocs, 0);1171}11721173struct anv_execbuf {1174struct drm_i915_gem_execbuffer2 execbuf;11751176struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;11771178struct drm_i915_gem_exec_object2 * objects;1179uint32_t bo_count;1180struct anv_bo ** bos;11811182/* Allocated length of the 'objects' and 'bos' arrays */1183uint32_t array_length;11841185/* List of relocations for surface states, only used with platforms not1186* using softpin.1187*/1188void * surface_states_relocs;11891190/* Indicates whether any of the command buffers have relocations. This1191* doesn't not necessarily mean we'll need the kernel to process them. It1192* might be that a previous execbuf has already placed things in the VMA1193* and we can make i915 skip the relocations.1194*/1195bool has_relocs;11961197const VkAllocationCallbacks * alloc;1198VkSystemAllocationScope alloc_scope;11991200int perf_query_pass;1201};12021203static void1204anv_execbuf_init(struct anv_execbuf *exec)1205{1206memset(exec, 0, sizeof(*exec));1207}12081209static void1210anv_execbuf_finish(struct anv_execbuf *exec)1211{1212vk_free(exec->alloc, exec->surface_states_relocs);1213vk_free(exec->alloc, exec->objects);1214vk_free(exec->alloc, exec->bos);1215}12161217static void1218anv_execbuf_add_ext(struct anv_execbuf *exec,1219uint32_t ext_name,1220struct i915_user_extension *ext)1221{1222__u64 *iter = &exec->execbuf.cliprects_ptr;12231224exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;12251226while (*iter != 0) {1227iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;1228}12291230ext->name = ext_name;12311232*iter = (uintptr_t) ext;1233}12341235static VkResult1236anv_execbuf_add_bo_bitset(struct anv_device *device,1237struct anv_execbuf *exec,1238uint32_t dep_words,1239BITSET_WORD *deps,1240uint32_t extra_flags);12411242static VkResult1243anv_execbuf_add_bo(struct anv_device *device,1244struct anv_execbuf *exec,1245struct anv_bo *bo,1246struct anv_reloc_list *relocs,1247uint32_t extra_flags)1248{1249struct drm_i915_gem_exec_object2 *obj = NULL;12501251bo = anv_bo_unwrap(bo);12521253if (bo->index < exec->bo_count && exec->bos[bo->index] == bo)1254obj = &exec->objects[bo->index];12551256if (obj == NULL) {1257/* We've never seen this one before. Add it to the list and assign1258* an id that we can use later.1259*/1260if (exec->bo_count >= exec->array_length) {1261uint32_t new_len = exec->objects ? exec->array_length * 2 : 64;12621263struct drm_i915_gem_exec_object2 *new_objects =1264vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope);1265if (new_objects == NULL)1266return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);12671268struct anv_bo **new_bos =1269vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope);1270if (new_bos == NULL) {1271vk_free(exec->alloc, new_objects);1272return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);1273}12741275if (exec->objects) {1276memcpy(new_objects, exec->objects,1277exec->bo_count * sizeof(*new_objects));1278memcpy(new_bos, exec->bos,1279exec->bo_count * sizeof(*new_bos));1280}12811282vk_free(exec->alloc, exec->objects);1283vk_free(exec->alloc, exec->bos);12841285exec->objects = new_objects;1286exec->bos = new_bos;1287exec->array_length = new_len;1288}12891290assert(exec->bo_count < exec->array_length);12911292bo->index = exec->bo_count++;1293obj = &exec->objects[bo->index];1294exec->bos[bo->index] = bo;12951296obj->handle = bo->gem_handle;1297obj->relocation_count = 0;1298obj->relocs_ptr = 0;1299obj->alignment = 0;1300obj->offset = bo->offset;1301obj->flags = bo->flags | extra_flags;1302obj->rsvd1 = 0;1303obj->rsvd2 = 0;1304}13051306if (extra_flags & EXEC_OBJECT_WRITE) {1307obj->flags |= EXEC_OBJECT_WRITE;1308obj->flags &= ~EXEC_OBJECT_ASYNC;1309}13101311if (relocs != NULL) {1312assert(obj->relocation_count == 0);13131314if (relocs->num_relocs > 0) {1315/* This is the first time we've ever seen a list of relocations for1316* this BO. Go ahead and set the relocations and then walk the list1317* of relocations and add them all.1318*/1319exec->has_relocs = true;1320obj->relocation_count = relocs->num_relocs;1321obj->relocs_ptr = (uintptr_t) relocs->relocs;13221323for (size_t i = 0; i < relocs->num_relocs; i++) {1324VkResult result;13251326/* A quick sanity check on relocations */1327assert(relocs->relocs[i].offset < bo->size);1328result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],1329NULL, extra_flags);1330if (result != VK_SUCCESS)1331return result;1332}1333}13341335return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,1336relocs->deps, extra_flags);1337}13381339return VK_SUCCESS;1340}13411342/* Add BO dependencies to execbuf */1343static VkResult1344anv_execbuf_add_bo_bitset(struct anv_device *device,1345struct anv_execbuf *exec,1346uint32_t dep_words,1347BITSET_WORD *deps,1348uint32_t extra_flags)1349{1350for (uint32_t w = 0; w < dep_words; w++) {1351BITSET_WORD mask = deps[w];1352while (mask) {1353int i = u_bit_scan(&mask);1354uint32_t gem_handle = w * BITSET_WORDBITS + i;1355struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);1356assert(bo->refcount > 0);1357VkResult result =1358anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);1359if (result != VK_SUCCESS)1360return result;1361}1362}13631364return VK_SUCCESS;1365}13661367static void1368anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,1369struct anv_reloc_list *list)1370{1371for (size_t i = 0; i < list->num_relocs; i++)1372list->relocs[i].target_handle = anv_bo_unwrap(list->reloc_bos[i])->index;1373}13741375static void1376adjust_relocations_from_state_pool(struct anv_state_pool *pool,1377struct anv_reloc_list *relocs,1378uint32_t last_pool_center_bo_offset)1379{1380assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);1381uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;13821383for (size_t i = 0; i < relocs->num_relocs; i++) {1384/* All of the relocations from this block pool to other BO's should1385* have been emitted relative to the surface block pool center. We1386* need to add the center offset to make them relative to the1387* beginning of the actual GEM bo.1388*/1389relocs->relocs[i].offset += delta;1390}1391}13921393static void1394adjust_relocations_to_state_pool(struct anv_state_pool *pool,1395struct anv_bo *from_bo,1396struct anv_reloc_list *relocs,1397uint32_t last_pool_center_bo_offset)1398{1399assert(!from_bo->is_wrapper);1400assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);1401uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;14021403/* When we initially emit relocations into a block pool, we don't1404* actually know what the final center_bo_offset will be so we just emit1405* it as if center_bo_offset == 0. Now that we know what the center1406* offset is, we need to walk the list of relocations and adjust any1407* relocations that point to the pool bo with the correct offset.1408*/1409for (size_t i = 0; i < relocs->num_relocs; i++) {1410if (relocs->reloc_bos[i] == pool->block_pool.bo) {1411/* Adjust the delta value in the relocation to correctly1412* correspond to the new delta. Initially, this value may have1413* been negative (if treated as unsigned), but we trust in1414* uint32_t roll-over to fix that for us at this point.1415*/1416relocs->relocs[i].delta += delta;14171418/* Since the delta has changed, we need to update the actual1419* relocated value with the new presumed value. This function1420* should only be called on batch buffers, so we know it isn't in1421* use by the GPU at the moment.1422*/1423assert(relocs->relocs[i].offset < from_bo->size);1424write_reloc(pool->block_pool.device,1425from_bo->map + relocs->relocs[i].offset,1426relocs->relocs[i].presumed_offset +1427relocs->relocs[i].delta, false);1428}1429}1430}14311432static void1433anv_reloc_list_apply(struct anv_device *device,1434struct anv_reloc_list *list,1435struct anv_bo *bo,1436bool always_relocate)1437{1438bo = anv_bo_unwrap(bo);14391440for (size_t i = 0; i < list->num_relocs; i++) {1441struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]);1442if (list->relocs[i].presumed_offset == target_bo->offset &&1443!always_relocate)1444continue;14451446void *p = bo->map + list->relocs[i].offset;1447write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);1448list->relocs[i].presumed_offset = target_bo->offset;1449}1450}14511452/**1453* This function applies the relocation for a command buffer and writes the1454* actual addresses into the buffers as per what we were told by the kernel on1455* the previous execbuf2 call. This should be safe to do because, for each1456* relocated address, we have two cases:1457*1458* 1) The target BO is inactive (as seen by the kernel). In this case, it is1459* not in use by the GPU so updating the address is 100% ok. It won't be1460* in-use by the GPU (from our context) again until the next execbuf21461* happens. If the kernel decides to move it in the next execbuf2, it1462* will have to do the relocations itself, but that's ok because it should1463* have all of the information needed to do so.1464*1465* 2) The target BO is active (as seen by the kernel). In this case, it1466* hasn't moved since the last execbuffer2 call because GTT shuffling1467* *only* happens when the BO is idle. (From our perspective, it only1468* happens inside the execbuffer2 ioctl, but the shuffling may be1469* triggered by another ioctl, with full-ppgtt this is limited to only1470* execbuffer2 ioctls on the same context, or memory pressure.) Since the1471* target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT1472* address and the relocated value we are writing into the BO will be the1473* same as the value that is already there.1474*1475* There is also a possibility that the target BO is active but the exact1476* RENDER_SURFACE_STATE object we are writing the relocation into isn't in1477* use. In this case, the address currently in the RENDER_SURFACE_STATE1478* may be stale but it's still safe to write the relocation because that1479* particular RENDER_SURFACE_STATE object isn't in-use by the GPU and1480* won't be until the next execbuf2 call.1481*1482* By doing relocations on the CPU, we can tell the kernel that it doesn't1483* need to bother. We want to do this because the surface state buffer is1484* used by every command buffer so, if the kernel does the relocations, it1485* will always be busy and the kernel will always stall. This is also1486* probably the fastest mechanism for doing relocations since the kernel would1487* have to make a full copy of all the relocations lists.1488*/1489static bool1490execbuf_can_skip_relocations(struct anv_execbuf *exec)1491{1492if (!exec->has_relocs)1493return true;14941495static int userspace_relocs = -1;1496if (userspace_relocs < 0)1497userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);1498if (!userspace_relocs)1499return false;15001501/* First, we have to check to see whether or not we can even do the1502* relocation. New buffers which have never been submitted to the kernel1503* don't have a valid offset so we need to let the kernel do relocations so1504* that we can get offsets for them. On future execbuf2 calls, those1505* buffers will have offsets and we will be able to skip relocating.1506* Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.1507*/1508for (uint32_t i = 0; i < exec->bo_count; i++) {1509assert(!exec->bos[i]->is_wrapper);1510if (exec->bos[i]->offset == (uint64_t)-1)1511return false;1512}15131514return true;1515}15161517static void1518relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,1519struct anv_execbuf *exec)1520{1521/* Since surface states are shared between command buffers and we don't1522* know what order they will be submitted to the kernel, we don't know1523* what address is actually written in the surface state object at any1524* given time. The only option is to always relocate them.1525*/1526struct anv_bo *surface_state_bo =1527anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo);1528anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,1529surface_state_bo,1530true /* always relocate surface states */);15311532/* Since we own all of the batch buffers, we know what values are stored1533* in the relocated addresses and only have to update them if the offsets1534* have changed.1535*/1536struct anv_batch_bo **bbo;1537u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {1538anv_reloc_list_apply(cmd_buffer->device,1539&(*bbo)->relocs, (*bbo)->bo, false);1540}15411542for (uint32_t i = 0; i < exec->bo_count; i++)1543exec->objects[i].offset = exec->bos[i]->offset;1544}15451546static void1547reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)1548{1549/* In the case where we fall back to doing kernel relocations, we need to1550* ensure that the relocation list is valid. All relocations on the batch1551* buffers are already valid and kept up-to-date. Since surface states are1552* shared between command buffers and we don't know what order they will be1553* submitted to the kernel, we don't know what address is actually written1554* in the surface state object at any given time. The only option is to set1555* a bogus presumed offset and let the kernel relocate them.1556*/1557for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)1558cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;1559}15601561static VkResult1562setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,1563struct anv_cmd_buffer *cmd_buffer)1564{1565struct anv_state_pool *ss_pool =1566&cmd_buffer->device->surface_state_pool;15671568adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,1569cmd_buffer->last_ss_pool_center);1570VkResult result;1571if (cmd_buffer->device->physical->use_softpin) {1572/* Add surface dependencies (BOs) to the execbuf */1573anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,1574cmd_buffer->surface_relocs.dep_words,1575cmd_buffer->surface_relocs.deps, 0);1576} else {1577/* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs1578* will get added automatically by processing relocations on the batch1579* buffer. We have to add the surface state BO manually because it has1580* relocations of its own that we need to be sure are processsed.1581*/1582result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,1583ss_pool->block_pool.bo,1584&cmd_buffer->surface_relocs, 0);1585if (result != VK_SUCCESS)1586return result;1587}15881589/* First, we walk over all of the bos we've seen and add them and their1590* relocations to the validate list.1591*/1592struct anv_batch_bo **bbo;1593u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {1594adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs,1595cmd_buffer->last_ss_pool_center);15961597result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,1598(*bbo)->bo, &(*bbo)->relocs, 0);1599if (result != VK_SUCCESS)1600return result;1601}16021603/* Now that we've adjusted all of the surface state relocations, we need to1604* record the surface state pool center so future executions of the command1605* buffer can adjust correctly.1606*/1607cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;16081609return VK_SUCCESS;1610}16111612static void1613chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,1614uint32_t num_cmd_buffers)1615{1616if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {1617assert(num_cmd_buffers == 1);1618return;1619}16201621/* Chain the N-1 first batch buffers */1622for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++)1623anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);16241625/* Put an end to the last one */1626anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);1627}16281629static VkResult1630setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,1631struct anv_queue *queue,1632struct anv_cmd_buffer **cmd_buffers,1633uint32_t num_cmd_buffers)1634{1635struct anv_device *device = queue->device;1636struct anv_state_pool *ss_pool = &device->surface_state_pool;1637VkResult result;16381639/* Edit the tail of the command buffers to chain them all together if they1640* can be.1641*/1642chain_command_buffers(cmd_buffers, num_cmd_buffers);16431644for (uint32_t i = 0; i < num_cmd_buffers; i++) {1645result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);1646if (result != VK_SUCCESS)1647return result;1648}16491650/* Add all the global BOs to the object list for softpin case. */1651if (device->physical->use_softpin) {1652anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {1653result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);1654if (result != VK_SUCCESS)1655return result;1656}16571658struct anv_block_pool *pool;1659pool = &device->dynamic_state_pool.block_pool;1660anv_block_pool_foreach_bo(bo, pool) {1661result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);1662if (result != VK_SUCCESS)1663return result;1664}16651666pool = &device->general_state_pool.block_pool;1667anv_block_pool_foreach_bo(bo, pool) {1668result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);1669if (result != VK_SUCCESS)1670return result;1671}16721673pool = &device->instruction_state_pool.block_pool;1674anv_block_pool_foreach_bo(bo, pool) {1675result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);1676if (result != VK_SUCCESS)1677return result;1678}16791680pool = &device->binding_table_pool.block_pool;1681anv_block_pool_foreach_bo(bo, pool) {1682result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);1683if (result != VK_SUCCESS)1684return result;1685}16861687/* Add the BOs for all user allocated memory objects because we can't1688* track after binding updates of VK_EXT_descriptor_indexing.1689*/1690list_for_each_entry(struct anv_device_memory, mem,1691&device->memory_objects, link) {1692result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);1693if (result != VK_SUCCESS)1694return result;1695}1696} else {1697/* We do not support chaining primary command buffers without1698* softpin.1699*/1700assert(num_cmd_buffers == 1);1701}17021703bool no_reloc = true;1704if (execbuf->has_relocs) {1705no_reloc = execbuf_can_skip_relocations(execbuf);1706if (no_reloc) {1707/* If we were able to successfully relocate everything, tell the1708* kernel that it can skip doing relocations. The requirement for1709* using NO_RELOC is:1710*1711* 1) The addresses written in the objects must match the1712* corresponding reloc.presumed_offset which in turn must match1713* the corresponding execobject.offset.1714*1715* 2) To avoid stalling, execobject.offset should match the current1716* address of that object within the active context.1717*1718* In order to satisfy all of the invariants that make userspace1719* relocations to be safe (see relocate_cmd_buffer()), we need to1720* further ensure that the addresses we use match those used by the1721* kernel for the most recent execbuf2.1722*1723* The kernel may still choose to do relocations anyway if something1724* has moved in the GTT. In this case, the relocation list still1725* needs to be valid. All relocations on the batch buffers are1726* already valid and kept up-to-date. For surface state relocations,1727* by applying the relocations in relocate_cmd_buffer, we ensured1728* that the address in the RENDER_SURFACE_STATE matches1729* presumed_offset, so it should be safe for the kernel to relocate1730* them as needed.1731*/1732for (uint32_t i = 0; i < num_cmd_buffers; i++) {1733relocate_cmd_buffer(cmd_buffers[i], execbuf);17341735anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,1736device->surface_state_pool.block_pool.bo,1737true /* always relocate surface states */);1738}1739} else {1740/* In the case where we fall back to doing kernel relocations, we1741* need to ensure that the relocation list is valid. All relocations1742* on the batch buffers are already valid and kept up-to-date. Since1743* surface states are shared between command buffers and we don't1744* know what order they will be submitted to the kernel, we don't1745* know what address is actually written in the surface state object1746* at any given time. The only option is to set a bogus presumed1747* offset and let the kernel relocate them.1748*/1749for (uint32_t i = 0; i < num_cmd_buffers; i++)1750reset_cmd_buffer_surface_offsets(cmd_buffers[i]);1751}1752}17531754struct anv_batch_bo *first_batch_bo =1755list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);17561757/* The kernel requires that the last entry in the validation list be the1758* batch buffer to execute. We can simply swap the element1759* corresponding to the first batch_bo in the chain with the last1760* element in the list.1761*/1762if (first_batch_bo->bo->index != execbuf->bo_count - 1) {1763uint32_t idx = first_batch_bo->bo->index;1764uint32_t last_idx = execbuf->bo_count - 1;17651766struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];1767assert(execbuf->bos[idx] == first_batch_bo->bo);17681769execbuf->objects[idx] = execbuf->objects[last_idx];1770execbuf->bos[idx] = execbuf->bos[last_idx];1771execbuf->bos[idx]->index = idx;17721773execbuf->objects[last_idx] = tmp_obj;1774execbuf->bos[last_idx] = first_batch_bo->bo;1775first_batch_bo->bo->index = last_idx;1776}17771778/* If we are pinning our BOs, we shouldn't have to relocate anything */1779if (device->physical->use_softpin)1780assert(!execbuf->has_relocs);17811782/* Now we go through and fixup all of the relocation lists to point to the1783* correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to1784* do this after we reorder the list above as some of the indices may have1785* changed.1786*/1787struct anv_batch_bo **bbo;1788if (execbuf->has_relocs) {1789assert(num_cmd_buffers == 1);1790u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)1791anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);17921793anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);1794}17951796if (!device->info.has_llc) {1797__builtin_ia32_mfence();1798for (uint32_t i = 0; i < num_cmd_buffers; i++) {1799u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {1800for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)1801__builtin_ia32_clflush((*bbo)->bo->map + i);1802}1803}1804}18051806struct anv_batch *batch = &cmd_buffers[0]->batch;1807execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {1808.buffers_ptr = (uintptr_t) execbuf->objects,1809.buffer_count = execbuf->bo_count,1810.batch_start_offset = 0,1811/* On platforms that cannot chain batch buffers because of the i9151812* command parser, we have to provide the batch length. Everywhere else1813* we'll chain batches so no point in passing a length.1814*/1815.batch_len = device->can_chain_batches ? 0 : batch->next - batch->start,1816.cliprects_ptr = 0,1817.num_cliprects = 0,1818.DR1 = 0,1819.DR4 = 0,1820.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),1821.rsvd1 = device->context_id,1822.rsvd2 = 0,1823};18241825return VK_SUCCESS;1826}18271828static VkResult1829setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)1830{1831struct anv_device *device = queue->device;1832VkResult result = anv_execbuf_add_bo(device, execbuf,1833device->trivial_batch_bo,1834NULL, 0);1835if (result != VK_SUCCESS)1836return result;18371838execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {1839.buffers_ptr = (uintptr_t) execbuf->objects,1840.buffer_count = execbuf->bo_count,1841.batch_start_offset = 0,1842.batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */1843.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,1844.rsvd1 = device->context_id,1845.rsvd2 = 0,1846};18471848return VK_SUCCESS;1849}18501851/* We lock around execbuf for three main reasons:1852*1853* 1) When a block pool is resized, we create a new gem handle with a1854* different size and, in the case of surface states, possibly a different1855* center offset but we re-use the same anv_bo struct when we do so. If1856* this happens in the middle of setting up an execbuf, we could end up1857* with our list of BOs out of sync with our list of gem handles.1858*1859* 2) The algorithm we use for building the list of unique buffers isn't1860* thread-safe. While the client is supposed to syncronize around1861* QueueSubmit, this would be extremely difficult to debug if it ever came1862* up in the wild due to a broken app. It's better to play it safe and1863* just lock around QueueSubmit.1864*1865* 3) The anv_cmd_buffer_execbuf function may perform relocations in1866* userspace. Due to the fact that the surface state buffer is shared1867* between batches, we can't afford to have that happen from multiple1868* threads at the same time. Even though the user is supposed to ensure1869* this doesn't happen, we play it safe as in (2) above.1870*1871* Since the only other things that ever take the device lock such as block1872* pool resize only rarely happen, this will almost never be contended so1873* taking a lock isn't really an expensive operation in this case.1874*/1875VkResult1876anv_queue_execbuf_locked(struct anv_queue *queue,1877struct anv_queue_submit *submit)1878{1879struct anv_device *device = queue->device;1880struct anv_execbuf execbuf;1881anv_execbuf_init(&execbuf);1882execbuf.alloc = submit->alloc;1883execbuf.alloc_scope = submit->alloc_scope;1884execbuf.perf_query_pass = submit->perf_query_pass;18851886/* Always add the workaround BO as it includes a driver identifier for the1887* error_state.1888*/1889VkResult result =1890anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);1891if (result != VK_SUCCESS)1892goto error;18931894for (uint32_t i = 0; i < submit->fence_bo_count; i++) {1895int signaled;1896struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled);18971898result = anv_execbuf_add_bo(device, &execbuf, bo, NULL,1899signaled ? EXEC_OBJECT_WRITE : 0);1900if (result != VK_SUCCESS)1901goto error;1902}19031904if (submit->cmd_buffer_count) {1905result = setup_execbuf_for_cmd_buffers(&execbuf, queue,1906submit->cmd_buffers,1907submit->cmd_buffer_count);1908} else if (submit->simple_bo) {1909result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0);1910if (result != VK_SUCCESS)1911goto error;19121913execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {1914.buffers_ptr = (uintptr_t) execbuf.objects,1915.buffer_count = execbuf.bo_count,1916.batch_start_offset = 0,1917.batch_len = submit->simple_bo_size,1918.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,1919.rsvd1 = device->context_id,1920.rsvd2 = 0,1921};1922} else {1923result = setup_empty_execbuf(&execbuf, queue);1924}19251926if (result != VK_SUCCESS)1927goto error;19281929const bool has_perf_query =1930submit->perf_query_pass >= 0 &&1931submit->cmd_buffer_count &&1932submit->perf_query_pool;19331934if (INTEL_DEBUG & DEBUG_SUBMIT) {1935fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0\n",1936execbuf.execbuf.batch_start_offset, execbuf.execbuf.batch_len);1937for (uint32_t i = 0; i < execbuf.bo_count; i++) {1938const struct anv_bo *bo = execbuf.bos[i];19391940fprintf(stderr, " BO: addr=0x%016"PRIx64" size=%010"PRIx64" handle=%05u name=%s\n",1941bo->offset, bo->size, bo->gem_handle, bo->name);1942}1943}19441945if (INTEL_DEBUG & DEBUG_BATCH) {1946fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues));1947if (submit->cmd_buffer_count) {1948if (has_perf_query) {1949struct anv_query_pool *query_pool = submit->perf_query_pool;1950struct anv_bo *pass_batch_bo = query_pool->bo;1951uint64_t pass_batch_offset =1952khr_perf_query_preamble_offset(query_pool,1953submit->perf_query_pass);19541955intel_print_batch(&device->decoder_ctx,1956pass_batch_bo->map + pass_batch_offset, 64,1957pass_batch_bo->offset + pass_batch_offset, false);1958}19591960for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {1961struct anv_batch_bo **bo =1962u_vector_tail(&submit->cmd_buffers[i]->seen_bbos);1963device->cmd_buffer_being_decoded = submit->cmd_buffers[i];1964intel_print_batch(&device->decoder_ctx, (*bo)->bo->map,1965(*bo)->bo->size, (*bo)->bo->offset, false);1966device->cmd_buffer_being_decoded = NULL;1967}1968} else if (submit->simple_bo) {1969intel_print_batch(&device->decoder_ctx, submit->simple_bo->map,1970submit->simple_bo->size, submit->simple_bo->offset, false);1971} else {1972intel_print_batch(&device->decoder_ctx,1973device->trivial_batch_bo->map,1974device->trivial_batch_bo->size,1975device->trivial_batch_bo->offset, false);1976}1977}19781979if (submit->fence_count > 0) {1980assert(device->physical->has_syncobj);1981if (device->has_thread_submit) {1982execbuf.timeline_fences.fence_count = submit->fence_count;1983execbuf.timeline_fences.handles_ptr = (uintptr_t)submit->fences;1984execbuf.timeline_fences.values_ptr = (uintptr_t)submit->fence_values;1985anv_execbuf_add_ext(&execbuf,1986DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,1987&execbuf.timeline_fences.base);1988} else {1989execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY;1990execbuf.execbuf.num_cliprects = submit->fence_count;1991execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences;1992}1993}19941995if (submit->in_fence != -1) {1996assert(!device->has_thread_submit);1997execbuf.execbuf.flags |= I915_EXEC_FENCE_IN;1998execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence;1999}20002001if (submit->need_out_fence) {2002assert(!device->has_thread_submit);2003execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT;2004}20052006if (has_perf_query) {2007struct anv_query_pool *query_pool = submit->perf_query_pool;2008assert(submit->perf_query_pass < query_pool->n_passes);2009struct intel_perf_query_info *query_info =2010query_pool->pass_query[submit->perf_query_pass];20112012/* Some performance queries just the pipeline statistic HW, no need for2013* OA in that case, so no need to reconfigure.2014*/2015if ((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0 &&2016(query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||2017query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {2018int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,2019(void *)(uintptr_t) query_info->oa_metrics_set_id);2020if (ret < 0) {2021result = anv_device_set_lost(device,2022"i915-perf config failed: %s",2023strerror(errno));2024}2025}20262027struct anv_bo *pass_batch_bo = query_pool->bo;20282029struct drm_i915_gem_exec_object2 query_pass_object = {2030.handle = pass_batch_bo->gem_handle,2031.offset = pass_batch_bo->offset,2032.flags = pass_batch_bo->flags,2033};2034struct drm_i915_gem_execbuffer2 query_pass_execbuf = {2035.buffers_ptr = (uintptr_t) &query_pass_object,2036.buffer_count = 1,2037.batch_start_offset = khr_perf_query_preamble_offset(query_pool,2038submit->perf_query_pass),2039.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,2040.rsvd1 = device->context_id,2041};20422043int ret = queue->device->no_hw ? 0 :2044anv_gem_execbuffer(queue->device, &query_pass_execbuf);2045if (ret)2046result = anv_queue_set_lost(queue, "execbuf2 failed: %m");2047}20482049int ret = queue->device->no_hw ? 0 :2050anv_gem_execbuffer(queue->device, &execbuf.execbuf);2051if (ret)2052result = anv_queue_set_lost(queue, "execbuf2 failed: %m");20532054struct drm_i915_gem_exec_object2 *objects = execbuf.objects;2055for (uint32_t k = 0; k < execbuf.bo_count; k++) {2056if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED)2057assert(execbuf.bos[k]->offset == objects[k].offset);2058execbuf.bos[k]->offset = objects[k].offset;2059}20602061if (result == VK_SUCCESS && submit->need_out_fence)2062submit->out_fence = execbuf.execbuf.rsvd2 >> 32;20632064error:2065pthread_cond_broadcast(&device->queue_submit);20662067anv_execbuf_finish(&execbuf);20682069return result;2070}207120722073