Path: blob/21.2-virgl/src/intel/vulkan/anv_allocator.c
4547 views
/*1* Copyright © 2015 Intel Corporation2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223#include <stdlib.h>24#include <unistd.h>25#include <limits.h>26#include <assert.h>27#include <sys/mman.h>2829#include "anv_private.h"3031#include "common/intel_aux_map.h"32#include "util/anon_file.h"3334#ifdef HAVE_VALGRIND35#define VG_NOACCESS_READ(__ptr) ({ \36VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \37__typeof(*(__ptr)) __val = *(__ptr); \38VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\39__val; \40})41#define VG_NOACCESS_WRITE(__ptr, __val) ({ \42VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr))); \43*(__ptr) = (__val); \44VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr))); \45})46#else47#define VG_NOACCESS_READ(__ptr) (*(__ptr))48#define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))49#endif5051#ifndef MAP_POPULATE52#define MAP_POPULATE 053#endif5455/* Design goals:56*57* - Lock free (except when resizing underlying bos)58*59* - Constant time allocation with typically only one atomic60*61* - Multiple allocation sizes without fragmentation62*63* - Can grow while keeping addresses and offset of contents stable64*65* - All allocations within one bo so we can point one of the66* STATE_BASE_ADDRESS pointers at it.67*68* The overall design is a two-level allocator: top level is a fixed size, big69* block (8k) allocator, which operates out of a bo. Allocation is done by70* either pulling a block from the free list or growing the used range of the71* bo. Growing the range may run out of space in the bo which we then need to72* grow. Growing the bo is tricky in a multi-threaded, lockless environment:73* we need to keep all pointers and contents in the old map valid. GEM bos in74* general can't grow, but we use a trick: we create a memfd and use ftruncate75* to grow it as necessary. We mmap the new size and then create a gem bo for76* it using the new gem userptr ioctl. Without heavy-handed locking around77* our allocation fast-path, there isn't really a way to munmap the old mmap,78* so we just keep it around until garbage collection time. While the block79* allocator is lockless for normal operations, we block other threads trying80* to allocate while we're growing the map. It sholdn't happen often, and81* growing is fast anyway.82*83* At the next level we can use various sub-allocators. The state pool is a84* pool of smaller, fixed size objects, which operates much like the block85* pool. It uses a free list for freeing objects, but when it runs out of86* space it just allocates a new block from the block pool. This allocator is87* intended for longer lived state objects such as SURFACE_STATE and most88* other persistent state objects in the API. We may need to track more info89* with these object and a pointer back to the CPU object (eg VkImage). In90* those cases we just allocate a slightly bigger object and put the extra91* state after the GPU state object.92*93* The state stream allocator works similar to how the i965 DRI driver streams94* all its state. Even with Vulkan, we need to emit transient state (whether95* surface state base or dynamic state base), and for that we can just get a96* block and fill it up. These cases are local to a command buffer and the97* sub-allocator need not be thread safe. The streaming allocator gets a new98* block when it runs out of space and chains them together so they can be99* easily freed.100*/101102/* Allocations are always at least 64 byte aligned, so 1 is an invalid value.103* We use it to indicate the free list is empty. */104#define EMPTY UINT32_MAX105106/* On FreeBSD PAGE_SIZE is already defined in107* /usr/include/machine/param.h that is indirectly108* included here.109*/110#ifndef PAGE_SIZE111#define PAGE_SIZE 4096112#endif113114struct anv_mmap_cleanup {115void *map;116size_t size;117};118119static inline uint32_t120ilog2_round_up(uint32_t value)121{122assert(value != 0);123return 32 - __builtin_clz(value - 1);124}125126static inline uint32_t127round_to_power_of_two(uint32_t value)128{129return 1 << ilog2_round_up(value);130}131132struct anv_state_table_cleanup {133void *map;134size_t size;135};136137#define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})138#define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))139140static VkResult141anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);142143VkResult144anv_state_table_init(struct anv_state_table *table,145struct anv_device *device,146uint32_t initial_entries)147{148VkResult result;149150table->device = device;151152/* Just make it 2GB up-front. The Linux kernel won't actually back it153* with pages until we either map and fault on one of them or we use154* userptr and send a chunk of it off to the GPU.155*/156table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");157if (table->fd == -1) {158result = vk_error(VK_ERROR_INITIALIZATION_FAILED);159goto fail_fd;160}161162if (!u_vector_init(&table->cleanups,163round_to_power_of_two(sizeof(struct anv_state_table_cleanup)),164128)) {165result = vk_error(VK_ERROR_INITIALIZATION_FAILED);166goto fail_fd;167}168169table->state.next = 0;170table->state.end = 0;171table->size = 0;172173uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;174result = anv_state_table_expand_range(table, initial_size);175if (result != VK_SUCCESS)176goto fail_cleanups;177178return VK_SUCCESS;179180fail_cleanups:181u_vector_finish(&table->cleanups);182fail_fd:183close(table->fd);184185return result;186}187188static VkResult189anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)190{191void *map;192struct anv_state_table_cleanup *cleanup;193194/* Assert that we only ever grow the pool */195assert(size >= table->state.end);196197/* Make sure that we don't go outside the bounds of the memfd */198if (size > BLOCK_POOL_MEMFD_SIZE)199return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);200201cleanup = u_vector_add(&table->cleanups);202if (!cleanup)203return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);204205*cleanup = ANV_STATE_TABLE_CLEANUP_INIT;206207/* Just leak the old map until we destroy the pool. We can't munmap it208* without races or imposing locking on the block allocate fast path. On209* the whole the leaked maps adds up to less than the size of the210* current map. MAP_POPULATE seems like the right thing to do, but we211* should try to get some numbers.212*/213map = mmap(NULL, size, PROT_READ | PROT_WRITE,214MAP_SHARED | MAP_POPULATE, table->fd, 0);215if (map == MAP_FAILED) {216return vk_errorf(table->device, &table->device->vk.base,217VK_ERROR_OUT_OF_HOST_MEMORY, "mmap failed: %m");218}219220cleanup->map = map;221cleanup->size = size;222223table->map = map;224table->size = size;225226return VK_SUCCESS;227}228229static VkResult230anv_state_table_grow(struct anv_state_table *table)231{232VkResult result = VK_SUCCESS;233234uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE,235PAGE_SIZE);236uint32_t old_size = table->size;237238/* The block pool is always initialized to a nonzero size and this function239* is always called after initialization.240*/241assert(old_size > 0);242243uint32_t required = MAX2(used, old_size);244if (used * 2 <= required) {245/* If we're in this case then this isn't the firsta allocation and we246* already have enough space on both sides to hold double what we247* have allocated. There's nothing for us to do.248*/249goto done;250}251252uint32_t size = old_size * 2;253while (size < required)254size *= 2;255256assert(size > table->size);257258result = anv_state_table_expand_range(table, size);259260done:261return result;262}263264void265anv_state_table_finish(struct anv_state_table *table)266{267struct anv_state_table_cleanup *cleanup;268269u_vector_foreach(cleanup, &table->cleanups) {270if (cleanup->map)271munmap(cleanup->map, cleanup->size);272}273274u_vector_finish(&table->cleanups);275276close(table->fd);277}278279VkResult280anv_state_table_add(struct anv_state_table *table, uint32_t *idx,281uint32_t count)282{283struct anv_block_state state, old, new;284VkResult result;285286assert(idx);287288while(1) {289state.u64 = __sync_fetch_and_add(&table->state.u64, count);290if (state.next + count <= state.end) {291assert(table->map);292struct anv_free_entry *entry = &table->map[state.next];293for (int i = 0; i < count; i++) {294entry[i].state.idx = state.next + i;295}296*idx = state.next;297return VK_SUCCESS;298} else if (state.next <= state.end) {299/* We allocated the first block outside the pool so we have to grow300* the pool. pool_state->next acts a mutex: threads who try to301* allocate now will get block indexes above the current limit and302* hit futex_wait below.303*/304new.next = state.next + count;305do {306result = anv_state_table_grow(table);307if (result != VK_SUCCESS)308return result;309new.end = table->size / ANV_STATE_ENTRY_SIZE;310} while (new.end < new.next);311312old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);313if (old.next != state.next)314futex_wake(&table->state.end, INT_MAX);315} else {316futex_wait(&table->state.end, state.end, NULL);317continue;318}319}320}321322void323anv_free_list_push(union anv_free_list *list,324struct anv_state_table *table,325uint32_t first, uint32_t count)326{327union anv_free_list current, old, new;328uint32_t last = first;329330for (uint32_t i = 1; i < count; i++, last++)331table->map[last].next = last + 1;332333old.u64 = list->u64;334do {335current = old;336table->map[last].next = current.offset;337new.offset = first;338new.count = current.count + 1;339old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);340} while (old.u64 != current.u64);341}342343struct anv_state *344anv_free_list_pop(union anv_free_list *list,345struct anv_state_table *table)346{347union anv_free_list current, new, old;348349current.u64 = list->u64;350while (current.offset != EMPTY) {351__sync_synchronize();352new.offset = table->map[current.offset].next;353new.count = current.count + 1;354old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);355if (old.u64 == current.u64) {356struct anv_free_entry *entry = &table->map[current.offset];357return &entry->state;358}359current = old;360}361362return NULL;363}364365static VkResult366anv_block_pool_expand_range(struct anv_block_pool *pool,367uint32_t center_bo_offset, uint32_t size);368369VkResult370anv_block_pool_init(struct anv_block_pool *pool,371struct anv_device *device,372const char *name,373uint64_t start_address,374uint32_t initial_size)375{376VkResult result;377378pool->name = name;379pool->device = device;380pool->use_softpin = device->physical->use_softpin;381pool->nbos = 0;382pool->size = 0;383pool->center_bo_offset = 0;384pool->start_address = intel_canonical_address(start_address);385pool->map = NULL;386387if (pool->use_softpin) {388pool->bo = NULL;389pool->fd = -1;390} else {391/* Just make it 2GB up-front. The Linux kernel won't actually back it392* with pages until we either map and fault on one of them or we use393* userptr and send a chunk of it off to the GPU.394*/395pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool");396if (pool->fd == -1)397return vk_error(VK_ERROR_INITIALIZATION_FAILED);398399pool->wrapper_bo = (struct anv_bo) {400.refcount = 1,401.offset = -1,402.is_wrapper = true,403};404pool->bo = &pool->wrapper_bo;405}406407if (!u_vector_init(&pool->mmap_cleanups,408round_to_power_of_two(sizeof(struct anv_mmap_cleanup)),409128)) {410result = vk_error(VK_ERROR_INITIALIZATION_FAILED);411goto fail_fd;412}413414pool->state.next = 0;415pool->state.end = 0;416pool->back_state.next = 0;417pool->back_state.end = 0;418419result = anv_block_pool_expand_range(pool, 0, initial_size);420if (result != VK_SUCCESS)421goto fail_mmap_cleanups;422423/* Make the entire pool available in the front of the pool. If back424* allocation needs to use this space, the "ends" will be re-arranged.425*/426pool->state.end = pool->size;427428return VK_SUCCESS;429430fail_mmap_cleanups:431u_vector_finish(&pool->mmap_cleanups);432fail_fd:433if (pool->fd >= 0)434close(pool->fd);435436return result;437}438439void440anv_block_pool_finish(struct anv_block_pool *pool)441{442anv_block_pool_foreach_bo(bo, pool) {443if (bo->map)444anv_gem_munmap(pool->device, bo->map, bo->size);445anv_gem_close(pool->device, bo->gem_handle);446}447448struct anv_mmap_cleanup *cleanup;449u_vector_foreach(cleanup, &pool->mmap_cleanups)450munmap(cleanup->map, cleanup->size);451u_vector_finish(&pool->mmap_cleanups);452453if (pool->fd >= 0)454close(pool->fd);455}456457static VkResult458anv_block_pool_expand_range(struct anv_block_pool *pool,459uint32_t center_bo_offset, uint32_t size)460{461/* Assert that we only ever grow the pool */462assert(center_bo_offset >= pool->back_state.end);463assert(size - center_bo_offset >= pool->state.end);464465/* Assert that we don't go outside the bounds of the memfd */466assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER);467assert(pool->use_softpin ||468size - center_bo_offset <=469BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER);470471/* For state pool BOs we have to be a bit careful about where we place them472* in the GTT. There are two documented workarounds for state base address473* placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset474* which state that those two base addresses do not support 48-bit475* addresses and need to be placed in the bottom 32-bit range.476* Unfortunately, this is not quite accurate.477*478* The real problem is that we always set the size of our state pools in479* STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most480* likely significantly smaller. We do this because we do not no at the481* time we emit STATE_BASE_ADDRESS whether or not we will need to expand482* the pool during command buffer building so we don't actually have a483* valid final size. If the address + size, as seen by STATE_BASE_ADDRESS484* overflows 48 bits, the GPU appears to treat all accesses to the buffer485* as being out of bounds and returns zero. For dynamic state, this486* usually just leads to rendering corruptions, but shaders that are all487* zero hang the GPU immediately.488*489* The easiest solution to do is exactly what the bogus workarounds say to490* do: restrict these buffers to 32-bit addresses. We could also pin the491* BO to some particular location of our choosing, but that's significantly492* more work than just not setting a flag. So, we explicitly DO NOT set493* the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the494* hard work for us. When using softpin, we're in control and the fixed495* addresses we choose are fine for base addresses.496*/497enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE;498if (!pool->use_softpin)499bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS;500501if (pool->use_softpin) {502uint32_t new_bo_size = size - pool->size;503struct anv_bo *new_bo;504assert(center_bo_offset == 0);505VkResult result = anv_device_alloc_bo(pool->device,506pool->name,507new_bo_size,508bo_alloc_flags |509ANV_BO_ALLOC_FIXED_ADDRESS |510ANV_BO_ALLOC_MAPPED |511ANV_BO_ALLOC_SNOOPED,512pool->start_address + pool->size,513&new_bo);514if (result != VK_SUCCESS)515return result;516517pool->bos[pool->nbos++] = new_bo;518519/* This pointer will always point to the first BO in the list */520pool->bo = pool->bos[0];521} else {522/* Just leak the old map until we destroy the pool. We can't munmap it523* without races or imposing locking on the block allocate fast path. On524* the whole the leaked maps adds up to less than the size of the525* current map. MAP_POPULATE seems like the right thing to do, but we526* should try to get some numbers.527*/528void *map = mmap(NULL, size, PROT_READ | PROT_WRITE,529MAP_SHARED | MAP_POPULATE, pool->fd,530BLOCK_POOL_MEMFD_CENTER - center_bo_offset);531if (map == MAP_FAILED)532return vk_errorf(pool->device, &pool->device->vk.base,533VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");534535struct anv_bo *new_bo;536VkResult result = anv_device_import_bo_from_host_ptr(pool->device,537map, size,538bo_alloc_flags,5390 /* client_address */,540&new_bo);541if (result != VK_SUCCESS) {542munmap(map, size);543return result;544}545546struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups);547if (!cleanup) {548munmap(map, size);549anv_device_release_bo(pool->device, new_bo);550return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);551}552cleanup->map = map;553cleanup->size = size;554555/* Now that we mapped the new memory, we can write the new556* center_bo_offset back into pool and update pool->map. */557pool->center_bo_offset = center_bo_offset;558pool->map = map + center_bo_offset;559560pool->bos[pool->nbos++] = new_bo;561pool->wrapper_bo.map = new_bo;562}563564assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);565pool->size = size;566567return VK_SUCCESS;568}569570/** Returns current memory map of the block pool.571*572* The returned pointer points to the map for the memory at the specified573* offset. The offset parameter is relative to the "center" of the block pool574* rather than the start of the block pool BO map.575*/576void*577anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)578{579if (pool->use_softpin) {580struct anv_bo *bo = NULL;581int32_t bo_offset = 0;582anv_block_pool_foreach_bo(iter_bo, pool) {583if (offset < bo_offset + iter_bo->size) {584bo = iter_bo;585break;586}587bo_offset += iter_bo->size;588}589assert(bo != NULL);590assert(offset >= bo_offset);591assert((offset - bo_offset) + size <= bo->size);592593return bo->map + (offset - bo_offset);594} else {595return pool->map + offset;596}597}598599/** Grows and re-centers the block pool.600*601* We grow the block pool in one or both directions in such a way that the602* following conditions are met:603*604* 1) The size of the entire pool is always a power of two.605*606* 2) The pool only grows on both ends. Neither end can get607* shortened.608*609* 3) At the end of the allocation, we have about twice as much space610* allocated for each end as we have used. This way the pool doesn't611* grow too far in one direction or the other.612*613* 4) If the _alloc_back() has never been called, then the back portion of614* the pool retains a size of zero. (This makes it easier for users of615* the block pool that only want a one-sided pool.)616*617* 5) We have enough space allocated for at least one more block in618* whichever side `state` points to.619*620* 6) The center of the pool is always aligned to both the block_size of621* the pool and a 4K CPU page.622*/623static uint32_t624anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,625uint32_t contiguous_size)626{627VkResult result = VK_SUCCESS;628629pthread_mutex_lock(&pool->device->mutex);630631assert(state == &pool->state || state == &pool->back_state);632633/* Gather a little usage information on the pool. Since we may have634* threadsd waiting in queue to get some storage while we resize, it's635* actually possible that total_used will be larger than old_size. In636* particular, block_pool_alloc() increments state->next prior to637* calling block_pool_grow, so this ensures that we get enough space for638* which ever side tries to grow the pool.639*640* We align to a page size because it makes it easier to do our641* calculations later in such a way that we state page-aigned.642*/643uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);644uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);645uint32_t total_used = front_used + back_used;646647assert(state == &pool->state || back_used > 0);648649uint32_t old_size = pool->size;650651/* The block pool is always initialized to a nonzero size and this function652* is always called after initialization.653*/654assert(old_size > 0);655656const uint32_t old_back = pool->center_bo_offset;657const uint32_t old_front = old_size - pool->center_bo_offset;658659/* The back_used and front_used may actually be smaller than the actual660* requirement because they are based on the next pointers which are661* updated prior to calling this function.662*/663uint32_t back_required = MAX2(back_used, old_back);664uint32_t front_required = MAX2(front_used, old_front);665666if (pool->use_softpin) {667/* With softpin, the pool is made up of a bunch of buffers with separate668* maps. Make sure we have enough contiguous space that we can get a669* properly contiguous map for the next chunk.670*/671assert(old_back == 0);672front_required = MAX2(front_required, old_front + contiguous_size);673}674675if (back_used * 2 <= back_required && front_used * 2 <= front_required) {676/* If we're in this case then this isn't the firsta allocation and we677* already have enough space on both sides to hold double what we678* have allocated. There's nothing for us to do.679*/680goto done;681}682683uint32_t size = old_size * 2;684while (size < back_required + front_required)685size *= 2;686687assert(size > pool->size);688689/* We compute a new center_bo_offset such that, when we double the size690* of the pool, we maintain the ratio of how much is used by each side.691* This way things should remain more-or-less balanced.692*/693uint32_t center_bo_offset;694if (back_used == 0) {695/* If we're in this case then we have never called alloc_back(). In696* this case, we want keep the offset at 0 to make things as simple697* as possible for users that don't care about back allocations.698*/699center_bo_offset = 0;700} else {701/* Try to "center" the allocation based on how much is currently in702* use on each side of the center line.703*/704center_bo_offset = ((uint64_t)size * back_used) / total_used;705706/* Align down to a multiple of the page size */707center_bo_offset &= ~(PAGE_SIZE - 1);708709assert(center_bo_offset >= back_used);710711/* Make sure we don't shrink the back end of the pool */712if (center_bo_offset < back_required)713center_bo_offset = back_required;714715/* Make sure that we don't shrink the front end of the pool */716if (size - center_bo_offset < front_required)717center_bo_offset = size - front_required;718}719720assert(center_bo_offset % PAGE_SIZE == 0);721722result = anv_block_pool_expand_range(pool, center_bo_offset, size);723724done:725pthread_mutex_unlock(&pool->device->mutex);726727if (result == VK_SUCCESS) {728/* Return the appropriate new size. This function never actually729* updates state->next. Instead, we let the caller do that because it730* needs to do so in order to maintain its concurrency model.731*/732if (state == &pool->state) {733return pool->size - pool->center_bo_offset;734} else {735assert(pool->center_bo_offset > 0);736return pool->center_bo_offset;737}738} else {739return 0;740}741}742743static uint32_t744anv_block_pool_alloc_new(struct anv_block_pool *pool,745struct anv_block_state *pool_state,746uint32_t block_size, uint32_t *padding)747{748struct anv_block_state state, old, new;749750/* Most allocations won't generate any padding */751if (padding)752*padding = 0;753754while (1) {755state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);756if (state.next + block_size <= state.end) {757return state.next;758} else if (state.next <= state.end) {759if (pool->use_softpin && state.next < state.end) {760/* We need to grow the block pool, but still have some leftover761* space that can't be used by that particular allocation. So we762* add that as a "padding", and return it.763*/764uint32_t leftover = state.end - state.next;765766/* If there is some leftover space in the pool, the caller must767* deal with it.768*/769assert(leftover == 0 || padding);770if (padding)771*padding = leftover;772state.next += leftover;773}774775/* We allocated the first block outside the pool so we have to grow776* the pool. pool_state->next acts a mutex: threads who try to777* allocate now will get block indexes above the current limit and778* hit futex_wait below.779*/780new.next = state.next + block_size;781do {782new.end = anv_block_pool_grow(pool, pool_state, block_size);783} while (new.end < new.next);784785old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);786if (old.next != state.next)787futex_wake(&pool_state->end, INT_MAX);788return state.next;789} else {790futex_wait(&pool_state->end, state.end, NULL);791continue;792}793}794}795796int32_t797anv_block_pool_alloc(struct anv_block_pool *pool,798uint32_t block_size, uint32_t *padding)799{800uint32_t offset;801802offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding);803804return offset;805}806807/* Allocates a block out of the back of the block pool.808*809* This will allocated a block earlier than the "start" of the block pool.810* The offsets returned from this function will be negative but will still811* be correct relative to the block pool's map pointer.812*813* If you ever use anv_block_pool_alloc_back, then you will have to do814* gymnastics with the block pool's BO when doing relocations.815*/816int32_t817anv_block_pool_alloc_back(struct anv_block_pool *pool,818uint32_t block_size)819{820int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state,821block_size, NULL);822823/* The offset we get out of anv_block_pool_alloc_new() is actually the824* number of bytes downwards from the middle to the end of the block.825* We need to turn it into a (negative) offset from the middle to the826* start of the block.827*/828assert(offset >= 0);829return -(offset + block_size);830}831832VkResult833anv_state_pool_init(struct anv_state_pool *pool,834struct anv_device *device,835const char *name,836uint64_t base_address,837int32_t start_offset,838uint32_t block_size)839{840/* We don't want to ever see signed overflow */841assert(start_offset < INT32_MAX - (int32_t)BLOCK_POOL_MEMFD_SIZE);842843VkResult result = anv_block_pool_init(&pool->block_pool, device, name,844base_address + start_offset,845block_size * 16);846if (result != VK_SUCCESS)847return result;848849pool->start_offset = start_offset;850851result = anv_state_table_init(&pool->table, device, 64);852if (result != VK_SUCCESS) {853anv_block_pool_finish(&pool->block_pool);854return result;855}856857assert(util_is_power_of_two_or_zero(block_size));858pool->block_size = block_size;859pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY;860for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {861pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;862pool->buckets[i].block.next = 0;863pool->buckets[i].block.end = 0;864}865VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));866867return VK_SUCCESS;868}869870void871anv_state_pool_finish(struct anv_state_pool *pool)872{873VG(VALGRIND_DESTROY_MEMPOOL(pool));874anv_state_table_finish(&pool->table);875anv_block_pool_finish(&pool->block_pool);876}877878static uint32_t879anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,880struct anv_block_pool *block_pool,881uint32_t state_size,882uint32_t block_size,883uint32_t *padding)884{885struct anv_block_state block, old, new;886uint32_t offset;887888/* We don't always use anv_block_pool_alloc(), which would set *padding to889* zero for us. So if we have a pointer to padding, we must zero it out890* ourselves here, to make sure we always return some sensible value.891*/892if (padding)893*padding = 0;894895/* If our state is large, we don't need any sub-allocation from a block.896* Instead, we just grab whole (potentially large) blocks.897*/898if (state_size >= block_size)899return anv_block_pool_alloc(block_pool, state_size, padding);900901restart:902block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);903904if (block.next < block.end) {905return block.next;906} else if (block.next == block.end) {907offset = anv_block_pool_alloc(block_pool, block_size, padding);908new.next = offset + state_size;909new.end = offset + block_size;910old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);911if (old.next != block.next)912futex_wake(&pool->block.end, INT_MAX);913return offset;914} else {915futex_wait(&pool->block.end, block.end, NULL);916goto restart;917}918}919920static uint32_t921anv_state_pool_get_bucket(uint32_t size)922{923unsigned size_log2 = ilog2_round_up(size);924assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);925if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)926size_log2 = ANV_MIN_STATE_SIZE_LOG2;927return size_log2 - ANV_MIN_STATE_SIZE_LOG2;928}929930static uint32_t931anv_state_pool_get_bucket_size(uint32_t bucket)932{933uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;934return 1 << size_log2;935}936937/** Helper to push a chunk into the state table.938*939* It creates 'count' entries into the state table and update their sizes,940* offsets and maps, also pushing them as "free" states.941*/942static void943anv_state_pool_return_blocks(struct anv_state_pool *pool,944uint32_t chunk_offset, uint32_t count,945uint32_t block_size)946{947/* Disallow returning 0 chunks */948assert(count != 0);949950/* Make sure we always return chunks aligned to the block_size */951assert(chunk_offset % block_size == 0);952953uint32_t st_idx;954UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);955assert(result == VK_SUCCESS);956for (int i = 0; i < count; i++) {957/* update states that were added back to the state table */958struct anv_state *state_i = anv_state_table_get(&pool->table,959st_idx + i);960state_i->alloc_size = block_size;961state_i->offset = pool->start_offset + chunk_offset + block_size * i;962state_i->map = anv_block_pool_map(&pool->block_pool,963state_i->offset,964state_i->alloc_size);965}966967uint32_t block_bucket = anv_state_pool_get_bucket(block_size);968anv_free_list_push(&pool->buckets[block_bucket].free_list,969&pool->table, st_idx, count);970}971972/** Returns a chunk of memory back to the state pool.973*974* Do a two-level split. If chunk_size is bigger than divisor975* (pool->block_size), we return as many divisor sized blocks as we can, from976* the end of the chunk.977*978* The remaining is then split into smaller blocks (starting at small_size if979* it is non-zero), with larger blocks always being taken from the end of the980* chunk.981*/982static void983anv_state_pool_return_chunk(struct anv_state_pool *pool,984uint32_t chunk_offset, uint32_t chunk_size,985uint32_t small_size)986{987uint32_t divisor = pool->block_size;988uint32_t nblocks = chunk_size / divisor;989uint32_t rest = chunk_size - nblocks * divisor;990991if (nblocks > 0) {992/* First return divisor aligned and sized chunks. We start returning993* larger blocks from the end fo the chunk, since they should already be994* aligned to divisor. Also anv_state_pool_return_blocks() only accepts995* aligned chunks.996*/997uint32_t offset = chunk_offset + rest;998anv_state_pool_return_blocks(pool, offset, nblocks, divisor);999}10001001chunk_size = rest;1002divisor /= 2;10031004if (small_size > 0 && small_size < divisor)1005divisor = small_size;10061007uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;10081009/* Just as before, return larger divisor aligned blocks from the end of the1010* chunk first.1011*/1012while (chunk_size > 0 && divisor >= min_size) {1013nblocks = chunk_size / divisor;1014rest = chunk_size - nblocks * divisor;1015if (nblocks > 0) {1016anv_state_pool_return_blocks(pool, chunk_offset + rest,1017nblocks, divisor);1018chunk_size = rest;1019}1020divisor /= 2;1021}1022}10231024static struct anv_state1025anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,1026uint32_t size, uint32_t align)1027{1028uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));10291030struct anv_state *state;1031uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);1032int32_t offset;10331034/* Try free list first. */1035state = anv_free_list_pop(&pool->buckets[bucket].free_list,1036&pool->table);1037if (state) {1038assert(state->offset >= pool->start_offset);1039goto done;1040}10411042/* Try to grab a chunk from some larger bucket and split it up */1043for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {1044state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);1045if (state) {1046unsigned chunk_size = anv_state_pool_get_bucket_size(b);1047int32_t chunk_offset = state->offset;10481049/* First lets update the state we got to its new size. offset and map1050* remain the same.1051*/1052state->alloc_size = alloc_size;10531054/* Now return the unused part of the chunk back to the pool as free1055* blocks1056*1057* There are a couple of options as to what we do with it:1058*1059* 1) We could fully split the chunk into state.alloc_size sized1060* pieces. However, this would mean that allocating a 16B1061* state could potentially split a 2MB chunk into 512K smaller1062* chunks. This would lead to unnecessary fragmentation.1063*1064* 2) The classic "buddy allocator" method would have us split the1065* chunk in half and return one half. Then we would split the1066* remaining half in half and return one half, and repeat as1067* needed until we get down to the size we want. However, if1068* you are allocating a bunch of the same size state (which is1069* the common case), this means that every other allocation has1070* to go up a level and every fourth goes up two levels, etc.1071* This is not nearly as efficient as it could be if we did a1072* little more work up-front.1073*1074* 3) Split the difference between (1) and (2) by doing a1075* two-level split. If it's bigger than some fixed block_size,1076* we split it into block_size sized chunks and return all but1077* one of them. Then we split what remains into1078* state.alloc_size sized chunks and return them.1079*1080* We choose something close to option (3), which is implemented with1081* anv_state_pool_return_chunk(). That is done by returning the1082* remaining of the chunk, with alloc_size as a hint of the size that1083* we want the smaller chunk split into.1084*/1085anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,1086chunk_size - alloc_size, alloc_size);1087goto done;1088}1089}10901091uint32_t padding;1092offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],1093&pool->block_pool,1094alloc_size,1095pool->block_size,1096&padding);1097/* Everytime we allocate a new state, add it to the state pool */1098uint32_t idx;1099UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);1100assert(result == VK_SUCCESS);11011102state = anv_state_table_get(&pool->table, idx);1103state->offset = pool->start_offset + offset;1104state->alloc_size = alloc_size;1105state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);11061107if (padding > 0) {1108uint32_t return_offset = offset - padding;1109anv_state_pool_return_chunk(pool, return_offset, padding, 0);1110}11111112done:1113return *state;1114}11151116struct anv_state1117anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)1118{1119if (size == 0)1120return ANV_STATE_NULL;11211122struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);1123VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));1124return state;1125}11261127struct anv_state1128anv_state_pool_alloc_back(struct anv_state_pool *pool)1129{1130struct anv_state *state;1131uint32_t alloc_size = pool->block_size;11321133/* This function is only used with pools where start_offset == 0 */1134assert(pool->start_offset == 0);11351136state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table);1137if (state) {1138assert(state->offset < pool->start_offset);1139goto done;1140}11411142int32_t offset;1143offset = anv_block_pool_alloc_back(&pool->block_pool,1144pool->block_size);1145uint32_t idx;1146UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);1147assert(result == VK_SUCCESS);11481149state = anv_state_table_get(&pool->table, idx);1150state->offset = pool->start_offset + offset;1151state->alloc_size = alloc_size;1152state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);11531154done:1155VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size));1156return *state;1157}11581159static void1160anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)1161{1162assert(util_is_power_of_two_or_zero(state.alloc_size));1163unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);11641165if (state.offset < pool->start_offset) {1166assert(state.alloc_size == pool->block_size);1167anv_free_list_push(&pool->back_alloc_free_list,1168&pool->table, state.idx, 1);1169} else {1170anv_free_list_push(&pool->buckets[bucket].free_list,1171&pool->table, state.idx, 1);1172}1173}11741175void1176anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)1177{1178if (state.alloc_size == 0)1179return;11801181VG(VALGRIND_MEMPOOL_FREE(pool, state.map));1182anv_state_pool_free_no_vg(pool, state);1183}11841185struct anv_state_stream_block {1186struct anv_state block;11871188/* The next block */1189struct anv_state_stream_block *next;11901191#ifdef HAVE_VALGRIND1192/* A pointer to the first user-allocated thing in this block. This is1193* what valgrind sees as the start of the block.1194*/1195void *_vg_ptr;1196#endif1197};11981199/* The state stream allocator is a one-shot, single threaded allocator for1200* variable sized blocks. We use it for allocating dynamic state.1201*/1202void1203anv_state_stream_init(struct anv_state_stream *stream,1204struct anv_state_pool *state_pool,1205uint32_t block_size)1206{1207stream->state_pool = state_pool;1208stream->block_size = block_size;12091210stream->block = ANV_STATE_NULL;12111212/* Ensure that next + whatever > block_size. This way the first call to1213* state_stream_alloc fetches a new block.1214*/1215stream->next = block_size;12161217util_dynarray_init(&stream->all_blocks, NULL);12181219VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));1220}12211222void1223anv_state_stream_finish(struct anv_state_stream *stream)1224{1225util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {1226VG(VALGRIND_MEMPOOL_FREE(stream, block->map));1227VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));1228anv_state_pool_free_no_vg(stream->state_pool, *block);1229}1230util_dynarray_fini(&stream->all_blocks);12311232VG(VALGRIND_DESTROY_MEMPOOL(stream));1233}12341235struct anv_state1236anv_state_stream_alloc(struct anv_state_stream *stream,1237uint32_t size, uint32_t alignment)1238{1239if (size == 0)1240return ANV_STATE_NULL;12411242assert(alignment <= PAGE_SIZE);12431244uint32_t offset = align_u32(stream->next, alignment);1245if (offset + size > stream->block.alloc_size) {1246uint32_t block_size = stream->block_size;1247if (block_size < size)1248block_size = round_to_power_of_two(size);12491250stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,1251block_size, PAGE_SIZE);1252util_dynarray_append(&stream->all_blocks,1253struct anv_state, stream->block);1254VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));12551256/* Reset back to the start */1257stream->next = offset = 0;1258assert(offset + size <= stream->block.alloc_size);1259}1260const bool new_block = stream->next == 0;12611262struct anv_state state = stream->block;1263state.offset += offset;1264state.alloc_size = size;1265state.map += offset;12661267stream->next = offset + size;12681269if (new_block) {1270assert(state.map == stream->block.map);1271VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));1272} else {1273/* This only updates the mempool. The newly allocated chunk is still1274* marked as NOACCESS. */1275VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,1276stream->next));1277/* Mark the newly allocated chunk as undefined */1278VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));1279}12801281return state;1282}12831284void1285anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,1286struct anv_state_pool *parent,1287uint32_t count, uint32_t size, uint32_t alignment)1288{1289pool->pool = parent;1290pool->reserved_blocks = ANV_FREE_LIST_EMPTY;1291pool->count = count;12921293for (unsigned i = 0; i < count; i++) {1294struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);1295anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);1296}1297}12981299void1300anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)1301{1302struct anv_state *state;13031304while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {1305anv_state_pool_free(pool->pool, *state);1306pool->count--;1307}1308assert(pool->count == 0);1309}13101311struct anv_state1312anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)1313{1314return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);1315}13161317void1318anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,1319struct anv_state state)1320{1321anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);1322}13231324void1325anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,1326const char *name)1327{1328pool->name = name;1329pool->device = device;1330for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {1331util_sparse_array_free_list_init(&pool->free_list[i],1332&device->bo_cache.bo_map, 0,1333offsetof(struct anv_bo, free_index));1334}13351336VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));1337}13381339void1340anv_bo_pool_finish(struct anv_bo_pool *pool)1341{1342for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {1343while (1) {1344struct anv_bo *bo =1345util_sparse_array_free_list_pop_elem(&pool->free_list[i]);1346if (bo == NULL)1347break;13481349/* anv_device_release_bo is going to "free" it */1350VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));1351anv_device_release_bo(pool->device, bo);1352}1353}13541355VG(VALGRIND_DESTROY_MEMPOOL(pool));1356}13571358VkResult1359anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,1360struct anv_bo **bo_out)1361{1362const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size);1363const unsigned pow2_size = 1 << size_log2;1364const unsigned bucket = size_log2 - 12;1365assert(bucket < ARRAY_SIZE(pool->free_list));13661367struct anv_bo *bo =1368util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);1369if (bo != NULL) {1370VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));1371*bo_out = bo;1372return VK_SUCCESS;1373}13741375VkResult result = anv_device_alloc_bo(pool->device,1376pool->name,1377pow2_size,1378ANV_BO_ALLOC_MAPPED |1379ANV_BO_ALLOC_SNOOPED |1380ANV_BO_ALLOC_CAPTURE,13810 /* explicit_address */,1382&bo);1383if (result != VK_SUCCESS)1384return result;13851386/* We want it to look like it came from this pool */1387VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));1388VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));13891390*bo_out = bo;13911392return VK_SUCCESS;1393}13941395void1396anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)1397{1398VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));13991400assert(util_is_power_of_two_or_zero(bo->size));1401const unsigned size_log2 = ilog2_round_up(bo->size);1402const unsigned bucket = size_log2 - 12;1403assert(bucket < ARRAY_SIZE(pool->free_list));14041405assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,1406bo->gem_handle) == bo);1407util_sparse_array_free_list_push(&pool->free_list[bucket],1408&bo->gem_handle, 1);1409}14101411// Scratch pool14121413void1414anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool)1415{1416memset(pool, 0, sizeof(*pool));1417}14181419void1420anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)1421{1422for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {1423for (unsigned i = 0; i < 16; i++) {1424if (pool->bos[i][s] != NULL)1425anv_device_release_bo(device, pool->bos[i][s]);1426}1427}14281429for (unsigned i = 0; i < 16; i++) {1430if (pool->surf_states[i].map != NULL) {1431anv_state_pool_free(&device->surface_state_pool,1432pool->surf_states[i]);1433}1434}1435}14361437struct anv_bo *1438anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,1439gl_shader_stage stage, unsigned per_thread_scratch)1440{1441if (per_thread_scratch == 0)1442return NULL;14431444unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);1445assert(scratch_size_log2 < 16);14461447assert(stage < ARRAY_SIZE(pool->bos));14481449const struct intel_device_info *devinfo = &device->info;14501451/* On GFX version 12.5, scratch access changed to a surface-based model.1452* Instead of each shader type having its own layout based on IDs passed1453* from the relevant fixed-function unit, all scratch access is based on1454* thread IDs like it always has been for compute.1455*/1456if (devinfo->verx10 >= 125)1457stage = MESA_SHADER_COMPUTE;14581459struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);14601461if (bo != NULL)1462return bo;14631464unsigned subslices = MAX2(device->physical->subslice_total, 1);14651466/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:1467*1468* "Scratch Space per slice is computed based on 4 sub-slices. SW1469* must allocate scratch space enough so that each slice has 41470* slices allowed."1471*1472* According to the other driver team, this applies to compute shaders1473* as well. This is not currently documented at all.1474*1475* This hack is no longer necessary on Gfx11+.1476*1477* For, Gfx11+, scratch space allocation is based on the number of threads1478* in the base configuration.1479*/1480if (devinfo->verx10 == 125)1481subslices = 32;1482else if (devinfo->ver == 12)1483subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);1484else if (devinfo->ver == 11)1485subslices = 8;1486else if (devinfo->ver >= 9)1487subslices = 4 * devinfo->num_slices;14881489unsigned scratch_ids_per_subslice;1490if (devinfo->ver >= 12) {1491/* Same as ICL below, but with 16 EUs. */1492scratch_ids_per_subslice = 16 * 8;1493} else if (devinfo->ver == 11) {1494/* The MEDIA_VFE_STATE docs say:1495*1496* "Starting with this configuration, the Maximum Number of1497* Threads must be set to (#EU * 8) for GPGPU dispatches.1498*1499* Although there are only 7 threads per EU in the configuration,1500* the FFTID is calculated as if there are 8 threads per EU,1501* which in turn requires a larger amount of Scratch Space to be1502* allocated by the driver."1503*/1504scratch_ids_per_subslice = 8 * 8;1505} else if (devinfo->is_haswell) {1506/* WaCSScratchSize:hsw1507*1508* Haswell's scratch space address calculation appears to be sparse1509* rather than tightly packed. The Thread ID has bits indicating1510* which subslice, EU within a subslice, and thread within an EU it1511* is. There's a maximum of two slices and two subslices, so these1512* can be stored with a single bit. Even though there are only 10 EUs1513* per subslice, this is stored in 4 bits, so there's an effective1514* maximum value of 16 EUs. Similarly, although there are only 71515* threads per EU, this is stored in a 3 bit number, giving an1516* effective maximum value of 8 threads per EU.1517*1518* This means that we need to use 16 * 8 instead of 10 * 7 for the1519* number of threads per subslice.1520*/1521scratch_ids_per_subslice = 16 * 8;1522} else if (devinfo->is_cherryview) {1523/* Cherryview devices have either 6 or 8 EUs per subslice, and each EU1524* has 7 threads. The 6 EU devices appear to calculate thread IDs as if1525* it had 8 EUs.1526*/1527scratch_ids_per_subslice = 8 * 7;1528} else {1529scratch_ids_per_subslice = devinfo->max_cs_threads;1530}15311532uint32_t max_threads[] = {1533[MESA_SHADER_VERTEX] = devinfo->max_vs_threads,1534[MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,1535[MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,1536[MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,1537[MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,1538[MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices,1539};15401541uint32_t size = per_thread_scratch * max_threads[stage];15421543/* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they1544* are still relative to the general state base address. When we emit1545* STATE_BASE_ADDRESS, we set general state base address to 0 and the size1546* to the maximum (1 page under 4GB). This allows us to just place the1547* scratch buffers anywhere we wish in the bottom 32 bits of address space1548* and just set the scratch base pointer in 3DSTATE_*S using a relocation.1549* However, in order to do so, we need to ensure that the kernel does not1550* place the scratch BO above the 32-bit boundary.1551*1552* NOTE: Technically, it can't go "anywhere" because the top page is off1553* limits. However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the1554* kernel allocates space using1555*1556* end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);1557*1558* so nothing will ever touch the top page.1559*/1560VkResult result = anv_device_alloc_bo(device, "scratch", size,1561ANV_BO_ALLOC_32BIT_ADDRESS |1562ANV_BO_ALLOC_LOCAL_MEM,15630 /* explicit_address */,1564&bo);1565if (result != VK_SUCCESS)1566return NULL; /* TODO */15671568struct anv_bo *current_bo =1569p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);1570if (current_bo) {1571anv_device_release_bo(device, bo);1572return current_bo;1573} else {1574return bo;1575}1576}15771578uint32_t1579anv_scratch_pool_get_surf(struct anv_device *device,1580struct anv_scratch_pool *pool,1581unsigned per_thread_scratch)1582{1583if (per_thread_scratch == 0)1584return 0;15851586unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);1587assert(scratch_size_log2 < 16);15881589uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);1590if (surf > 0)1591return surf;15921593struct anv_bo *bo =1594anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,1595per_thread_scratch);1596struct anv_address addr = { .bo = bo };15971598struct anv_state state =1599anv_state_pool_alloc(&device->surface_state_pool,1600device->isl_dev.ss.size, 64);16011602isl_buffer_fill_state(&device->isl_dev, state.map,1603.address = anv_address_physical(addr),1604.size_B = bo->size,1605.mocs = anv_mocs(device, bo, 0),1606.format = ISL_FORMAT_RAW,1607.swizzle = ISL_SWIZZLE_IDENTITY,1608.stride_B = per_thread_scratch,1609.is_scratch = true);16101611uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],16120, state.offset);1613if (current) {1614anv_state_pool_free(&device->surface_state_pool, state);1615return current;1616} else {1617pool->surf_states[scratch_size_log2] = state;1618return state.offset;1619}1620}16211622VkResult1623anv_bo_cache_init(struct anv_bo_cache *cache)1624{1625util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);16261627if (pthread_mutex_init(&cache->mutex, NULL)) {1628util_sparse_array_finish(&cache->bo_map);1629return vk_errorf(NULL, NULL, VK_ERROR_OUT_OF_HOST_MEMORY,1630"pthread_mutex_init failed: %m");1631}16321633return VK_SUCCESS;1634}16351636void1637anv_bo_cache_finish(struct anv_bo_cache *cache)1638{1639util_sparse_array_finish(&cache->bo_map);1640pthread_mutex_destroy(&cache->mutex);1641}16421643#define ANV_BO_CACHE_SUPPORTED_FLAGS \1644(EXEC_OBJECT_WRITE | \1645EXEC_OBJECT_ASYNC | \1646EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \1647EXEC_OBJECT_PINNED | \1648EXEC_OBJECT_CAPTURE)16491650static uint32_t1651anv_bo_alloc_flags_to_bo_flags(struct anv_device *device,1652enum anv_bo_alloc_flags alloc_flags)1653{1654struct anv_physical_device *pdevice = device->physical;16551656uint64_t bo_flags = 0;1657if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) &&1658pdevice->supports_48bit_addresses)1659bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;16601661if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture)1662bo_flags |= EXEC_OBJECT_CAPTURE;16631664if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {1665assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);1666bo_flags |= EXEC_OBJECT_WRITE;1667}16681669if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)1670bo_flags |= EXEC_OBJECT_ASYNC;16711672if (pdevice->use_softpin)1673bo_flags |= EXEC_OBJECT_PINNED;16741675return bo_flags;1676}16771678static uint32_t1679anv_device_get_bo_align(struct anv_device *device,1680enum anv_bo_alloc_flags alloc_flags)1681{1682/* Gfx12 CCS surface addresses need to be 64K aligned. */1683if (device->info.ver >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS))1684return 64 * 1024;16851686return 4096;1687}16881689VkResult1690anv_device_alloc_bo(struct anv_device *device,1691const char *name,1692uint64_t size,1693enum anv_bo_alloc_flags alloc_flags,1694uint64_t explicit_address,1695struct anv_bo **bo_out)1696{1697if (!device->physical->has_implicit_ccs)1698assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));16991700const uint32_t bo_flags =1701anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);1702assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));17031704/* The kernel is going to give us whole pages anyway */1705size = align_u64(size, 4096);17061707const uint32_t align = anv_device_get_bo_align(device, alloc_flags);17081709uint64_t ccs_size = 0;1710if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) {1711/* Align the size up to the next multiple of 64K so we don't have any1712* AUX-TT entries pointing from a 64K page to itself.1713*/1714size = align_u64(size, 64 * 1024);17151716/* See anv_bo::_ccs_size */1717ccs_size = align_u64(DIV_ROUND_UP(size, INTEL_AUX_MAP_GFX12_CCS_SCALE), 4096);1718}17191720uint32_t gem_handle;17211722/* If we have vram size, we have multiple memory regions and should choose1723* one of them.1724*/1725if (device->physical->vram.size > 0) {1726struct drm_i915_gem_memory_class_instance regions[2];1727uint32_t nregions = 0;17281729if (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM) {1730/* For vram allocation, still use system memory as a fallback. */1731regions[nregions++] = device->physical->vram.region;1732regions[nregions++] = device->physical->sys.region;1733} else {1734regions[nregions++] = device->physical->sys.region;1735}17361737gem_handle = anv_gem_create_regions(device, size + ccs_size,1738nregions, regions);1739} else {1740gem_handle = anv_gem_create(device, size + ccs_size);1741}17421743if (gem_handle == 0)1744return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);17451746struct anv_bo new_bo = {1747.name = name,1748.gem_handle = gem_handle,1749.refcount = 1,1750.offset = -1,1751.size = size,1752._ccs_size = ccs_size,1753.flags = bo_flags,1754.is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL),1755.has_client_visible_address =1756(alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,1757.has_implicit_ccs = ccs_size > 0,1758};17591760if (alloc_flags & ANV_BO_ALLOC_MAPPED) {1761new_bo.map = anv_gem_mmap(device, new_bo.gem_handle, 0, size, 0);1762if (new_bo.map == MAP_FAILED) {1763anv_gem_close(device, new_bo.gem_handle);1764return vk_errorf(device, &device->vk.base,1765VK_ERROR_OUT_OF_HOST_MEMORY,1766"mmap failed: %m");1767}1768}17691770if (alloc_flags & ANV_BO_ALLOC_SNOOPED) {1771assert(alloc_flags & ANV_BO_ALLOC_MAPPED);1772/* We don't want to change these defaults if it's going to be shared1773* with another process.1774*/1775assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));17761777/* Regular objects are created I915_CACHING_CACHED on LLC platforms and1778* I915_CACHING_NONE on non-LLC platforms. For many internal state1779* objects, we'd rather take the snooping overhead than risk forgetting1780* a CLFLUSH somewhere. Userptr objects are always created as1781* I915_CACHING_CACHED, which on non-LLC means snooped so there's no1782* need to do this there.1783*/1784if (!device->info.has_llc) {1785anv_gem_set_caching(device, new_bo.gem_handle,1786I915_CACHING_CACHED);1787}1788}17891790if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {1791new_bo.has_fixed_address = true;1792new_bo.offset = explicit_address;1793} else if (new_bo.flags & EXEC_OBJECT_PINNED) {1794new_bo.offset = anv_vma_alloc(device, new_bo.size + new_bo._ccs_size,1795align, alloc_flags, explicit_address);1796if (new_bo.offset == 0) {1797if (new_bo.map)1798anv_gem_munmap(device, new_bo.map, size);1799anv_gem_close(device, new_bo.gem_handle);1800return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,1801"failed to allocate virtual address for BO");1802}1803} else {1804assert(!new_bo.has_client_visible_address);1805}18061807if (new_bo._ccs_size > 0) {1808assert(device->info.has_aux_map);1809intel_aux_map_add_mapping(device->aux_map_ctx,1810intel_canonical_address(new_bo.offset),1811intel_canonical_address(new_bo.offset + new_bo.size),1812new_bo.size, 0 /* format_bits */);1813}18141815assert(new_bo.gem_handle);18161817/* If we just got this gem_handle from anv_bo_init_new then we know no one1818* else is touching this BO at the moment so we don't need to lock here.1819*/1820struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);1821*bo = new_bo;18221823*bo_out = bo;18241825return VK_SUCCESS;1826}18271828VkResult1829anv_device_import_bo_from_host_ptr(struct anv_device *device,1830void *host_ptr, uint32_t size,1831enum anv_bo_alloc_flags alloc_flags,1832uint64_t client_address,1833struct anv_bo **bo_out)1834{1835assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |1836ANV_BO_ALLOC_SNOOPED |1837ANV_BO_ALLOC_FIXED_ADDRESS)));18381839/* We can't do implicit CCS with an aux table on shared memory */1840if (!device->physical->has_implicit_ccs || device->info.has_aux_map)1841assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));18421843struct anv_bo_cache *cache = &device->bo_cache;1844const uint32_t bo_flags =1845anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);1846assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));18471848uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size);1849if (!gem_handle)1850return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);18511852pthread_mutex_lock(&cache->mutex);18531854struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);1855if (bo->refcount > 0) {1856/* VK_EXT_external_memory_host doesn't require handling importing the1857* same pointer twice at the same time, but we don't get in the way. If1858* kernel gives us the same gem_handle, only succeed if the flags match.1859*/1860assert(bo->gem_handle == gem_handle);1861if (bo_flags != bo->flags) {1862pthread_mutex_unlock(&cache->mutex);1863return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,1864"same host pointer imported two different ways");1865}18661867if (bo->has_client_visible_address !=1868((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {1869pthread_mutex_unlock(&cache->mutex);1870return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,1871"The same BO was imported with and without buffer "1872"device address");1873}18741875if (client_address && client_address != intel_48b_address(bo->offset)) {1876pthread_mutex_unlock(&cache->mutex);1877return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,1878"The same BO was imported at two different "1879"addresses");1880}18811882__sync_fetch_and_add(&bo->refcount, 1);1883} else {1884struct anv_bo new_bo = {1885.name = "host-ptr",1886.gem_handle = gem_handle,1887.refcount = 1,1888.offset = -1,1889.size = size,1890.map = host_ptr,1891.flags = bo_flags,1892.is_external = true,1893.from_host_ptr = true,1894.has_client_visible_address =1895(alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,1896};18971898assert(client_address == intel_48b_address(client_address));1899if (new_bo.flags & EXEC_OBJECT_PINNED) {1900assert(new_bo._ccs_size == 0);1901new_bo.offset = anv_vma_alloc(device, new_bo.size,1902anv_device_get_bo_align(device,1903alloc_flags),1904alloc_flags, client_address);1905if (new_bo.offset == 0) {1906anv_gem_close(device, new_bo.gem_handle);1907pthread_mutex_unlock(&cache->mutex);1908return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,1909"failed to allocate virtual address for BO");1910}1911} else {1912assert(!new_bo.has_client_visible_address);1913}19141915*bo = new_bo;1916}19171918pthread_mutex_unlock(&cache->mutex);1919*bo_out = bo;19201921return VK_SUCCESS;1922}19231924VkResult1925anv_device_import_bo(struct anv_device *device,1926int fd,1927enum anv_bo_alloc_flags alloc_flags,1928uint64_t client_address,1929struct anv_bo **bo_out)1930{1931assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |1932ANV_BO_ALLOC_SNOOPED |1933ANV_BO_ALLOC_FIXED_ADDRESS)));19341935/* We can't do implicit CCS with an aux table on shared memory */1936if (!device->physical->has_implicit_ccs || device->info.has_aux_map)1937assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));19381939struct anv_bo_cache *cache = &device->bo_cache;1940const uint32_t bo_flags =1941anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);1942assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));19431944pthread_mutex_lock(&cache->mutex);19451946uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);1947if (!gem_handle) {1948pthread_mutex_unlock(&cache->mutex);1949return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);1950}19511952struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);1953if (bo->refcount > 0) {1954/* We have to be careful how we combine flags so that it makes sense.1955* Really, though, if we get to this case and it actually matters, the1956* client has imported a BO twice in different ways and they get what1957* they have coming.1958*/1959uint64_t new_flags = 0;1960new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;1961new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;1962new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;1963new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;1964new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;19651966/* It's theoretically possible for a BO to get imported such that it's1967* both pinned and not pinned. The only way this can happen is if it1968* gets imported as both a semaphore and a memory object and that would1969* be an application error. Just fail out in that case.1970*/1971if ((bo->flags & EXEC_OBJECT_PINNED) !=1972(bo_flags & EXEC_OBJECT_PINNED)) {1973pthread_mutex_unlock(&cache->mutex);1974return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,1975"The same BO was imported two different ways");1976}19771978/* It's also theoretically possible that someone could export a BO from1979* one heap and import it into another or to import the same BO into two1980* different heaps. If this happens, we could potentially end up both1981* allowing and disallowing 48-bit addresses. There's not much we can1982* do about it if we're pinning so we just throw an error and hope no1983* app is actually that stupid.1984*/1985if ((new_flags & EXEC_OBJECT_PINNED) &&1986(bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=1987(bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) {1988pthread_mutex_unlock(&cache->mutex);1989return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,1990"The same BO was imported on two different heaps");1991}19921993if (bo->has_client_visible_address !=1994((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {1995pthread_mutex_unlock(&cache->mutex);1996return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,1997"The same BO was imported with and without buffer "1998"device address");1999}20002001if (client_address && client_address != intel_48b_address(bo->offset)) {2002pthread_mutex_unlock(&cache->mutex);2003return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,2004"The same BO was imported at two different "2005"addresses");2006}20072008bo->flags = new_flags;20092010__sync_fetch_and_add(&bo->refcount, 1);2011} else {2012off_t size = lseek(fd, 0, SEEK_END);2013if (size == (off_t)-1) {2014anv_gem_close(device, gem_handle);2015pthread_mutex_unlock(&cache->mutex);2016return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);2017}20182019struct anv_bo new_bo = {2020.name = "imported",2021.gem_handle = gem_handle,2022.refcount = 1,2023.offset = -1,2024.size = size,2025.flags = bo_flags,2026.is_external = true,2027.has_client_visible_address =2028(alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,2029};20302031assert(client_address == intel_48b_address(client_address));2032if (new_bo.flags & EXEC_OBJECT_PINNED) {2033assert(new_bo._ccs_size == 0);2034new_bo.offset = anv_vma_alloc(device, new_bo.size,2035anv_device_get_bo_align(device,2036alloc_flags),2037alloc_flags, client_address);2038if (new_bo.offset == 0) {2039anv_gem_close(device, new_bo.gem_handle);2040pthread_mutex_unlock(&cache->mutex);2041return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,2042"failed to allocate virtual address for BO");2043}2044} else {2045assert(!new_bo.has_client_visible_address);2046}20472048*bo = new_bo;2049}20502051pthread_mutex_unlock(&cache->mutex);2052*bo_out = bo;20532054return VK_SUCCESS;2055}20562057VkResult2058anv_device_export_bo(struct anv_device *device,2059struct anv_bo *bo, int *fd_out)2060{2061assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);20622063/* This BO must have been flagged external in order for us to be able2064* to export it. This is done based on external options passed into2065* anv_AllocateMemory.2066*/2067assert(bo->is_external);20682069int fd = anv_gem_handle_to_fd(device, bo->gem_handle);2070if (fd < 0)2071return vk_error(VK_ERROR_TOO_MANY_OBJECTS);20722073*fd_out = fd;20742075return VK_SUCCESS;2076}20772078static bool2079atomic_dec_not_one(uint32_t *counter)2080{2081uint32_t old, val;20822083val = *counter;2084while (1) {2085if (val == 1)2086return false;20872088old = __sync_val_compare_and_swap(counter, val, val - 1);2089if (old == val)2090return true;20912092val = old;2093}2094}20952096void2097anv_device_release_bo(struct anv_device *device,2098struct anv_bo *bo)2099{2100struct anv_bo_cache *cache = &device->bo_cache;2101assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);21022103/* Try to decrement the counter but don't go below one. If this succeeds2104* then the refcount has been decremented and we are not the last2105* reference.2106*/2107if (atomic_dec_not_one(&bo->refcount))2108return;21092110pthread_mutex_lock(&cache->mutex);21112112/* We are probably the last reference since our attempt to decrement above2113* failed. However, we can't actually know until we are inside the mutex.2114* Otherwise, someone could import the BO between the decrement and our2115* taking the mutex.2116*/2117if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {2118/* Turns out we're not the last reference. Unlock and bail. */2119pthread_mutex_unlock(&cache->mutex);2120return;2121}2122assert(bo->refcount == 0);21232124if (bo->map && !bo->from_host_ptr)2125anv_gem_munmap(device, bo->map, bo->size);21262127if (bo->_ccs_size > 0) {2128assert(device->physical->has_implicit_ccs);2129assert(device->info.has_aux_map);2130assert(bo->has_implicit_ccs);2131intel_aux_map_unmap_range(device->aux_map_ctx,2132intel_canonical_address(bo->offset),2133bo->size);2134}21352136if ((bo->flags & EXEC_OBJECT_PINNED) && !bo->has_fixed_address)2137anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size);21382139uint32_t gem_handle = bo->gem_handle;21402141/* Memset the BO just in case. The refcount being zero should be enough to2142* prevent someone from assuming the data is valid but it's safer to just2143* stomp to zero just in case. We explicitly do this *before* we close the2144* GEM handle to ensure that if anyone allocates something and gets the2145* same GEM handle, the memset has already happen and won't stomp all over2146* any data they may write in this BO.2147*/2148memset(bo, 0, sizeof(*bo));21492150anv_gem_close(device, gem_handle);21512152/* Don't unlock until we've actually closed the BO. The whole point of2153* the BO cache is to ensure that we correctly handle races with creating2154* and releasing GEM handles and we don't want to let someone import the BO2155* again between mutex unlock and closing the GEM handle.2156*/2157pthread_mutex_unlock(&cache->mutex);2158}215921602161