Path: blob/21.2-virgl/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
4561 views
/*1* Copyright © 2011 Marek Olšák <[email protected]>2* Copyright © 2015 Advanced Micro Devices, Inc.3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining6* a copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,14* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES15* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND16* NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS17* AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE20* USE OR OTHER DEALINGS IN THE SOFTWARE.21*22* The above copyright notice and this permission notice (including the23* next paragraph) shall be included in all copies or substantial portions24* of the Software.25*/2627#include "amdgpu_cs.h"2829#include "util/hash_table.h"30#include "util/os_time.h"31#include "util/u_hash_table.h"32#include "frontend/drm_driver.h"33#include "drm-uapi/amdgpu_drm.h"34#include <xf86drm.h>35#include <stdio.h>36#include <inttypes.h>3738#ifndef AMDGPU_VA_RANGE_HIGH39#define AMDGPU_VA_RANGE_HIGH 0x240#endif4142/* Set to 1 for verbose output showing committed sparse buffer ranges. */43#define DEBUG_SPARSE_COMMITS 04445struct amdgpu_sparse_backing_chunk {46uint32_t begin, end;47};4849static bool amdgpu_bo_wait(struct radeon_winsys *rws,50struct pb_buffer *_buf, uint64_t timeout,51enum radeon_bo_usage usage)52{53struct amdgpu_winsys *ws = amdgpu_winsys(rws);54struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);55int64_t abs_timeout = 0;5657if (timeout == 0) {58if (p_atomic_read(&bo->num_active_ioctls))59return false;6061} else {62abs_timeout = os_time_get_absolute_timeout(timeout);6364/* Wait if any ioctl is being submitted with this buffer. */65if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))66return false;67}6869if (bo->bo && bo->u.real.is_shared) {70/* We can't use user fences for shared buffers, because user fences71* are local to this process only. If we want to wait for all buffer72* uses in all processes, we have to use amdgpu_bo_wait_for_idle.73*/74bool buffer_busy = true;75int r;7677r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);78if (r)79fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,80r);81return !buffer_busy;82}8384if (timeout == 0) {85unsigned idle_fences;86bool buffer_idle;8788simple_mtx_lock(&ws->bo_fence_lock);8990for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {91if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))92break;93}9495/* Release the idle fences to avoid checking them again later. */96for (unsigned i = 0; i < idle_fences; ++i)97amdgpu_fence_reference(&bo->fences[i], NULL);9899memmove(&bo->fences[0], &bo->fences[idle_fences],100(bo->num_fences - idle_fences) * sizeof(*bo->fences));101bo->num_fences -= idle_fences;102103buffer_idle = !bo->num_fences;104simple_mtx_unlock(&ws->bo_fence_lock);105106return buffer_idle;107} else {108bool buffer_idle = true;109110simple_mtx_lock(&ws->bo_fence_lock);111while (bo->num_fences && buffer_idle) {112struct pipe_fence_handle *fence = NULL;113bool fence_idle = false;114115amdgpu_fence_reference(&fence, bo->fences[0]);116117/* Wait for the fence. */118simple_mtx_unlock(&ws->bo_fence_lock);119if (amdgpu_fence_wait(fence, abs_timeout, true))120fence_idle = true;121else122buffer_idle = false;123simple_mtx_lock(&ws->bo_fence_lock);124125/* Release an idle fence to avoid checking it again later, keeping in126* mind that the fence array may have been modified by other threads.127*/128if (fence_idle && bo->num_fences && bo->fences[0] == fence) {129amdgpu_fence_reference(&bo->fences[0], NULL);130memmove(&bo->fences[0], &bo->fences[1],131(bo->num_fences - 1) * sizeof(*bo->fences));132bo->num_fences--;133}134135amdgpu_fence_reference(&fence, NULL);136}137simple_mtx_unlock(&ws->bo_fence_lock);138139return buffer_idle;140}141}142143static enum radeon_bo_domain amdgpu_bo_get_initial_domain(144struct pb_buffer *buf)145{146return ((struct amdgpu_winsys_bo*)buf)->base.placement;147}148149static enum radeon_bo_flag amdgpu_bo_get_flags(150struct pb_buffer *buf)151{152return ((struct amdgpu_winsys_bo*)buf)->base.usage;153}154155static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)156{157for (unsigned i = 0; i < bo->num_fences; ++i)158amdgpu_fence_reference(&bo->fences[i], NULL);159160FREE(bo->fences);161bo->num_fences = 0;162bo->max_fences = 0;163}164165void amdgpu_bo_destroy(struct amdgpu_winsys *ws, struct pb_buffer *_buf)166{167struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);168struct amdgpu_screen_winsys *sws_iter;169170assert(bo->bo && "must not be called for slab entries");171172if (!bo->u.real.is_user_ptr && bo->u.real.cpu_ptr) {173bo->u.real.cpu_ptr = NULL;174amdgpu_bo_unmap(&ws->dummy_ws.base, &bo->base);175}176assert(bo->u.real.is_user_ptr || bo->u.real.map_count == 0);177178#if DEBUG179if (ws->debug_all_bos) {180simple_mtx_lock(&ws->global_bo_list_lock);181list_del(&bo->u.real.global_list_item);182ws->num_buffers--;183simple_mtx_unlock(&ws->global_bo_list_lock);184}185#endif186187/* Close all KMS handles retrieved for other DRM file descriptions */188simple_mtx_lock(&ws->sws_list_lock);189for (sws_iter = ws->sws_list; sws_iter; sws_iter = sws_iter->next) {190struct hash_entry *entry;191192if (!sws_iter->kms_handles)193continue;194195entry = _mesa_hash_table_search(sws_iter->kms_handles, bo);196if (entry) {197struct drm_gem_close args = { .handle = (uintptr_t)entry->data };198199drmIoctl(sws_iter->fd, DRM_IOCTL_GEM_CLOSE, &args);200_mesa_hash_table_remove(sws_iter->kms_handles, entry);201}202}203simple_mtx_unlock(&ws->sws_list_lock);204205simple_mtx_lock(&ws->bo_export_table_lock);206_mesa_hash_table_remove_key(ws->bo_export_table, bo->bo);207simple_mtx_unlock(&ws->bo_export_table_lock);208209if (bo->base.placement & RADEON_DOMAIN_VRAM_GTT) {210amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);211amdgpu_va_range_free(bo->u.real.va_handle);212}213amdgpu_bo_free(bo->bo);214215amdgpu_bo_remove_fences(bo);216217if (bo->base.placement & RADEON_DOMAIN_VRAM)218ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);219else if (bo->base.placement & RADEON_DOMAIN_GTT)220ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);221222simple_mtx_destroy(&bo->lock);223FREE(bo);224}225226static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buffer *_buf)227{228struct amdgpu_winsys *ws = amdgpu_winsys(rws);229struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);230231assert(bo->bo); /* slab buffers have a separate vtbl */232233if (bo->u.real.use_reusable_pool)234pb_cache_add_buffer(bo->cache_entry);235else236amdgpu_bo_destroy(ws, _buf);237}238239static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)240{241for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {242pb_slabs_reclaim(&ws->bo_slabs[i]);243if (ws->info.has_tmz_support)244pb_slabs_reclaim(&ws->bo_slabs_encrypted[i]);245}246247pb_cache_release_all_buffers(&ws->bo_cache);248}249250static bool amdgpu_bo_do_map(struct radeon_winsys *rws, struct amdgpu_winsys_bo *bo, void **cpu)251{252struct amdgpu_winsys *ws = amdgpu_winsys(rws);253254assert(!(bo->base.usage & RADEON_FLAG_SPARSE) && bo->bo && !bo->u.real.is_user_ptr);255int r = amdgpu_bo_cpu_map(bo->bo, cpu);256if (r) {257/* Clean up buffer managers and try again. */258amdgpu_clean_up_buffer_managers(ws);259r = amdgpu_bo_cpu_map(bo->bo, cpu);260if (r)261return false;262}263264if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {265if (bo->base.placement & RADEON_DOMAIN_VRAM)266ws->mapped_vram += bo->base.size;267else if (bo->base.placement & RADEON_DOMAIN_GTT)268ws->mapped_gtt += bo->base.size;269ws->num_mapped_buffers++;270}271272return true;273}274275void *amdgpu_bo_map(struct radeon_winsys *rws,276struct pb_buffer *buf,277struct radeon_cmdbuf *rcs,278enum pipe_map_flags usage)279{280struct amdgpu_winsys *ws = amdgpu_winsys(rws);281struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;282struct amdgpu_winsys_bo *real;283struct amdgpu_cs *cs = rcs ? amdgpu_cs(rcs) : NULL;284285assert(!(bo->base.usage & RADEON_FLAG_SPARSE));286287/* If it's not unsynchronized bo_map, flush CS if needed and then wait. */288if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {289/* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */290if (usage & PIPE_MAP_DONTBLOCK) {291if (!(usage & PIPE_MAP_WRITE)) {292/* Mapping for read.293*294* Since we are mapping for read, we don't need to wait295* if the GPU is using the buffer for read too296* (neither one is changing it).297*298* Only check whether the buffer is being used for write. */299if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,300RADEON_USAGE_WRITE)) {301cs->flush_cs(cs->flush_data,302RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);303return NULL;304}305306if (!amdgpu_bo_wait(rws, (struct pb_buffer*)bo, 0,307RADEON_USAGE_WRITE)) {308return NULL;309}310} else {311if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {312cs->flush_cs(cs->flush_data,313RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);314return NULL;315}316317if (!amdgpu_bo_wait(rws, (struct pb_buffer*)bo, 0,318RADEON_USAGE_READWRITE)) {319return NULL;320}321}322} else {323uint64_t time = os_time_get_nano();324325if (!(usage & PIPE_MAP_WRITE)) {326/* Mapping for read.327*328* Since we are mapping for read, we don't need to wait329* if the GPU is using the buffer for read too330* (neither one is changing it).331*332* Only check whether the buffer is being used for write. */333if (cs) {334if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,335RADEON_USAGE_WRITE)) {336cs->flush_cs(cs->flush_data,337RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);338} else {339/* Try to avoid busy-waiting in amdgpu_bo_wait. */340if (p_atomic_read(&bo->num_active_ioctls))341amdgpu_cs_sync_flush(rcs);342}343}344345amdgpu_bo_wait(rws, (struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,346RADEON_USAGE_WRITE);347} else {348/* Mapping for write. */349if (cs) {350if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {351cs->flush_cs(cs->flush_data,352RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);353} else {354/* Try to avoid busy-waiting in amdgpu_bo_wait. */355if (p_atomic_read(&bo->num_active_ioctls))356amdgpu_cs_sync_flush(rcs);357}358}359360amdgpu_bo_wait(rws, (struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,361RADEON_USAGE_READWRITE);362}363364ws->buffer_wait_time += os_time_get_nano() - time;365}366}367368/* Buffer synchronization has been checked, now actually map the buffer. */369void *cpu = NULL;370uint64_t offset = 0;371372if (bo->bo) {373real = bo;374} else {375real = bo->u.slab.real;376offset = bo->va - real->va;377}378379if (usage & RADEON_MAP_TEMPORARY) {380if (real->u.real.is_user_ptr) {381cpu = real->u.real.cpu_ptr;382} else {383if (!amdgpu_bo_do_map(rws, real, &cpu))384return NULL;385}386} else {387cpu = p_atomic_read(&real->u.real.cpu_ptr);388if (!cpu) {389simple_mtx_lock(&real->lock);390/* Must re-check due to the possibility of a race. Re-check need not391* be atomic thanks to the lock. */392cpu = real->u.real.cpu_ptr;393if (!cpu) {394if (!amdgpu_bo_do_map(rws, real, &cpu)) {395simple_mtx_unlock(&real->lock);396return NULL;397}398p_atomic_set(&real->u.real.cpu_ptr, cpu);399}400simple_mtx_unlock(&real->lock);401}402}403404return (uint8_t*)cpu + offset;405}406407void amdgpu_bo_unmap(struct radeon_winsys *rws, struct pb_buffer *buf)408{409struct amdgpu_winsys *ws = amdgpu_winsys(rws);410struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;411struct amdgpu_winsys_bo *real;412413assert(!(bo->base.usage & RADEON_FLAG_SPARSE));414415real = bo->bo ? bo : bo->u.slab.real;416417if (real->u.real.is_user_ptr)418return;419420assert(real->u.real.map_count != 0 && "too many unmaps");421if (p_atomic_dec_zero(&real->u.real.map_count)) {422assert(!real->u.real.cpu_ptr &&423"too many unmaps or forgot RADEON_MAP_TEMPORARY flag");424425if (real->base.placement & RADEON_DOMAIN_VRAM)426ws->mapped_vram -= real->base.size;427else if (real->base.placement & RADEON_DOMAIN_GTT)428ws->mapped_gtt -= real->base.size;429ws->num_mapped_buffers--;430}431432amdgpu_bo_cpu_unmap(real->bo);433}434435static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {436/* Cast to void* because one of the function parameters is a struct pointer instead of void*. */437(void*)amdgpu_bo_destroy_or_cache438/* other functions are never called */439};440441static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys *ws, struct amdgpu_winsys_bo *bo)442{443#if DEBUG444assert(bo->bo);445446if (ws->debug_all_bos) {447simple_mtx_lock(&ws->global_bo_list_lock);448list_addtail(&bo->u.real.global_list_item, &ws->global_bo_list);449ws->num_buffers++;450simple_mtx_unlock(&ws->global_bo_list_lock);451}452#endif453}454455static unsigned amdgpu_get_optimal_alignment(struct amdgpu_winsys *ws,456uint64_t size, unsigned alignment)457{458/* Increase the alignment for faster address translation and better memory459* access pattern.460*/461if (size >= ws->info.pte_fragment_size) {462alignment = MAX2(alignment, ws->info.pte_fragment_size);463} else if (size) {464unsigned msb = util_last_bit(size);465466alignment = MAX2(alignment, 1u << (msb - 1));467}468return alignment;469}470471static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,472uint64_t size,473unsigned alignment,474enum radeon_bo_domain initial_domain,475unsigned flags,476int heap)477{478struct amdgpu_bo_alloc_request request = {0};479amdgpu_bo_handle buf_handle;480uint64_t va = 0;481struct amdgpu_winsys_bo *bo;482amdgpu_va_handle va_handle = NULL;483int r;484bool init_pb_cache;485486/* VRAM or GTT must be specified, but not both at the same time. */487assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |488RADEON_DOMAIN_GDS |489RADEON_DOMAIN_OA)) == 1);490491alignment = amdgpu_get_optimal_alignment(ws, size, alignment);492493init_pb_cache = heap >= 0 && (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING);494495bo = CALLOC(1, sizeof(struct amdgpu_winsys_bo) +496init_pb_cache * sizeof(struct pb_cache_entry));497if (!bo) {498return NULL;499}500501if (init_pb_cache) {502bo->u.real.use_reusable_pool = true;503pb_cache_init_entry(&ws->bo_cache, bo->cache_entry, &bo->base,504heap);505}506request.alloc_size = size;507request.phys_alignment = alignment;508509if (initial_domain & RADEON_DOMAIN_VRAM) {510request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;511512/* Since VRAM and GTT have almost the same performance on APUs, we could513* just set GTT. However, in order to decrease GTT(RAM) usage, which is514* shared with the OS, allow VRAM placements too. The idea is not to use515* VRAM usefully, but to use it so that it's not unused and wasted.516*/517if (!ws->info.has_dedicated_vram)518request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;519}520521if (initial_domain & RADEON_DOMAIN_GTT)522request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;523if (initial_domain & RADEON_DOMAIN_GDS)524request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;525if (initial_domain & RADEON_DOMAIN_OA)526request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;527528if (flags & RADEON_FLAG_NO_CPU_ACCESS)529request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;530if (flags & RADEON_FLAG_GTT_WC)531request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;532if (ws->zero_all_vram_allocs &&533(request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))534request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;535if ((flags & RADEON_FLAG_ENCRYPTED) &&536ws->info.has_tmz_support) {537request.flags |= AMDGPU_GEM_CREATE_ENCRYPTED;538539if (!(flags & RADEON_FLAG_DRIVER_INTERNAL)) {540struct amdgpu_screen_winsys *sws_iter;541simple_mtx_lock(&ws->sws_list_lock);542for (sws_iter = ws->sws_list; sws_iter; sws_iter = sws_iter->next) {543*((bool*) &sws_iter->base.uses_secure_bos) = true;544}545simple_mtx_unlock(&ws->sws_list_lock);546}547}548549r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);550if (r) {551fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");552fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size);553fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment);554fprintf(stderr, "amdgpu: domains : %u\n", initial_domain);555fprintf(stderr, "amdgpu: flags : %" PRIx64 "\n", request.flags);556goto error_bo_alloc;557}558559if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {560unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;561562r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,563size + va_gap_size, alignment,5640, &va, &va_handle,565(flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |566AMDGPU_VA_RANGE_HIGH);567if (r)568goto error_va_alloc;569570unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |571AMDGPU_VM_PAGE_EXECUTABLE;572573if (!(flags & RADEON_FLAG_READ_ONLY))574vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;575576if (flags & RADEON_FLAG_UNCACHED)577vm_flags |= AMDGPU_VM_MTYPE_UC;578579r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,580AMDGPU_VA_OP_MAP);581if (r)582goto error_va_map;583}584585simple_mtx_init(&bo->lock, mtx_plain);586pipe_reference_init(&bo->base.reference, 1);587bo->base.alignment_log2 = util_logbase2(alignment);588bo->base.size = size;589bo->base.vtbl = &amdgpu_winsys_bo_vtbl;590bo->bo = buf_handle;591bo->va = va;592bo->u.real.va_handle = va_handle;593bo->base.placement = initial_domain;594bo->base.usage = flags;595bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);596597if (initial_domain & RADEON_DOMAIN_VRAM)598ws->allocated_vram += align64(size, ws->info.gart_page_size);599else if (initial_domain & RADEON_DOMAIN_GTT)600ws->allocated_gtt += align64(size, ws->info.gart_page_size);601602amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);603604amdgpu_add_buffer_to_global_list(ws, bo);605606return bo;607608error_va_map:609amdgpu_va_range_free(va_handle);610611error_va_alloc:612amdgpu_bo_free(buf_handle);613614error_bo_alloc:615FREE(bo);616return NULL;617}618619bool amdgpu_bo_can_reclaim(struct amdgpu_winsys *ws, struct pb_buffer *_buf)620{621return amdgpu_bo_wait(&ws->dummy_ws.base, _buf, 0, RADEON_USAGE_READWRITE);622}623624bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)625{626struct amdgpu_winsys_bo *bo = container_of(entry, struct amdgpu_winsys_bo, u.slab.entry);627628return amdgpu_bo_can_reclaim(priv, &bo->base);629}630631static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size,632enum radeon_bo_flag flags)633{634struct pb_slabs *bo_slabs = ((flags & RADEON_FLAG_ENCRYPTED) && ws->info.has_tmz_support) ?635ws->bo_slabs_encrypted : ws->bo_slabs;636/* Find the correct slab allocator for the given size. */637for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {638struct pb_slabs *slabs = &bo_slabs[i];639640if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))641return slabs;642}643644assert(0);645return NULL;646}647648static unsigned get_slab_wasted_size(struct amdgpu_winsys *ws, struct amdgpu_winsys_bo *bo)649{650assert(bo->base.size <= bo->u.slab.entry.entry_size);651assert(bo->base.size < (1 << bo->base.alignment_log2) ||652bo->base.size < 1 << ws->bo_slabs[0].min_order ||653bo->base.size > bo->u.slab.entry.entry_size / 2);654return bo->u.slab.entry.entry_size - bo->base.size;655}656657static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer *_buf)658{659struct amdgpu_winsys *ws = amdgpu_winsys(rws);660struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);661struct pb_slabs *slabs;662663assert(!bo->bo);664665slabs = get_slabs(ws, bo->base.size, bo->base.usage & RADEON_FLAG_ENCRYPTED);666667if (bo->base.placement & RADEON_DOMAIN_VRAM)668ws->slab_wasted_vram -= get_slab_wasted_size(ws, bo);669else670ws->slab_wasted_gtt -= get_slab_wasted_size(ws, bo);671672pb_slab_free(slabs, &bo->u.slab.entry);673}674675static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {676/* Cast to void* because one of the function parameters is a struct pointer instead of void*. */677(void*)amdgpu_bo_slab_destroy678/* other functions are never called */679};680681/* Return the power of two size of a slab entry matching the input size. */682static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)683{684unsigned entry_size = util_next_power_of_two(size);685unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;686687return MAX2(entry_size, min_entry_size);688}689690/* Return the slab entry alignment. */691static unsigned get_slab_entry_alignment(struct amdgpu_winsys *ws, unsigned size)692{693unsigned entry_size = get_slab_pot_entry_size(ws, size);694695if (size <= entry_size * 3 / 4)696return entry_size / 4;697698return entry_size;699}700701static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,702unsigned entry_size,703unsigned group_index,704bool encrypted)705{706struct amdgpu_winsys *ws = priv;707struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);708enum radeon_bo_domain domains = radeon_domain_from_heap(heap);709enum radeon_bo_flag flags = radeon_flags_from_heap(heap);710uint32_t base_id;711unsigned slab_size = 0;712713if (!slab)714return NULL;715716if (encrypted)717flags |= RADEON_FLAG_ENCRYPTED;718719struct pb_slabs *slabs = ((flags & RADEON_FLAG_ENCRYPTED) && ws->info.has_tmz_support) ?720ws->bo_slabs_encrypted : ws->bo_slabs;721722/* Determine the slab buffer size. */723for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {724unsigned max_entry_size = 1 << (slabs[i].min_order + slabs[i].num_orders - 1);725726if (entry_size <= max_entry_size) {727/* The slab size is twice the size of the largest possible entry. */728slab_size = max_entry_size * 2;729730if (!util_is_power_of_two_nonzero(entry_size)) {731assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));732733/* If the entry size is 3/4 of a power of two, we would waste space and not gain734* anything if we allocated only twice the power of two for the backing buffer:735* 2 * 3/4 = 1.5 usable with buffer size 2736*737* Allocating 5 times the entry size leads us to the next power of two and results738* in a much better memory utilization:739* 5 * 3/4 = 3.75 usable with buffer size 4740*/741if (entry_size * 5 > slab_size)742slab_size = util_next_power_of_two(entry_size * 5);743}744745/* The largest slab should have the same size as the PTE fragment746* size to get faster address translation.747*/748if (i == NUM_SLAB_ALLOCATORS - 1 &&749slab_size < ws->info.pte_fragment_size)750slab_size = ws->info.pte_fragment_size;751break;752}753}754assert(slab_size != 0);755756slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,757slab_size, slab_size,758domains, flags));759if (!slab->buffer)760goto fail;761762slab_size = slab->buffer->base.size;763764slab->base.num_entries = slab_size / entry_size;765slab->base.num_free = slab->base.num_entries;766slab->entry_size = entry_size;767slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));768if (!slab->entries)769goto fail_buffer;770771list_inithead(&slab->base.free);772773base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);774775for (unsigned i = 0; i < slab->base.num_entries; ++i) {776struct amdgpu_winsys_bo *bo = &slab->entries[i];777778simple_mtx_init(&bo->lock, mtx_plain);779bo->base.alignment_log2 = util_logbase2(get_slab_entry_alignment(ws, entry_size));780bo->base.size = entry_size;781bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;782bo->va = slab->buffer->va + i * entry_size;783bo->base.placement = domains;784bo->unique_id = base_id + i;785bo->u.slab.entry.slab = &slab->base;786bo->u.slab.entry.group_index = group_index;787bo->u.slab.entry.entry_size = entry_size;788789if (slab->buffer->bo) {790/* The slab is not suballocated. */791bo->u.slab.real = slab->buffer;792} else {793/* The slab is allocated out of a bigger slab. */794bo->u.slab.real = slab->buffer->u.slab.real;795assert(bo->u.slab.real->bo);796}797798list_addtail(&bo->u.slab.entry.head, &slab->base.free);799}800801/* Wasted alignment due to slabs with 3/4 allocations being aligned to a power of two. */802assert(slab->base.num_entries * entry_size <= slab_size);803if (domains & RADEON_DOMAIN_VRAM)804ws->slab_wasted_vram += slab_size - slab->base.num_entries * entry_size;805else806ws->slab_wasted_gtt += slab_size - slab->base.num_entries * entry_size;807808return &slab->base;809810fail_buffer:811amdgpu_winsys_bo_reference(ws, &slab->buffer, NULL);812fail:813FREE(slab);814return NULL;815}816817struct pb_slab *amdgpu_bo_slab_alloc_encrypted(void *priv, unsigned heap,818unsigned entry_size,819unsigned group_index)820{821return amdgpu_bo_slab_alloc(priv, heap, entry_size, group_index, true);822}823824struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap,825unsigned entry_size,826unsigned group_index)827{828return amdgpu_bo_slab_alloc(priv, heap, entry_size, group_index, false);829}830831void amdgpu_bo_slab_free(struct amdgpu_winsys *ws, struct pb_slab *pslab)832{833struct amdgpu_slab *slab = amdgpu_slab(pslab);834unsigned slab_size = slab->buffer->base.size;835836assert(slab->base.num_entries * slab->entry_size <= slab_size);837if (slab->buffer->base.placement & RADEON_DOMAIN_VRAM)838ws->slab_wasted_vram -= slab_size - slab->base.num_entries * slab->entry_size;839else840ws->slab_wasted_gtt -= slab_size - slab->base.num_entries * slab->entry_size;841842for (unsigned i = 0; i < slab->base.num_entries; ++i) {843amdgpu_bo_remove_fences(&slab->entries[i]);844simple_mtx_destroy(&slab->entries[i].lock);845}846847FREE(slab->entries);848amdgpu_winsys_bo_reference(ws, &slab->buffer, NULL);849FREE(slab);850}851852#if DEBUG_SPARSE_COMMITS853static void854sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)855{856fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"857"Commitments:\n",858__func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);859860struct amdgpu_sparse_backing *span_backing = NULL;861uint32_t span_first_backing_page = 0;862uint32_t span_first_va_page = 0;863uint32_t va_page = 0;864865for (;;) {866struct amdgpu_sparse_backing *backing = 0;867uint32_t backing_page = 0;868869if (va_page < bo->u.sparse.num_va_pages) {870backing = bo->u.sparse.commitments[va_page].backing;871backing_page = bo->u.sparse.commitments[va_page].page;872}873874if (span_backing &&875(backing != span_backing ||876backing_page != span_first_backing_page + (va_page - span_first_va_page))) {877fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",878span_first_va_page, va_page - 1, span_backing,879span_first_backing_page,880span_first_backing_page + (va_page - span_first_va_page) - 1);881882span_backing = NULL;883}884885if (va_page >= bo->u.sparse.num_va_pages)886break;887888if (backing && !span_backing) {889span_backing = backing;890span_first_backing_page = backing_page;891span_first_va_page = va_page;892}893894va_page++;895}896897fprintf(stderr, "Backing:\n");898899list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {900fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);901for (unsigned i = 0; i < backing->num_chunks; ++i)902fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);903}904}905#endif906907/*908* Attempt to allocate the given number of backing pages. Fewer pages may be909* allocated (depending on the fragmentation of existing backing buffers),910* which will be reflected by a change to *pnum_pages.911*/912static struct amdgpu_sparse_backing *913sparse_backing_alloc(struct amdgpu_winsys *ws, struct amdgpu_winsys_bo *bo,914uint32_t *pstart_page, uint32_t *pnum_pages)915{916struct amdgpu_sparse_backing *best_backing;917unsigned best_idx;918uint32_t best_num_pages;919920best_backing = NULL;921best_idx = 0;922best_num_pages = 0;923924/* This is a very simple and inefficient best-fit algorithm. */925list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {926for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {927uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;928if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||929(best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {930best_backing = backing;931best_idx = idx;932best_num_pages = cur_num_pages;933}934}935}936937/* Allocate a new backing buffer if necessary. */938if (!best_backing) {939struct pb_buffer *buf;940uint64_t size;941uint32_t pages;942943best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);944if (!best_backing)945return NULL;946947best_backing->max_chunks = 4;948best_backing->chunks = CALLOC(best_backing->max_chunks,949sizeof(*best_backing->chunks));950if (!best_backing->chunks) {951FREE(best_backing);952return NULL;953}954955assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));956957size = MIN3(bo->base.size / 16,9588 * 1024 * 1024,959bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);960size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);961962buf = amdgpu_bo_create(ws, size, RADEON_SPARSE_PAGE_SIZE,963bo->base.placement,964(bo->base.usage & ~RADEON_FLAG_SPARSE &965/* Set the interprocess sharing flag to disable pb_cache because966* amdgpu_bo_wait doesn't wait for active CS jobs.967*/968~RADEON_FLAG_NO_INTERPROCESS_SHARING) | RADEON_FLAG_NO_SUBALLOC);969if (!buf) {970FREE(best_backing->chunks);971FREE(best_backing);972return NULL;973}974975/* We might have gotten a bigger buffer than requested via caching. */976pages = buf->size / RADEON_SPARSE_PAGE_SIZE;977978best_backing->bo = amdgpu_winsys_bo(buf);979best_backing->num_chunks = 1;980best_backing->chunks[0].begin = 0;981best_backing->chunks[0].end = pages;982983list_add(&best_backing->list, &bo->u.sparse.backing);984bo->u.sparse.num_backing_pages += pages;985986best_idx = 0;987best_num_pages = pages;988}989990*pnum_pages = MIN2(*pnum_pages, best_num_pages);991*pstart_page = best_backing->chunks[best_idx].begin;992best_backing->chunks[best_idx].begin += *pnum_pages;993994if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {995memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],996sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));997best_backing->num_chunks--;998}9991000return best_backing;1001}10021003static void1004sparse_free_backing_buffer(struct amdgpu_winsys *ws, struct amdgpu_winsys_bo *bo,1005struct amdgpu_sparse_backing *backing)1006{1007bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;10081009simple_mtx_lock(&ws->bo_fence_lock);1010amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);1011simple_mtx_unlock(&ws->bo_fence_lock);10121013list_del(&backing->list);1014amdgpu_winsys_bo_reference(ws, &backing->bo, NULL);1015FREE(backing->chunks);1016FREE(backing);1017}10181019/*1020* Return a range of pages from the given backing buffer back into the1021* free structure.1022*/1023static bool1024sparse_backing_free(struct amdgpu_winsys *ws, struct amdgpu_winsys_bo *bo,1025struct amdgpu_sparse_backing *backing,1026uint32_t start_page, uint32_t num_pages)1027{1028uint32_t end_page = start_page + num_pages;1029unsigned low = 0;1030unsigned high = backing->num_chunks;10311032/* Find the first chunk with begin >= start_page. */1033while (low < high) {1034unsigned mid = low + (high - low) / 2;10351036if (backing->chunks[mid].begin >= start_page)1037high = mid;1038else1039low = mid + 1;1040}10411042assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);1043assert(low == 0 || backing->chunks[low - 1].end <= start_page);10441045if (low > 0 && backing->chunks[low - 1].end == start_page) {1046backing->chunks[low - 1].end = end_page;10471048if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {1049backing->chunks[low - 1].end = backing->chunks[low].end;1050memmove(&backing->chunks[low], &backing->chunks[low + 1],1051sizeof(*backing->chunks) * (backing->num_chunks - low - 1));1052backing->num_chunks--;1053}1054} else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {1055backing->chunks[low].begin = start_page;1056} else {1057if (backing->num_chunks >= backing->max_chunks) {1058unsigned new_max_chunks = 2 * backing->max_chunks;1059struct amdgpu_sparse_backing_chunk *new_chunks =1060REALLOC(backing->chunks,1061sizeof(*backing->chunks) * backing->max_chunks,1062sizeof(*backing->chunks) * new_max_chunks);1063if (!new_chunks)1064return false;10651066backing->max_chunks = new_max_chunks;1067backing->chunks = new_chunks;1068}10691070memmove(&backing->chunks[low + 1], &backing->chunks[low],1071sizeof(*backing->chunks) * (backing->num_chunks - low));1072backing->chunks[low].begin = start_page;1073backing->chunks[low].end = end_page;1074backing->num_chunks++;1075}10761077if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&1078backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)1079sparse_free_backing_buffer(ws, bo, backing);10801081return true;1082}10831084static void amdgpu_bo_sparse_destroy(struct radeon_winsys *rws, struct pb_buffer *_buf)1085{1086struct amdgpu_winsys *ws = amdgpu_winsys(rws);1087struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);1088int r;10891090assert(!bo->bo && bo->base.usage & RADEON_FLAG_SPARSE);10911092r = amdgpu_bo_va_op_raw(ws->dev, NULL, 0,1093(uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,1094bo->va, 0, AMDGPU_VA_OP_CLEAR);1095if (r) {1096fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);1097}10981099while (!list_is_empty(&bo->u.sparse.backing)) {1100sparse_free_backing_buffer(ws, bo,1101container_of(bo->u.sparse.backing.next,1102struct amdgpu_sparse_backing, list));1103}11041105amdgpu_va_range_free(bo->u.sparse.va_handle);1106FREE(bo->u.sparse.commitments);1107simple_mtx_destroy(&bo->lock);1108FREE(bo);1109}11101111static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {1112/* Cast to void* because one of the function parameters is a struct pointer instead of void*. */1113(void*)amdgpu_bo_sparse_destroy1114/* other functions are never called */1115};11161117static struct pb_buffer *1118amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,1119enum radeon_bo_domain domain,1120enum radeon_bo_flag flags)1121{1122struct amdgpu_winsys_bo *bo;1123uint64_t map_size;1124uint64_t va_gap_size;1125int r;11261127/* We use 32-bit page numbers; refuse to attempt allocating sparse buffers1128* that exceed this limit. This is not really a restriction: we don't have1129* that much virtual address space anyway.1130*/1131if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)1132return NULL;11331134bo = CALLOC_STRUCT(amdgpu_winsys_bo);1135if (!bo)1136return NULL;11371138simple_mtx_init(&bo->lock, mtx_plain);1139pipe_reference_init(&bo->base.reference, 1);1140bo->base.alignment_log2 = util_logbase2(RADEON_SPARSE_PAGE_SIZE);1141bo->base.size = size;1142bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;1143bo->base.placement = domain;1144bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);1145bo->base.usage = flags;11461147bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);1148bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,1149sizeof(*bo->u.sparse.commitments));1150if (!bo->u.sparse.commitments)1151goto error_alloc_commitments;11521153list_inithead(&bo->u.sparse.backing);11541155/* For simplicity, we always map a multiple of the page size. */1156map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);1157va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;1158r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,1159map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,11600, &bo->va, &bo->u.sparse.va_handle,1161AMDGPU_VA_RANGE_HIGH);1162if (r)1163goto error_va_alloc;11641165r = amdgpu_bo_va_op_raw(ws->dev, NULL, 0, size, bo->va,1166AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);1167if (r)1168goto error_va_map;11691170return &bo->base;11711172error_va_map:1173amdgpu_va_range_free(bo->u.sparse.va_handle);1174error_va_alloc:1175FREE(bo->u.sparse.commitments);1176error_alloc_commitments:1177simple_mtx_destroy(&bo->lock);1178FREE(bo);1179return NULL;1180}11811182static bool1183amdgpu_bo_sparse_commit(struct radeon_winsys *rws, struct pb_buffer *buf,1184uint64_t offset, uint64_t size, bool commit)1185{1186struct amdgpu_winsys *ws = amdgpu_winsys(rws);1187struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);1188struct amdgpu_sparse_commitment *comm;1189uint32_t va_page, end_va_page;1190bool ok = true;1191int r;11921193assert(bo->base.usage & RADEON_FLAG_SPARSE);1194assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);1195assert(offset <= bo->base.size);1196assert(size <= bo->base.size - offset);1197assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);11981199comm = bo->u.sparse.commitments;1200va_page = offset / RADEON_SPARSE_PAGE_SIZE;1201end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);12021203simple_mtx_lock(&bo->lock);12041205#if DEBUG_SPARSE_COMMITS1206sparse_dump(bo, __func__);1207#endif12081209if (commit) {1210while (va_page < end_va_page) {1211uint32_t span_va_page;12121213/* Skip pages that are already committed. */1214if (comm[va_page].backing) {1215va_page++;1216continue;1217}12181219/* Determine length of uncommitted span. */1220span_va_page = va_page;1221while (va_page < end_va_page && !comm[va_page].backing)1222va_page++;12231224/* Fill the uncommitted span with chunks of backing memory. */1225while (span_va_page < va_page) {1226struct amdgpu_sparse_backing *backing;1227uint32_t backing_start, backing_size;12281229backing_size = va_page - span_va_page;1230backing = sparse_backing_alloc(ws, bo, &backing_start, &backing_size);1231if (!backing) {1232ok = false;1233goto out;1234}12351236r = amdgpu_bo_va_op_raw(ws->dev, backing->bo->bo,1237(uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,1238(uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,1239bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,1240AMDGPU_VM_PAGE_READABLE |1241AMDGPU_VM_PAGE_WRITEABLE |1242AMDGPU_VM_PAGE_EXECUTABLE,1243AMDGPU_VA_OP_REPLACE);1244if (r) {1245ok = sparse_backing_free(ws, bo, backing, backing_start, backing_size);1246assert(ok && "sufficient memory should already be allocated");12471248ok = false;1249goto out;1250}12511252while (backing_size) {1253comm[span_va_page].backing = backing;1254comm[span_va_page].page = backing_start;1255span_va_page++;1256backing_start++;1257backing_size--;1258}1259}1260}1261} else {1262r = amdgpu_bo_va_op_raw(ws->dev, NULL, 0,1263(uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,1264bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,1265AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);1266if (r) {1267ok = false;1268goto out;1269}12701271while (va_page < end_va_page) {1272struct amdgpu_sparse_backing *backing;1273uint32_t backing_start;1274uint32_t span_pages;12751276/* Skip pages that are already uncommitted. */1277if (!comm[va_page].backing) {1278va_page++;1279continue;1280}12811282/* Group contiguous spans of pages. */1283backing = comm[va_page].backing;1284backing_start = comm[va_page].page;1285comm[va_page].backing = NULL;12861287span_pages = 1;1288va_page++;12891290while (va_page < end_va_page &&1291comm[va_page].backing == backing &&1292comm[va_page].page == backing_start + span_pages) {1293comm[va_page].backing = NULL;1294va_page++;1295span_pages++;1296}12971298if (!sparse_backing_free(ws, bo, backing, backing_start, span_pages)) {1299/* Couldn't allocate tracking data structures, so we have to leak */1300fprintf(stderr, "amdgpu: leaking PRT backing memory\n");1301ok = false;1302}1303}1304}1305out:13061307simple_mtx_unlock(&bo->lock);13081309return ok;1310}13111312static void amdgpu_buffer_get_metadata(struct radeon_winsys *rws,1313struct pb_buffer *_buf,1314struct radeon_bo_metadata *md,1315struct radeon_surf *surf)1316{1317struct amdgpu_winsys *ws = amdgpu_winsys(rws);1318struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);1319struct amdgpu_bo_info info = {0};1320int r;13211322assert(bo->bo && "must not be called for slab entries");13231324r = amdgpu_bo_query_info(bo->bo, &info);1325if (r)1326return;13271328ac_surface_set_bo_metadata(&ws->info, surf, info.metadata.tiling_info,1329&md->mode);13301331md->size_metadata = info.metadata.size_metadata;1332memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));1333}13341335static void amdgpu_buffer_set_metadata(struct radeon_winsys *rws,1336struct pb_buffer *_buf,1337struct radeon_bo_metadata *md,1338struct radeon_surf *surf)1339{1340struct amdgpu_winsys *ws = amdgpu_winsys(rws);1341struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);1342struct amdgpu_bo_metadata metadata = {0};13431344assert(bo->bo && "must not be called for slab entries");13451346ac_surface_get_bo_metadata(&ws->info, surf, &metadata.tiling_info);13471348metadata.size_metadata = md->size_metadata;1349memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));13501351amdgpu_bo_set_metadata(bo->bo, &metadata);1352}13531354struct pb_buffer *1355amdgpu_bo_create(struct amdgpu_winsys *ws,1356uint64_t size,1357unsigned alignment,1358enum radeon_bo_domain domain,1359enum radeon_bo_flag flags)1360{1361struct amdgpu_winsys_bo *bo;1362int heap = -1;13631364if (domain & (RADEON_DOMAIN_GDS | RADEON_DOMAIN_OA))1365flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_SUBALLOC;13661367/* VRAM implies WC. This is not optional. */1368assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);13691370/* NO_CPU_ACCESS is not valid with GTT. */1371assert(!(domain & RADEON_DOMAIN_GTT) || !(flags & RADEON_FLAG_NO_CPU_ACCESS));13721373/* Sparse buffers must have NO_CPU_ACCESS set. */1374assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);13751376struct pb_slabs *slabs = ((flags & RADEON_FLAG_ENCRYPTED) && ws->info.has_tmz_support) ?1377ws->bo_slabs_encrypted : ws->bo_slabs;1378struct pb_slabs *last_slab = &slabs[NUM_SLAB_ALLOCATORS - 1];1379unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);13801381/* Sub-allocate small buffers from slabs. */1382if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&1383size <= max_slab_entry_size) {1384struct pb_slab_entry *entry;1385int heap = radeon_get_heap_index(domain, flags);13861387if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)1388goto no_slab;13891390unsigned alloc_size = size;13911392/* Always use slabs for sizes less than 4 KB because the kernel aligns1393* everything to 4 KB.1394*/1395if (size < alignment && alignment <= 4 * 1024)1396alloc_size = alignment;13971398if (alignment > get_slab_entry_alignment(ws, alloc_size)) {1399/* 3/4 allocations can return too small alignment. Try again with a power of two1400* allocation size.1401*/1402unsigned pot_size = get_slab_pot_entry_size(ws, alloc_size);14031404if (alignment <= pot_size) {1405/* This size works but wastes some memory to fulfil the alignment. */1406alloc_size = pot_size;1407} else {1408goto no_slab; /* can't fulfil alignment requirements */1409}1410}14111412struct pb_slabs *slabs = get_slabs(ws, alloc_size, flags);1413entry = pb_slab_alloc(slabs, alloc_size, heap);1414if (!entry) {1415/* Clean up buffer managers and try again. */1416amdgpu_clean_up_buffer_managers(ws);14171418entry = pb_slab_alloc(slabs, alloc_size, heap);1419}1420if (!entry)1421return NULL;14221423bo = container_of(entry, struct amdgpu_winsys_bo, u.slab.entry);1424pipe_reference_init(&bo->base.reference, 1);1425bo->base.size = size;1426assert(alignment <= 1 << bo->base.alignment_log2);14271428if (domain & RADEON_DOMAIN_VRAM)1429ws->slab_wasted_vram += get_slab_wasted_size(ws, bo);1430else1431ws->slab_wasted_gtt += get_slab_wasted_size(ws, bo);14321433return &bo->base;1434}1435no_slab:14361437if (flags & RADEON_FLAG_SPARSE) {1438assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);14391440return amdgpu_bo_sparse_create(ws, size, domain, flags);1441}14421443/* This flag is irrelevant for the cache. */1444flags &= ~RADEON_FLAG_NO_SUBALLOC;14451446/* Align size to page size. This is the minimum alignment for normal1447* BOs. Aligning this here helps the cached bufmgr. Especially small BOs,1448* like constant/uniform buffers, can benefit from better and more reuse.1449*/1450if (domain & RADEON_DOMAIN_VRAM_GTT) {1451size = align64(size, ws->info.gart_page_size);1452alignment = align(alignment, ws->info.gart_page_size);1453}14541455bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;14561457if (use_reusable_pool) {1458heap = radeon_get_heap_index(domain, flags & ~RADEON_FLAG_ENCRYPTED);1459assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);14601461/* Get a buffer from the cache. */1462bo = (struct amdgpu_winsys_bo*)1463pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);1464if (bo)1465return &bo->base;1466}14671468/* Create a new one. */1469bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);1470if (!bo) {1471/* Clean up buffer managers and try again. */1472amdgpu_clean_up_buffer_managers(ws);14731474bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);1475if (!bo)1476return NULL;1477}14781479return &bo->base;1480}14811482static struct pb_buffer *1483amdgpu_buffer_create(struct radeon_winsys *ws,1484uint64_t size,1485unsigned alignment,1486enum radeon_bo_domain domain,1487enum radeon_bo_flag flags)1488{1489struct pb_buffer * res = amdgpu_bo_create(amdgpu_winsys(ws), size, alignment, domain,1490flags);1491return res;1492}14931494static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,1495struct winsys_handle *whandle,1496unsigned vm_alignment)1497{1498struct amdgpu_winsys *ws = amdgpu_winsys(rws);1499struct amdgpu_winsys_bo *bo = NULL;1500enum amdgpu_bo_handle_type type;1501struct amdgpu_bo_import_result result = {0};1502uint64_t va;1503amdgpu_va_handle va_handle = NULL;1504struct amdgpu_bo_info info = {0};1505enum radeon_bo_domain initial = 0;1506enum radeon_bo_flag flags = 0;1507int r;15081509switch (whandle->type) {1510case WINSYS_HANDLE_TYPE_SHARED:1511type = amdgpu_bo_handle_type_gem_flink_name;1512break;1513case WINSYS_HANDLE_TYPE_FD:1514type = amdgpu_bo_handle_type_dma_buf_fd;1515break;1516default:1517return NULL;1518}15191520r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);1521if (r)1522return NULL;15231524simple_mtx_lock(&ws->bo_export_table_lock);1525bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);15261527/* If the amdgpu_winsys_bo instance already exists, bump the reference1528* counter and return it.1529*/1530if (bo) {1531p_atomic_inc(&bo->base.reference.count);1532simple_mtx_unlock(&ws->bo_export_table_lock);15331534/* Release the buffer handle, because we don't need it anymore.1535* This function is returning an existing buffer, which has its own1536* handle.1537*/1538amdgpu_bo_free(result.buf_handle);1539return &bo->base;1540}15411542/* Get initial domains. */1543r = amdgpu_bo_query_info(result.buf_handle, &info);1544if (r)1545goto error;15461547r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,1548result.alloc_size,1549amdgpu_get_optimal_alignment(ws, result.alloc_size,1550vm_alignment),15510, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);1552if (r)1553goto error;15541555bo = CALLOC_STRUCT(amdgpu_winsys_bo);1556if (!bo)1557goto error;15581559r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);1560if (r)1561goto error;15621563if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)1564initial |= RADEON_DOMAIN_VRAM;1565if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)1566initial |= RADEON_DOMAIN_GTT;1567if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)1568flags |= RADEON_FLAG_NO_CPU_ACCESS;1569if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)1570flags |= RADEON_FLAG_GTT_WC;1571if (info.alloc_flags & AMDGPU_GEM_CREATE_ENCRYPTED) {1572/* Imports are always possible even if the importer isn't using TMZ.1573* For instance libweston needs to import the buffer to be able to determine1574* if it can be used for scanout.1575*/1576flags |= RADEON_FLAG_ENCRYPTED;1577}15781579/* Initialize the structure. */1580simple_mtx_init(&bo->lock, mtx_plain);1581pipe_reference_init(&bo->base.reference, 1);1582bo->base.alignment_log2 = util_logbase2(info.phys_alignment);1583bo->bo = result.buf_handle;1584bo->base.size = result.alloc_size;1585bo->base.vtbl = &amdgpu_winsys_bo_vtbl;1586bo->va = va;1587bo->u.real.va_handle = va_handle;1588bo->base.placement = initial;1589bo->base.usage = flags;1590bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);1591bo->u.real.is_shared = true;15921593if (bo->base.placement & RADEON_DOMAIN_VRAM)1594ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);1595else if (bo->base.placement & RADEON_DOMAIN_GTT)1596ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);15971598amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);15991600amdgpu_add_buffer_to_global_list(ws, bo);16011602_mesa_hash_table_insert(ws->bo_export_table, bo->bo, bo);1603simple_mtx_unlock(&ws->bo_export_table_lock);16041605return &bo->base;16061607error:1608simple_mtx_unlock(&ws->bo_export_table_lock);1609if (bo)1610FREE(bo);1611if (va_handle)1612amdgpu_va_range_free(va_handle);1613amdgpu_bo_free(result.buf_handle);1614return NULL;1615}16161617static bool amdgpu_bo_get_handle(struct radeon_winsys *rws,1618struct pb_buffer *buffer,1619struct winsys_handle *whandle)1620{1621struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws);1622struct amdgpu_winsys *ws = amdgpu_winsys(rws);1623struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);1624enum amdgpu_bo_handle_type type;1625struct hash_entry *entry;1626int r;16271628/* Don't allow exports of slab entries and sparse buffers. */1629if (!bo->bo)1630return false;16311632bo->u.real.use_reusable_pool = false;16331634switch (whandle->type) {1635case WINSYS_HANDLE_TYPE_SHARED:1636type = amdgpu_bo_handle_type_gem_flink_name;1637break;1638case WINSYS_HANDLE_TYPE_KMS:1639if (sws->fd == ws->fd) {1640whandle->handle = bo->u.real.kms_handle;16411642if (bo->u.real.is_shared)1643return true;16441645goto hash_table_set;1646}16471648simple_mtx_lock(&ws->sws_list_lock);1649entry = _mesa_hash_table_search(sws->kms_handles, bo);1650simple_mtx_unlock(&ws->sws_list_lock);1651if (entry) {1652whandle->handle = (uintptr_t)entry->data;1653return true;1654}1655FALLTHROUGH;1656case WINSYS_HANDLE_TYPE_FD:1657type = amdgpu_bo_handle_type_dma_buf_fd;1658break;1659default:1660return false;1661}16621663r = amdgpu_bo_export(bo->bo, type, &whandle->handle);1664if (r)1665return false;16661667if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {1668int dma_fd = whandle->handle;16691670r = drmPrimeFDToHandle(sws->fd, dma_fd, &whandle->handle);1671close(dma_fd);16721673if (r)1674return false;16751676simple_mtx_lock(&ws->sws_list_lock);1677_mesa_hash_table_insert_pre_hashed(sws->kms_handles,1678bo->u.real.kms_handle, bo,1679(void*)(uintptr_t)whandle->handle);1680simple_mtx_unlock(&ws->sws_list_lock);1681}16821683hash_table_set:1684simple_mtx_lock(&ws->bo_export_table_lock);1685_mesa_hash_table_insert(ws->bo_export_table, bo->bo, bo);1686simple_mtx_unlock(&ws->bo_export_table_lock);16871688bo->u.real.is_shared = true;1689return true;1690}16911692static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,1693void *pointer, uint64_t size)1694{1695struct amdgpu_winsys *ws = amdgpu_winsys(rws);1696amdgpu_bo_handle buf_handle;1697struct amdgpu_winsys_bo *bo;1698uint64_t va;1699amdgpu_va_handle va_handle;1700/* Avoid failure when the size is not page aligned */1701uint64_t aligned_size = align64(size, ws->info.gart_page_size);17021703bo = CALLOC_STRUCT(amdgpu_winsys_bo);1704if (!bo)1705return NULL;17061707if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,1708aligned_size, &buf_handle))1709goto error;17101711if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,1712aligned_size,1713amdgpu_get_optimal_alignment(ws, aligned_size,1714ws->info.gart_page_size),17150, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))1716goto error_va_alloc;17171718if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))1719goto error_va_map;17201721/* Initialize it. */1722bo->u.real.is_user_ptr = true;1723pipe_reference_init(&bo->base.reference, 1);1724simple_mtx_init(&bo->lock, mtx_plain);1725bo->bo = buf_handle;1726bo->base.alignment_log2 = 0;1727bo->base.size = size;1728bo->base.vtbl = &amdgpu_winsys_bo_vtbl;1729bo->u.real.cpu_ptr = pointer;1730bo->va = va;1731bo->u.real.va_handle = va_handle;1732bo->base.placement = RADEON_DOMAIN_GTT;1733bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);17341735ws->allocated_gtt += aligned_size;17361737amdgpu_add_buffer_to_global_list(ws, bo);17381739amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);17401741return (struct pb_buffer*)bo;17421743error_va_map:1744amdgpu_va_range_free(va_handle);17451746error_va_alloc:1747amdgpu_bo_free(buf_handle);17481749error:1750FREE(bo);1751return NULL;1752}17531754static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)1755{1756struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;17571758return bo->bo ? bo->u.real.is_user_ptr : false;1759}17601761static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)1762{1763struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;17641765return !bo->bo && !(bo->base.usage & RADEON_FLAG_SPARSE);1766}17671768static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)1769{1770return ((struct amdgpu_winsys_bo*)buf)->va;1771}17721773void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *ws)1774{1775ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;1776ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;1777ws->base.buffer_map = amdgpu_bo_map;1778ws->base.buffer_unmap = amdgpu_bo_unmap;1779ws->base.buffer_wait = amdgpu_bo_wait;1780ws->base.buffer_create = amdgpu_buffer_create;1781ws->base.buffer_from_handle = amdgpu_bo_from_handle;1782ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;1783ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;1784ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;1785ws->base.buffer_get_handle = amdgpu_bo_get_handle;1786ws->base.buffer_commit = amdgpu_bo_sparse_commit;1787ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;1788ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;1789ws->base.buffer_get_flags = amdgpu_bo_get_flags;1790}179117921793