Path: blob/master/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
26517 views
// SPDX-License-Identifier: MIT1/*2* Copyright 2014-2018 Advanced Micro Devices, Inc.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*/22#include <linux/dma-buf.h>23#include <linux/list.h>24#include <linux/pagemap.h>25#include <linux/sched/mm.h>26#include <linux/sched/task.h>27#include <drm/ttm/ttm_tt.h>2829#include <drm/drm_exec.h>3031#include "amdgpu_object.h"32#include "amdgpu_gem.h"33#include "amdgpu_vm.h"34#include "amdgpu_hmm.h"35#include "amdgpu_amdkfd.h"36#include "amdgpu_dma_buf.h"37#include <uapi/linux/kfd_ioctl.h>38#include "amdgpu_xgmi.h"39#include "kfd_priv.h"40#include "kfd_smi_events.h"4142/* Userptr restore delay, just long enough to allow consecutive VM43* changes to accumulate44*/45#define AMDGPU_USERPTR_RESTORE_DELAY_MS 146#define AMDGPU_RESERVE_MEM_LIMIT (3UL << 29)4748/*49* Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB50* BO chunk51*/52#define VRAM_AVAILABLITY_ALIGN (1 << 21)5354/* Impose limit on how much memory KFD can use */55static struct {56uint64_t max_system_mem_limit;57uint64_t max_ttm_mem_limit;58int64_t system_mem_used;59int64_t ttm_mem_used;60spinlock_t mem_limit_lock;61} kfd_mem_limit;6263static const char * const domain_bit_to_string[] = {64"CPU",65"GTT",66"VRAM",67"GDS",68"GWS",69"OA"70};7172#define domain_string(domain) domain_bit_to_string[ffs(domain)-1]7374static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work);7576static bool kfd_mem_is_attached(struct amdgpu_vm *avm,77struct kgd_mem *mem)78{79struct kfd_mem_attachment *entry;8081list_for_each_entry(entry, &mem->attachments, list)82if (entry->bo_va->base.vm == avm)83return true;8485return false;86}8788/**89* reuse_dmamap() - Check whether adev can share the original90* userptr BO91*92* If both adev and bo_adev are in direct mapping or93* in the same iommu group, they can share the original BO.94*95* @adev: Device to which can or cannot share the original BO96* @bo_adev: Device to which allocated BO belongs to97*98* Return: returns true if adev can share original userptr BO,99* false otherwise.100*/101static bool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_adev)102{103return (adev->ram_is_direct_mapped && bo_adev->ram_is_direct_mapped) ||104(adev->dev->iommu_group == bo_adev->dev->iommu_group);105}106107/* Set memory usage limits. Current, limits are108* System (TTM + userptr) memory - 15/16th System RAM109* TTM memory - 3/8th System RAM110*/111void amdgpu_amdkfd_gpuvm_init_mem_limits(void)112{113struct sysinfo si;114uint64_t mem;115116if (kfd_mem_limit.max_system_mem_limit)117return;118119si_meminfo(&si);120mem = si.totalram - si.totalhigh;121mem *= si.mem_unit;122123spin_lock_init(&kfd_mem_limit.mem_limit_lock);124kfd_mem_limit.max_system_mem_limit = mem - (mem >> 6);125if (kfd_mem_limit.max_system_mem_limit < 2 * AMDGPU_RESERVE_MEM_LIMIT)126kfd_mem_limit.max_system_mem_limit >>= 1;127else128kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT;129130kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;131pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",132(kfd_mem_limit.max_system_mem_limit >> 20),133(kfd_mem_limit.max_ttm_mem_limit >> 20));134}135136void amdgpu_amdkfd_reserve_system_mem(uint64_t size)137{138kfd_mem_limit.system_mem_used += size;139}140141/* Estimate page table size needed to represent a given memory size142*143* With 4KB pages, we need one 8 byte PTE for each 4KB of memory144* (factor 512, >> 9). With 2MB pages, we need one 8 byte PTE for 2MB145* of memory (factor 256K, >> 18). ROCm user mode tries to optimize146* for 2MB pages for TLB efficiency. However, small allocations and147* fragmented system memory still need some 4KB pages. We choose a148* compromise that should work in most cases without reserving too149* much memory for page tables unnecessarily (factor 16K, >> 14).150*/151152#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), AMDGPU_VM_RESERVED_VRAM)153154/**155* amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size156* of buffer.157*158* @adev: Device to which allocated BO belongs to159* @size: Size of buffer, in bytes, encapsulated by B0. This should be160* equivalent to amdgpu_bo_size(BO)161* @alloc_flag: Flag used in allocating a BO as noted above162* @xcp_id: xcp_id is used to get xcp from xcp manager, one xcp is163* managed as one compute node in driver for app164*165* Return:166* returns -ENOMEM in case of error, ZERO otherwise167*/168int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,169uint64_t size, u32 alloc_flag, int8_t xcp_id)170{171uint64_t reserved_for_pt =172ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);173struct amdgpu_ras *con = amdgpu_ras_get_context(adev);174uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);175size_t system_mem_needed, ttm_mem_needed, vram_needed;176int ret = 0;177uint64_t vram_size = 0;178179system_mem_needed = 0;180ttm_mem_needed = 0;181vram_needed = 0;182if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {183system_mem_needed = size;184ttm_mem_needed = size;185} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {186/*187* Conservatively round up the allocation requirement to 2 MB188* to avoid fragmentation caused by 4K allocations in the tail189* 2M BO chunk.190*/191vram_needed = size;192/*193* For GFX 9.4.3, get the VRAM size from XCP structs194*/195if (WARN_ONCE(xcp_id < 0, "invalid XCP ID %d", xcp_id))196return -EINVAL;197198vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);199if (adev->apu_prefer_gtt) {200system_mem_needed = size;201ttm_mem_needed = size;202}203} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {204system_mem_needed = size;205} else if (!(alloc_flag &206(KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |207KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {208pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);209return -ENOMEM;210}211212spin_lock(&kfd_mem_limit.mem_limit_lock);213214if (kfd_mem_limit.system_mem_used + system_mem_needed >215kfd_mem_limit.max_system_mem_limit)216pr_debug("Set no_system_mem_limit=1 if using shared memory\n");217218if ((kfd_mem_limit.system_mem_used + system_mem_needed >219kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||220(kfd_mem_limit.ttm_mem_used + ttm_mem_needed >221kfd_mem_limit.max_ttm_mem_limit) ||222(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >223vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) {224ret = -ENOMEM;225goto release;226}227228/* Update memory accounting by decreasing available system229* memory, TTM memory and GPU memory as computed above230*/231WARN_ONCE(vram_needed && !adev,232"adev reference can't be null when vram is used");233if (adev && xcp_id >= 0) {234adev->kfd.vram_used[xcp_id] += vram_needed;235adev->kfd.vram_used_aligned[xcp_id] +=236adev->apu_prefer_gtt ?237vram_needed :238ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);239}240kfd_mem_limit.system_mem_used += system_mem_needed;241kfd_mem_limit.ttm_mem_used += ttm_mem_needed;242243release:244spin_unlock(&kfd_mem_limit.mem_limit_lock);245return ret;246}247248void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,249uint64_t size, u32 alloc_flag, int8_t xcp_id)250{251spin_lock(&kfd_mem_limit.mem_limit_lock);252253if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {254kfd_mem_limit.system_mem_used -= size;255kfd_mem_limit.ttm_mem_used -= size;256} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {257WARN_ONCE(!adev,258"adev reference can't be null when alloc mem flags vram is set");259if (WARN_ONCE(xcp_id < 0, "invalid XCP ID %d", xcp_id))260goto release;261262if (adev) {263adev->kfd.vram_used[xcp_id] -= size;264if (adev->apu_prefer_gtt) {265adev->kfd.vram_used_aligned[xcp_id] -= size;266kfd_mem_limit.system_mem_used -= size;267kfd_mem_limit.ttm_mem_used -= size;268} else {269adev->kfd.vram_used_aligned[xcp_id] -=270ALIGN(size, VRAM_AVAILABLITY_ALIGN);271}272}273} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {274kfd_mem_limit.system_mem_used -= size;275} else if (!(alloc_flag &276(KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |277KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {278pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);279goto release;280}281WARN_ONCE(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] < 0,282"KFD VRAM memory accounting unbalanced for xcp: %d", xcp_id);283WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,284"KFD TTM memory accounting unbalanced");285WARN_ONCE(kfd_mem_limit.system_mem_used < 0,286"KFD system memory accounting unbalanced");287288release:289spin_unlock(&kfd_mem_limit.mem_limit_lock);290}291292void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)293{294struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);295u32 alloc_flags = bo->kfd_bo->alloc_flags;296u64 size = amdgpu_bo_size(bo);297298amdgpu_amdkfd_unreserve_mem_limit(adev, size, alloc_flags,299bo->xcp_id);300301kfree(bo->kfd_bo);302}303304/**305* create_dmamap_sg_bo() - Creates a amdgpu_bo object to reflect information306* about USERPTR or DOOREBELL or MMIO BO.307*308* @adev: Device for which dmamap BO is being created309* @mem: BO of peer device that is being DMA mapped. Provides parameters310* in building the dmamap BO311* @bo_out: Output parameter updated with handle of dmamap BO312*/313static int314create_dmamap_sg_bo(struct amdgpu_device *adev,315struct kgd_mem *mem, struct amdgpu_bo **bo_out)316{317struct drm_gem_object *gem_obj;318int ret;319uint64_t flags = 0;320321ret = amdgpu_bo_reserve(mem->bo, false);322if (ret)323return ret;324325if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)326flags |= mem->bo->flags & (AMDGPU_GEM_CREATE_COHERENT |327AMDGPU_GEM_CREATE_UNCACHED);328329ret = amdgpu_gem_object_create(adev, mem->bo->tbo.base.size, 1,330AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_CREATE_PREEMPTIBLE | flags,331ttm_bo_type_sg, mem->bo->tbo.base.resv, &gem_obj, 0);332333amdgpu_bo_unreserve(mem->bo);334335if (ret) {336pr_err("Error in creating DMA mappable SG BO on domain: %d\n", ret);337return -EINVAL;338}339340*bo_out = gem_to_amdgpu_bo(gem_obj);341(*bo_out)->parent = amdgpu_bo_ref(mem->bo);342return ret;343}344345/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's346* reservation object.347*348* @bo: [IN] Remove eviction fence(s) from this BO349* @ef: [IN] This eviction fence is removed if it350* is present in the shared list.351*352* NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held.353*/354static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,355struct amdgpu_amdkfd_fence *ef)356{357struct dma_fence *replacement;358359if (!ef)360return -EINVAL;361362/* TODO: Instead of block before we should use the fence of the page363* table update and TLB flush here directly.364*/365replacement = dma_fence_get_stub();366dma_resv_replace_fences(bo->tbo.base.resv, ef->base.context,367replacement, DMA_RESV_USAGE_BOOKKEEP);368dma_fence_put(replacement);369return 0;370}371372/**373* amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences374* @bo: the BO where to remove the evictions fences from.375*376* This functions should only be used on release when all references to the BO377* are already dropped. We remove the eviction fence from the private copy of378* the dma_resv object here since that is what is used during release to379* determine of the BO is idle or not.380*/381void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)382{383struct dma_resv *resv = &bo->tbo.base._resv;384struct dma_fence *fence, *stub;385struct dma_resv_iter cursor;386387dma_resv_assert_held(resv);388389stub = dma_fence_get_stub();390dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {391if (!to_amdgpu_amdkfd_fence(fence))392continue;393394dma_resv_replace_fences(resv, fence->context, stub,395DMA_RESV_USAGE_BOOKKEEP);396}397dma_fence_put(stub);398}399400static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,401bool wait)402{403struct ttm_operation_ctx ctx = { false, false };404int ret;405406if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm),407"Called with userptr BO"))408return -EINVAL;409410/* bo has been pinned, not need validate it */411if (bo->tbo.pin_count)412return 0;413414amdgpu_bo_placement_from_domain(bo, domain);415416ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);417if (ret)418goto validate_fail;419if (wait)420amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);421422validate_fail:423return ret;424}425426int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,427uint32_t domain,428struct dma_fence *fence)429{430int ret = amdgpu_bo_reserve(bo, false);431432if (ret)433return ret;434435ret = amdgpu_amdkfd_bo_validate(bo, domain, true);436if (ret)437goto unreserve_out;438439ret = dma_resv_reserve_fences(bo->tbo.base.resv, 1);440if (ret)441goto unreserve_out;442443dma_resv_add_fence(bo->tbo.base.resv, fence,444DMA_RESV_USAGE_BOOKKEEP);445446unreserve_out:447amdgpu_bo_unreserve(bo);448449return ret;450}451452static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)453{454return amdgpu_amdkfd_bo_validate(bo, bo->allowed_domains, false);455}456457/* vm_validate_pt_pd_bos - Validate page table and directory BOs458*459* Page directories are not updated here because huge page handling460* during page table updates can invalidate page directory entries461* again. Page directories are only updated after updating page462* tables.463*/464static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm,465struct ww_acquire_ctx *ticket)466{467struct amdgpu_bo *pd = vm->root.bo;468struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);469int ret;470471ret = amdgpu_vm_validate(adev, vm, ticket,472amdgpu_amdkfd_validate_vm_bo, NULL);473if (ret) {474pr_err("failed to validate PT BOs\n");475return ret;476}477478vm->pd_phys_addr = amdgpu_gmc_pd_addr(vm->root.bo);479480return 0;481}482483static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)484{485struct amdgpu_bo *pd = vm->root.bo;486struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);487int ret;488489ret = amdgpu_vm_update_pdes(adev, vm, false);490if (ret)491return ret;492493return amdgpu_sync_fence(sync, vm->last_update, GFP_KERNEL);494}495496static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)497{498uint32_t mapping_flags = AMDGPU_VM_PAGE_READABLE |499AMDGPU_VM_MTYPE_DEFAULT;500501if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE)502mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;503if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE)504mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;505506return amdgpu_gem_va_map_flags(adev, mapping_flags);507}508509/**510* create_sg_table() - Create an sg_table for a contiguous DMA addr range511* @addr: The starting address to point to512* @size: Size of memory area in bytes being pointed to513*514* Allocates an instance of sg_table and initializes it to point to memory515* area specified by input parameters. The address used to build is assumed516* to be DMA mapped, if needed.517*518* DOORBELL or MMIO BOs use only one scatterlist node in their sg_table519* because they are physically contiguous.520*521* Return: Initialized instance of SG Table or NULL522*/523static struct sg_table *create_sg_table(uint64_t addr, uint32_t size)524{525struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);526527if (!sg)528return NULL;529if (sg_alloc_table(sg, 1, GFP_KERNEL)) {530kfree(sg);531return NULL;532}533sg_dma_address(sg->sgl) = addr;534sg->sgl->length = size;535#ifdef CONFIG_NEED_SG_DMA_LENGTH536sg->sgl->dma_length = size;537#endif538return sg;539}540541static int542kfd_mem_dmamap_userptr(struct kgd_mem *mem,543struct kfd_mem_attachment *attachment)544{545enum dma_data_direction direction =546mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?547DMA_BIDIRECTIONAL : DMA_TO_DEVICE;548struct ttm_operation_ctx ctx = {.interruptible = true};549struct amdgpu_bo *bo = attachment->bo_va->base.bo;550struct amdgpu_device *adev = attachment->adev;551struct ttm_tt *src_ttm = mem->bo->tbo.ttm;552struct ttm_tt *ttm = bo->tbo.ttm;553int ret;554555if (WARN_ON(ttm->num_pages != src_ttm->num_pages))556return -EINVAL;557558ttm->sg = kmalloc(sizeof(*ttm->sg), GFP_KERNEL);559if (unlikely(!ttm->sg))560return -ENOMEM;561562/* Same sequence as in amdgpu_ttm_tt_pin_userptr */563ret = sg_alloc_table_from_pages(ttm->sg, src_ttm->pages,564ttm->num_pages, 0,565(u64)ttm->num_pages << PAGE_SHIFT,566GFP_KERNEL);567if (unlikely(ret))568goto free_sg;569570ret = dma_map_sgtable(adev->dev, ttm->sg, direction, 0);571if (unlikely(ret))572goto release_sg;573574amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);575ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);576if (ret)577goto unmap_sg;578579return 0;580581unmap_sg:582dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);583release_sg:584pr_err("DMA map userptr failed: %d\n", ret);585sg_free_table(ttm->sg);586free_sg:587kfree(ttm->sg);588ttm->sg = NULL;589return ret;590}591592static int593kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment *attachment)594{595struct ttm_operation_ctx ctx = {.interruptible = true};596struct amdgpu_bo *bo = attachment->bo_va->base.bo;597598amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);599return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);600}601602/**603* kfd_mem_dmamap_sg_bo() - Create DMA mapped sg_table to access DOORBELL or MMIO BO604* @mem: SG BO of the DOORBELL or MMIO resource on the owning device605* @attachment: Virtual address attachment of the BO on accessing device606*607* An access request from the device that owns DOORBELL does not require DMA mapping.608* This is because the request doesn't go through PCIe root complex i.e. it instead609* loops back. The need to DMA map arises only when accessing peer device's DOORBELL610*611* In contrast, all access requests for MMIO need to be DMA mapped without regard to612* device ownership. This is because access requests for MMIO go through PCIe root613* complex.614*615* This is accomplished in two steps:616* - Obtain DMA mapped address of DOORBELL or MMIO memory that could be used617* in updating requesting device's page table618* - Signal TTM to mark memory pointed to by requesting device's BO as GPU619* accessible. This allows an update of requesting device's page table620* with entries associated with DOOREBELL or MMIO memory621*622* This method is invoked in the following contexts:623* - Mapping of DOORBELL or MMIO BO of same or peer device624* - Validating an evicted DOOREBELL or MMIO BO on device seeking access625*626* Return: ZERO if successful, NON-ZERO otherwise627*/628static int629kfd_mem_dmamap_sg_bo(struct kgd_mem *mem,630struct kfd_mem_attachment *attachment)631{632struct ttm_operation_ctx ctx = {.interruptible = true};633struct amdgpu_bo *bo = attachment->bo_va->base.bo;634struct amdgpu_device *adev = attachment->adev;635struct ttm_tt *ttm = bo->tbo.ttm;636enum dma_data_direction dir;637dma_addr_t dma_addr;638bool mmio;639int ret;640641/* Expect SG Table of dmapmap BO to be NULL */642mmio = (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP);643if (unlikely(ttm->sg)) {644pr_err("SG Table of %d BO for peer device is UNEXPECTEDLY NON-NULL", mmio);645return -EINVAL;646}647648dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?649DMA_BIDIRECTIONAL : DMA_TO_DEVICE;650dma_addr = mem->bo->tbo.sg->sgl->dma_address;651pr_debug("%d BO size: %d\n", mmio, mem->bo->tbo.sg->sgl->length);652pr_debug("%d BO address before DMA mapping: %llx\n", mmio, dma_addr);653dma_addr = dma_map_resource(adev->dev, dma_addr,654mem->bo->tbo.sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);655ret = dma_mapping_error(adev->dev, dma_addr);656if (unlikely(ret))657return ret;658pr_debug("%d BO address after DMA mapping: %llx\n", mmio, dma_addr);659660ttm->sg = create_sg_table(dma_addr, mem->bo->tbo.sg->sgl->length);661if (unlikely(!ttm->sg)) {662ret = -ENOMEM;663goto unmap_sg;664}665666amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);667ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);668if (unlikely(ret))669goto free_sg;670671return ret;672673free_sg:674sg_free_table(ttm->sg);675kfree(ttm->sg);676ttm->sg = NULL;677unmap_sg:678dma_unmap_resource(adev->dev, dma_addr, mem->bo->tbo.sg->sgl->length,679dir, DMA_ATTR_SKIP_CPU_SYNC);680return ret;681}682683static int684kfd_mem_dmamap_attachment(struct kgd_mem *mem,685struct kfd_mem_attachment *attachment)686{687switch (attachment->type) {688case KFD_MEM_ATT_SHARED:689return 0;690case KFD_MEM_ATT_USERPTR:691return kfd_mem_dmamap_userptr(mem, attachment);692case KFD_MEM_ATT_DMABUF:693return kfd_mem_dmamap_dmabuf(attachment);694case KFD_MEM_ATT_SG:695return kfd_mem_dmamap_sg_bo(mem, attachment);696default:697WARN_ON_ONCE(1);698}699return -EINVAL;700}701702static void703kfd_mem_dmaunmap_userptr(struct kgd_mem *mem,704struct kfd_mem_attachment *attachment)705{706enum dma_data_direction direction =707mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?708DMA_BIDIRECTIONAL : DMA_TO_DEVICE;709struct ttm_operation_ctx ctx = {.interruptible = false};710struct amdgpu_bo *bo = attachment->bo_va->base.bo;711struct amdgpu_device *adev = attachment->adev;712struct ttm_tt *ttm = bo->tbo.ttm;713714if (unlikely(!ttm->sg))715return;716717amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);718(void)ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);719720dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);721sg_free_table(ttm->sg);722kfree(ttm->sg);723ttm->sg = NULL;724}725726static void727kfd_mem_dmaunmap_dmabuf(struct kfd_mem_attachment *attachment)728{729/* This is a no-op. We don't want to trigger eviction fences when730* unmapping DMABufs. Therefore the invalidation (moving to system731* domain) is done in kfd_mem_dmamap_dmabuf.732*/733}734735/**736* kfd_mem_dmaunmap_sg_bo() - Free DMA mapped sg_table of DOORBELL or MMIO BO737* @mem: SG BO of the DOORBELL or MMIO resource on the owning device738* @attachment: Virtual address attachment of the BO on accessing device739*740* The method performs following steps:741* - Signal TTM to mark memory pointed to by BO as GPU inaccessible742* - Free SG Table that is used to encapsulate DMA mapped memory of743* peer device's DOORBELL or MMIO memory744*745* This method is invoked in the following contexts:746* UNMapping of DOORBELL or MMIO BO on a device having access to its memory747* Eviction of DOOREBELL or MMIO BO on device having access to its memory748*749* Return: void750*/751static void752kfd_mem_dmaunmap_sg_bo(struct kgd_mem *mem,753struct kfd_mem_attachment *attachment)754{755struct ttm_operation_ctx ctx = {.interruptible = true};756struct amdgpu_bo *bo = attachment->bo_va->base.bo;757struct amdgpu_device *adev = attachment->adev;758struct ttm_tt *ttm = bo->tbo.ttm;759enum dma_data_direction dir;760761if (unlikely(!ttm->sg)) {762pr_debug("SG Table of BO is NULL");763return;764}765766amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);767(void)ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);768769dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?770DMA_BIDIRECTIONAL : DMA_TO_DEVICE;771dma_unmap_resource(adev->dev, ttm->sg->sgl->dma_address,772ttm->sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);773sg_free_table(ttm->sg);774kfree(ttm->sg);775ttm->sg = NULL;776bo->tbo.sg = NULL;777}778779static void780kfd_mem_dmaunmap_attachment(struct kgd_mem *mem,781struct kfd_mem_attachment *attachment)782{783switch (attachment->type) {784case KFD_MEM_ATT_SHARED:785break;786case KFD_MEM_ATT_USERPTR:787kfd_mem_dmaunmap_userptr(mem, attachment);788break;789case KFD_MEM_ATT_DMABUF:790kfd_mem_dmaunmap_dmabuf(attachment);791break;792case KFD_MEM_ATT_SG:793kfd_mem_dmaunmap_sg_bo(mem, attachment);794break;795default:796WARN_ON_ONCE(1);797}798}799800static int kfd_mem_export_dmabuf(struct kgd_mem *mem)801{802if (!mem->dmabuf) {803struct amdgpu_device *bo_adev;804struct dma_buf *dmabuf;805806bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);807dmabuf = drm_gem_prime_handle_to_dmabuf(&bo_adev->ddev, bo_adev->kfd.client.file,808mem->gem_handle,809mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?810DRM_RDWR : 0);811if (IS_ERR(dmabuf))812return PTR_ERR(dmabuf);813mem->dmabuf = dmabuf;814}815816return 0;817}818819static int820kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem,821struct amdgpu_bo **bo)822{823struct drm_gem_object *gobj;824int ret;825826ret = kfd_mem_export_dmabuf(mem);827if (ret)828return ret;829830gobj = amdgpu_gem_prime_import(adev_to_drm(adev), mem->dmabuf);831if (IS_ERR(gobj))832return PTR_ERR(gobj);833834*bo = gem_to_amdgpu_bo(gobj);835(*bo)->flags |= AMDGPU_GEM_CREATE_PREEMPTIBLE;836837return 0;838}839840/* kfd_mem_attach - Add a BO to a VM841*842* Everything that needs to bo done only once when a BO is first added843* to a VM. It can later be mapped and unmapped many times without844* repeating these steps.845*846* 0. Create BO for DMA mapping, if needed847* 1. Allocate and initialize BO VA entry data structure848* 2. Add BO to the VM849* 3. Determine ASIC-specific PTE flags850* 4. Alloc page tables and directories if needed851* 4a. Validate new page tables and directories852*/853static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,854struct amdgpu_vm *vm, bool is_aql)855{856struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);857unsigned long bo_size = mem->bo->tbo.base.size;858uint64_t va = mem->va;859struct kfd_mem_attachment *attachment[2] = {NULL, NULL};860struct amdgpu_bo *bo[2] = {NULL, NULL};861struct amdgpu_bo_va *bo_va;862bool same_hive = false;863int i, ret;864865if (!va) {866pr_err("Invalid VA when adding BO to VM\n");867return -EINVAL;868}869870/* Determine access to VRAM, MMIO and DOORBELL BOs of peer devices871*872* The access path of MMIO and DOORBELL BOs of is always over PCIe.873* In contrast the access path of VRAM BOs depens upon the type of874* link that connects the peer device. Access over PCIe is allowed875* if peer device has large BAR. In contrast, access over xGMI is876* allowed for both small and large BAR configurations of peer device877*/878if ((adev != bo_adev && !adev->apu_prefer_gtt) &&879((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||880(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||881(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {882if (mem->domain == AMDGPU_GEM_DOMAIN_VRAM)883same_hive = amdgpu_xgmi_same_hive(adev, bo_adev);884if (!same_hive && !amdgpu_device_is_peer_accessible(bo_adev, adev))885return -EINVAL;886}887888for (i = 0; i <= is_aql; i++) {889attachment[i] = kzalloc(sizeof(*attachment[i]), GFP_KERNEL);890if (unlikely(!attachment[i])) {891ret = -ENOMEM;892goto unwind;893}894895pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,896va + bo_size, vm);897898if ((adev == bo_adev && !(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) ||899(amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) && reuse_dmamap(adev, bo_adev)) ||900(mem->domain == AMDGPU_GEM_DOMAIN_GTT && reuse_dmamap(adev, bo_adev)) ||901same_hive) {902/* Mappings on the local GPU, or VRAM mappings in the903* local hive, or userptr, or GTT mapping can reuse dma map904* address space share the original BO905*/906attachment[i]->type = KFD_MEM_ATT_SHARED;907bo[i] = mem->bo;908drm_gem_object_get(&bo[i]->tbo.base);909} else if (i > 0) {910/* Multiple mappings on the same GPU share the BO */911attachment[i]->type = KFD_MEM_ATT_SHARED;912bo[i] = bo[0];913drm_gem_object_get(&bo[i]->tbo.base);914} else if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {915/* Create an SG BO to DMA-map userptrs on other GPUs */916attachment[i]->type = KFD_MEM_ATT_USERPTR;917ret = create_dmamap_sg_bo(adev, mem, &bo[i]);918if (ret)919goto unwind;920/* Handle DOORBELL BOs of peer devices and MMIO BOs of local and peer devices */921} else if (mem->bo->tbo.type == ttm_bo_type_sg) {922WARN_ONCE(!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL ||923mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP),924"Handing invalid SG BO in ATTACH request");925attachment[i]->type = KFD_MEM_ATT_SG;926ret = create_dmamap_sg_bo(adev, mem, &bo[i]);927if (ret)928goto unwind;929/* Enable acces to GTT and VRAM BOs of peer devices */930} else if (mem->domain == AMDGPU_GEM_DOMAIN_GTT ||931mem->domain == AMDGPU_GEM_DOMAIN_VRAM) {932attachment[i]->type = KFD_MEM_ATT_DMABUF;933ret = kfd_mem_attach_dmabuf(adev, mem, &bo[i]);934if (ret)935goto unwind;936pr_debug("Employ DMABUF mechanism to enable peer GPU access\n");937} else {938WARN_ONCE(true, "Handling invalid ATTACH request");939ret = -EINVAL;940goto unwind;941}942943/* Add BO to VM internal data structures */944ret = amdgpu_bo_reserve(bo[i], false);945if (ret) {946pr_debug("Unable to reserve BO during memory attach");947goto unwind;948}949bo_va = amdgpu_vm_bo_find(vm, bo[i]);950if (!bo_va)951bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);952else953++bo_va->ref_count;954attachment[i]->bo_va = bo_va;955amdgpu_bo_unreserve(bo[i]);956if (unlikely(!attachment[i]->bo_va)) {957ret = -ENOMEM;958pr_err("Failed to add BO object to VM. ret == %d\n",959ret);960goto unwind;961}962attachment[i]->va = va;963attachment[i]->pte_flags = get_pte_flags(adev, mem);964attachment[i]->adev = adev;965list_add(&attachment[i]->list, &mem->attachments);966967va += bo_size;968}969970return 0;971972unwind:973for (; i >= 0; i--) {974if (!attachment[i])975continue;976if (attachment[i]->bo_va) {977(void)amdgpu_bo_reserve(bo[i], true);978if (--attachment[i]->bo_va->ref_count == 0)979amdgpu_vm_bo_del(adev, attachment[i]->bo_va);980amdgpu_bo_unreserve(bo[i]);981list_del(&attachment[i]->list);982}983if (bo[i])984drm_gem_object_put(&bo[i]->tbo.base);985kfree(attachment[i]);986}987return ret;988}989990static void kfd_mem_detach(struct kfd_mem_attachment *attachment)991{992struct amdgpu_bo *bo = attachment->bo_va->base.bo;993994pr_debug("\t remove VA 0x%llx in entry %p\n",995attachment->va, attachment);996if (--attachment->bo_va->ref_count == 0)997amdgpu_vm_bo_del(attachment->adev, attachment->bo_va);998drm_gem_object_put(&bo->tbo.base);999list_del(&attachment->list);1000kfree(attachment);1001}10021003static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,1004struct amdkfd_process_info *process_info,1005bool userptr)1006{1007mutex_lock(&process_info->lock);1008if (userptr)1009list_add_tail(&mem->validate_list,1010&process_info->userptr_valid_list);1011else1012list_add_tail(&mem->validate_list, &process_info->kfd_bo_list);1013mutex_unlock(&process_info->lock);1014}10151016static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,1017struct amdkfd_process_info *process_info)1018{1019mutex_lock(&process_info->lock);1020list_del(&mem->validate_list);1021mutex_unlock(&process_info->lock);1022}10231024/* Initializes user pages. It registers the MMU notifier and validates1025* the userptr BO in the GTT domain.1026*1027* The BO must already be on the userptr_valid_list. Otherwise an1028* eviction and restore may happen that leaves the new BO unmapped1029* with the user mode queues running.1030*1031* Takes the process_info->lock to protect against concurrent restore1032* workers.1033*1034* Returns 0 for success, negative errno for errors.1035*/1036static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,1037bool criu_resume)1038{1039struct amdkfd_process_info *process_info = mem->process_info;1040struct amdgpu_bo *bo = mem->bo;1041struct ttm_operation_ctx ctx = { true, false };1042struct hmm_range *range;1043int ret = 0;10441045mutex_lock(&process_info->lock);10461047ret = amdgpu_ttm_tt_set_userptr(&bo->tbo, user_addr, 0);1048if (ret) {1049pr_err("%s: Failed to set userptr: %d\n", __func__, ret);1050goto out;1051}10521053ret = amdgpu_hmm_register(bo, user_addr);1054if (ret) {1055pr_err("%s: Failed to register MMU notifier: %d\n",1056__func__, ret);1057goto out;1058}10591060if (criu_resume) {1061/*1062* During a CRIU restore operation, the userptr buffer objects1063* will be validated in the restore_userptr_work worker at a1064* later stage when it is scheduled by another ioctl called by1065* CRIU master process for the target pid for restore.1066*/1067mutex_lock(&process_info->notifier_lock);1068mem->invalid++;1069mutex_unlock(&process_info->notifier_lock);1070mutex_unlock(&process_info->lock);1071return 0;1072}10731074ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);1075if (ret) {1076if (ret == -EAGAIN)1077pr_debug("Failed to get user pages, try again\n");1078else1079pr_err("%s: Failed to get user pages: %d\n", __func__, ret);1080goto unregister_out;1081}10821083ret = amdgpu_bo_reserve(bo, true);1084if (ret) {1085pr_err("%s: Failed to reserve BO\n", __func__);1086goto release_out;1087}1088amdgpu_bo_placement_from_domain(bo, mem->domain);1089ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);1090if (ret)1091pr_err("%s: failed to validate BO\n", __func__);1092amdgpu_bo_unreserve(bo);10931094release_out:1095amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range);1096unregister_out:1097if (ret)1098amdgpu_hmm_unregister(bo);1099out:1100mutex_unlock(&process_info->lock);1101return ret;1102}11031104/* Reserving a BO and its page table BOs must happen atomically to1105* avoid deadlocks. Some operations update multiple VMs at once. Track1106* all the reservation info in a context structure. Optionally a sync1107* object can track VM updates.1108*/1109struct bo_vm_reservation_context {1110/* DRM execution context for the reservation */1111struct drm_exec exec;1112/* Number of VMs reserved */1113unsigned int n_vms;1114/* Pointer to sync object */1115struct amdgpu_sync *sync;1116};11171118enum bo_vm_match {1119BO_VM_NOT_MAPPED = 0, /* Match VMs where a BO is not mapped */1120BO_VM_MAPPED, /* Match VMs where a BO is mapped */1121BO_VM_ALL, /* Match all VMs a BO was added to */1122};11231124/**1125* reserve_bo_and_vm - reserve a BO and a VM unconditionally.1126* @mem: KFD BO structure.1127* @vm: the VM to reserve.1128* @ctx: the struct that will be used in unreserve_bo_and_vms().1129*/1130static int reserve_bo_and_vm(struct kgd_mem *mem,1131struct amdgpu_vm *vm,1132struct bo_vm_reservation_context *ctx)1133{1134struct amdgpu_bo *bo = mem->bo;1135int ret;11361137WARN_ON(!vm);11381139ctx->n_vms = 1;1140ctx->sync = &mem->sync;1141drm_exec_init(&ctx->exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);1142drm_exec_until_all_locked(&ctx->exec) {1143ret = amdgpu_vm_lock_pd(vm, &ctx->exec, 2);1144drm_exec_retry_on_contention(&ctx->exec);1145if (unlikely(ret))1146goto error;11471148ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);1149drm_exec_retry_on_contention(&ctx->exec);1150if (unlikely(ret))1151goto error;1152}1153return 0;11541155error:1156pr_err("Failed to reserve buffers in ttm.\n");1157drm_exec_fini(&ctx->exec);1158return ret;1159}11601161/**1162* reserve_bo_and_cond_vms - reserve a BO and some VMs conditionally1163* @mem: KFD BO structure.1164* @vm: the VM to reserve. If NULL, then all VMs associated with the BO1165* is used. Otherwise, a single VM associated with the BO.1166* @map_type: the mapping status that will be used to filter the VMs.1167* @ctx: the struct that will be used in unreserve_bo_and_vms().1168*1169* Returns 0 for success, negative for failure.1170*/1171static int reserve_bo_and_cond_vms(struct kgd_mem *mem,1172struct amdgpu_vm *vm, enum bo_vm_match map_type,1173struct bo_vm_reservation_context *ctx)1174{1175struct kfd_mem_attachment *entry;1176struct amdgpu_bo *bo = mem->bo;1177int ret;11781179ctx->sync = &mem->sync;1180drm_exec_init(&ctx->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |1181DRM_EXEC_IGNORE_DUPLICATES, 0);1182drm_exec_until_all_locked(&ctx->exec) {1183ctx->n_vms = 0;1184list_for_each_entry(entry, &mem->attachments, list) {1185if ((vm && vm != entry->bo_va->base.vm) ||1186(entry->is_mapped != map_type1187&& map_type != BO_VM_ALL))1188continue;11891190ret = amdgpu_vm_lock_pd(entry->bo_va->base.vm,1191&ctx->exec, 2);1192drm_exec_retry_on_contention(&ctx->exec);1193if (unlikely(ret))1194goto error;1195++ctx->n_vms;1196}11971198ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);1199drm_exec_retry_on_contention(&ctx->exec);1200if (unlikely(ret))1201goto error;1202}1203return 0;12041205error:1206pr_err("Failed to reserve buffers in ttm.\n");1207drm_exec_fini(&ctx->exec);1208return ret;1209}12101211/**1212* unreserve_bo_and_vms - Unreserve BO and VMs from a reservation context1213* @ctx: Reservation context to unreserve1214* @wait: Optionally wait for a sync object representing pending VM updates1215* @intr: Whether the wait is interruptible1216*1217* Also frees any resources allocated in1218* reserve_bo_and_(cond_)vm(s). Returns the status from1219* amdgpu_sync_wait.1220*/1221static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,1222bool wait, bool intr)1223{1224int ret = 0;12251226if (wait)1227ret = amdgpu_sync_wait(ctx->sync, intr);12281229drm_exec_fini(&ctx->exec);1230ctx->sync = NULL;1231return ret;1232}12331234static int unmap_bo_from_gpuvm(struct kgd_mem *mem,1235struct kfd_mem_attachment *entry,1236struct amdgpu_sync *sync)1237{1238struct amdgpu_bo_va *bo_va = entry->bo_va;1239struct amdgpu_device *adev = entry->adev;1240struct amdgpu_vm *vm = bo_va->base.vm;12411242if (bo_va->queue_refcount) {1243pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);1244return -EBUSY;1245}12461247(void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va);12481249(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);12501251(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);12521253return 0;1254}12551256static int update_gpuvm_pte(struct kgd_mem *mem,1257struct kfd_mem_attachment *entry,1258struct amdgpu_sync *sync)1259{1260struct amdgpu_bo_va *bo_va = entry->bo_va;1261struct amdgpu_device *adev = entry->adev;1262int ret;12631264ret = kfd_mem_dmamap_attachment(mem, entry);1265if (ret)1266return ret;12671268/* Update the page tables */1269ret = amdgpu_vm_bo_update(adev, bo_va, false);1270if (ret) {1271pr_err("amdgpu_vm_bo_update failed\n");1272return ret;1273}12741275return amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);1276}12771278static int map_bo_to_gpuvm(struct kgd_mem *mem,1279struct kfd_mem_attachment *entry,1280struct amdgpu_sync *sync,1281bool no_update_pte)1282{1283int ret;12841285/* Set virtual address for the allocation */1286ret = amdgpu_vm_bo_map(entry->adev, entry->bo_va, entry->va, 0,1287amdgpu_bo_size(entry->bo_va->base.bo),1288entry->pte_flags);1289if (ret) {1290pr_err("Failed to map VA 0x%llx in vm. ret %d\n",1291entry->va, ret);1292return ret;1293}12941295if (no_update_pte)1296return 0;12971298ret = update_gpuvm_pte(mem, entry, sync);1299if (ret) {1300pr_err("update_gpuvm_pte() failed\n");1301goto update_gpuvm_pte_failed;1302}13031304return 0;13051306update_gpuvm_pte_failed:1307unmap_bo_from_gpuvm(mem, entry, sync);1308kfd_mem_dmaunmap_attachment(mem, entry);1309return ret;1310}13111312static int process_validate_vms(struct amdkfd_process_info *process_info,1313struct ww_acquire_ctx *ticket)1314{1315struct amdgpu_vm *peer_vm;1316int ret;13171318list_for_each_entry(peer_vm, &process_info->vm_list_head,1319vm_list_node) {1320ret = vm_validate_pt_pd_bos(peer_vm, ticket);1321if (ret)1322return ret;1323}13241325return 0;1326}13271328static int process_sync_pds_resv(struct amdkfd_process_info *process_info,1329struct amdgpu_sync *sync)1330{1331struct amdgpu_vm *peer_vm;1332int ret;13331334list_for_each_entry(peer_vm, &process_info->vm_list_head,1335vm_list_node) {1336struct amdgpu_bo *pd = peer_vm->root.bo;13371338ret = amdgpu_sync_resv(NULL, sync, pd->tbo.base.resv,1339AMDGPU_SYNC_NE_OWNER,1340AMDGPU_FENCE_OWNER_KFD);1341if (ret)1342return ret;1343}13441345return 0;1346}13471348static int process_update_pds(struct amdkfd_process_info *process_info,1349struct amdgpu_sync *sync)1350{1351struct amdgpu_vm *peer_vm;1352int ret;13531354list_for_each_entry(peer_vm, &process_info->vm_list_head,1355vm_list_node) {1356ret = vm_update_pds(peer_vm, sync);1357if (ret)1358return ret;1359}13601361return 0;1362}13631364static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,1365struct dma_fence **ef)1366{1367struct amdkfd_process_info *info = NULL;1368int ret;13691370if (!*process_info) {1371info = kzalloc(sizeof(*info), GFP_KERNEL);1372if (!info)1373return -ENOMEM;13741375mutex_init(&info->lock);1376mutex_init(&info->notifier_lock);1377INIT_LIST_HEAD(&info->vm_list_head);1378INIT_LIST_HEAD(&info->kfd_bo_list);1379INIT_LIST_HEAD(&info->userptr_valid_list);1380INIT_LIST_HEAD(&info->userptr_inval_list);13811382info->eviction_fence =1383amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),1384current->mm,1385NULL);1386if (!info->eviction_fence) {1387pr_err("Failed to create eviction fence\n");1388ret = -ENOMEM;1389goto create_evict_fence_fail;1390}13911392info->pid = get_task_pid(current->group_leader, PIDTYPE_PID);1393INIT_DELAYED_WORK(&info->restore_userptr_work,1394amdgpu_amdkfd_restore_userptr_worker);13951396*process_info = info;1397}13981399vm->process_info = *process_info;14001401/* Validate page directory and attach eviction fence */1402ret = amdgpu_bo_reserve(vm->root.bo, true);1403if (ret)1404goto reserve_pd_fail;1405ret = vm_validate_pt_pd_bos(vm, NULL);1406if (ret) {1407pr_err("validate_pt_pd_bos() failed\n");1408goto validate_pd_fail;1409}1410ret = amdgpu_bo_sync_wait(vm->root.bo,1411AMDGPU_FENCE_OWNER_KFD, false);1412if (ret)1413goto wait_pd_fail;1414ret = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1);1415if (ret)1416goto reserve_shared_fail;1417dma_resv_add_fence(vm->root.bo->tbo.base.resv,1418&vm->process_info->eviction_fence->base,1419DMA_RESV_USAGE_BOOKKEEP);1420amdgpu_bo_unreserve(vm->root.bo);14211422/* Update process info */1423mutex_lock(&vm->process_info->lock);1424list_add_tail(&vm->vm_list_node,1425&(vm->process_info->vm_list_head));1426vm->process_info->n_vms++;1427if (ef)1428*ef = dma_fence_get(&vm->process_info->eviction_fence->base);1429mutex_unlock(&vm->process_info->lock);14301431return 0;14321433reserve_shared_fail:1434wait_pd_fail:1435validate_pd_fail:1436amdgpu_bo_unreserve(vm->root.bo);1437reserve_pd_fail:1438vm->process_info = NULL;1439if (info) {1440dma_fence_put(&info->eviction_fence->base);1441*process_info = NULL;1442put_pid(info->pid);1443create_evict_fence_fail:1444mutex_destroy(&info->lock);1445mutex_destroy(&info->notifier_lock);1446kfree(info);1447}1448return ret;1449}14501451/**1452* amdgpu_amdkfd_gpuvm_pin_bo() - Pins a BO using following criteria1453* @bo: Handle of buffer object being pinned1454* @domain: Domain into which BO should be pinned1455*1456* - USERPTR BOs are UNPINNABLE and will return error1457* - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their1458* PIN count incremented. It is valid to PIN a BO multiple times1459*1460* Return: ZERO if successful in pinning, Non-Zero in case of error.1461*/1462static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)1463{1464int ret = 0;14651466ret = amdgpu_bo_reserve(bo, false);1467if (unlikely(ret))1468return ret;14691470if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {1471/*1472* If bo is not contiguous on VRAM, move to system memory first to ensure1473* we can get contiguous VRAM space after evicting other BOs.1474*/1475if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {1476struct ttm_operation_ctx ctx = { true, false };14771478amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);1479ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);1480if (unlikely(ret)) {1481pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret);1482goto out;1483}1484}1485}14861487ret = amdgpu_bo_pin(bo, domain);1488if (ret)1489pr_err("Error in Pinning BO to domain: %d\n", domain);14901491amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);1492out:1493amdgpu_bo_unreserve(bo);1494return ret;1495}14961497/**1498* amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria1499* @bo: Handle of buffer object being unpinned1500*1501* - Is a illegal request for USERPTR BOs and is ignored1502* - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their1503* PIN count decremented. Calls to UNPIN must balance calls to PIN1504*/1505static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)1506{1507int ret = 0;15081509ret = amdgpu_bo_reserve(bo, false);1510if (unlikely(ret))1511return;15121513amdgpu_bo_unpin(bo);1514amdgpu_bo_unreserve(bo);1515}15161517int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,1518struct amdgpu_vm *avm,1519void **process_info,1520struct dma_fence **ef)1521{1522int ret;15231524/* Already a compute VM? */1525if (avm->process_info)1526return -EINVAL;15271528/* Convert VM into a compute VM */1529ret = amdgpu_vm_make_compute(adev, avm);1530if (ret)1531return ret;15321533/* Initialize KFD part of the VM and process info */1534ret = init_kfd_vm(avm, process_info, ef);1535if (ret)1536return ret;15371538amdgpu_vm_set_task_info(avm);15391540return 0;1541}15421543void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,1544struct amdgpu_vm *vm)1545{1546struct amdkfd_process_info *process_info = vm->process_info;15471548if (!process_info)1549return;15501551/* Update process info */1552mutex_lock(&process_info->lock);1553process_info->n_vms--;1554list_del(&vm->vm_list_node);1555mutex_unlock(&process_info->lock);15561557vm->process_info = NULL;15581559/* Release per-process resources when last compute VM is destroyed */1560if (!process_info->n_vms) {1561WARN_ON(!list_empty(&process_info->kfd_bo_list));1562WARN_ON(!list_empty(&process_info->userptr_valid_list));1563WARN_ON(!list_empty(&process_info->userptr_inval_list));15641565dma_fence_put(&process_info->eviction_fence->base);1566cancel_delayed_work_sync(&process_info->restore_userptr_work);1567put_pid(process_info->pid);1568mutex_destroy(&process_info->lock);1569mutex_destroy(&process_info->notifier_lock);1570kfree(process_info);1571}1572}15731574uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv)1575{1576struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);1577struct amdgpu_bo *pd = avm->root.bo;1578struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);15791580if (adev->asic_type < CHIP_VEGA10)1581return avm->pd_phys_addr >> AMDGPU_GPU_PAGE_SHIFT;1582return avm->pd_phys_addr;1583}15841585void amdgpu_amdkfd_block_mmu_notifications(void *p)1586{1587struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;15881589mutex_lock(&pinfo->lock);1590WRITE_ONCE(pinfo->block_mmu_notifications, true);1591mutex_unlock(&pinfo->lock);1592}15931594int amdgpu_amdkfd_criu_resume(void *p)1595{1596int ret = 0;1597struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;15981599mutex_lock(&pinfo->lock);1600pr_debug("scheduling work\n");1601mutex_lock(&pinfo->notifier_lock);1602pinfo->evicted_bos++;1603mutex_unlock(&pinfo->notifier_lock);1604if (!READ_ONCE(pinfo->block_mmu_notifications)) {1605ret = -EINVAL;1606goto out_unlock;1607}1608WRITE_ONCE(pinfo->block_mmu_notifications, false);1609queue_delayed_work(system_freezable_wq,1610&pinfo->restore_userptr_work, 0);16111612out_unlock:1613mutex_unlock(&pinfo->lock);1614return ret;1615}16161617size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,1618uint8_t xcp_id)1619{1620uint64_t reserved_for_pt =1621ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);1622struct amdgpu_ras *con = amdgpu_ras_get_context(adev);1623uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);1624ssize_t available;1625uint64_t vram_available, system_mem_available, ttm_mem_available;16261627spin_lock(&kfd_mem_limit.mem_limit_lock);1628vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)1629- adev->kfd.vram_used_aligned[xcp_id]1630- atomic64_read(&adev->vram_pin_size)1631- reserved_for_pt1632- reserved_for_ras;16331634if (adev->apu_prefer_gtt) {1635system_mem_available = no_system_mem_limit ?1636kfd_mem_limit.max_system_mem_limit :1637kfd_mem_limit.max_system_mem_limit -1638kfd_mem_limit.system_mem_used;16391640ttm_mem_available = kfd_mem_limit.max_ttm_mem_limit -1641kfd_mem_limit.ttm_mem_used;16421643available = min3(system_mem_available, ttm_mem_available,1644vram_available);1645available = ALIGN_DOWN(available, PAGE_SIZE);1646} else {1647available = ALIGN_DOWN(vram_available, VRAM_AVAILABLITY_ALIGN);1648}16491650spin_unlock(&kfd_mem_limit.mem_limit_lock);16511652if (available < 0)1653available = 0;16541655return available;1656}16571658int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(1659struct amdgpu_device *adev, uint64_t va, uint64_t size,1660void *drm_priv, struct kgd_mem **mem,1661uint64_t *offset, uint32_t flags, bool criu_resume)1662{1663struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);1664struct amdgpu_fpriv *fpriv = container_of(avm, struct amdgpu_fpriv, vm);1665enum ttm_bo_type bo_type = ttm_bo_type_device;1666struct sg_table *sg = NULL;1667uint64_t user_addr = 0;1668struct amdgpu_bo *bo;1669struct drm_gem_object *gobj = NULL;1670u32 domain, alloc_domain;1671uint64_t aligned_size;1672int8_t xcp_id = -1;1673u64 alloc_flags;1674int ret;16751676/*1677* Check on which domain to allocate BO1678*/1679if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {1680domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;16811682if (adev->apu_prefer_gtt) {1683domain = AMDGPU_GEM_DOMAIN_GTT;1684alloc_domain = AMDGPU_GEM_DOMAIN_GTT;1685alloc_flags = 0;1686} else {1687alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;1688alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?1689AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;16901691/* For contiguous VRAM allocation */1692if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS)1693alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;1694}1695xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?16960 : fpriv->xcp_id;1697} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {1698domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;1699alloc_flags = 0;1700} else {1701domain = AMDGPU_GEM_DOMAIN_GTT;1702alloc_domain = AMDGPU_GEM_DOMAIN_CPU;1703alloc_flags = AMDGPU_GEM_CREATE_PREEMPTIBLE;17041705if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {1706if (!offset || !*offset)1707return -EINVAL;1708user_addr = untagged_addr(*offset);1709} else if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |1710KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {1711bo_type = ttm_bo_type_sg;1712if (size > UINT_MAX)1713return -EINVAL;1714sg = create_sg_table(*offset, size);1715if (!sg)1716return -ENOMEM;1717} else {1718return -EINVAL;1719}1720}17211722if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)1723alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;1724if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)1725alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENT;1726if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)1727alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;17281729*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);1730if (!*mem) {1731ret = -ENOMEM;1732goto err;1733}1734INIT_LIST_HEAD(&(*mem)->attachments);1735mutex_init(&(*mem)->lock);1736(*mem)->aql_queue = !!(flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM);17371738/* Workaround for AQL queue wraparound bug. Map the same1739* memory twice. That means we only actually allocate half1740* the memory.1741*/1742if ((*mem)->aql_queue)1743size >>= 1;1744aligned_size = PAGE_ALIGN(size);17451746(*mem)->alloc_flags = flags;17471748amdgpu_sync_create(&(*mem)->sync);17491750ret = amdgpu_amdkfd_reserve_mem_limit(adev, aligned_size, flags,1751xcp_id);1752if (ret) {1753pr_debug("Insufficient memory\n");1754goto err_reserve_limit;1755}17561757pr_debug("\tcreate BO VA 0x%llx size 0x%llx domain %s xcp_id %d\n",1758va, (*mem)->aql_queue ? size << 1 : size,1759domain_string(alloc_domain), xcp_id);17601761ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain, alloc_flags,1762bo_type, NULL, &gobj, xcp_id + 1);1763if (ret) {1764pr_debug("Failed to create BO on domain %s. ret %d\n",1765domain_string(alloc_domain), ret);1766goto err_bo_create;1767}1768ret = drm_vma_node_allow(&gobj->vma_node, drm_priv);1769if (ret) {1770pr_debug("Failed to allow vma node access. ret %d\n", ret);1771goto err_node_allow;1772}1773ret = drm_gem_handle_create(adev->kfd.client.file, gobj, &(*mem)->gem_handle);1774if (ret)1775goto err_gem_handle_create;1776bo = gem_to_amdgpu_bo(gobj);1777if (bo_type == ttm_bo_type_sg) {1778bo->tbo.sg = sg;1779bo->tbo.ttm->sg = sg;1780}1781bo->kfd_bo = *mem;1782(*mem)->bo = bo;1783if (user_addr)1784bo->flags |= AMDGPU_AMDKFD_CREATE_USERPTR_BO;17851786(*mem)->va = va;1787(*mem)->domain = domain;1788(*mem)->mapped_to_gpu_memory = 0;1789(*mem)->process_info = avm->process_info;17901791add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);17921793if (user_addr) {1794pr_debug("creating userptr BO for user_addr = %llx\n", user_addr);1795ret = init_user_pages(*mem, user_addr, criu_resume);1796if (ret)1797goto allocate_init_user_pages_failed;1798} else if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |1799KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {1800ret = amdgpu_amdkfd_gpuvm_pin_bo(bo, AMDGPU_GEM_DOMAIN_GTT);1801if (ret) {1802pr_err("Pinning MMIO/DOORBELL BO during ALLOC FAILED\n");1803goto err_pin_bo;1804}1805bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT;1806bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT;1807} else {1808mutex_lock(&avm->process_info->lock);1809if (avm->process_info->eviction_fence &&1810!dma_fence_is_signaled(&avm->process_info->eviction_fence->base))1811ret = amdgpu_amdkfd_bo_validate_and_fence(bo, domain,1812&avm->process_info->eviction_fence->base);1813mutex_unlock(&avm->process_info->lock);1814if (ret)1815goto err_validate_bo;1816}18171818if (offset)1819*offset = amdgpu_bo_mmap_offset(bo);18201821return 0;18221823allocate_init_user_pages_failed:1824err_pin_bo:1825err_validate_bo:1826remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);1827drm_gem_handle_delete(adev->kfd.client.file, (*mem)->gem_handle);1828err_gem_handle_create:1829drm_vma_node_revoke(&gobj->vma_node, drm_priv);1830err_node_allow:1831/* Don't unreserve system mem limit twice */1832goto err_reserve_limit;1833err_bo_create:1834amdgpu_amdkfd_unreserve_mem_limit(adev, aligned_size, flags, xcp_id);1835err_reserve_limit:1836amdgpu_sync_free(&(*mem)->sync);1837mutex_destroy(&(*mem)->lock);1838if (gobj)1839drm_gem_object_put(gobj);1840else1841kfree(*mem);1842err:1843if (sg) {1844sg_free_table(sg);1845kfree(sg);1846}1847return ret;1848}18491850int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(1851struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv,1852uint64_t *size)1853{1854struct amdkfd_process_info *process_info = mem->process_info;1855unsigned long bo_size = mem->bo->tbo.base.size;1856bool use_release_notifier = (mem->bo->kfd_bo == mem);1857struct kfd_mem_attachment *entry, *tmp;1858struct bo_vm_reservation_context ctx;1859unsigned int mapped_to_gpu_memory;1860int ret;1861bool is_imported = false;18621863mutex_lock(&mem->lock);18641865/* Unpin MMIO/DOORBELL BO's that were pinned during allocation */1866if (mem->alloc_flags &1867(KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |1868KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {1869amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);1870}18711872mapped_to_gpu_memory = mem->mapped_to_gpu_memory;1873is_imported = mem->is_imported;1874mutex_unlock(&mem->lock);1875/* lock is not needed after this, since mem is unused and will1876* be freed anyway1877*/18781879if (mapped_to_gpu_memory > 0) {1880pr_debug("BO VA 0x%llx size 0x%lx is still mapped.\n",1881mem->va, bo_size);1882return -EBUSY;1883}18841885/* Make sure restore workers don't access the BO any more */1886mutex_lock(&process_info->lock);1887list_del(&mem->validate_list);1888mutex_unlock(&process_info->lock);18891890/* Cleanup user pages and MMU notifiers */1891if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {1892amdgpu_hmm_unregister(mem->bo);1893mutex_lock(&process_info->notifier_lock);1894amdgpu_ttm_tt_discard_user_pages(mem->bo->tbo.ttm, mem->range);1895mutex_unlock(&process_info->notifier_lock);1896}18971898ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx);1899if (unlikely(ret))1900return ret;19011902amdgpu_amdkfd_remove_eviction_fence(mem->bo,1903process_info->eviction_fence);1904pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,1905mem->va + bo_size * (1 + mem->aql_queue));19061907/* Remove from VM internal data structures */1908list_for_each_entry_safe(entry, tmp, &mem->attachments, list) {1909kfd_mem_dmaunmap_attachment(mem, entry);1910kfd_mem_detach(entry);1911}19121913ret = unreserve_bo_and_vms(&ctx, false, false);19141915/* Free the sync object */1916amdgpu_sync_free(&mem->sync);19171918/* If the SG is not NULL, it's one we created for a doorbell or mmio1919* remap BO. We need to free it.1920*/1921if (mem->bo->tbo.sg) {1922sg_free_table(mem->bo->tbo.sg);1923kfree(mem->bo->tbo.sg);1924}19251926/* Update the size of the BO being freed if it was allocated from1927* VRAM and is not imported. For APP APU VRAM allocations are done1928* in GTT domain1929*/1930if (size) {1931if (!is_imported &&1932(mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||1933(adev->apu_prefer_gtt &&1934mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))1935*size = bo_size;1936else1937*size = 0;1938}19391940/* Free the BO*/1941drm_vma_node_revoke(&mem->bo->tbo.base.vma_node, drm_priv);1942drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle);1943if (mem->dmabuf) {1944dma_buf_put(mem->dmabuf);1945mem->dmabuf = NULL;1946}1947mutex_destroy(&mem->lock);19481949/* If this releases the last reference, it will end up calling1950* amdgpu_amdkfd_release_notify and kfree the mem struct. That's why1951* this needs to be the last call here.1952*/1953drm_gem_object_put(&mem->bo->tbo.base);19541955/*1956* For kgd_mem allocated in amdgpu_amdkfd_gpuvm_import_dmabuf(),1957* explicitly free it here.1958*/1959if (!use_release_notifier)1960kfree(mem);19611962return ret;1963}19641965int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(1966struct amdgpu_device *adev, struct kgd_mem *mem,1967void *drm_priv)1968{1969struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);1970int ret;1971struct amdgpu_bo *bo;1972uint32_t domain;1973struct kfd_mem_attachment *entry;1974struct bo_vm_reservation_context ctx;1975unsigned long bo_size;1976bool is_invalid_userptr = false;19771978bo = mem->bo;1979if (!bo) {1980pr_err("Invalid BO when mapping memory to GPU\n");1981return -EINVAL;1982}19831984/* Make sure restore is not running concurrently. Since we1985* don't map invalid userptr BOs, we rely on the next restore1986* worker to do the mapping1987*/1988mutex_lock(&mem->process_info->lock);19891990/* Lock notifier lock. If we find an invalid userptr BO, we can be1991* sure that the MMU notifier is no longer running1992* concurrently and the queues are actually stopped1993*/1994if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {1995mutex_lock(&mem->process_info->notifier_lock);1996is_invalid_userptr = !!mem->invalid;1997mutex_unlock(&mem->process_info->notifier_lock);1998}19992000mutex_lock(&mem->lock);20012002domain = mem->domain;2003bo_size = bo->tbo.base.size;20042005pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n",2006mem->va,2007mem->va + bo_size * (1 + mem->aql_queue),2008avm, domain_string(domain));20092010if (!kfd_mem_is_attached(avm, mem)) {2011ret = kfd_mem_attach(adev, mem, avm, mem->aql_queue);2012if (ret)2013goto out;2014}20152016ret = reserve_bo_and_vm(mem, avm, &ctx);2017if (unlikely(ret))2018goto out;20192020/* Userptr can be marked as "not invalid", but not actually be2021* validated yet (still in the system domain). In that case2022* the queues are still stopped and we can leave mapping for2023* the next restore worker2024*/2025if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) &&2026bo->tbo.resource->mem_type == TTM_PL_SYSTEM)2027is_invalid_userptr = true;20282029ret = vm_validate_pt_pd_bos(avm, NULL);2030if (unlikely(ret))2031goto out_unreserve;20322033list_for_each_entry(entry, &mem->attachments, list) {2034if (entry->bo_va->base.vm != avm || entry->is_mapped)2035continue;20362037pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n",2038entry->va, entry->va + bo_size, entry);20392040ret = map_bo_to_gpuvm(mem, entry, ctx.sync,2041is_invalid_userptr);2042if (ret) {2043pr_err("Failed to map bo to gpuvm\n");2044goto out_unreserve;2045}20462047ret = vm_update_pds(avm, ctx.sync);2048if (ret) {2049pr_err("Failed to update page directories\n");2050goto out_unreserve;2051}20522053entry->is_mapped = true;2054mem->mapped_to_gpu_memory++;2055pr_debug("\t INC mapping count %d\n",2056mem->mapped_to_gpu_memory);2057}20582059ret = unreserve_bo_and_vms(&ctx, false, false);20602061goto out;20622063out_unreserve:2064unreserve_bo_and_vms(&ctx, false, false);2065out:2066mutex_unlock(&mem->process_info->lock);2067mutex_unlock(&mem->lock);2068return ret;2069}20702071int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)2072{2073struct kfd_mem_attachment *entry;2074struct amdgpu_vm *vm;2075int ret;20762077vm = drm_priv_to_vm(drm_priv);20782079mutex_lock(&mem->lock);20802081ret = amdgpu_bo_reserve(mem->bo, true);2082if (ret)2083goto out;20842085list_for_each_entry(entry, &mem->attachments, list) {2086if (entry->bo_va->base.vm != vm)2087continue;2088if (entry->bo_va->base.bo->tbo.ttm &&2089!entry->bo_va->base.bo->tbo.ttm->sg)2090continue;20912092kfd_mem_dmaunmap_attachment(mem, entry);2093}20942095amdgpu_bo_unreserve(mem->bo);2096out:2097mutex_unlock(&mem->lock);20982099return ret;2100}21012102int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(2103struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv)2104{2105struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);2106unsigned long bo_size = mem->bo->tbo.base.size;2107struct kfd_mem_attachment *entry;2108struct bo_vm_reservation_context ctx;2109int ret;21102111mutex_lock(&mem->lock);21122113ret = reserve_bo_and_cond_vms(mem, avm, BO_VM_MAPPED, &ctx);2114if (unlikely(ret))2115goto out;2116/* If no VMs were reserved, it means the BO wasn't actually mapped */2117if (ctx.n_vms == 0) {2118ret = -EINVAL;2119goto unreserve_out;2120}21212122ret = vm_validate_pt_pd_bos(avm, NULL);2123if (unlikely(ret))2124goto unreserve_out;21252126pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n",2127mem->va,2128mem->va + bo_size * (1 + mem->aql_queue),2129avm);21302131list_for_each_entry(entry, &mem->attachments, list) {2132if (entry->bo_va->base.vm != avm || !entry->is_mapped)2133continue;21342135pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",2136entry->va, entry->va + bo_size, entry);21372138ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);2139if (ret)2140goto unreserve_out;21412142entry->is_mapped = false;21432144mem->mapped_to_gpu_memory--;2145pr_debug("\t DEC mapping count %d\n",2146mem->mapped_to_gpu_memory);2147}21482149unreserve_out:2150unreserve_bo_and_vms(&ctx, false, false);2151out:2152mutex_unlock(&mem->lock);2153return ret;2154}21552156int amdgpu_amdkfd_gpuvm_sync_memory(2157struct amdgpu_device *adev, struct kgd_mem *mem, bool intr)2158{2159struct amdgpu_sync sync;2160int ret;21612162amdgpu_sync_create(&sync);21632164mutex_lock(&mem->lock);2165amdgpu_sync_clone(&mem->sync, &sync);2166mutex_unlock(&mem->lock);21672168ret = amdgpu_sync_wait(&sync, intr);2169amdgpu_sync_free(&sync);2170return ret;2171}21722173/**2174* amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count2175* @bo: Buffer object to be mapped2176* @bo_gart: Return bo reference2177*2178* Before return, bo reference count is incremented. To release the reference and unpin/2179* unmap the BO, call amdgpu_amdkfd_free_gtt_mem.2180*/2181int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart)2182{2183int ret;21842185ret = amdgpu_bo_reserve(bo, true);2186if (ret) {2187pr_err("Failed to reserve bo. ret %d\n", ret);2188goto err_reserve_bo_failed;2189}21902191ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);2192if (ret) {2193pr_err("Failed to pin bo. ret %d\n", ret);2194goto err_pin_bo_failed;2195}21962197ret = amdgpu_ttm_alloc_gart(&bo->tbo);2198if (ret) {2199pr_err("Failed to bind bo to GART. ret %d\n", ret);2200goto err_map_bo_gart_failed;2201}22022203amdgpu_amdkfd_remove_eviction_fence(2204bo, bo->vm_bo->vm->process_info->eviction_fence);22052206amdgpu_bo_unreserve(bo);22072208*bo_gart = amdgpu_bo_ref(bo);22092210return 0;22112212err_map_bo_gart_failed:2213amdgpu_bo_unpin(bo);2214err_pin_bo_failed:2215amdgpu_bo_unreserve(bo);2216err_reserve_bo_failed:22172218return ret;2219}22202221/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access2222*2223* @mem: Buffer object to be mapped for CPU access2224* @kptr[out]: pointer in kernel CPU address space2225* @size[out]: size of the buffer2226*2227* Pins the BO and maps it for kernel CPU access. The eviction fence is removed2228* from the BO, since pinned BOs cannot be evicted. The bo must remain on the2229* validate_list, so the GPU mapping can be restored after a page table was2230* evicted.2231*2232* Return: 0 on success, error code on failure2233*/2234int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,2235void **kptr, uint64_t *size)2236{2237int ret;2238struct amdgpu_bo *bo = mem->bo;22392240if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {2241pr_err("userptr can't be mapped to kernel\n");2242return -EINVAL;2243}22442245mutex_lock(&mem->process_info->lock);22462247ret = amdgpu_bo_reserve(bo, true);2248if (ret) {2249pr_err("Failed to reserve bo. ret %d\n", ret);2250goto bo_reserve_failed;2251}22522253ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);2254if (ret) {2255pr_err("Failed to pin bo. ret %d\n", ret);2256goto pin_failed;2257}22582259ret = amdgpu_bo_kmap(bo, kptr);2260if (ret) {2261pr_err("Failed to map bo to kernel. ret %d\n", ret);2262goto kmap_failed;2263}22642265amdgpu_amdkfd_remove_eviction_fence(2266bo, mem->process_info->eviction_fence);22672268if (size)2269*size = amdgpu_bo_size(bo);22702271amdgpu_bo_unreserve(bo);22722273mutex_unlock(&mem->process_info->lock);2274return 0;22752276kmap_failed:2277amdgpu_bo_unpin(bo);2278pin_failed:2279amdgpu_bo_unreserve(bo);2280bo_reserve_failed:2281mutex_unlock(&mem->process_info->lock);22822283return ret;2284}22852286/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Unmap a GTT BO for kernel CPU access2287*2288* @mem: Buffer object to be unmapped for CPU access2289*2290* Removes the kernel CPU mapping and unpins the BO. It does not restore the2291* eviction fence, so this function should only be used for cleanup before the2292* BO is destroyed.2293*/2294void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem)2295{2296struct amdgpu_bo *bo = mem->bo;22972298(void)amdgpu_bo_reserve(bo, true);2299amdgpu_bo_kunmap(bo);2300amdgpu_bo_unpin(bo);2301amdgpu_bo_unreserve(bo);2302}23032304int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,2305struct kfd_vm_fault_info *mem)2306{2307if (atomic_read(&adev->gmc.vm_fault_info_updated) == 1) {2308*mem = *adev->gmc.vm_fault_info;2309mb(); /* make sure read happened */2310atomic_set(&adev->gmc.vm_fault_info_updated, 0);2311}2312return 0;2313}23142315static int import_obj_create(struct amdgpu_device *adev,2316struct dma_buf *dma_buf,2317struct drm_gem_object *obj,2318uint64_t va, void *drm_priv,2319struct kgd_mem **mem, uint64_t *size,2320uint64_t *mmap_offset)2321{2322struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);2323struct amdgpu_bo *bo;2324int ret;23252326bo = gem_to_amdgpu_bo(obj);2327if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |2328AMDGPU_GEM_DOMAIN_GTT)))2329/* Only VRAM and GTT BOs are supported */2330return -EINVAL;23312332*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);2333if (!*mem)2334return -ENOMEM;23352336ret = drm_vma_node_allow(&obj->vma_node, drm_priv);2337if (ret)2338goto err_free_mem;23392340if (size)2341*size = amdgpu_bo_size(bo);23422343if (mmap_offset)2344*mmap_offset = amdgpu_bo_mmap_offset(bo);23452346INIT_LIST_HEAD(&(*mem)->attachments);2347mutex_init(&(*mem)->lock);23482349(*mem)->alloc_flags =2350((bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?2351KFD_IOC_ALLOC_MEM_FLAGS_VRAM : KFD_IOC_ALLOC_MEM_FLAGS_GTT)2352| KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE2353| KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;23542355get_dma_buf(dma_buf);2356(*mem)->dmabuf = dma_buf;2357(*mem)->bo = bo;2358(*mem)->va = va;2359(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) &&2360!adev->apu_prefer_gtt ?2361AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;23622363(*mem)->mapped_to_gpu_memory = 0;2364(*mem)->process_info = avm->process_info;2365add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, false);2366amdgpu_sync_create(&(*mem)->sync);2367(*mem)->is_imported = true;23682369mutex_lock(&avm->process_info->lock);2370if (avm->process_info->eviction_fence &&2371!dma_fence_is_signaled(&avm->process_info->eviction_fence->base))2372ret = amdgpu_amdkfd_bo_validate_and_fence(bo, (*mem)->domain,2373&avm->process_info->eviction_fence->base);2374mutex_unlock(&avm->process_info->lock);2375if (ret)2376goto err_remove_mem;23772378return 0;23792380err_remove_mem:2381remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);2382drm_vma_node_revoke(&obj->vma_node, drm_priv);2383err_free_mem:2384kfree(*mem);2385return ret;2386}23872388int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,2389uint64_t va, void *drm_priv,2390struct kgd_mem **mem, uint64_t *size,2391uint64_t *mmap_offset)2392{2393struct drm_gem_object *obj;2394uint32_t handle;2395int ret;23962397ret = drm_gem_prime_fd_to_handle(&adev->ddev, adev->kfd.client.file, fd,2398&handle);2399if (ret)2400return ret;2401obj = drm_gem_object_lookup(adev->kfd.client.file, handle);2402if (!obj) {2403ret = -EINVAL;2404goto err_release_handle;2405}24062407ret = import_obj_create(adev, obj->dma_buf, obj, va, drm_priv, mem, size,2408mmap_offset);2409if (ret)2410goto err_put_obj;24112412(*mem)->gem_handle = handle;24132414return 0;24152416err_put_obj:2417drm_gem_object_put(obj);2418err_release_handle:2419drm_gem_handle_delete(adev->kfd.client.file, handle);2420return ret;2421}24222423int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,2424struct dma_buf **dma_buf)2425{2426int ret;24272428mutex_lock(&mem->lock);2429ret = kfd_mem_export_dmabuf(mem);2430if (ret)2431goto out;24322433get_dma_buf(mem->dmabuf);2434*dma_buf = mem->dmabuf;2435out:2436mutex_unlock(&mem->lock);2437return ret;2438}24392440/* Evict a userptr BO by stopping the queues if necessary2441*2442* Runs in MMU notifier, may be in RECLAIM_FS context. This means it2443* cannot do any memory allocations, and cannot take any locks that2444* are held elsewhere while allocating memory.2445*2446* It doesn't do anything to the BO itself. The real work happens in2447* restore, where we get updated page addresses. This function only2448* ensures that GPU access to the BO is stopped.2449*/2450int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,2451unsigned long cur_seq, struct kgd_mem *mem)2452{2453struct amdkfd_process_info *process_info = mem->process_info;2454int r = 0;24552456/* Do not process MMU notifications during CRIU restore until2457* KFD_CRIU_OP_RESUME IOCTL is received2458*/2459if (READ_ONCE(process_info->block_mmu_notifications))2460return 0;24612462mutex_lock(&process_info->notifier_lock);2463mmu_interval_set_seq(mni, cur_seq);24642465mem->invalid++;2466if (++process_info->evicted_bos == 1) {2467/* First eviction, stop the queues */2468r = kgd2kfd_quiesce_mm(mni->mm,2469KFD_QUEUE_EVICTION_TRIGGER_USERPTR);24702471if (r && r != -ESRCH)2472pr_err("Failed to quiesce KFD\n");24732474if (r != -ESRCH)2475queue_delayed_work(system_freezable_wq,2476&process_info->restore_userptr_work,2477msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));2478}2479mutex_unlock(&process_info->notifier_lock);24802481return r;2482}24832484/* Update invalid userptr BOs2485*2486* Moves invalidated (evicted) userptr BOs from userptr_valid_list to2487* userptr_inval_list and updates user pages for all BOs that have2488* been invalidated since their last update.2489*/2490static int update_invalid_user_pages(struct amdkfd_process_info *process_info,2491struct mm_struct *mm)2492{2493struct kgd_mem *mem, *tmp_mem;2494struct amdgpu_bo *bo;2495struct ttm_operation_ctx ctx = { false, false };2496uint32_t invalid;2497int ret = 0;24982499mutex_lock(&process_info->notifier_lock);25002501/* Move all invalidated BOs to the userptr_inval_list */2502list_for_each_entry_safe(mem, tmp_mem,2503&process_info->userptr_valid_list,2504validate_list)2505if (mem->invalid)2506list_move_tail(&mem->validate_list,2507&process_info->userptr_inval_list);25082509/* Go through userptr_inval_list and update any invalid user_pages */2510list_for_each_entry(mem, &process_info->userptr_inval_list,2511validate_list) {2512invalid = mem->invalid;2513if (!invalid)2514/* BO hasn't been invalidated since the last2515* revalidation attempt. Keep its page list.2516*/2517continue;25182519bo = mem->bo;25202521amdgpu_ttm_tt_discard_user_pages(bo->tbo.ttm, mem->range);2522mem->range = NULL;25232524/* BO reservations and getting user pages (hmm_range_fault)2525* must happen outside the notifier lock2526*/2527mutex_unlock(&process_info->notifier_lock);25282529/* Move the BO to system (CPU) domain if necessary to unmap2530* and free the SG table2531*/2532if (bo->tbo.resource->mem_type != TTM_PL_SYSTEM) {2533if (amdgpu_bo_reserve(bo, true))2534return -EAGAIN;2535amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);2536ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);2537amdgpu_bo_unreserve(bo);2538if (ret) {2539pr_err("%s: Failed to invalidate userptr BO\n",2540__func__);2541return -EAGAIN;2542}2543}25442545/* Get updated user pages */2546ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,2547&mem->range);2548if (ret) {2549pr_debug("Failed %d to get user pages\n", ret);25502551/* Return -EFAULT bad address error as success. It will2552* fail later with a VM fault if the GPU tries to access2553* it. Better than hanging indefinitely with stalled2554* user mode queues.2555*2556* Return other error -EBUSY or -ENOMEM to retry restore2557*/2558if (ret != -EFAULT)2559return ret;25602561/* If applications unmap memory before destroying the userptr2562* from the KFD, trigger a segmentation fault in VM debug mode.2563*/2564if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) {2565pr_err("Pid %d unmapped memory before destroying userptr at GPU addr 0x%llx\n",2566pid_nr(process_info->pid), mem->va);25672568// Send GPU VM fault to user space2569kfd_signal_vm_fault_event_with_userptr(kfd_lookup_process_by_pid(process_info->pid),2570mem->va);2571}25722573ret = 0;2574}25752576mutex_lock(&process_info->notifier_lock);25772578/* Mark the BO as valid unless it was invalidated2579* again concurrently.2580*/2581if (mem->invalid != invalid) {2582ret = -EAGAIN;2583goto unlock_out;2584}2585/* set mem valid if mem has hmm range associated */2586if (mem->range)2587mem->invalid = 0;2588}25892590unlock_out:2591mutex_unlock(&process_info->notifier_lock);25922593return ret;2594}25952596/* Validate invalid userptr BOs2597*2598* Validates BOs on the userptr_inval_list. Also updates GPUVM page tables2599* with new page addresses and waits for the page table updates to complete.2600*/2601static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)2602{2603struct ttm_operation_ctx ctx = { false, false };2604struct amdgpu_sync sync;2605struct drm_exec exec;26062607struct amdgpu_vm *peer_vm;2608struct kgd_mem *mem, *tmp_mem;2609struct amdgpu_bo *bo;2610int ret;26112612amdgpu_sync_create(&sync);26132614drm_exec_init(&exec, 0, 0);2615/* Reserve all BOs and page tables for validation */2616drm_exec_until_all_locked(&exec) {2617/* Reserve all the page directories */2618list_for_each_entry(peer_vm, &process_info->vm_list_head,2619vm_list_node) {2620ret = amdgpu_vm_lock_pd(peer_vm, &exec, 2);2621drm_exec_retry_on_contention(&exec);2622if (unlikely(ret))2623goto unreserve_out;2624}26252626/* Reserve the userptr_inval_list entries to resv_list */2627list_for_each_entry(mem, &process_info->userptr_inval_list,2628validate_list) {2629struct drm_gem_object *gobj;26302631gobj = &mem->bo->tbo.base;2632ret = drm_exec_prepare_obj(&exec, gobj, 1);2633drm_exec_retry_on_contention(&exec);2634if (unlikely(ret))2635goto unreserve_out;2636}2637}26382639ret = process_validate_vms(process_info, NULL);2640if (ret)2641goto unreserve_out;26422643/* Validate BOs and update GPUVM page tables */2644list_for_each_entry_safe(mem, tmp_mem,2645&process_info->userptr_inval_list,2646validate_list) {2647struct kfd_mem_attachment *attachment;26482649bo = mem->bo;26502651/* Validate the BO if we got user pages */2652if (bo->tbo.ttm->pages[0]) {2653amdgpu_bo_placement_from_domain(bo, mem->domain);2654ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);2655if (ret) {2656pr_err("%s: failed to validate BO\n", __func__);2657goto unreserve_out;2658}2659}26602661/* Update mapping. If the BO was not validated2662* (because we couldn't get user pages), this will2663* clear the page table entries, which will result in2664* VM faults if the GPU tries to access the invalid2665* memory.2666*/2667list_for_each_entry(attachment, &mem->attachments, list) {2668if (!attachment->is_mapped)2669continue;26702671kfd_mem_dmaunmap_attachment(mem, attachment);2672ret = update_gpuvm_pte(mem, attachment, &sync);2673if (ret) {2674pr_err("%s: update PTE failed\n", __func__);2675/* make sure this gets validated again */2676mutex_lock(&process_info->notifier_lock);2677mem->invalid++;2678mutex_unlock(&process_info->notifier_lock);2679goto unreserve_out;2680}2681}2682}26832684/* Update page directories */2685ret = process_update_pds(process_info, &sync);26862687unreserve_out:2688drm_exec_fini(&exec);2689amdgpu_sync_wait(&sync, false);2690amdgpu_sync_free(&sync);26912692return ret;2693}26942695/* Confirm that all user pages are valid while holding the notifier lock2696*2697* Moves valid BOs from the userptr_inval_list back to userptr_val_list.2698*/2699static int confirm_valid_user_pages_locked(struct amdkfd_process_info *process_info)2700{2701struct kgd_mem *mem, *tmp_mem;2702int ret = 0;27032704list_for_each_entry_safe(mem, tmp_mem,2705&process_info->userptr_inval_list,2706validate_list) {2707bool valid;27082709/* keep mem without hmm range at userptr_inval_list */2710if (!mem->range)2711continue;27122713/* Only check mem with hmm range associated */2714valid = amdgpu_ttm_tt_get_user_pages_done(2715mem->bo->tbo.ttm, mem->range);27162717mem->range = NULL;2718if (!valid) {2719WARN(!mem->invalid, "Invalid BO not marked invalid");2720ret = -EAGAIN;2721continue;2722}27232724if (mem->invalid) {2725WARN(1, "Valid BO is marked invalid");2726ret = -EAGAIN;2727continue;2728}27292730list_move_tail(&mem->validate_list,2731&process_info->userptr_valid_list);2732}27332734return ret;2735}27362737/* Worker callback to restore evicted userptr BOs2738*2739* Tries to update and validate all userptr BOs. If successful and no2740* concurrent evictions happened, the queues are restarted. Otherwise,2741* reschedule for another attempt later.2742*/2743static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)2744{2745struct delayed_work *dwork = to_delayed_work(work);2746struct amdkfd_process_info *process_info =2747container_of(dwork, struct amdkfd_process_info,2748restore_userptr_work);2749struct task_struct *usertask;2750struct mm_struct *mm;2751uint32_t evicted_bos;27522753mutex_lock(&process_info->notifier_lock);2754evicted_bos = process_info->evicted_bos;2755mutex_unlock(&process_info->notifier_lock);2756if (!evicted_bos)2757return;27582759/* Reference task and mm in case of concurrent process termination */2760usertask = get_pid_task(process_info->pid, PIDTYPE_PID);2761if (!usertask)2762return;2763mm = get_task_mm(usertask);2764if (!mm) {2765put_task_struct(usertask);2766return;2767}27682769mutex_lock(&process_info->lock);27702771if (update_invalid_user_pages(process_info, mm))2772goto unlock_out;2773/* userptr_inval_list can be empty if all evicted userptr BOs2774* have been freed. In that case there is nothing to validate2775* and we can just restart the queues.2776*/2777if (!list_empty(&process_info->userptr_inval_list)) {2778if (validate_invalid_user_pages(process_info))2779goto unlock_out;2780}2781/* Final check for concurrent evicton and atomic update. If2782* another eviction happens after successful update, it will2783* be a first eviction that calls quiesce_mm. The eviction2784* reference counting inside KFD will handle this case.2785*/2786mutex_lock(&process_info->notifier_lock);2787if (process_info->evicted_bos != evicted_bos)2788goto unlock_notifier_out;27892790if (confirm_valid_user_pages_locked(process_info)) {2791WARN(1, "User pages unexpectedly invalid");2792goto unlock_notifier_out;2793}27942795process_info->evicted_bos = evicted_bos = 0;27962797if (kgd2kfd_resume_mm(mm)) {2798pr_err("%s: Failed to resume KFD\n", __func__);2799/* No recovery from this failure. Probably the CP is2800* hanging. No point trying again.2801*/2802}28032804unlock_notifier_out:2805mutex_unlock(&process_info->notifier_lock);2806unlock_out:2807mutex_unlock(&process_info->lock);28082809/* If validation failed, reschedule another attempt */2810if (evicted_bos) {2811queue_delayed_work(system_freezable_wq,2812&process_info->restore_userptr_work,2813msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));28142815kfd_smi_event_queue_restore_rescheduled(mm);2816}2817mmput(mm);2818put_task_struct(usertask);2819}28202821static void replace_eviction_fence(struct dma_fence __rcu **ef,2822struct dma_fence *new_ef)2823{2824struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true2825/* protected by process_info->lock */);28262827/* If we're replacing an unsignaled eviction fence, that fence will2828* never be signaled, and if anyone is still waiting on that fence,2829* they will hang forever. This should never happen. We should only2830* replace the fence in restore_work that only gets scheduled after2831* eviction work signaled the fence.2832*/2833WARN_ONCE(!dma_fence_is_signaled(old_ef),2834"Replacing unsignaled eviction fence");2835dma_fence_put(old_ef);2836}28372838/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given2839* KFD process identified by process_info2840*2841* @process_info: amdkfd_process_info of the KFD process2842*2843* After memory eviction, restore thread calls this function. The function2844* should be called when the Process is still valid. BO restore involves -2845*2846* 1. Release old eviction fence and create new one2847* 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list.2848* 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of2849* BOs that need to be reserved.2850* 4. Reserve all the BOs2851* 5. Validate of PD and PT BOs.2852* 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence2853* 7. Add fence to all PD and PT BOs.2854* 8. Unreserve all BOs2855*/2856int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)2857{2858struct amdkfd_process_info *process_info = info;2859struct amdgpu_vm *peer_vm;2860struct kgd_mem *mem;2861struct list_head duplicate_save;2862struct amdgpu_sync sync_obj;2863unsigned long failed_size = 0;2864unsigned long total_size = 0;2865struct drm_exec exec;2866int ret;28672868INIT_LIST_HEAD(&duplicate_save);28692870mutex_lock(&process_info->lock);28712872drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);2873drm_exec_until_all_locked(&exec) {2874list_for_each_entry(peer_vm, &process_info->vm_list_head,2875vm_list_node) {2876ret = amdgpu_vm_lock_pd(peer_vm, &exec, 2);2877drm_exec_retry_on_contention(&exec);2878if (unlikely(ret)) {2879pr_err("Locking VM PD failed, ret: %d\n", ret);2880goto ttm_reserve_fail;2881}2882}28832884/* Reserve all BOs and page tables/directory. Add all BOs from2885* kfd_bo_list to ctx.list2886*/2887list_for_each_entry(mem, &process_info->kfd_bo_list,2888validate_list) {2889struct drm_gem_object *gobj;28902891gobj = &mem->bo->tbo.base;2892ret = drm_exec_prepare_obj(&exec, gobj, 1);2893drm_exec_retry_on_contention(&exec);2894if (unlikely(ret)) {2895pr_err("drm_exec_prepare_obj failed, ret: %d\n", ret);2896goto ttm_reserve_fail;2897}2898}2899}29002901amdgpu_sync_create(&sync_obj);29022903/* Validate BOs managed by KFD */2904list_for_each_entry(mem, &process_info->kfd_bo_list,2905validate_list) {29062907struct amdgpu_bo *bo = mem->bo;2908uint32_t domain = mem->domain;2909struct dma_resv_iter cursor;2910struct dma_fence *fence;29112912total_size += amdgpu_bo_size(bo);29132914ret = amdgpu_amdkfd_bo_validate(bo, domain, false);2915if (ret) {2916pr_debug("Memory eviction: Validate BOs failed\n");2917failed_size += amdgpu_bo_size(bo);2918ret = amdgpu_amdkfd_bo_validate(bo,2919AMDGPU_GEM_DOMAIN_GTT, false);2920if (ret) {2921pr_debug("Memory eviction: Try again\n");2922goto validate_map_fail;2923}2924}2925dma_resv_for_each_fence(&cursor, bo->tbo.base.resv,2926DMA_RESV_USAGE_KERNEL, fence) {2927ret = amdgpu_sync_fence(&sync_obj, fence, GFP_KERNEL);2928if (ret) {2929pr_debug("Memory eviction: Sync BO fence failed. Try again\n");2930goto validate_map_fail;2931}2932}2933}29342935if (failed_size)2936pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);29372938/* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO2939* validations above would invalidate DMABuf imports again.2940*/2941ret = process_validate_vms(process_info, &exec.ticket);2942if (ret) {2943pr_debug("Validating VMs failed, ret: %d\n", ret);2944goto validate_map_fail;2945}29462947/* Update mappings managed by KFD. */2948list_for_each_entry(mem, &process_info->kfd_bo_list,2949validate_list) {2950struct kfd_mem_attachment *attachment;29512952list_for_each_entry(attachment, &mem->attachments, list) {2953if (!attachment->is_mapped)2954continue;29552956kfd_mem_dmaunmap_attachment(mem, attachment);2957ret = update_gpuvm_pte(mem, attachment, &sync_obj);2958if (ret) {2959pr_debug("Memory eviction: update PTE failed. Try again\n");2960goto validate_map_fail;2961}2962}2963}29642965/* Update mappings not managed by KFD */2966list_for_each_entry(peer_vm, &process_info->vm_list_head,2967vm_list_node) {2968struct amdgpu_device *adev = amdgpu_ttm_adev(2969peer_vm->root.bo->tbo.bdev);29702971ret = amdgpu_vm_handle_moved(adev, peer_vm, &exec.ticket);2972if (ret) {2973pr_debug("Memory eviction: handle moved failed. Try again\n");2974goto validate_map_fail;2975}2976}29772978/* Update page directories */2979ret = process_update_pds(process_info, &sync_obj);2980if (ret) {2981pr_debug("Memory eviction: update PDs failed. Try again\n");2982goto validate_map_fail;2983}29842985/* Sync with fences on all the page tables. They implicitly depend on any2986* move fences from amdgpu_vm_handle_moved above.2987*/2988ret = process_sync_pds_resv(process_info, &sync_obj);2989if (ret) {2990pr_debug("Memory eviction: Failed to sync to PD BO moving fence. Try again\n");2991goto validate_map_fail;2992}29932994/* Wait for validate and PT updates to finish */2995amdgpu_sync_wait(&sync_obj, false);29962997/* The old eviction fence may be unsignaled if restore happens2998* after a GPU reset or suspend/resume. Keep the old fence in that2999* case. Otherwise release the old eviction fence and create new3000* one, because fence only goes from unsignaled to signaled once3001* and cannot be reused. Use context and mm from the old fence.3002*3003* If an old eviction fence signals after this check, that's OK.3004* Anyone signaling an eviction fence must stop the queues first3005* and schedule another restore worker.3006*/3007if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {3008struct amdgpu_amdkfd_fence *new_fence =3009amdgpu_amdkfd_fence_create(3010process_info->eviction_fence->base.context,3011process_info->eviction_fence->mm,3012NULL);30133014if (!new_fence) {3015pr_err("Failed to create eviction fence\n");3016ret = -ENOMEM;3017goto validate_map_fail;3018}3019dma_fence_put(&process_info->eviction_fence->base);3020process_info->eviction_fence = new_fence;3021replace_eviction_fence(ef, dma_fence_get(&new_fence->base));3022} else {3023WARN_ONCE(*ef != &process_info->eviction_fence->base,3024"KFD eviction fence doesn't match KGD process_info");3025}30263027/* Attach new eviction fence to all BOs except pinned ones */3028list_for_each_entry(mem, &process_info->kfd_bo_list, validate_list) {3029if (mem->bo->tbo.pin_count)3030continue;30313032dma_resv_add_fence(mem->bo->tbo.base.resv,3033&process_info->eviction_fence->base,3034DMA_RESV_USAGE_BOOKKEEP);3035}3036/* Attach eviction fence to PD / PT BOs and DMABuf imports */3037list_for_each_entry(peer_vm, &process_info->vm_list_head,3038vm_list_node) {3039struct amdgpu_bo *bo = peer_vm->root.bo;30403041dma_resv_add_fence(bo->tbo.base.resv,3042&process_info->eviction_fence->base,3043DMA_RESV_USAGE_BOOKKEEP);3044}30453046validate_map_fail:3047amdgpu_sync_free(&sync_obj);3048ttm_reserve_fail:3049drm_exec_fini(&exec);3050mutex_unlock(&process_info->lock);3051return ret;3052}30533054int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem)3055{3056struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;3057struct amdgpu_bo *gws_bo = (struct amdgpu_bo *)gws;3058int ret;30593060if (!info || !gws)3061return -EINVAL;30623063*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);3064if (!*mem)3065return -ENOMEM;30663067mutex_init(&(*mem)->lock);3068INIT_LIST_HEAD(&(*mem)->attachments);3069(*mem)->bo = amdgpu_bo_ref(gws_bo);3070(*mem)->domain = AMDGPU_GEM_DOMAIN_GWS;3071(*mem)->process_info = process_info;3072add_kgd_mem_to_kfd_bo_list(*mem, process_info, false);3073amdgpu_sync_create(&(*mem)->sync);307430753076/* Validate gws bo the first time it is added to process */3077mutex_lock(&(*mem)->process_info->lock);3078ret = amdgpu_bo_reserve(gws_bo, false);3079if (unlikely(ret)) {3080pr_err("Reserve gws bo failed %d\n", ret);3081goto bo_reservation_failure;3082}30833084ret = amdgpu_amdkfd_bo_validate(gws_bo, AMDGPU_GEM_DOMAIN_GWS, true);3085if (ret) {3086pr_err("GWS BO validate failed %d\n", ret);3087goto bo_validation_failure;3088}3089/* GWS resource is shared b/t amdgpu and amdkfd3090* Add process eviction fence to bo so they can3091* evict each other.3092*/3093ret = dma_resv_reserve_fences(gws_bo->tbo.base.resv, 1);3094if (ret)3095goto reserve_shared_fail;3096dma_resv_add_fence(gws_bo->tbo.base.resv,3097&process_info->eviction_fence->base,3098DMA_RESV_USAGE_BOOKKEEP);3099amdgpu_bo_unreserve(gws_bo);3100mutex_unlock(&(*mem)->process_info->lock);31013102return ret;31033104reserve_shared_fail:3105bo_validation_failure:3106amdgpu_bo_unreserve(gws_bo);3107bo_reservation_failure:3108mutex_unlock(&(*mem)->process_info->lock);3109amdgpu_sync_free(&(*mem)->sync);3110remove_kgd_mem_from_kfd_bo_list(*mem, process_info);3111amdgpu_bo_unref(&gws_bo);3112mutex_destroy(&(*mem)->lock);3113kfree(*mem);3114*mem = NULL;3115return ret;3116}31173118int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)3119{3120int ret;3121struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;3122struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;3123struct amdgpu_bo *gws_bo = kgd_mem->bo;31243125/* Remove BO from process's validate list so restore worker won't touch3126* it anymore3127*/3128remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);31293130ret = amdgpu_bo_reserve(gws_bo, false);3131if (unlikely(ret)) {3132pr_err("Reserve gws bo failed %d\n", ret);3133//TODO add BO back to validate_list?3134return ret;3135}3136amdgpu_amdkfd_remove_eviction_fence(gws_bo,3137process_info->eviction_fence);3138amdgpu_bo_unreserve(gws_bo);3139amdgpu_sync_free(&kgd_mem->sync);3140amdgpu_bo_unref(&gws_bo);3141mutex_destroy(&kgd_mem->lock);3142kfree(mem);3143return 0;3144}31453146/* Returns GPU-specific tiling mode information */3147int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,3148struct tile_config *config)3149{3150config->gb_addr_config = adev->gfx.config.gb_addr_config;3151config->tile_config_ptr = adev->gfx.config.tile_mode_array;3152config->num_tile_configs =3153ARRAY_SIZE(adev->gfx.config.tile_mode_array);3154config->macro_tile_config_ptr =3155adev->gfx.config.macrotile_mode_array;3156config->num_macro_tile_configs =3157ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);31583159/* Those values are not set from GFX9 onwards */3160config->num_banks = adev->gfx.config.num_banks;3161config->num_ranks = adev->gfx.config.num_ranks;31623163return 0;3164}31653166bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem)3167{3168struct amdgpu_vm *vm = drm_priv_to_vm(drm_priv);3169struct kfd_mem_attachment *entry;31703171list_for_each_entry(entry, &mem->attachments, list) {3172if (entry->is_mapped && entry->bo_va->base.vm == vm)3173return true;3174}3175return false;3176}31773178#if defined(CONFIG_DEBUG_FS)31793180int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data)3181{31823183spin_lock(&kfd_mem_limit.mem_limit_lock);3184seq_printf(m, "System mem used %lldM out of %lluM\n",3185(kfd_mem_limit.system_mem_used >> 20),3186(kfd_mem_limit.max_system_mem_limit >> 20));3187seq_printf(m, "TTM mem used %lldM out of %lluM\n",3188(kfd_mem_limit.ttm_mem_used >> 20),3189(kfd_mem_limit.max_ttm_mem_limit >> 20));3190spin_unlock(&kfd_mem_limit.mem_limit_lock);31913192return 0;3193}31943195#endif319631973198