Path: blob/master/drivers/accel/habanalabs/common/memory.c
26436 views
// SPDX-License-Identifier: GPL-2.012/*3* Copyright 2016-2022 HabanaLabs, Ltd.4* All Rights Reserved.5*/67#include <uapi/drm/habanalabs_accel.h>8#include "habanalabs.h"9#include "../include/hw_ip/mmu/mmu_general.h"1011#include <linux/uaccess.h>12#include <linux/slab.h>13#include <linux/vmalloc.h>14#include <linux/pci-p2pdma.h>1516MODULE_IMPORT_NS("DMA_BUF");1718#define HL_MMU_DEBUG 01920/* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */21#define DRAM_POOL_PAGE_SIZE SZ_8M2223#define MEM_HANDLE_INVALID ULONG_MAX2425static int allocate_timestamps_buffers(struct hl_fpriv *hpriv,26struct hl_mem_in *args, u64 *handle);2728static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u32 *page_size)29{30struct asic_fixed_properties *prop = &hdev->asic_prop;31u64 psize;3233/*34* for ASIC that supports setting the allocation page size by user we will address35* user's choice only if it is not 0 (as 0 means taking the default page size)36*/37if (prop->supports_user_set_page_size && args->alloc.page_size) {38psize = args->alloc.page_size;3940if (!is_power_of_2(psize)) {41dev_err(hdev->dev, "user page size (%#llx) is not power of 2\n", psize);42return -EINVAL;43}44} else {45psize = prop->device_mem_alloc_default_page_size;46}4748*page_size = psize;4950return 0;51}5253/*54* The va ranges in context object contain a list with the available chunks of55* device virtual memory.56* There is one range for host allocations and one for DRAM allocations.57*58* On initialization each range contains one chunk of all of its available59* virtual range which is a half of the total device virtual range.60*61* On each mapping of physical pages, a suitable virtual range chunk (with a62* minimum size) is selected from the list. If the chunk size equals the63* requested size, the chunk is returned. Otherwise, the chunk is split into64* two chunks - one to return as result and a remainder to stay in the list.65*66* On each Unmapping of a virtual address, the relevant virtual chunk is67* returned to the list. The chunk is added to the list and if its edges match68* the edges of the adjacent chunks (means a contiguous chunk can be created),69* the chunks are merged.70*71* On finish, the list is checked to have only one chunk of all the relevant72* virtual range (which is a half of the device total virtual range).73* If not (means not all mappings were unmapped), a warning is printed.74*/7576/*77* alloc_device_memory() - allocate device memory.78* @ctx: pointer to the context structure.79* @args: host parameters containing the requested size.80* @ret_handle: result handle.81*82* This function does the following:83* - Allocate the requested size rounded up to 'dram_page_size' pages.84* - Return unique handle for later map/unmap/free.85*/86static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,87u32 *ret_handle)88{89struct hl_device *hdev = ctx->hdev;90struct hl_vm *vm = &hdev->vm;91struct hl_vm_phys_pg_pack *phys_pg_pack;92u64 paddr = 0, total_size, num_pgs, i;93u32 num_curr_pgs, page_size;94bool contiguous;95int handle, rc;9697num_curr_pgs = 0;9899rc = set_alloc_page_size(hdev, args, &page_size);100if (rc)101return rc;102103num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size);104total_size = num_pgs * page_size;105106if (!total_size) {107dev_err(hdev->dev, "Cannot allocate 0 bytes\n");108return -EINVAL;109}110111contiguous = args->flags & HL_MEM_CONTIGUOUS;112113if (contiguous) {114if (is_power_of_2(page_size))115paddr = (uintptr_t) gen_pool_dma_alloc_align(vm->dram_pg_pool,116total_size, NULL, page_size);117else118paddr = gen_pool_alloc(vm->dram_pg_pool, total_size);119if (!paddr) {120dev_err(hdev->dev,121"Cannot allocate %llu contiguous pages with total size of %llu\n",122num_pgs, total_size);123return -ENOMEM;124}125}126127phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);128if (!phys_pg_pack) {129rc = -ENOMEM;130goto pages_pack_err;131}132133phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;134phys_pg_pack->asid = ctx->asid;135phys_pg_pack->npages = num_pgs;136phys_pg_pack->page_size = page_size;137phys_pg_pack->total_size = total_size;138phys_pg_pack->flags = args->flags;139phys_pg_pack->contiguous = contiguous;140141phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);142if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {143rc = -ENOMEM;144goto pages_arr_err;145}146147if (phys_pg_pack->contiguous) {148for (i = 0 ; i < num_pgs ; i++)149phys_pg_pack->pages[i] = paddr + i * page_size;150} else {151for (i = 0 ; i < num_pgs ; i++) {152if (is_power_of_2(page_size))153phys_pg_pack->pages[i] =154(uintptr_t)gen_pool_dma_alloc_align(vm->dram_pg_pool,155page_size, NULL,156page_size);157else158phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool,159page_size);160161if (!phys_pg_pack->pages[i]) {162dev_err(hdev->dev,163"Cannot allocate device memory (out of memory)\n");164rc = -ENOMEM;165goto page_err;166}167168num_curr_pgs++;169}170}171172spin_lock(&vm->idr_lock);173handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,174GFP_ATOMIC);175spin_unlock(&vm->idr_lock);176177if (handle < 0) {178dev_err(hdev->dev, "Failed to get handle for page\n");179rc = -EFAULT;180goto idr_err;181}182183for (i = 0 ; i < num_pgs ; i++)184kref_get(&vm->dram_pg_pool_refcount);185186phys_pg_pack->handle = handle;187188atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);189atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);190191*ret_handle = handle;192193return 0;194195idr_err:196page_err:197if (!phys_pg_pack->contiguous)198for (i = 0 ; i < num_curr_pgs ; i++)199gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],200page_size);201202kvfree(phys_pg_pack->pages);203pages_arr_err:204kfree(phys_pg_pack);205pages_pack_err:206if (contiguous)207gen_pool_free(vm->dram_pg_pool, paddr, total_size);208209return rc;210}211212/**213* dma_map_host_va() - DMA mapping of the given host virtual address.214* @hdev: habanalabs device structure.215* @addr: the host virtual address of the memory area.216* @size: the size of the memory area.217* @p_userptr: pointer to result userptr structure.218*219* This function does the following:220* - Allocate userptr structure.221* - Pin the given host memory using the userptr structure.222* - Perform DMA mapping to have the DMA addresses of the pages.223*/224static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,225struct hl_userptr **p_userptr)226{227struct hl_userptr *userptr;228int rc;229230userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);231if (!userptr) {232rc = -ENOMEM;233goto userptr_err;234}235236rc = hl_pin_host_memory(hdev, addr, size, userptr);237if (rc)238goto pin_err;239240userptr->dma_mapped = true;241userptr->dir = DMA_BIDIRECTIONAL;242userptr->vm_type = VM_TYPE_USERPTR;243244*p_userptr = userptr;245246rc = hl_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL);247if (rc) {248dev_err(hdev->dev, "failed to map sgt with DMA region\n");249goto dma_map_err;250}251252return 0;253254dma_map_err:255hl_unpin_host_memory(hdev, userptr);256pin_err:257kfree(userptr);258userptr_err:259260return rc;261}262263/**264* dma_unmap_host_va() - DMA unmapping of the given host virtual address.265* @hdev: habanalabs device structure.266* @userptr: userptr to free.267*268* This function does the following:269* - Unpins the physical pages.270* - Frees the userptr structure.271*/272static void dma_unmap_host_va(struct hl_device *hdev,273struct hl_userptr *userptr)274{275hl_unpin_host_memory(hdev, userptr);276kfree(userptr);277}278279/**280* dram_pg_pool_do_release() - free DRAM pages pool281* @ref: pointer to reference object.282*283* This function does the following:284* - Frees the idr structure of physical pages handles.285* - Frees the generic pool of DRAM physical pages.286*/287static void dram_pg_pool_do_release(struct kref *ref)288{289struct hl_vm *vm = container_of(ref, struct hl_vm,290dram_pg_pool_refcount);291292/*293* free the idr here as only here we know for sure that there are no294* allocated physical pages and hence there are no handles in use295*/296idr_destroy(&vm->phys_pg_pack_handles);297gen_pool_destroy(vm->dram_pg_pool);298}299300/**301* free_phys_pg_pack() - free physical page pack.302* @hdev: habanalabs device structure.303* @phys_pg_pack: physical page pack to free.304*305* This function does the following:306* - For DRAM memory only307* - iterate over the pack, free each physical block structure by308* returning it to the general pool.309* - Free the hl_vm_phys_pg_pack structure.310*/311static void free_phys_pg_pack(struct hl_device *hdev,312struct hl_vm_phys_pg_pack *phys_pg_pack)313{314struct hl_vm *vm = &hdev->vm;315u64 i;316317if (phys_pg_pack->created_from_userptr)318goto end;319320if (phys_pg_pack->contiguous) {321gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],322phys_pg_pack->total_size);323324for (i = 0; i < phys_pg_pack->npages ; i++)325kref_put(&vm->dram_pg_pool_refcount,326dram_pg_pool_do_release);327} else {328for (i = 0 ; i < phys_pg_pack->npages ; i++) {329gen_pool_free(vm->dram_pg_pool,330phys_pg_pack->pages[i],331phys_pg_pack->page_size);332kref_put(&vm->dram_pg_pool_refcount,333dram_pg_pool_do_release);334}335}336337end:338kvfree(phys_pg_pack->pages);339kfree(phys_pg_pack);340341return;342}343344/**345* free_device_memory() - free device memory.346* @ctx: pointer to the context structure.347* @args: host parameters containing the requested size.348*349* This function does the following:350* - Free the device memory related to the given handle.351*/352static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args)353{354struct hl_device *hdev = ctx->hdev;355struct hl_vm *vm = &hdev->vm;356struct hl_vm_phys_pg_pack *phys_pg_pack;357u32 handle = args->free.handle;358359spin_lock(&vm->idr_lock);360phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);361if (!phys_pg_pack) {362spin_unlock(&vm->idr_lock);363dev_err(hdev->dev, "free device memory failed, no match for handle %u\n", handle);364return -EINVAL;365}366367if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {368spin_unlock(&vm->idr_lock);369dev_err(hdev->dev, "handle %u is mapped, cannot free\n", handle);370return -EINVAL;371}372373/* must remove from idr before the freeing of the physical pages as the refcount of the pool374* is also the trigger of the idr destroy375*/376idr_remove(&vm->phys_pg_pack_handles, handle);377spin_unlock(&vm->idr_lock);378379atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);380atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);381382free_phys_pg_pack(hdev, phys_pg_pack);383384return 0;385}386387/**388* clear_va_list_locked() - free virtual addresses list.389* @hdev: habanalabs device structure.390* @va_list: list of virtual addresses to free.391*392* This function does the following:393* - Iterate over the list and free each virtual addresses block.394*395* This function should be called only when va_list lock is taken.396*/397static void clear_va_list_locked(struct hl_device *hdev,398struct list_head *va_list)399{400struct hl_vm_va_block *va_block, *tmp;401402list_for_each_entry_safe(va_block, tmp, va_list, node) {403list_del(&va_block->node);404kfree(va_block);405}406}407408/**409* print_va_list_locked() - print virtual addresses list.410* @hdev: habanalabs device structure.411* @va_list: list of virtual addresses to print.412*413* This function does the following:414* - Iterate over the list and print each virtual addresses block.415*416* This function should be called only when va_list lock is taken.417*/418static void print_va_list_locked(struct hl_device *hdev,419struct list_head *va_list)420{421#if HL_MMU_DEBUG422struct hl_vm_va_block *va_block;423424dev_dbg(hdev->dev, "print va list:\n");425426list_for_each_entry(va_block, va_list, node)427dev_dbg(hdev->dev,428"va block, start: 0x%llx, end: 0x%llx, size: %llu\n",429va_block->start, va_block->end, va_block->size);430#endif431}432433/**434* merge_va_blocks_locked() - merge a virtual block if possible.435* @hdev: pointer to the habanalabs device structure.436* @va_list: pointer to the virtual addresses block list.437* @va_block: virtual block to merge with adjacent blocks.438*439* This function does the following:440* - Merge the given blocks with the adjacent blocks if their virtual ranges441* create a contiguous virtual range.442*443* This Function should be called only when va_list lock is taken.444*/445static void merge_va_blocks_locked(struct hl_device *hdev,446struct list_head *va_list, struct hl_vm_va_block *va_block)447{448struct hl_vm_va_block *prev, *next;449450prev = list_prev_entry(va_block, node);451if (&prev->node != va_list && prev->end + 1 == va_block->start) {452prev->end = va_block->end;453prev->size = prev->end - prev->start + 1;454list_del(&va_block->node);455kfree(va_block);456va_block = prev;457}458459next = list_next_entry(va_block, node);460if (&next->node != va_list && va_block->end + 1 == next->start) {461next->start = va_block->start;462next->size = next->end - next->start + 1;463list_del(&va_block->node);464kfree(va_block);465}466}467468/**469* add_va_block_locked() - add a virtual block to the virtual addresses list.470* @hdev: pointer to the habanalabs device structure.471* @va_list: pointer to the virtual addresses block list.472* @start: start virtual address.473* @end: end virtual address.474*475* This function does the following:476* - Add the given block to the virtual blocks list and merge with other blocks477* if a contiguous virtual block can be created.478*479* This Function should be called only when va_list lock is taken.480*/481static int add_va_block_locked(struct hl_device *hdev,482struct list_head *va_list, u64 start, u64 end)483{484struct hl_vm_va_block *va_block, *res = NULL;485u64 size = end - start + 1;486487print_va_list_locked(hdev, va_list);488489list_for_each_entry(va_block, va_list, node) {490/* TODO: remove upon matureness */491if (hl_mem_area_crosses_range(start, size, va_block->start,492va_block->end)) {493dev_err(hdev->dev,494"block crossing ranges at start 0x%llx, end 0x%llx\n",495va_block->start, va_block->end);496return -EINVAL;497}498499if (va_block->end < start)500res = va_block;501}502503va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);504if (!va_block)505return -ENOMEM;506507va_block->start = start;508va_block->end = end;509va_block->size = size;510511if (!res)512list_add(&va_block->node, va_list);513else514list_add(&va_block->node, &res->node);515516merge_va_blocks_locked(hdev, va_list, va_block);517518print_va_list_locked(hdev, va_list);519520return 0;521}522523/**524* add_va_block() - wrapper for add_va_block_locked.525* @hdev: pointer to the habanalabs device structure.526* @va_range: pointer to the virtual addresses range object.527* @start: start virtual address.528* @end: end virtual address.529*530* This function does the following:531* - Takes the list lock and calls add_va_block_locked.532*/533static inline int add_va_block(struct hl_device *hdev,534struct hl_va_range *va_range, u64 start, u64 end)535{536int rc;537538mutex_lock(&va_range->lock);539rc = add_va_block_locked(hdev, &va_range->list, start, end);540mutex_unlock(&va_range->lock);541542return rc;543}544545/**546* is_hint_crossing_range() - check if hint address crossing specified reserved.547* @range_type: virtual space range type.548* @start_addr: start virtual address.549* @size: block size.550* @prop: asic properties structure to retrieve reserved ranges from.551*/552static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,553u64 start_addr, u32 size, struct asic_fixed_properties *prop) {554bool range_cross;555556if (range_type == HL_VA_RANGE_TYPE_DRAM)557range_cross =558hl_mem_area_crosses_range(start_addr, size,559prop->hints_dram_reserved_va_range.start_addr,560prop->hints_dram_reserved_va_range.end_addr);561else if (range_type == HL_VA_RANGE_TYPE_HOST)562range_cross =563hl_mem_area_crosses_range(start_addr, size,564prop->hints_host_reserved_va_range.start_addr,565prop->hints_host_reserved_va_range.end_addr);566else567range_cross =568hl_mem_area_crosses_range(start_addr, size,569prop->hints_host_hpage_reserved_va_range.start_addr,570prop->hints_host_hpage_reserved_va_range.end_addr);571572return range_cross;573}574575/**576* get_va_block() - get a virtual block for the given size and alignment.577*578* @hdev: pointer to the habanalabs device structure.579* @va_range: pointer to the virtual addresses range.580* @size: requested block size.581* @hint_addr: hint for requested address by the user.582* @va_block_align: required alignment of the virtual block start address.583* @range_type: va range type (host, dram)584* @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT585*586* This function does the following:587* - Iterate on the virtual block list to find a suitable virtual block for the588* given size, hint address and alignment.589* - Reserve the requested block and update the list.590* - Return the start address of the virtual block.591*/592static u64 get_va_block(struct hl_device *hdev,593struct hl_va_range *va_range,594u64 size, u64 hint_addr, u32 va_block_align,595enum hl_va_range_type range_type,596u32 flags)597{598struct hl_vm_va_block *va_block, *new_va_block = NULL;599struct asic_fixed_properties *prop = &hdev->asic_prop;600u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,601align_mask, reserved_valid_start = 0, reserved_valid_size = 0,602dram_hint_mask = prop->dram_hints_align_mask;603bool add_prev = false;604bool is_align_pow_2 = is_power_of_2(va_range->page_size);605bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);606bool force_hint = flags & HL_MEM_FORCE_HINT;607int rc;608609if (is_align_pow_2)610align_mask = ~((u64)va_block_align - 1);611else612/*613* with non-power-of-2 range we work only with page granularity614* and the start address is page aligned,615* so no need for alignment checking.616*/617size = DIV_ROUND_UP_ULL(size, va_range->page_size) *618va_range->page_size;619620tmp_hint_addr = hint_addr & ~dram_hint_mask;621622/* Check if we need to ignore hint address */623if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||624(!is_align_pow_2 && is_hint_dram_addr &&625do_div(tmp_hint_addr, va_range->page_size))) {626627if (force_hint) {628/* Hint must be respected, so here we just fail */629dev_err(hdev->dev,630"Hint address 0x%llx is not page aligned - cannot be respected\n",631hint_addr);632return 0;633}634635dev_dbg(hdev->dev,636"Hint address 0x%llx will be ignored because it is not aligned\n",637hint_addr);638hint_addr = 0;639}640641mutex_lock(&va_range->lock);642643print_va_list_locked(hdev, &va_range->list);644645list_for_each_entry(va_block, &va_range->list, node) {646/* Calc the first possible aligned addr */647valid_start = va_block->start;648649if (is_align_pow_2 && (valid_start & (va_block_align - 1))) {650valid_start &= align_mask;651valid_start += va_block_align;652if (valid_start > va_block->end)653continue;654}655656valid_size = va_block->end - valid_start + 1;657if (valid_size < size)658continue;659660/*661* In case hint address is 0, and hints_range_reservation662* property enabled, then avoid allocating va blocks from the663* range reserved for hint addresses664*/665if (prop->hints_range_reservation && !hint_addr)666if (is_hint_crossing_range(range_type, valid_start,667size, prop))668continue;669670/* Pick the minimal length block which has the required size */671if (!new_va_block || (valid_size < reserved_valid_size)) {672new_va_block = va_block;673reserved_valid_start = valid_start;674reserved_valid_size = valid_size;675}676677if (hint_addr && hint_addr >= valid_start &&678(hint_addr + size) <= va_block->end) {679new_va_block = va_block;680reserved_valid_start = hint_addr;681reserved_valid_size = valid_size;682break;683}684}685686if (!new_va_block) {687dev_err(hdev->dev, "no available va block for size %llu\n",688size);689goto out;690}691692if (force_hint && reserved_valid_start != hint_addr) {693/* Hint address must be respected. If we are here - this means694* we could not respect it.695*/696dev_err(hdev->dev,697"Hint address 0x%llx could not be respected\n",698hint_addr);699reserved_valid_start = 0;700goto out;701}702703/*704* Check if there is some leftover range due to reserving the new705* va block, then return it to the main virtual addresses list.706*/707if (reserved_valid_start > new_va_block->start) {708prev_start = new_va_block->start;709prev_end = reserved_valid_start - 1;710711new_va_block->start = reserved_valid_start;712new_va_block->size = reserved_valid_size;713714add_prev = true;715}716717if (new_va_block->size > size) {718new_va_block->start += size;719new_va_block->size = new_va_block->end - new_va_block->start + 1;720} else {721list_del(&new_va_block->node);722kfree(new_va_block);723}724725if (add_prev) {726rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end);727if (rc) {728reserved_valid_start = 0;729goto out;730}731}732733print_va_list_locked(hdev, &va_range->list);734out:735mutex_unlock(&va_range->lock);736737return reserved_valid_start;738}739740/*741* hl_reserve_va_block() - reserve a virtual block of a given size.742* @hdev: pointer to the habanalabs device structure.743* @ctx: current context744* @type: virtual addresses range type.745* @size: requested block size.746* @alignment: required alignment in bytes of the virtual block start address,747* 0 means no alignment.748*749* This function does the following:750* - Iterate on the virtual block list to find a suitable virtual block for the751* given size and alignment.752* - Reserve the requested block and update the list.753* - Return the start address of the virtual block.754*/755u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,756enum hl_va_range_type type, u64 size, u32 alignment)757{758return get_va_block(hdev, ctx->va_range[type], size, 0,759max(alignment, ctx->va_range[type]->page_size),760type, 0);761}762763/**764* hl_get_va_range_type() - get va_range type for the given address and size.765* @ctx: context to fetch va_range from.766* @address: the start address of the area we want to validate.767* @size: the size in bytes of the area we want to validate.768* @type: returned va_range type.769*770* Return: true if the area is inside a valid range, false otherwise.771*/772static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,773enum hl_va_range_type *type)774{775int i;776777for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) {778if (hl_mem_area_inside_range(address, size,779ctx->va_range[i]->start_addr,780ctx->va_range[i]->end_addr)) {781*type = i;782return 0;783}784}785786return -EINVAL;787}788789/**790* hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.791* @hdev: pointer to the habanalabs device structure792* @ctx: pointer to the context structure.793* @start_addr: start virtual address.794* @size: number of bytes to unreserve.795*796* This function does the following:797* - Takes the list lock and calls add_va_block_locked.798*/799int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,800u64 start_addr, u64 size)801{802enum hl_va_range_type type;803int rc;804805rc = hl_get_va_range_type(ctx, start_addr, size, &type);806if (rc) {807dev_err(hdev->dev,808"cannot find va_range for va %#llx size %llu",809start_addr, size);810return rc;811}812813rc = add_va_block(hdev, ctx->va_range[type], start_addr,814start_addr + size - 1);815if (rc)816dev_warn(hdev->dev,817"add va block failed for vaddr: 0x%llx\n", start_addr);818819return rc;820}821822/**823* init_phys_pg_pack_from_userptr() - initialize physical page pack from host824* memory825* @ctx: pointer to the context structure.826* @userptr: userptr to initialize from.827* @pphys_pg_pack: result pointer.828* @force_regular_page: tell the function to ignore huge page optimization,829* even if possible. Needed for cases where the device VA830* is allocated before we know the composition of the831* physical pages832*833* This function does the following:834* - Create a physical page pack from the physical pages related to the given835* virtual block.836*/837static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,838struct hl_userptr *userptr,839struct hl_vm_phys_pg_pack **pphys_pg_pack,840bool force_regular_page)841{842u32 npages, page_size = PAGE_SIZE,843huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;844u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);845struct hl_vm_phys_pg_pack *phys_pg_pack;846bool first = true, is_huge_page_opt;847u64 page_mask, total_npages;848struct scatterlist *sg;849dma_addr_t dma_addr;850int rc, i, j;851852phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);853if (!phys_pg_pack)854return -ENOMEM;855856phys_pg_pack->vm_type = userptr->vm_type;857phys_pg_pack->created_from_userptr = true;858phys_pg_pack->asid = ctx->asid;859atomic_set(&phys_pg_pack->mapping_cnt, 1);860861is_huge_page_opt = (force_regular_page ? false : true);862863/* Only if all dma_addrs are aligned to 2MB and their864* sizes is at least 2MB, we can use huge page mapping.865* We limit the 2MB optimization to this condition,866* since later on we acquire the related VA range as one867* consecutive block.868*/869total_npages = 0;870for_each_sgtable_dma_sg(userptr->sgt, sg, i) {871npages = hl_get_sg_info(sg, &dma_addr);872873total_npages += npages;874875if ((npages % pgs_in_huge_page) ||876(dma_addr & (huge_page_size - 1)))877is_huge_page_opt = false;878}879880if (is_huge_page_opt) {881page_size = huge_page_size;882do_div(total_npages, pgs_in_huge_page);883}884885page_mask = ~(((u64) page_size) - 1);886887phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),888GFP_KERNEL);889if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {890rc = -ENOMEM;891goto page_pack_arr_mem_err;892}893894phys_pg_pack->npages = total_npages;895phys_pg_pack->page_size = page_size;896phys_pg_pack->total_size = total_npages * page_size;897898j = 0;899for_each_sgtable_dma_sg(userptr->sgt, sg, i) {900npages = hl_get_sg_info(sg, &dma_addr);901902/* align down to physical page size and save the offset */903if (first) {904first = false;905phys_pg_pack->offset = dma_addr & (page_size - 1);906dma_addr &= page_mask;907}908909while (npages) {910phys_pg_pack->pages[j++] = dma_addr;911dma_addr += page_size;912913if (is_huge_page_opt)914npages -= pgs_in_huge_page;915else916npages--;917}918}919920*pphys_pg_pack = phys_pg_pack;921922return 0;923924page_pack_arr_mem_err:925kfree(phys_pg_pack);926927return rc;928}929930/**931* map_phys_pg_pack() - maps the physical page pack..932* @ctx: pointer to the context structure.933* @vaddr: start address of the virtual area to map from.934* @phys_pg_pack: the pack of physical pages to map to.935*936* This function does the following:937* - Maps each chunk of virtual memory to matching physical chunk.938* - Stores number of successful mappings in the given argument.939* - Returns 0 on success, error code otherwise.940*/941static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,942struct hl_vm_phys_pg_pack *phys_pg_pack)943{944struct hl_device *hdev = ctx->hdev;945u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;946u32 page_size = phys_pg_pack->page_size;947int rc = 0;948bool is_host_addr;949950for (i = 0 ; i < phys_pg_pack->npages ; i++) {951paddr = phys_pg_pack->pages[i];952953rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size,954(i + 1) == phys_pg_pack->npages);955if (rc) {956dev_err(hdev->dev,957"map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",958rc, phys_pg_pack->handle, phys_pg_pack->npages,959mapped_pg_cnt);960goto err;961}962963mapped_pg_cnt++;964next_vaddr += page_size;965}966967return 0;968969err:970is_host_addr = !hl_is_dram_va(hdev, vaddr);971972next_vaddr = vaddr;973for (i = 0 ; i < mapped_pg_cnt ; i++) {974if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,975(i + 1) == mapped_pg_cnt))976dev_warn_ratelimited(hdev->dev,977"failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",978phys_pg_pack->handle, next_vaddr,979phys_pg_pack->pages[i], page_size);980981next_vaddr += page_size;982983/*984* unmapping on Palladium can be really long, so avoid a CPU985* soft lockup bug by sleeping a little between unmapping pages986*987* In addition, on host num of pages could be huge,988* because page size could be 4KB, so when unmapping host989* pages sleep every 32K pages to avoid soft lockup990*/991if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))992usleep_range(50, 200);993}994995return rc;996}997998/**999* unmap_phys_pg_pack() - unmaps the physical page pack.1000* @ctx: pointer to the context structure.1001* @vaddr: start address of the virtual area to unmap.1002* @phys_pg_pack: the pack of physical pages to unmap.1003*/1004static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,1005struct hl_vm_phys_pg_pack *phys_pg_pack)1006{1007struct hl_device *hdev = ctx->hdev;1008u64 next_vaddr, i;1009bool is_host_addr;1010u32 page_size;10111012is_host_addr = !hl_is_dram_va(hdev, vaddr);1013page_size = phys_pg_pack->page_size;1014next_vaddr = vaddr;10151016for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {1017if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,1018(i + 1) == phys_pg_pack->npages))1019dev_warn_ratelimited(hdev->dev,1020"unmap failed for vaddr: 0x%llx\n", next_vaddr);10211022/*1023* unmapping on Palladium can be really long, so avoid a CPU1024* soft lockup bug by sleeping a little between unmapping pages1025*1026* In addition, on host num of pages could be huge,1027* because page size could be 4KB, so when unmapping host1028* pages sleep every 32K pages to avoid soft lockup1029*/1030if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))1031usleep_range(50, 200);1032}1033}10341035/**1036* map_device_va() - map the given memory.1037* @ctx: pointer to the context structure.1038* @args: host parameters with handle/host virtual address.1039* @device_addr: pointer to result device virtual address.1040*1041* This function does the following:1042* - If given a physical device memory handle, map to a device virtual block1043* and return the start address of this block.1044* - If given a host virtual address and size, find the related physical pages,1045* map a device virtual block to this pages and return the start address of1046* this block.1047*/1048static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr)1049{1050struct hl_vm_phys_pg_pack *phys_pg_pack;1051enum hl_va_range_type va_range_type = 0;1052struct hl_device *hdev = ctx->hdev;1053struct hl_userptr *userptr = NULL;1054u32 handle = 0, va_block_align;1055struct hl_vm_hash_node *hnode;1056struct hl_vm *vm = &hdev->vm;1057struct hl_va_range *va_range;1058bool is_userptr, do_prefetch;1059u64 ret_vaddr, hint_addr;1060enum vm_type *vm_type;1061int rc;10621063/* set map flags */1064is_userptr = args->flags & HL_MEM_USERPTR;1065do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);10661067/* Assume failure */1068*device_addr = 0;10691070if (is_userptr) {1071u64 addr = args->map_host.host_virt_addr,1072size = args->map_host.mem_size;1073u32 page_size = hdev->asic_prop.pmmu.page_size,1074huge_page_size = hdev->asic_prop.pmmu_huge.page_size;10751076rc = dma_map_host_va(hdev, addr, size, &userptr);1077if (rc)1078return rc;10791080rc = init_phys_pg_pack_from_userptr(ctx, userptr,1081&phys_pg_pack, false);1082if (rc) {1083dev_err(hdev->dev,1084"unable to init page pack for vaddr 0x%llx\n",1085addr);1086goto init_page_pack_err;1087}10881089vm_type = (enum vm_type *) userptr;1090hint_addr = args->map_host.hint_addr;1091handle = phys_pg_pack->handle;10921093/* get required alignment */1094if (phys_pg_pack->page_size == page_size) {1095va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];1096va_range_type = HL_VA_RANGE_TYPE_HOST;1097/*1098* huge page alignment may be needed in case of regular1099* page mapping, depending on the host VA alignment1100*/1101if (addr & (huge_page_size - 1))1102va_block_align = page_size;1103else1104va_block_align = huge_page_size;1105} else {1106/*1107* huge page alignment is needed in case of huge page1108* mapping1109*/1110va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];1111va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;1112va_block_align = huge_page_size;1113}1114} else {1115handle = lower_32_bits(args->map_device.handle);11161117spin_lock(&vm->idr_lock);1118phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);1119if (!phys_pg_pack) {1120spin_unlock(&vm->idr_lock);1121dev_err(hdev->dev,1122"no match for handle %u\n", handle);1123return -EINVAL;1124}11251126/* increment now to avoid freeing device memory while mapping */1127atomic_inc(&phys_pg_pack->mapping_cnt);11281129spin_unlock(&vm->idr_lock);11301131vm_type = (enum vm_type *) phys_pg_pack;11321133hint_addr = args->map_device.hint_addr;11341135/* DRAM VA alignment is the same as the MMU page size */1136va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];1137va_range_type = HL_VA_RANGE_TYPE_DRAM;1138va_block_align = hdev->asic_prop.dmmu.page_size;1139}11401141/*1142* relevant for mapping device physical memory only, as host memory is1143* implicitly shared1144*/1145if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&1146phys_pg_pack->asid != ctx->asid) {1147dev_err(hdev->dev,1148"Failed to map memory, handle %u is not shared\n",1149handle);1150rc = -EPERM;1151goto shared_err;1152}11531154hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);1155if (!hnode) {1156rc = -ENOMEM;1157goto hnode_err;1158}11591160if (hint_addr && phys_pg_pack->offset) {1161if (args->flags & HL_MEM_FORCE_HINT) {1162/* Fail if hint must be respected but it can't be */1163dev_err(hdev->dev,1164"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",1165hint_addr, phys_pg_pack->offset);1166rc = -EINVAL;1167goto va_block_err;1168}1169dev_dbg(hdev->dev,1170"Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",1171hint_addr, phys_pg_pack->offset);1172}11731174ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,1175hint_addr, va_block_align,1176va_range_type, args->flags);1177if (!ret_vaddr) {1178dev_err(hdev->dev, "no available va block for handle %u\n",1179handle);1180rc = -ENOMEM;1181goto va_block_err;1182}11831184mutex_lock(&hdev->mmu_lock);11851186rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);1187if (rc) {1188dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",1189rc, handle);1190mutex_unlock(&hdev->mmu_lock);1191goto map_err;1192}11931194rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,1195ctx->asid, ret_vaddr, phys_pg_pack->total_size);1196mutex_unlock(&hdev->mmu_lock);1197if (rc)1198goto map_err;11991200/*1201* prefetch is done upon user's request. it is performed in WQ as and so can1202* be outside the MMU lock. the operation itself is already protected by the mmu lock1203*/1204if (do_prefetch) {1205rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr,1206phys_pg_pack->total_size);1207if (rc)1208goto map_err;1209}12101211ret_vaddr += phys_pg_pack->offset;12121213hnode->ptr = vm_type;1214hnode->vaddr = ret_vaddr;1215hnode->handle = is_userptr ? MEM_HANDLE_INVALID : handle;12161217mutex_lock(&ctx->mem_hash_lock);1218hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);1219mutex_unlock(&ctx->mem_hash_lock);12201221*device_addr = ret_vaddr;12221223if (is_userptr)1224free_phys_pg_pack(hdev, phys_pg_pack);12251226return rc;12271228map_err:1229if (add_va_block(hdev, va_range, ret_vaddr,1230ret_vaddr + phys_pg_pack->total_size - 1))1231dev_warn(hdev->dev,1232"release va block failed for handle 0x%x, vaddr: 0x%llx\n",1233handle, ret_vaddr);12341235va_block_err:1236kfree(hnode);1237hnode_err:1238shared_err:1239atomic_dec(&phys_pg_pack->mapping_cnt);1240if (is_userptr)1241free_phys_pg_pack(hdev, phys_pg_pack);1242init_page_pack_err:1243if (is_userptr)1244dma_unmap_host_va(hdev, userptr);12451246return rc;1247}12481249/* Should be called while the context's mem_hash_lock is taken */1250static struct hl_vm_hash_node *get_vm_hash_node_locked(struct hl_ctx *ctx, u64 vaddr)1251{1252struct hl_vm_hash_node *hnode;12531254hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)1255if (vaddr == hnode->vaddr)1256return hnode;12571258return NULL;1259}12601261/**1262* unmap_device_va() - unmap the given device virtual address.1263* @ctx: pointer to the context structure.1264* @args: host parameters with device virtual address to unmap.1265* @ctx_free: true if in context free flow, false otherwise.1266*1267* This function does the following:1268* - unmap the physical pages related to the given virtual address.1269* - return the device virtual block to the virtual block list.1270*/1271static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,1272bool ctx_free)1273{1274struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;1275u64 vaddr = args->unmap.device_virt_addr;1276struct asic_fixed_properties *prop;1277struct hl_device *hdev = ctx->hdev;1278struct hl_userptr *userptr = NULL;1279struct hl_vm_hash_node *hnode;1280struct hl_va_range *va_range;1281enum vm_type *vm_type;1282bool is_userptr;1283int rc = 0;12841285prop = &hdev->asic_prop;12861287/* protect from double entrance */1288mutex_lock(&ctx->mem_hash_lock);1289hnode = get_vm_hash_node_locked(ctx, vaddr);1290if (!hnode) {1291mutex_unlock(&ctx->mem_hash_lock);1292dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);1293return -EINVAL;1294}12951296if (hnode->export_cnt) {1297mutex_unlock(&ctx->mem_hash_lock);1298dev_err(hdev->dev, "failed to unmap %#llx, memory is exported\n", vaddr);1299return -EINVAL;1300}13011302hash_del(&hnode->node);1303mutex_unlock(&ctx->mem_hash_lock);13041305vm_type = hnode->ptr;13061307if (*vm_type == VM_TYPE_USERPTR) {1308is_userptr = true;1309userptr = hnode->ptr;13101311rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,1312false);1313if (rc) {1314dev_err(hdev->dev,1315"unable to init page pack for vaddr 0x%llx\n",1316vaddr);1317goto vm_type_err;1318}13191320if (phys_pg_pack->page_size ==1321hdev->asic_prop.pmmu.page_size)1322va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];1323else1324va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];1325} else if (*vm_type == VM_TYPE_PHYS_PACK) {1326is_userptr = false;1327va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];1328phys_pg_pack = hnode->ptr;1329} else {1330dev_warn(hdev->dev,1331"unmap failed, unknown vm desc for vaddr 0x%llx\n",1332vaddr);1333rc = -EFAULT;1334goto vm_type_err;1335}13361337if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {1338dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);1339rc = -EINVAL;1340goto mapping_cnt_err;1341}13421343if (!is_userptr && !is_power_of_2(phys_pg_pack->page_size))1344vaddr = prop->dram_base_address +1345DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address,1346phys_pg_pack->page_size) *1347phys_pg_pack->page_size;1348else1349vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);13501351mutex_lock(&hdev->mmu_lock);13521353unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);13541355/*1356* During context free this function is called in a loop to clean all1357* the context mappings. Hence the cache invalidation can be called once1358* at the loop end rather than for each iteration1359*/1360if (!ctx_free)1361rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr,1362phys_pg_pack->total_size);13631364mutex_unlock(&hdev->mmu_lock);13651366/*1367* If the context is closing we don't need to check for the MMU cache1368* invalidation return code and update the VA free list as in this flow1369* we invalidate the MMU cache outside of this unmap function and the VA1370* free list will be freed anyway.1371*/1372if (!ctx_free) {1373int tmp_rc;13741375tmp_rc = add_va_block(hdev, va_range, vaddr,1376vaddr + phys_pg_pack->total_size - 1);1377if (tmp_rc) {1378dev_warn(hdev->dev,1379"add va block failed for vaddr: 0x%llx\n",1380vaddr);1381if (!rc)1382rc = tmp_rc;1383}1384}13851386atomic_dec(&phys_pg_pack->mapping_cnt);1387kfree(hnode);13881389if (is_userptr) {1390free_phys_pg_pack(hdev, phys_pg_pack);1391dma_unmap_host_va(hdev, userptr);1392}13931394return rc;13951396mapping_cnt_err:1397if (is_userptr)1398free_phys_pg_pack(hdev, phys_pg_pack);1399vm_type_err:1400mutex_lock(&ctx->mem_hash_lock);1401hash_add(ctx->mem_hash, &hnode->node, vaddr);1402mutex_unlock(&ctx->mem_hash_lock);14031404return rc;1405}14061407static int map_block(struct hl_device *hdev, u64 address, u64 *handle, u32 *size)1408{1409u32 block_id;1410int rc;14111412*handle = 0;1413if (size)1414*size = 0;14151416rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);1417if (rc)1418return rc;14191420*handle = block_id | HL_MMAP_TYPE_BLOCK;1421*handle <<= PAGE_SHIFT;14221423return 0;1424}14251426static void hw_block_vm_close(struct vm_area_struct *vma)1427{1428struct hl_vm_hw_block_list_node *lnode =1429(struct hl_vm_hw_block_list_node *) vma->vm_private_data;1430struct hl_ctx *ctx = lnode->ctx;1431long new_mmap_size;14321433new_mmap_size = lnode->mapped_size - (vma->vm_end - vma->vm_start);1434if (new_mmap_size > 0) {1435lnode->mapped_size = new_mmap_size;1436return;1437}14381439mutex_lock(&ctx->hw_block_list_lock);1440list_del(&lnode->node);1441mutex_unlock(&ctx->hw_block_list_lock);1442hl_ctx_put(ctx);1443kfree(lnode);1444vma->vm_private_data = NULL;1445}14461447static const struct vm_operations_struct hw_block_vm_ops = {1448.close = hw_block_vm_close1449};14501451/**1452* hl_hw_block_mmap() - mmap a hw block to user.1453* @hpriv: pointer to the private data of the fd1454* @vma: pointer to vm_area_struct of the process1455*1456* Driver increments context reference for every HW block mapped in order1457* to prevent user from closing FD without unmapping first1458*/1459int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)1460{1461struct hl_vm_hw_block_list_node *lnode;1462struct hl_device *hdev = hpriv->hdev;1463struct hl_ctx *ctx = hpriv->ctx;1464u32 block_id, block_size;1465int rc;14661467/* We use the page offset to hold the block id and thus we need to clear1468* it before doing the mmap itself1469*/1470block_id = vma->vm_pgoff;1471vma->vm_pgoff = 0;14721473/* Driver only allows mapping of a complete HW block */1474block_size = vma->vm_end - vma->vm_start;14751476if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {1477dev_err(hdev->dev,1478"user pointer is invalid - 0x%lx\n",1479vma->vm_start);14801481return -EINVAL;1482}14831484lnode = kzalloc(sizeof(*lnode), GFP_KERNEL);1485if (!lnode)1486return -ENOMEM;14871488rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);1489if (rc) {1490kfree(lnode);1491return rc;1492}14931494hl_ctx_get(ctx);14951496lnode->ctx = ctx;1497lnode->vaddr = vma->vm_start;1498lnode->block_size = block_size;1499lnode->mapped_size = lnode->block_size;1500lnode->id = block_id;15011502vma->vm_private_data = lnode;1503vma->vm_ops = &hw_block_vm_ops;15041505mutex_lock(&ctx->hw_block_list_lock);1506list_add_tail(&lnode->node, &ctx->hw_block_mem_list);1507mutex_unlock(&ctx->hw_block_list_lock);15081509vma->vm_pgoff = block_id;15101511return 0;1512}15131514static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,1515struct device *dev, enum dma_data_direction dir)1516{1517dma_addr_t addr;1518int rc;15191520addr = dma_map_resource(dev, bar_address, chunk_size, dir,1521DMA_ATTR_SKIP_CPU_SYNC);1522rc = dma_mapping_error(dev, addr);1523if (rc)1524return rc;15251526sg_set_page(sg, NULL, chunk_size, 0);1527sg_dma_address(sg) = addr;1528sg_dma_len(sg) = chunk_size;15291530return 0;1531}15321533static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages,1534u64 page_size, u64 exported_size, u64 offset,1535struct device *dev, enum dma_data_direction dir)1536{1537u64 dma_max_seg_size, curr_page, size, chunk_size, left_size_to_export, left_size_in_page,1538left_size_in_dma_seg, device_address, bar_address, start_page;1539struct asic_fixed_properties *prop = &hdev->asic_prop;1540struct scatterlist *sg;1541unsigned int nents, i;1542struct sg_table *sgt;1543bool next_sg_entry;1544int rc;15451546/* Align max segment size to PAGE_SIZE to fit the minimal IOMMU mapping granularity */1547dma_max_seg_size = ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);1548if (dma_max_seg_size < PAGE_SIZE) {1549dev_err_ratelimited(hdev->dev,1550"dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",1551dma_max_seg_size);1552return ERR_PTR(-EINVAL);1553}15541555sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);1556if (!sgt)1557return ERR_PTR(-ENOMEM);15581559/* Use the offset to move to the actual first page that is exported */1560for (start_page = 0 ; start_page < npages ; ++start_page) {1561if (offset < page_size)1562break;15631564/* The offset value was validated so there can't be an underflow */1565offset -= page_size;1566}15671568/* Calculate the required number of entries for the SG table */1569curr_page = start_page;1570nents = 1;1571left_size_to_export = exported_size;1572left_size_in_page = page_size - offset;1573left_size_in_dma_seg = dma_max_seg_size;1574next_sg_entry = false;15751576while (true) {1577size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);1578left_size_to_export -= size;1579left_size_in_page -= size;1580left_size_in_dma_seg -= size;15811582if (!left_size_to_export)1583break;15841585if (!left_size_in_page) {1586/* left_size_to_export is not zero so there must be another page */1587if (pages[curr_page] + page_size != pages[curr_page + 1])1588next_sg_entry = true;15891590++curr_page;1591left_size_in_page = page_size;1592}15931594if (!left_size_in_dma_seg) {1595next_sg_entry = true;1596left_size_in_dma_seg = dma_max_seg_size;1597}15981599if (next_sg_entry) {1600++nents;1601next_sg_entry = false;1602}1603}16041605rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);1606if (rc)1607goto err_free_sgt;16081609/* Prepare the SG table entries */1610curr_page = start_page;1611device_address = pages[curr_page] + offset;1612left_size_to_export = exported_size;1613left_size_in_page = page_size - offset;1614left_size_in_dma_seg = dma_max_seg_size;1615next_sg_entry = false;16161617for_each_sgtable_dma_sg(sgt, sg, i) {1618bar_address = hdev->dram_pci_bar_start + (device_address - prop->dram_base_address);1619chunk_size = 0;16201621for ( ; curr_page < npages ; ++curr_page) {1622size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);1623chunk_size += size;1624left_size_to_export -= size;1625left_size_in_page -= size;1626left_size_in_dma_seg -= size;16271628if (!left_size_to_export)1629break;16301631if (!left_size_in_page) {1632/* left_size_to_export is not zero so there must be another page */1633if (pages[curr_page] + page_size != pages[curr_page + 1]) {1634device_address = pages[curr_page + 1];1635next_sg_entry = true;1636}16371638left_size_in_page = page_size;1639}16401641if (!left_size_in_dma_seg) {1642/*1643* Skip setting a new device address if already moving to a page1644* which is not contiguous with the current page.1645*/1646if (!next_sg_entry) {1647device_address += chunk_size;1648next_sg_entry = true;1649}16501651left_size_in_dma_seg = dma_max_seg_size;1652}16531654if (next_sg_entry) {1655next_sg_entry = false;1656break;1657}1658}16591660rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);1661if (rc)1662goto err_unmap;1663}16641665/* There should be nothing left to export exactly after looping over all SG elements */1666if (left_size_to_export) {1667dev_err(hdev->dev,1668"left size to export %#llx after initializing %u SG elements\n",1669left_size_to_export, sgt->nents);1670rc = -ENOMEM;1671goto err_unmap;1672}16731674/*1675* Because we are not going to include a CPU list, we want to have some chance that other1676* users will detect this when going over SG table, by setting the orig_nents to 0 and using1677* only nents (length of DMA list).1678*/1679sgt->orig_nents = 0;16801681dev_dbg(hdev->dev, "prepared SG table with %u entries for importer %s\n",1682nents, dev_name(dev));1683for_each_sgtable_dma_sg(sgt, sg, i)1684dev_dbg(hdev->dev,1685"SG entry %d: address %#llx, length %#x\n",1686i, sg_dma_address(sg), sg_dma_len(sg));16871688return sgt;16891690err_unmap:1691for_each_sgtable_dma_sg(sgt, sg, i) {1692if (!sg_dma_len(sg))1693continue;16941695dma_unmap_resource(dev, sg_dma_address(sg), sg_dma_len(sg), dir,1696DMA_ATTR_SKIP_CPU_SYNC);1697}16981699sg_free_table(sgt);17001701err_free_sgt:1702kfree(sgt);1703return ERR_PTR(rc);1704}17051706static int hl_dmabuf_attach(struct dma_buf *dmabuf,1707struct dma_buf_attachment *attachment)1708{1709struct hl_dmabuf_priv *hl_dmabuf;1710struct hl_device *hdev;1711int rc;17121713hl_dmabuf = dmabuf->priv;1714hdev = hl_dmabuf->ctx->hdev;17151716rc = pci_p2pdma_distance(hdev->pdev, attachment->dev, true);17171718if (rc < 0)1719attachment->peer2peer = false;1720return 0;1721}17221723static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,1724enum dma_data_direction dir)1725{1726u64 *pages, npages, page_size, exported_size, offset;1727struct dma_buf *dma_buf = attachment->dmabuf;1728struct hl_vm_phys_pg_pack *phys_pg_pack;1729struct hl_dmabuf_priv *hl_dmabuf;1730struct hl_device *hdev;1731struct sg_table *sgt;17321733hl_dmabuf = dma_buf->priv;1734hdev = hl_dmabuf->ctx->hdev;17351736if (!attachment->peer2peer) {1737dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n");1738return ERR_PTR(-EPERM);1739}17401741exported_size = hl_dmabuf->dmabuf->size;1742offset = hl_dmabuf->offset;1743phys_pg_pack = hl_dmabuf->phys_pg_pack;17441745if (phys_pg_pack) {1746pages = phys_pg_pack->pages;1747npages = phys_pg_pack->npages;1748page_size = phys_pg_pack->page_size;1749} else {1750pages = &hl_dmabuf->device_phys_addr;1751npages = 1;1752page_size = hl_dmabuf->dmabuf->size;1753}17541755sgt = alloc_sgt_from_device_pages(hdev, pages, npages, page_size, exported_size, offset,1756attachment->dev, dir);1757if (IS_ERR(sgt))1758dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt));17591760return sgt;1761}17621763static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,1764struct sg_table *sgt,1765enum dma_data_direction dir)1766{1767struct scatterlist *sg;1768int i;17691770/* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives1771* only in the 'device' domain (after all, it maps a PCI bar address which points to the1772* device memory).1773*1774* Therefore, it was never in the 'CPU' domain and hence, there is no need to perform1775* a sync of the memory to the CPU's cache, as it never resided inside that cache.1776*/1777for_each_sgtable_dma_sg(sgt, sg, i)1778dma_unmap_resource(attachment->dev, sg_dma_address(sg),1779sg_dma_len(sg), dir,1780DMA_ATTR_SKIP_CPU_SYNC);17811782/* Need to restore orig_nents because sg_free_table use that field */1783sgt->orig_nents = sgt->nents;1784sg_free_table(sgt);1785kfree(sgt);1786}17871788static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 addr)1789{1790struct hl_device *hdev = ctx->hdev;1791struct hl_vm_hash_node *hnode;17921793/* get the memory handle */1794mutex_lock(&ctx->mem_hash_lock);1795hnode = get_vm_hash_node_locked(ctx, addr);1796if (!hnode) {1797mutex_unlock(&ctx->mem_hash_lock);1798dev_dbg(hdev->dev, "map address %#llx not found\n", addr);1799return ERR_PTR(-EINVAL);1800}18011802if (upper_32_bits(hnode->handle)) {1803mutex_unlock(&ctx->mem_hash_lock);1804dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",1805hnode->handle, addr);1806return ERR_PTR(-EINVAL);1807}18081809/*1810* node found, increase export count so this memory cannot be unmapped1811* and the hash node cannot be deleted.1812*/1813hnode->export_cnt++;1814mutex_unlock(&ctx->mem_hash_lock);18151816return hnode;1817}18181819static void memhash_node_export_put(struct hl_ctx *ctx, struct hl_vm_hash_node *hnode)1820{1821mutex_lock(&ctx->mem_hash_lock);1822hnode->export_cnt--;1823mutex_unlock(&ctx->mem_hash_lock);1824}18251826static void hl_release_dmabuf(struct dma_buf *dmabuf)1827{1828struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;1829struct hl_ctx *ctx;18301831ctx = hl_dmabuf->ctx;18321833if (hl_dmabuf->memhash_hnode)1834memhash_node_export_put(ctx, hl_dmabuf->memhash_hnode);18351836atomic_dec(&ctx->hdev->dmabuf_export_cnt);1837hl_ctx_put(ctx);18381839/* Paired with get_file() in export_dmabuf() */1840fput(ctx->hpriv->file_priv->filp);18411842kfree(hl_dmabuf);1843}18441845static const struct dma_buf_ops habanalabs_dmabuf_ops = {1846.attach = hl_dmabuf_attach,1847.map_dma_buf = hl_map_dmabuf,1848.unmap_dma_buf = hl_unmap_dmabuf,1849.release = hl_release_dmabuf,1850};18511852static int export_dmabuf(struct hl_ctx *ctx,1853struct hl_dmabuf_priv *hl_dmabuf,1854u64 total_size, int flags, int *dmabuf_fd)1855{1856DEFINE_DMA_BUF_EXPORT_INFO(exp_info);1857struct hl_device *hdev = ctx->hdev;1858CLASS(get_unused_fd, fd)(flags);18591860if (fd < 0) {1861dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);1862return fd;1863}18641865exp_info.ops = &habanalabs_dmabuf_ops;1866exp_info.size = total_size;1867exp_info.flags = flags;1868exp_info.priv = hl_dmabuf;18691870hl_dmabuf->dmabuf = dma_buf_export(&exp_info);1871if (IS_ERR(hl_dmabuf->dmabuf)) {1872dev_err(hdev->dev, "failed to export dma-buf\n");1873return PTR_ERR(hl_dmabuf->dmabuf);1874}18751876hl_dmabuf->ctx = ctx;1877hl_ctx_get(hl_dmabuf->ctx);1878atomic_inc(&ctx->hdev->dmabuf_export_cnt);18791880/* Get compute device file to enforce release order, such that all exported dma-buf will be1881* released first and only then the compute device.1882* Paired with fput() in hl_release_dmabuf().1883*/1884get_file(ctx->hpriv->file_priv->filp);18851886*dmabuf_fd = fd;1887fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);18881889return 0;1890}18911892static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)1893{1894if (!PAGE_ALIGNED(addr)) {1895dev_dbg(hdev->dev,1896"exported device memory address 0x%llx should be aligned to PAGE_SIZE 0x%lx\n",1897addr, PAGE_SIZE);1898return -EINVAL;1899}19001901if (!size || !PAGE_ALIGNED(size)) {1902dev_dbg(hdev->dev,1903"exported device memory size %llu should be a multiple of PAGE_SIZE %lu\n",1904size, PAGE_SIZE);1905return -EINVAL;1906}19071908if (!PAGE_ALIGNED(offset)) {1909dev_dbg(hdev->dev,1910"exported device memory offset %llu should be a multiple of PAGE_SIZE %lu\n",1911offset, PAGE_SIZE);1912return -EINVAL;1913}19141915return 0;1916}19171918static int validate_export_params_no_mmu(struct hl_device *hdev, u64 device_addr, u64 size)1919{1920struct asic_fixed_properties *prop = &hdev->asic_prop;1921u64 bar_address;1922int rc;19231924rc = validate_export_params_common(hdev, device_addr, size, 0);1925if (rc)1926return rc;19271928if (device_addr < prop->dram_user_base_address ||1929(device_addr + size) > prop->dram_end_address ||1930(device_addr + size) < device_addr) {1931dev_dbg(hdev->dev,1932"DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n",1933device_addr, size);1934return -EINVAL;1935}19361937bar_address = hdev->dram_pci_bar_start + (device_addr - prop->dram_base_address);19381939if ((bar_address + size) > (hdev->dram_pci_bar_start + prop->dram_pci_bar_size) ||1940(bar_address + size) < bar_address) {1941dev_dbg(hdev->dev,1942"DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n",1943device_addr, size);1944return -EINVAL;1945}19461947return 0;1948}19491950static int validate_export_params(struct hl_device *hdev, u64 device_addr, u64 size, u64 offset,1951struct hl_vm_phys_pg_pack *phys_pg_pack)1952{1953struct asic_fixed_properties *prop = &hdev->asic_prop;1954u64 bar_address;1955int i, rc;19561957rc = validate_export_params_common(hdev, device_addr, size, offset);1958if (rc)1959return rc;19601961if ((offset + size) > phys_pg_pack->total_size) {1962dev_dbg(hdev->dev, "offset %#llx and size %#llx exceed total map size %#llx\n",1963offset, size, phys_pg_pack->total_size);1964return -EINVAL;1965}19661967for (i = 0 ; i < phys_pg_pack->npages ; i++) {1968bar_address = hdev->dram_pci_bar_start +1969(phys_pg_pack->pages[i] - prop->dram_base_address);19701971if ((bar_address + phys_pg_pack->page_size) >1972(hdev->dram_pci_bar_start + prop->dram_pci_bar_size) ||1973(bar_address + phys_pg_pack->page_size) < bar_address) {1974dev_dbg(hdev->dev,1975"DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n",1976phys_pg_pack->pages[i], phys_pg_pack->page_size);1977return -EINVAL;1978}1979}19801981return 0;1982}19831984static struct hl_vm_phys_pg_pack *get_phys_pg_pack_from_hash_node(struct hl_device *hdev,1985struct hl_vm_hash_node *hnode)1986{1987struct hl_vm_phys_pg_pack *phys_pg_pack;1988struct hl_vm *vm = &hdev->vm;19891990spin_lock(&vm->idr_lock);1991phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) hnode->handle);1992if (!phys_pg_pack) {1993spin_unlock(&vm->idr_lock);1994dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) hnode->handle);1995return ERR_PTR(-EINVAL);1996}19971998spin_unlock(&vm->idr_lock);19992000if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {2001dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", hnode->handle);2002return ERR_PTR(-EINVAL);2003}20042005return phys_pg_pack;2006}20072008/**2009* export_dmabuf_from_addr() - export a dma-buf object for the given memory2010* address and size.2011* @ctx: pointer to the context structure.2012* @addr: device address.2013* @size: size of device memory to export.2014* @offset: the offset into the buffer from which to start exporting2015* @flags: DMA-BUF file/FD flags.2016* @dmabuf_fd: pointer to result FD that represents the dma-buf object.2017*2018* Create and export a dma-buf object for an existing memory allocation inside2019* the device memory, and return a FD which is associated with the dma-buf2020* object.2021*2022* Return: 0 on success, non-zero for failure.2023*/2024static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 addr, u64 size, u64 offset,2025int flags, int *dmabuf_fd)2026{2027struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;2028struct hl_vm_hash_node *hnode = NULL;2029struct asic_fixed_properties *prop;2030struct hl_dmabuf_priv *hl_dmabuf;2031struct hl_device *hdev;2032int rc;20332034hdev = ctx->hdev;2035prop = &hdev->asic_prop;20362037/* offset must be 0 in devices without virtual memory support */2038if (!prop->dram_supports_virtual_memory && offset) {2039dev_dbg(hdev->dev, "offset is not allowed in device without virtual memory\n");2040return -EINVAL;2041}20422043hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);2044if (!hl_dmabuf)2045return -ENOMEM;20462047if (prop->dram_supports_virtual_memory) {2048hnode = memhash_node_export_get(ctx, addr);2049if (IS_ERR(hnode)) {2050rc = PTR_ERR(hnode);2051goto err_free_dmabuf_wrapper;2052}2053phys_pg_pack = get_phys_pg_pack_from_hash_node(hdev, hnode);2054if (IS_ERR(phys_pg_pack)) {2055rc = PTR_ERR(phys_pg_pack);2056goto dec_memhash_export_cnt;2057}2058rc = validate_export_params(hdev, addr, size, offset, phys_pg_pack);2059if (rc)2060goto dec_memhash_export_cnt;20612062hl_dmabuf->phys_pg_pack = phys_pg_pack;2063hl_dmabuf->memhash_hnode = hnode;2064hl_dmabuf->offset = offset;2065} else {2066rc = validate_export_params_no_mmu(hdev, addr, size);2067if (rc)2068goto err_free_dmabuf_wrapper;20692070hl_dmabuf->device_phys_addr = addr;2071}20722073rc = export_dmabuf(ctx, hl_dmabuf, size, flags, dmabuf_fd);2074if (rc)2075goto dec_memhash_export_cnt;20762077return 0;20782079dec_memhash_export_cnt:2080if (prop->dram_supports_virtual_memory)2081memhash_node_export_put(ctx, hnode);2082err_free_dmabuf_wrapper:2083kfree(hl_dmabuf);2084return rc;2085}20862087static void ts_buff_release(struct hl_mmap_mem_buf *buf)2088{2089struct hl_ts_buff *ts_buff = buf->private;20902091vfree(ts_buff->kernel_buff_address);2092vfree(ts_buff->user_buff_address);2093kfree(ts_buff);2094}20952096static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args)2097{2098struct hl_ts_buff *ts_buff = buf->private;20992100vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE);2101return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0);2102}21032104static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)2105{2106struct hl_ts_buff *ts_buff = NULL;2107u32 num_elements;2108size_t size;2109void *p;21102111num_elements = *(u32 *)args;21122113ts_buff = kzalloc(sizeof(*ts_buff), gfp);2114if (!ts_buff)2115return -ENOMEM;21162117/* Allocate the user buffer */2118size = num_elements * sizeof(u64);2119p = vmalloc_user(size);2120if (!p)2121goto free_mem;21222123ts_buff->user_buff_address = p;2124buf->mappable_size = size;21252126/* Allocate the internal kernel buffer */2127size = num_elements * sizeof(struct hl_user_pending_interrupt);2128p = vzalloc(size);2129if (!p)2130goto free_user_buff;21312132ts_buff->kernel_buff_address = p;2133ts_buff->kernel_buff_size = size;21342135buf->private = ts_buff;21362137return 0;21382139free_user_buff:2140vfree(ts_buff->user_buff_address);2141free_mem:2142kfree(ts_buff);2143return -ENOMEM;2144}21452146static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {2147.topic = "TS",2148.mem_id = HL_MMAP_TYPE_TS_BUFF,2149.mmap = hl_ts_mmap,2150.alloc = hl_ts_alloc_buf,2151.release = ts_buff_release,2152};21532154/**2155* allocate_timestamps_buffers() - allocate timestamps buffers2156* This function will allocate ts buffer that will later on be mapped to the user2157* in order to be able to read the timestamp.2158* in addition it'll allocate an extra buffer for registration management.2159* since we cannot fail during registration for out-of-memory situation, so2160* we'll prepare a pool which will be used as user interrupt nodes and instead2161* of dynamically allocating nodes while registration we'll pick the node from2162* this pool. in addition it'll add node to the mapping hash which will be used2163* to map user ts buffer to the internal kernel ts buffer.2164* @hpriv: pointer to the private data of the fd2165* @args: ioctl input2166* @handle: user timestamp buffer handle as an output2167*/2168static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle)2169{2170struct hl_mem_mgr *mmg = &hpriv->mem_mgr;2171struct hl_mmap_mem_buf *buf;21722173if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {2174dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",2175args->num_of_elements, TS_MAX_ELEMENTS_NUM);2176return -EINVAL;2177}21782179buf = hl_mmap_mem_buf_alloc(mmg, &hl_ts_behavior, GFP_KERNEL, &args->num_of_elements);2180if (!buf)2181return -ENOMEM;21822183*handle = buf->handle;21842185return 0;2186}21872188int hl_mem_ioctl(struct drm_device *ddev, void *data, struct drm_file *file_priv)2189{2190struct hl_fpriv *hpriv = file_priv->driver_priv;2191enum hl_device_status status;2192union hl_mem_args *args = data;2193struct hl_device *hdev = hpriv->hdev;2194struct hl_ctx *ctx = hpriv->ctx;2195u64 block_handle, device_addr = 0;2196u32 handle = 0, block_size;2197int rc, dmabuf_fd = -EBADF;21982199if (!hl_device_operational(hdev, &status)) {2200dev_dbg_ratelimited(hdev->dev,2201"Device is %s. Can't execute MEMORY IOCTL\n",2202hdev->status[status]);2203return -EBUSY;2204}22052206switch (args->in.op) {2207case HL_MEM_OP_ALLOC:2208if (args->in.alloc.mem_size == 0) {2209dev_err(hdev->dev,2210"alloc size must be larger than 0\n");2211rc = -EINVAL;2212goto out;2213}22142215/* If DRAM does not support virtual memory the driver won't2216* handle the allocation/freeing of that memory. However, for2217* system administration/monitoring purposes, the driver will2218* keep track of the amount of DRAM memory that is allocated2219* and freed by the user. Because this code totally relies on2220* the user's input, the driver can't ensure the validity2221* of this accounting.2222*/2223if (!hdev->asic_prop.dram_supports_virtual_memory) {2224atomic64_add(args->in.alloc.mem_size,2225&ctx->dram_phys_mem);2226atomic64_add(args->in.alloc.mem_size,2227&hdev->dram_used_mem);22282229dev_dbg(hdev->dev, "DRAM alloc is not supported\n");2230rc = 0;22312232memset(args, 0, sizeof(*args));2233args->out.handle = 0;2234goto out;2235}22362237rc = alloc_device_memory(ctx, &args->in, &handle);22382239memset(args, 0, sizeof(*args));2240args->out.handle = (__u64) handle;2241break;22422243case HL_MEM_OP_FREE:2244/* If DRAM does not support virtual memory the driver won't2245* handle the allocation/freeing of that memory. However, for2246* system administration/monitoring purposes, the driver will2247* keep track of the amount of DRAM memory that is allocated2248* and freed by the user. Because this code totally relies on2249* the user's input, the driver can't ensure the validity2250* of this accounting.2251*/2252if (!hdev->asic_prop.dram_supports_virtual_memory) {2253atomic64_sub(args->in.alloc.mem_size,2254&ctx->dram_phys_mem);2255atomic64_sub(args->in.alloc.mem_size,2256&hdev->dram_used_mem);22572258dev_dbg(hdev->dev, "DRAM alloc is not supported\n");2259rc = 0;22602261goto out;2262}22632264rc = free_device_memory(ctx, &args->in);2265break;22662267case HL_MEM_OP_MAP:2268rc = map_device_va(ctx, &args->in, &device_addr);22692270memset(args, 0, sizeof(*args));2271args->out.device_virt_addr = device_addr;2272break;22732274case HL_MEM_OP_UNMAP:2275rc = unmap_device_va(ctx, &args->in, false);2276break;22772278case HL_MEM_OP_MAP_BLOCK:2279rc = map_block(hdev, args->in.map_block.block_addr,2280&block_handle, &block_size);2281args->out.block_handle = block_handle;2282args->out.block_size = block_size;2283break;22842285case HL_MEM_OP_EXPORT_DMABUF_FD:2286rc = export_dmabuf_from_addr(ctx,2287args->in.export_dmabuf_fd.addr,2288args->in.export_dmabuf_fd.mem_size,2289args->in.export_dmabuf_fd.offset,2290args->in.flags,2291&dmabuf_fd);2292memset(args, 0, sizeof(*args));2293args->out.fd = dmabuf_fd;2294break;22952296case HL_MEM_OP_TS_ALLOC:2297rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);2298break;2299default:2300dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");2301rc = -EINVAL;2302break;2303}23042305out:2306return rc;2307}23082309static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,2310u32 npages, u64 start, u32 offset,2311struct hl_userptr *userptr)2312{2313int rc;23142315if (!access_ok((void __user *) (uintptr_t) addr, size)) {2316dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);2317return -EFAULT;2318}23192320userptr->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);2321if (!userptr->pages)2322return -ENOMEM;23232324rc = pin_user_pages_fast(start, npages, FOLL_WRITE | FOLL_LONGTERM,2325userptr->pages);23262327if (rc != npages) {2328dev_err(hdev->dev,2329"Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n",2330rc, addr, size, npages);2331if (rc < 0)2332goto destroy_pages;2333npages = rc;2334rc = -EFAULT;2335goto put_pages;2336}2337userptr->npages = npages;23382339rc = sg_alloc_table_from_pages(userptr->sgt,2340userptr->pages,2341npages, offset, size, GFP_KERNEL);2342if (rc < 0) {2343dev_err(hdev->dev, "failed to create SG table from pages\n");2344goto put_pages;2345}23462347return 0;23482349put_pages:2350unpin_user_pages(userptr->pages, npages);2351destroy_pages:2352kvfree(userptr->pages);2353return rc;2354}23552356/**2357* hl_pin_host_memory() - pins a chunk of host memory.2358* @hdev: pointer to the habanalabs device structure.2359* @addr: the host virtual address of the memory area.2360* @size: the size of the memory area.2361* @userptr: pointer to hl_userptr structure.2362*2363* This function does the following:2364* - Pins the physical pages.2365* - Create an SG list from those pages.2366*/2367int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,2368struct hl_userptr *userptr)2369{2370u64 start, end;2371u32 npages, offset;2372int rc;23732374if (!size) {2375dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);2376return -EINVAL;2377}23782379/*2380* If the combination of the address and size requested for this memory2381* region causes an integer overflow, return error.2382*/2383if (((addr + size) < addr) ||2384PAGE_ALIGN(addr + size) < (addr + size)) {2385dev_err(hdev->dev,2386"user pointer 0x%llx + %llu causes integer overflow\n",2387addr, size);2388return -EINVAL;2389}23902391userptr->pid = current->pid;2392userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);2393if (!userptr->sgt)2394return -ENOMEM;23952396start = addr & PAGE_MASK;2397offset = addr & ~PAGE_MASK;2398end = PAGE_ALIGN(addr + size);2399npages = (end - start) >> PAGE_SHIFT;24002401userptr->size = size;2402userptr->addr = addr;2403userptr->dma_mapped = false;2404INIT_LIST_HEAD(&userptr->job_node);24052406rc = get_user_memory(hdev, addr, size, npages, start, offset,2407userptr);2408if (rc) {2409dev_err(hdev->dev,2410"failed to get user memory for address 0x%llx\n",2411addr);2412goto free_sgt;2413}24142415hl_debugfs_add_userptr(hdev, userptr);24162417return 0;24182419free_sgt:2420kfree(userptr->sgt);2421return rc;2422}24232424/*2425* hl_unpin_host_memory - unpins a chunk of host memory.2426* @hdev: pointer to the habanalabs device structure2427* @userptr: pointer to hl_userptr structure2428*2429* This function does the following:2430* - Unpins the physical pages related to the host memory2431* - Free the SG list2432*/2433void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)2434{2435hl_debugfs_remove_userptr(hdev, userptr);24362437if (userptr->dma_mapped)2438hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir);24392440unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true);2441kvfree(userptr->pages);24422443list_del(&userptr->job_node);24442445sg_free_table(userptr->sgt);2446kfree(userptr->sgt);2447}24482449/**2450* hl_userptr_delete_list() - clear userptr list.2451* @hdev: pointer to the habanalabs device structure.2452* @userptr_list: pointer to the list to clear.2453*2454* This function does the following:2455* - Iterates over the list and unpins the host memory and frees the userptr2456* structure.2457*/2458void hl_userptr_delete_list(struct hl_device *hdev,2459struct list_head *userptr_list)2460{2461struct hl_userptr *userptr, *tmp;24622463list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {2464hl_unpin_host_memory(hdev, userptr);2465kfree(userptr);2466}24672468INIT_LIST_HEAD(userptr_list);2469}24702471/**2472* hl_userptr_is_pinned() - returns whether the given userptr is pinned.2473* @hdev: pointer to the habanalabs device structure.2474* @addr: user address to check.2475* @size: user block size to check.2476* @userptr_list: pointer to the list to clear.2477* @userptr: pointer to userptr to check.2478*2479* This function does the following:2480* - Iterates over the list and checks if the given userptr is in it, means is2481* pinned. If so, returns true, otherwise returns false.2482*/2483bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,2484u32 size, struct list_head *userptr_list,2485struct hl_userptr **userptr)2486{2487list_for_each_entry((*userptr), userptr_list, job_node) {2488if ((addr == (*userptr)->addr) && (size == (*userptr)->size))2489return true;2490}24912492return false;2493}24942495/**2496* va_range_init() - initialize virtual addresses range.2497* @hdev: pointer to the habanalabs device structure.2498* @va_ranges: pointer to va_ranges array.2499* @range_type: virtual address range type.2500* @start: range start address, inclusive.2501* @end: range end address, inclusive.2502* @page_size: page size for this va_range.2503*2504* This function does the following:2505* - Initializes the virtual addresses list of the given range with the given2506* addresses.2507*/2508static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges,2509enum hl_va_range_type range_type, u64 start,2510u64 end, u32 page_size)2511{2512struct hl_va_range *va_range = va_ranges[range_type];2513int rc;25142515INIT_LIST_HEAD(&va_range->list);25162517/*2518* PAGE_SIZE alignment2519* it is the caller's responsibility to align the addresses if the2520* page size is not a power of 22521*/25222523if (is_power_of_2(page_size)) {2524start = round_up(start, page_size);25252526/*2527* The end of the range is inclusive, hence we need to align it2528* to the end of the last full page in the range. For example if2529* end = 0x3ff5 with page size 0x1000, we need to align it to2530* 0x2fff. The remaining 0xff5 bytes do not form a full page.2531*/2532end = round_down(end + 1, page_size) - 1;2533}25342535if (start >= end) {2536dev_err(hdev->dev, "too small vm range for va list\n");2537return -EFAULT;2538}25392540rc = add_va_block(hdev, va_range, start, end);25412542if (rc) {2543dev_err(hdev->dev, "Failed to init host va list\n");2544return rc;2545}25462547va_range->start_addr = start;2548va_range->end_addr = end;2549va_range->page_size = page_size;25502551return 0;2552}25532554/**2555* va_range_fini() - clear a virtual addresses range.2556* @hdev: pointer to the habanalabs structure.2557* @va_range: pointer to virtual addresses range.2558*2559* This function does the following:2560* - Frees the virtual addresses block list and its lock.2561*/2562static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)2563{2564mutex_lock(&va_range->lock);2565clear_va_list_locked(hdev, &va_range->list);2566mutex_unlock(&va_range->lock);25672568mutex_destroy(&va_range->lock);2569kfree(va_range);2570}25712572/**2573* vm_ctx_init_with_ranges() - initialize virtual memory for context.2574* @ctx: pointer to the habanalabs context structure.2575* @host_range_start: host virtual addresses range start.2576* @host_range_end: host virtual addresses range end.2577* @host_page_size: host page size.2578* @host_huge_range_start: host virtual addresses range start for memory2579* allocated with huge pages.2580* @host_huge_range_end: host virtual addresses range end for memory allocated2581* with huge pages.2582* @host_huge_page_size: host huge page size.2583* @dram_range_start: dram virtual addresses range start.2584* @dram_range_end: dram virtual addresses range end.2585* @dram_page_size: dram page size.2586*2587* This function initializes the following:2588* - MMU for context.2589* - Virtual address to area descriptor hashtable.2590* - Virtual block list of available virtual memory.2591*/2592static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,2593u64 host_range_start,2594u64 host_range_end,2595u32 host_page_size,2596u64 host_huge_range_start,2597u64 host_huge_range_end,2598u32 host_huge_page_size,2599u64 dram_range_start,2600u64 dram_range_end,2601u32 dram_page_size)2602{2603struct hl_device *hdev = ctx->hdev;2604int i, rc;26052606for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) {2607ctx->va_range[i] =2608kzalloc(sizeof(struct hl_va_range), GFP_KERNEL);2609if (!ctx->va_range[i]) {2610rc = -ENOMEM;2611goto free_va_range;2612}2613}26142615rc = hl_mmu_ctx_init(ctx);2616if (rc) {2617dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);2618goto free_va_range;2619}26202621mutex_init(&ctx->mem_hash_lock);2622hash_init(ctx->mem_hash);26232624mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);26252626rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_HOST,2627host_range_start, host_range_end, host_page_size);2628if (rc) {2629dev_err(hdev->dev, "failed to init host vm range\n");2630goto mmu_ctx_fini;2631}26322633if (hdev->pmmu_huge_range) {2634mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);26352636rc = va_range_init(hdev,2637ctx->va_range, HL_VA_RANGE_TYPE_HOST_HUGE,2638host_huge_range_start, host_huge_range_end,2639host_huge_page_size);2640if (rc) {2641dev_err(hdev->dev,2642"failed to init host huge vm range\n");2643goto clear_host_va_range;2644}2645} else {2646kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);2647ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =2648ctx->va_range[HL_VA_RANGE_TYPE_HOST];2649}26502651mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);26522653rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_DRAM,2654dram_range_start, dram_range_end, dram_page_size);2655if (rc) {2656dev_err(hdev->dev, "failed to init dram vm range\n");2657goto clear_host_huge_va_range;2658}26592660hl_debugfs_add_ctx_mem_hash(hdev, ctx);26612662return 0;26632664clear_host_huge_va_range:2665mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);26662667if (hdev->pmmu_huge_range) {2668mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);2669clear_va_list_locked(hdev,2670&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);2671mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);2672}2673clear_host_va_range:2674if (hdev->pmmu_huge_range)2675mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);2676mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);2677clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);2678mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);2679mmu_ctx_fini:2680mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);2681mutex_destroy(&ctx->mem_hash_lock);2682hl_mmu_ctx_fini(ctx);2683free_va_range:2684for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++)2685kfree(ctx->va_range[i]);26862687return rc;2688}26892690int hl_vm_ctx_init(struct hl_ctx *ctx)2691{2692struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;2693u64 host_range_start, host_range_end, host_huge_range_start,2694host_huge_range_end, dram_range_start, dram_range_end;2695u32 host_page_size, host_huge_page_size, dram_page_size;26962697atomic64_set(&ctx->dram_phys_mem, 0);26982699/*2700* In case of DRAM mapping, the returned address is the physical2701* address of the memory related to the given handle.2702*/2703if (ctx->hdev->mmu_disable)2704return 0;27052706dram_range_start = prop->dmmu.start_addr;2707dram_range_end = prop->dmmu.end_addr - 1;2708dram_page_size = prop->dram_page_size ?2709prop->dram_page_size : prop->dmmu.page_size;2710host_range_start = prop->pmmu.start_addr;2711host_range_end = prop->pmmu.end_addr - 1;2712host_page_size = prop->pmmu.page_size;2713host_huge_range_start = prop->pmmu_huge.start_addr;2714host_huge_range_end = prop->pmmu_huge.end_addr - 1;2715host_huge_page_size = prop->pmmu_huge.page_size;27162717return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,2718host_page_size, host_huge_range_start,2719host_huge_range_end, host_huge_page_size,2720dram_range_start, dram_range_end, dram_page_size);2721}27222723/**2724* hl_vm_ctx_fini() - virtual memory teardown of context.2725* @ctx: pointer to the habanalabs context structure.2726*2727* This function perform teardown the following:2728* - Virtual block list of available virtual memory.2729* - Virtual address to area descriptor hashtable.2730* - MMU for context.2731*2732* In addition this function does the following:2733* - Unmaps the existing hashtable nodes if the hashtable is not empty. The2734* hashtable should be empty as no valid mappings should exist at this2735* point.2736* - Frees any existing physical page list from the idr which relates to the2737* current context asid.2738* - This function checks the virtual block list for correctness. At this point2739* the list should contain one element which describes the whole virtual2740* memory range of the context. Otherwise, a warning is printed.2741*/2742void hl_vm_ctx_fini(struct hl_ctx *ctx)2743{2744struct hl_vm_phys_pg_pack *phys_pg_list, *tmp_phys_node;2745struct hl_device *hdev = ctx->hdev;2746struct hl_vm_hash_node *hnode;2747struct hl_vm *vm = &hdev->vm;2748struct hlist_node *tmp_node;2749struct list_head free_list;2750struct hl_mem_in args;2751int i;27522753if (hdev->mmu_disable)2754return;27552756hl_debugfs_remove_ctx_mem_hash(hdev, ctx);27572758/*2759* Clearly something went wrong on hard reset so no point in printing2760* another side effect error2761*/2762if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))2763dev_dbg(hdev->dev,2764"user released device without removing its memory mappings\n");27652766hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {2767dev_dbg(hdev->dev,2768"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",2769hnode->vaddr, ctx->asid);2770args.unmap.device_virt_addr = hnode->vaddr;2771unmap_device_va(ctx, &args, true);2772}27732774mutex_lock(&hdev->mmu_lock);27752776/* invalidate the cache once after the unmapping loop */2777hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);2778hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK);27792780mutex_unlock(&hdev->mmu_lock);27812782INIT_LIST_HEAD(&free_list);27832784spin_lock(&vm->idr_lock);2785idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)2786if (phys_pg_list->asid == ctx->asid) {2787dev_dbg(hdev->dev,2788"page list 0x%px of asid %d is still alive\n",2789phys_pg_list, ctx->asid);27902791atomic64_sub(phys_pg_list->total_size, &hdev->dram_used_mem);2792idr_remove(&vm->phys_pg_pack_handles, i);2793list_add(&phys_pg_list->node, &free_list);2794}2795spin_unlock(&vm->idr_lock);27962797list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node)2798free_phys_pg_pack(hdev, phys_pg_list);27992800va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);2801va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]);28022803if (hdev->pmmu_huge_range)2804va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);28052806mutex_destroy(&ctx->mem_hash_lock);2807hl_mmu_ctx_fini(ctx);28082809/* In this case we need to clear the global accounting of DRAM usage2810* because the user notifies us on allocations. If the user is no more,2811* all DRAM is available2812*/2813if (ctx->asid != HL_KERNEL_ASID_ID &&2814!hdev->asic_prop.dram_supports_virtual_memory)2815atomic64_set(&hdev->dram_used_mem, 0);2816}28172818/**2819* hl_vm_init() - initialize virtual memory module.2820* @hdev: pointer to the habanalabs device structure.2821*2822* This function initializes the following:2823* - MMU module.2824* - DRAM physical pages pool of 2MB.2825* - Idr for device memory allocation handles.2826*/2827int hl_vm_init(struct hl_device *hdev)2828{2829struct asic_fixed_properties *prop = &hdev->asic_prop;2830struct hl_vm *vm = &hdev->vm;2831int rc;28322833if (is_power_of_2(prop->dram_page_size))2834vm->dram_pg_pool =2835gen_pool_create(__ffs(prop->dram_page_size), -1);2836else2837vm->dram_pg_pool =2838gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -1);28392840if (!vm->dram_pg_pool) {2841dev_err(hdev->dev, "Failed to create dram page pool\n");2842return -ENOMEM;2843}28442845kref_init(&vm->dram_pg_pool_refcount);28462847rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,2848prop->dram_end_address - prop->dram_user_base_address,2849-1);28502851if (rc) {2852dev_err(hdev->dev,2853"Failed to add memory to dram page pool %d\n", rc);2854goto pool_add_err;2855}28562857spin_lock_init(&vm->idr_lock);2858idr_init(&vm->phys_pg_pack_handles);28592860atomic64_set(&hdev->dram_used_mem, 0);28612862vm->init_done = true;28632864return 0;28652866pool_add_err:2867gen_pool_destroy(vm->dram_pg_pool);28682869return rc;2870}28712872/**2873* hl_vm_fini() - virtual memory module teardown.2874* @hdev: pointer to the habanalabs device structure.2875*2876* This function perform teardown to the following:2877* - Idr for device memory allocation handles.2878* - DRAM physical pages pool of 2MB.2879* - MMU module.2880*/2881void hl_vm_fini(struct hl_device *hdev)2882{2883struct hl_vm *vm = &hdev->vm;28842885if (!vm->init_done)2886return;28872888/*2889* At this point all the contexts should be freed and hence no DRAM2890* memory should be in use. Hence the DRAM pool should be freed here.2891*/2892if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)2893dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",2894__func__);28952896vm->init_done = false;2897}28982899/**2900* hl_hw_block_mem_init() - HW block memory initialization.2901* @ctx: pointer to the habanalabs context structure.2902*2903* This function initializes the HW block virtual mapped addresses list and2904* it's lock.2905*/2906void hl_hw_block_mem_init(struct hl_ctx *ctx)2907{2908mutex_init(&ctx->hw_block_list_lock);2909INIT_LIST_HEAD(&ctx->hw_block_mem_list);2910}29112912/**2913* hl_hw_block_mem_fini() - HW block memory teardown.2914* @ctx: pointer to the habanalabs context structure.2915*2916* This function clears the HW block virtual mapped addresses list and destroys2917* it's lock.2918*/2919void hl_hw_block_mem_fini(struct hl_ctx *ctx)2920{2921struct hl_vm_hw_block_list_node *lnode, *tmp;29222923if (!list_empty(&ctx->hw_block_mem_list))2924dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n");29252926list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) {2927list_del(&lnode->node);2928kfree(lnode);2929}29302931mutex_destroy(&ctx->hw_block_list_lock);2932}293329342935