Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
26516 views
/*1* Copyright 2023 Advanced Micro Devices, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*/2122#include "kfd_debug.h"23#include "kfd_device_queue_manager.h"24#include "kfd_topology.h"25#include <linux/file.h>26#include <uapi/linux/kfd_ioctl.h>27#include <uapi/linux/kfd_sysfs.h>2829#define MAX_WATCH_ADDRESSES 43031int kfd_dbg_ev_query_debug_event(struct kfd_process *process,32unsigned int *queue_id,33unsigned int *gpu_id,34uint64_t exception_clear_mask,35uint64_t *event_status)36{37struct process_queue_manager *pqm;38struct process_queue_node *pqn;39int i;4041if (!(process && process->debug_trap_enabled))42return -ENODATA;4344mutex_lock(&process->event_mutex);45*event_status = 0;46*queue_id = 0;47*gpu_id = 0;4849/* find and report queue events */50pqm = &process->pqm;51list_for_each_entry(pqn, &pqm->queues, process_queue_list) {52uint64_t tmp = process->exception_enable_mask;5354if (!pqn->q)55continue;5657tmp &= pqn->q->properties.exception_status;5859if (!tmp)60continue;6162*event_status = pqn->q->properties.exception_status;63*queue_id = pqn->q->properties.queue_id;64*gpu_id = pqn->q->device->id;65pqn->q->properties.exception_status &= ~exception_clear_mask;66goto out;67}6869/* find and report device events */70for (i = 0; i < process->n_pdds; i++) {71struct kfd_process_device *pdd = process->pdds[i];72uint64_t tmp = process->exception_enable_mask73& pdd->exception_status;7475if (!tmp)76continue;7778*event_status = pdd->exception_status;79*gpu_id = pdd->dev->id;80pdd->exception_status &= ~exception_clear_mask;81goto out;82}8384/* report process events */85if (process->exception_enable_mask & process->exception_status) {86*event_status = process->exception_status;87process->exception_status &= ~exception_clear_mask;88}8990out:91mutex_unlock(&process->event_mutex);92return *event_status ? 0 : -EAGAIN;93}9495void debug_event_write_work_handler(struct work_struct *work)96{97struct kfd_process *process;9899static const char write_data = '.';100loff_t pos = 0;101102process = container_of(work,103struct kfd_process,104debug_event_workarea);105106if (process->debug_trap_enabled && process->dbg_ev_file)107kernel_write(process->dbg_ev_file, &write_data, 1, &pos);108}109110/* update process/device/queue exception status, write to descriptor111* only if exception_status is enabled.112*/113bool kfd_dbg_ev_raise(uint64_t event_mask,114struct kfd_process *process, struct kfd_node *dev,115unsigned int source_id, bool use_worker,116void *exception_data, size_t exception_data_size)117{118struct process_queue_manager *pqm;119struct process_queue_node *pqn;120int i;121static const char write_data = '.';122loff_t pos = 0;123bool is_subscribed = true;124125if (!(process && process->debug_trap_enabled))126return false;127128mutex_lock(&process->event_mutex);129130if (event_mask & KFD_EC_MASK_DEVICE) {131for (i = 0; i < process->n_pdds; i++) {132struct kfd_process_device *pdd = process->pdds[i];133134if (pdd->dev != dev)135continue;136137pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;138139if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {140if (!pdd->vm_fault_exc_data) {141pdd->vm_fault_exc_data = kmemdup(142exception_data,143exception_data_size,144GFP_KERNEL);145if (!pdd->vm_fault_exc_data)146pr_debug("Failed to allocate exception data memory");147} else {148pr_debug("Debugger exception data not saved\n");149print_hex_dump_bytes("exception data: ",150DUMP_PREFIX_OFFSET,151exception_data,152exception_data_size);153}154}155break;156}157} else if (event_mask & KFD_EC_MASK_PROCESS) {158process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;159} else {160pqm = &process->pqm;161list_for_each_entry(pqn, &pqm->queues,162process_queue_list) {163int target_id;164165if (!pqn->q)166continue;167168target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?169pqn->q->properties.queue_id :170pqn->q->doorbell_id;171172if (pqn->q->device != dev || target_id != source_id)173continue;174175pqn->q->properties.exception_status |= event_mask;176break;177}178}179180if (process->exception_enable_mask & event_mask) {181if (use_worker)182schedule_work(&process->debug_event_workarea);183else184kernel_write(process->dbg_ev_file,185&write_data,1861,187&pos);188} else {189is_subscribed = false;190}191192mutex_unlock(&process->event_mutex);193194return is_subscribed;195}196197/* set pending event queue entry from ring entry */198bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,199unsigned int pasid,200uint32_t doorbell_id,201uint64_t trap_mask,202void *exception_data,203size_t exception_data_size)204{205struct kfd_process *p;206struct kfd_process_device *pdd = NULL;207bool signaled_to_debugger_or_runtime = false;208209p = kfd_lookup_process_by_pasid(pasid, &pdd);210211if (!pdd)212return false;213214if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,215exception_data, exception_data_size)) {216struct process_queue_manager *pqm;217struct process_queue_node *pqn;218219if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&220p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {221mutex_lock(&p->mutex);222223pqm = &p->pqm;224list_for_each_entry(pqn, &pqm->queues,225process_queue_list) {226227if (!(pqn->q && pqn->q->device == dev &&228pqn->q->doorbell_id == doorbell_id))229continue;230231kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,232trap_mask);233234signaled_to_debugger_or_runtime = true;235236break;237}238239mutex_unlock(&p->mutex);240} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {241kfd_evict_process_device(pdd);242kfd_signal_vm_fault_event(pdd, NULL, exception_data);243244signaled_to_debugger_or_runtime = true;245}246} else {247signaled_to_debugger_or_runtime = true;248}249250kfd_unref_process(p);251252return signaled_to_debugger_or_runtime;253}254255int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,256unsigned int dev_id,257unsigned int queue_id,258uint64_t error_reason)259{260if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {261struct kfd_process_device *pdd = NULL;262struct kfd_hsa_memory_exception_data *data;263int i;264265for (i = 0; i < p->n_pdds; i++) {266if (p->pdds[i]->dev->id == dev_id) {267pdd = p->pdds[i];268break;269}270}271272if (!pdd)273return -ENODEV;274275data = (struct kfd_hsa_memory_exception_data *)276pdd->vm_fault_exc_data;277278kfd_evict_process_device(pdd);279kfd_signal_vm_fault_event(pdd, NULL, data);280error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);281}282283if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {284/*285* block should only happen after the debugger receives runtime286* enable notice.287*/288up(&p->runtime_enable_sema);289error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);290}291292if (error_reason)293return kfd_send_exception_to_runtime(p, queue_id, error_reason);294295return 0;296}297298static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)299{300struct mqd_update_info minfo = {0};301int err;302303if (!q)304return 0;305306if (!kfd_dbg_has_cwsr_workaround(q->device))307return 0;308309if (enable && q->properties.is_user_cu_masked)310return -EBUSY;311312minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;313314q->properties.is_dbg_wa = enable;315err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);316if (err)317q->properties.is_dbg_wa = false;318319return err;320}321322static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)323{324struct process_queue_manager *pqm = &target->pqm;325struct process_queue_node *pqn;326int r = 0;327328list_for_each_entry(pqn, &pqm->queues, process_queue_list) {329r = kfd_dbg_set_queue_workaround(pqn->q, enable);330if (enable && r)331goto unwind;332}333334return 0;335336unwind:337list_for_each_entry(pqn, &pqm->queues, process_queue_list)338kfd_dbg_set_queue_workaround(pqn->q, false);339340if (enable)341target->runtime_info.runtime_state = r == -EBUSY ?342DEBUG_RUNTIME_STATE_ENABLED_BUSY :343DEBUG_RUNTIME_STATE_ENABLED_ERROR;344345return r;346}347348int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)349{350uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;351uint32_t flags = pdd->process->dbg_flags;352struct amdgpu_device *adev = pdd->dev->adev;353int r;354355if (!kfd_dbg_is_per_vmid_supported(pdd->dev))356return 0;357358if (!pdd->proc_ctx_cpu_ptr) {359r = amdgpu_amdkfd_alloc_gtt_mem(adev,360AMDGPU_MES_PROC_CTX_SIZE,361&pdd->proc_ctx_bo,362&pdd->proc_ctx_gpu_addr,363&pdd->proc_ctx_cpu_ptr,364false);365if (r) {366dev_err(adev->dev,367"failed to allocate process context bo\n");368return r;369}370memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);371}372373return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,374pdd->watch_points, flags, sq_trap_en);375}376377#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1378static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)379{380int i;381382*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;383384spin_lock(&pdd->dev->watch_points_lock);385386for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {387/* device watchpoint in use so skip */388if ((pdd->dev->alloc_watch_ids >> i) & 0x1)389continue;390391pdd->alloc_watch_ids |= 0x1 << i;392pdd->dev->alloc_watch_ids |= 0x1 << i;393*watch_id = i;394spin_unlock(&pdd->dev->watch_points_lock);395return 0;396}397398spin_unlock(&pdd->dev->watch_points_lock);399400return -ENOMEM;401}402403static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)404{405spin_lock(&pdd->dev->watch_points_lock);406407/* process owns device watch point so safe to clear */408if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {409pdd->alloc_watch_ids &= ~(0x1 << watch_id);410pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);411}412413spin_unlock(&pdd->dev->watch_points_lock);414}415416static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)417{418bool owns_watch_id = false;419420spin_lock(&pdd->dev->watch_points_lock);421owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&422((pdd->alloc_watch_ids >> watch_id) & 0x1);423424spin_unlock(&pdd->dev->watch_points_lock);425426return owns_watch_id;427}428429int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,430uint32_t watch_id)431{432int r;433434if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))435return -EINVAL;436437if (!pdd->dev->kfd->shared_resources.enable_mes) {438r = debug_lock_and_unmap(pdd->dev->dqm);439if (r)440return r;441}442443amdgpu_gfx_off_ctrl(pdd->dev->adev, false);444pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(445pdd->dev->adev,446watch_id);447amdgpu_gfx_off_ctrl(pdd->dev->adev, true);448449if (!pdd->dev->kfd->shared_resources.enable_mes)450r = debug_map_and_unlock(pdd->dev->dqm);451else452r = kfd_dbg_set_mes_debug_mode(pdd, true);453454kfd_dbg_clear_dev_watch_id(pdd, watch_id);455456return r;457}458459int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,460uint64_t watch_address,461uint32_t watch_address_mask,462uint32_t *watch_id,463uint32_t watch_mode)464{465int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);466uint32_t xcc_mask = pdd->dev->xcc_mask;467468if (r)469return r;470471if (!pdd->dev->kfd->shared_resources.enable_mes) {472r = debug_lock_and_unmap(pdd->dev->dqm);473if (r) {474kfd_dbg_clear_dev_watch_id(pdd, *watch_id);475return r;476}477}478479amdgpu_gfx_off_ctrl(pdd->dev->adev, false);480for_each_inst(xcc_id, xcc_mask)481pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(482pdd->dev->adev,483watch_address,484watch_address_mask,485*watch_id,486watch_mode,487pdd->dev->vm_info.last_vmid_kfd,488xcc_id);489amdgpu_gfx_off_ctrl(pdd->dev->adev, true);490491if (!pdd->dev->kfd->shared_resources.enable_mes)492r = debug_map_and_unlock(pdd->dev->dqm);493else494r = kfd_dbg_set_mes_debug_mode(pdd, true);495496/* HWS is broken so no point in HW rollback but release the watchpoint anyways */497if (r)498kfd_dbg_clear_dev_watch_id(pdd, *watch_id);499500return 0;501}502503static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)504{505int i, j;506507for (i = 0; i < target->n_pdds; i++)508for (j = 0; j < MAX_WATCH_ADDRESSES; j++)509kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);510}511512int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)513{514uint32_t prev_flags = target->dbg_flags;515int i, r = 0, rewind_count = 0;516517for (i = 0; i < target->n_pdds; i++) {518struct kfd_topology_device *topo_dev =519kfd_topology_device_by_id(target->pdds[i]->dev->id);520uint32_t caps = topo_dev->node_props.capability;521522if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&523(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {524*flags = prev_flags;525return -EACCES;526}527528if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&529(*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {530*flags = prev_flags;531return -EACCES;532}533}534535target->dbg_flags = *flags;536*flags = prev_flags;537for (i = 0; i < target->n_pdds; i++) {538struct kfd_process_device *pdd = target->pdds[i];539540if (!kfd_dbg_is_per_vmid_supported(pdd->dev))541continue;542543if (!pdd->dev->kfd->shared_resources.enable_mes)544r = debug_refresh_runlist(pdd->dev->dqm);545else546r = kfd_dbg_set_mes_debug_mode(pdd, true);547548if (r) {549target->dbg_flags = prev_flags;550break;551}552553rewind_count++;554}555556/* Rewind flags */557if (r) {558target->dbg_flags = prev_flags;559560for (i = 0; i < rewind_count; i++) {561struct kfd_process_device *pdd = target->pdds[i];562563if (!kfd_dbg_is_per_vmid_supported(pdd->dev))564continue;565566if (!pdd->dev->kfd->shared_resources.enable_mes)567debug_refresh_runlist(pdd->dev->dqm);568else569kfd_dbg_set_mes_debug_mode(pdd, true);570}571}572573return r;574}575576/* kfd_dbg_trap_deactivate:577* target: target process578* unwind: If this is unwinding a failed kfd_dbg_trap_enable()579* unwind_count:580* If unwind == true, how far down the pdd list we need581* to unwind582* else: ignored583*/584void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)585{586int i;587588if (!unwind) {589uint32_t flags = 0;590int resume_count = resume_queues(target, 0, NULL);591592if (resume_count)593pr_debug("Resumed %d queues\n", resume_count);594595cancel_work_sync(&target->debug_event_workarea);596kfd_dbg_clear_process_address_watch(target);597kfd_dbg_trap_set_wave_launch_mode(target, 0);598599kfd_dbg_trap_set_flags(target, &flags);600}601602for (i = 0; i < target->n_pdds; i++) {603struct kfd_process_device *pdd = target->pdds[i];604605/* If this is an unwind, and we have unwound the required606* enable calls on the pdd list, we need to stop now607* otherwise we may mess up another debugger session.608*/609if (unwind && i == unwind_count)610break;611612kfd_process_set_trap_debug_flag(&pdd->qpd, false);613614/* GFX off is already disabled by debug activate if not RLC restore supported. */615if (kfd_dbg_is_rlc_restore_supported(pdd->dev))616amdgpu_gfx_off_ctrl(pdd->dev->adev, false);617pdd->spi_dbg_override =618pdd->dev->kfd2kgd->disable_debug_trap(619pdd->dev->adev,620target->runtime_info.ttmp_setup,621pdd->dev->vm_info.last_vmid_kfd);622amdgpu_gfx_off_ctrl(pdd->dev->adev, true);623624if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&625release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))626pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);627628if (!pdd->dev->kfd->shared_resources.enable_mes)629debug_refresh_runlist(pdd->dev->dqm);630else631kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));632}633634kfd_dbg_set_workaround(target, false);635}636637static void kfd_dbg_clean_exception_status(struct kfd_process *target)638{639struct process_queue_manager *pqm;640struct process_queue_node *pqn;641int i;642643for (i = 0; i < target->n_pdds; i++) {644struct kfd_process_device *pdd = target->pdds[i];645646kfd_process_drain_interrupts(pdd);647648pdd->exception_status = 0;649}650651pqm = &target->pqm;652list_for_each_entry(pqn, &pqm->queues, process_queue_list) {653if (!pqn->q)654continue;655656pqn->q->properties.exception_status = 0;657}658659target->exception_status = 0;660}661662int kfd_dbg_trap_disable(struct kfd_process *target)663{664if (!target->debug_trap_enabled)665return 0;666667/*668* Defer deactivation to runtime if runtime not enabled otherwise reset669* attached running target runtime state to enable for re-attach.670*/671if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)672kfd_dbg_trap_deactivate(target, false, 0);673else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)674target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;675676cancel_work_sync(&target->debug_event_workarea);677fput(target->dbg_ev_file);678target->dbg_ev_file = NULL;679680if (target->debugger_process) {681atomic_dec(&target->debugger_process->debugged_process_count);682target->debugger_process = NULL;683}684685target->debug_trap_enabled = false;686kfd_dbg_clean_exception_status(target);687kfd_unref_process(target);688689return 0;690}691692int kfd_dbg_trap_activate(struct kfd_process *target)693{694int i, r = 0;695696r = kfd_dbg_set_workaround(target, true);697if (r)698return r;699700for (i = 0; i < target->n_pdds; i++) {701struct kfd_process_device *pdd = target->pdds[i];702703if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {704r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);705706if (r) {707target->runtime_info.runtime_state = (r == -EBUSY) ?708DEBUG_RUNTIME_STATE_ENABLED_BUSY :709DEBUG_RUNTIME_STATE_ENABLED_ERROR;710711goto unwind_err;712}713}714715/* Disable GFX OFF to prevent garbage read/writes to debug registers.716* If RLC restore of debug registers is not supported and runtime enable717* hasn't done so already on ttmp setup request, restore the trap config registers.718*719* If RLC restore of debug registers is not supported, keep gfx off disabled for720* the debug session.721*/722amdgpu_gfx_off_ctrl(pdd->dev->adev, false);723if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||724target->runtime_info.ttmp_setup))725pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,726pdd->dev->vm_info.last_vmid_kfd);727728pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(729pdd->dev->adev,730false,731pdd->dev->vm_info.last_vmid_kfd);732733if (kfd_dbg_is_rlc_restore_supported(pdd->dev))734amdgpu_gfx_off_ctrl(pdd->dev->adev, true);735736/*737* Setting the debug flag in the trap handler requires that the TMA has been738* allocated, which occurs during CWSR initialization.739* In the event that CWSR has not been initialized at this point, setting the740* flag will be called again during CWSR initialization if the target process741* is still debug enabled.742*/743kfd_process_set_trap_debug_flag(&pdd->qpd, true);744745if (!pdd->dev->kfd->shared_resources.enable_mes)746r = debug_refresh_runlist(pdd->dev->dqm);747else748r = kfd_dbg_set_mes_debug_mode(pdd, true);749750if (r) {751target->runtime_info.runtime_state =752DEBUG_RUNTIME_STATE_ENABLED_ERROR;753goto unwind_err;754}755}756757return 0;758759unwind_err:760/* Enabling debug failed, we need to disable on761* all GPUs so the enable is all or nothing.762*/763kfd_dbg_trap_deactivate(target, true, i);764return r;765}766767int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,768void __user *runtime_info, uint32_t *runtime_size)769{770struct file *f;771uint32_t copy_size;772int i, r = 0;773774if (target->debug_trap_enabled)775return -EALREADY;776777/* Enable pre-checks */778for (i = 0; i < target->n_pdds; i++) {779struct kfd_process_device *pdd = target->pdds[i];780781if (!KFD_IS_SOC15(pdd->dev))782return -ENODEV;783784if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||785kfd_dbg_has_cwsr_workaround(pdd->dev)))786return -EBUSY;787}788789copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));790791f = fget(fd);792if (!f) {793pr_err("Failed to get file for (%i)\n", fd);794return -EBADF;795}796797target->dbg_ev_file = f;798799/* defer activation to runtime if not runtime enabled */800if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)801kfd_dbg_trap_activate(target);802803/* We already hold the process reference but hold another one for the804* debug session.805*/806kref_get(&target->ref);807target->debug_trap_enabled = true;808809if (target->debugger_process)810atomic_inc(&target->debugger_process->debugged_process_count);811812if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {813kfd_dbg_trap_deactivate(target, false, 0);814r = -EFAULT;815}816817*runtime_size = sizeof(target->runtime_info);818819return r;820}821822static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,823uint32_t trap_override,824uint32_t trap_mask_request,825uint32_t *trap_mask_supported)826{827int i = 0;828829*trap_mask_supported = 0xffffffff;830831for (i = 0; i < p->n_pdds; i++) {832struct kfd_process_device *pdd = p->pdds[i];833int err = pdd->dev->kfd2kgd->validate_trap_override_request(834pdd->dev->adev,835trap_override,836trap_mask_supported);837838if (err)839return err;840}841842if (trap_mask_request & ~*trap_mask_supported)843return -EACCES;844845return 0;846}847848int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,849uint32_t trap_override,850uint32_t trap_mask_bits,851uint32_t trap_mask_request,852uint32_t *trap_mask_prev,853uint32_t *trap_mask_supported)854{855int r = 0, i;856857r = kfd_dbg_validate_trap_override_request(target,858trap_override,859trap_mask_request,860trap_mask_supported);861862if (r)863return r;864865for (i = 0; i < target->n_pdds; i++) {866struct kfd_process_device *pdd = target->pdds[i];867868amdgpu_gfx_off_ctrl(pdd->dev->adev, false);869pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(870pdd->dev->adev,871pdd->dev->vm_info.last_vmid_kfd,872trap_override,873trap_mask_bits,874trap_mask_request,875trap_mask_prev,876pdd->spi_dbg_override);877amdgpu_gfx_off_ctrl(pdd->dev->adev, true);878879if (!pdd->dev->kfd->shared_resources.enable_mes)880r = debug_refresh_runlist(pdd->dev->dqm);881else882r = kfd_dbg_set_mes_debug_mode(pdd, true);883884if (r)885break;886}887888return r;889}890891int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,892uint8_t wave_launch_mode)893{894int r = 0, i;895896if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&897wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&898wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)899return -EINVAL;900901for (i = 0; i < target->n_pdds; i++) {902struct kfd_process_device *pdd = target->pdds[i];903904amdgpu_gfx_off_ctrl(pdd->dev->adev, false);905pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(906pdd->dev->adev,907wave_launch_mode,908pdd->dev->vm_info.last_vmid_kfd);909amdgpu_gfx_off_ctrl(pdd->dev->adev, true);910911if (!pdd->dev->kfd->shared_resources.enable_mes)912r = debug_refresh_runlist(pdd->dev->dqm);913else914r = kfd_dbg_set_mes_debug_mode(pdd, true);915916if (r)917break;918}919920return r;921}922923int kfd_dbg_trap_query_exception_info(struct kfd_process *target,924uint32_t source_id,925uint32_t exception_code,926bool clear_exception,927void __user *info,928uint32_t *info_size)929{930bool found = false;931int r = 0;932uint32_t copy_size, actual_info_size = 0;933uint64_t *exception_status_ptr = NULL;934935if (!target)936return -EINVAL;937938if (!info || !info_size)939return -EINVAL;940941mutex_lock(&target->event_mutex);942943if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {944/* Per queue exceptions */945struct queue *queue = NULL;946int i;947948for (i = 0; i < target->n_pdds; i++) {949struct kfd_process_device *pdd = target->pdds[i];950struct qcm_process_device *qpd = &pdd->qpd;951952list_for_each_entry(queue, &qpd->queues_list, list) {953if (!found && queue->properties.queue_id == source_id) {954found = true;955break;956}957}958if (found)959break;960}961962if (!found) {963r = -EINVAL;964goto out;965}966967if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {968r = -ENODATA;969goto out;970}971exception_status_ptr = &queue->properties.exception_status;972} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {973/* Per device exceptions */974struct kfd_process_device *pdd = NULL;975int i;976977for (i = 0; i < target->n_pdds; i++) {978pdd = target->pdds[i];979if (pdd->dev->id == source_id) {980found = true;981break;982}983}984985if (!found) {986r = -EINVAL;987goto out;988}989990if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {991r = -ENODATA;992goto out;993}994995if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {996copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);997998if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {999r = -EFAULT;1000goto out;1001}1002actual_info_size = pdd->vm_fault_exc_data_size;1003if (clear_exception) {1004kfree(pdd->vm_fault_exc_data);1005pdd->vm_fault_exc_data = NULL;1006pdd->vm_fault_exc_data_size = 0;1007}1008}1009exception_status_ptr = &pdd->exception_status;1010} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {1011/* Per process exceptions */1012if (!(target->exception_status & KFD_EC_MASK(exception_code))) {1013r = -ENODATA;1014goto out;1015}10161017if (exception_code == EC_PROCESS_RUNTIME) {1018copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));10191020if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {1021r = -EFAULT;1022goto out;1023}10241025actual_info_size = sizeof(target->runtime_info);1026}10271028exception_status_ptr = &target->exception_status;1029} else {1030pr_debug("Bad exception type [%i]\n", exception_code);1031r = -EINVAL;1032goto out;1033}10341035*info_size = actual_info_size;1036if (clear_exception)1037*exception_status_ptr &= ~KFD_EC_MASK(exception_code);1038out:1039mutex_unlock(&target->event_mutex);1040return r;1041}10421043int kfd_dbg_trap_device_snapshot(struct kfd_process *target,1044uint64_t exception_clear_mask,1045void __user *user_info,1046uint32_t *number_of_device_infos,1047uint32_t *entry_size)1048{1049struct kfd_dbg_device_info_entry device_info;1050uint32_t tmp_entry_size, tmp_num_devices;1051int i, r = 0;10521053if (!(target && user_info && number_of_device_infos && entry_size))1054return -EINVAL;10551056tmp_entry_size = *entry_size;10571058tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);1059*number_of_device_infos = target->n_pdds;1060*entry_size = min_t(size_t, *entry_size, sizeof(device_info));10611062if (!tmp_num_devices)1063return 0;10641065memset(&device_info, 0, sizeof(device_info));10661067mutex_lock(&target->event_mutex);10681069/* Run over all pdd of the process */1070for (i = 0; i < tmp_num_devices; i++) {1071struct kfd_process_device *pdd = target->pdds[i];1072struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);10731074device_info.gpu_id = pdd->dev->id;1075device_info.exception_status = pdd->exception_status;1076device_info.lds_base = pdd->lds_base;1077device_info.lds_limit = pdd->lds_limit;1078device_info.scratch_base = pdd->scratch_base;1079device_info.scratch_limit = pdd->scratch_limit;1080device_info.gpuvm_base = pdd->gpuvm_base;1081device_info.gpuvm_limit = pdd->gpuvm_limit;1082device_info.location_id = topo_dev->node_props.location_id;1083device_info.vendor_id = topo_dev->node_props.vendor_id;1084device_info.device_id = topo_dev->node_props.device_id;1085device_info.revision_id = pdd->dev->adev->pdev->revision;1086device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;1087device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;1088device_info.fw_version = pdd->dev->kfd->mec_fw_version;1089device_info.gfx_target_version =1090topo_dev->node_props.gfx_target_version;1091device_info.simd_count = topo_dev->node_props.simd_count;1092device_info.max_waves_per_simd =1093topo_dev->node_props.max_waves_per_simd;1094device_info.array_count = topo_dev->node_props.array_count;1095device_info.simd_arrays_per_engine =1096topo_dev->node_props.simd_arrays_per_engine;1097device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);1098device_info.capability = topo_dev->node_props.capability;1099device_info.debug_prop = topo_dev->node_props.debug_prop;11001101if (exception_clear_mask)1102pdd->exception_status &= ~exception_clear_mask;11031104if (copy_to_user(user_info, &device_info, *entry_size)) {1105r = -EFAULT;1106break;1107}11081109user_info += tmp_entry_size;1110}11111112mutex_unlock(&target->event_mutex);11131114return r;1115}11161117void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,1118uint64_t exception_set_mask)1119{1120uint64_t found_mask = 0;1121struct process_queue_manager *pqm;1122struct process_queue_node *pqn;1123static const char write_data = '.';1124loff_t pos = 0;1125int i;11261127mutex_lock(&target->event_mutex);11281129found_mask |= target->exception_status;11301131pqm = &target->pqm;1132list_for_each_entry(pqn, &pqm->queues, process_queue_list) {1133if (!pqn->q)1134continue;11351136found_mask |= pqn->q->properties.exception_status;1137}11381139for (i = 0; i < target->n_pdds; i++) {1140struct kfd_process_device *pdd = target->pdds[i];11411142found_mask |= pdd->exception_status;1143}11441145if (exception_set_mask & found_mask)1146kernel_write(target->dbg_ev_file, &write_data, 1, &pos);11471148target->exception_enable_mask = exception_set_mask;11491150mutex_unlock(&target->event_mutex);1151}115211531154