Path: blob/master/drivers/gpu/drm/amd/amdgpu/aldebaran.c
49603 views
/*1* Copyright 2021 Advanced Micro Devices, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*21*/2223#include "aldebaran.h"24#include "amdgpu_reset.h"25#include "amdgpu_amdkfd.h"26#include "amdgpu_dpm.h"27#include "amdgpu_job.h"28#include "amdgpu_ring.h"29#include "amdgpu_ras.h"30#include "amdgpu_psp.h"31#include "amdgpu_xgmi.h"3233static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)34{35struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;3637if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&38adev->gmc.xgmi.connected_to_cpu))39return true;4041return false;42}4344static struct amdgpu_reset_handler *45aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,46struct amdgpu_reset_context *reset_context)47{48struct amdgpu_reset_handler *handler;49struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;50int i;5152if (reset_context->method == AMD_RESET_METHOD_NONE) {53if (aldebaran_is_mode2_default(reset_ctl))54reset_context->method = AMD_RESET_METHOD_MODE2;55else56reset_context->method = amdgpu_asic_reset_method(adev);57}5859if (reset_context->method != AMD_RESET_METHOD_NONE) {60dev_dbg(adev->dev, "Getting reset handler for method %d\n",61reset_context->method);62for_each_handler(i, handler, reset_ctl) {63if (handler->reset_method == reset_context->method)64return handler;65}66}6768dev_dbg(adev->dev, "Reset handler not found!\n");6970return NULL;71}7273static inline uint32_t aldebaran_get_ip_block_mask(struct amdgpu_device *adev)74{75uint32_t ip_block_mask = BIT(AMD_IP_BLOCK_TYPE_GFX) |76BIT(AMD_IP_BLOCK_TYPE_SDMA);7778if (adev->aid_mask)79ip_block_mask |= BIT(AMD_IP_BLOCK_TYPE_IH);8081return ip_block_mask;82}8384static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)85{86uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);87uint32_t ip_block;88int r, i;8990/* Skip suspend of SDMA IP versions >= 4.4.2. They are multi-aid */91if (adev->aid_mask)92ip_block_mask &= ~BIT(AMD_IP_BLOCK_TYPE_SDMA);9394amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);95amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);9697for (i = adev->num_ip_blocks - 1; i >= 0; i--) {98ip_block = BIT(adev->ip_blocks[i].version->type);99if (!(ip_block_mask & ip_block))100continue;101102r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);103if (r)104return r;105}106107return 0;108}109110static int111aldebaran_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,112struct amdgpu_reset_context *reset_context)113{114int r = 0;115struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;116117dev_dbg(adev->dev, "Aldebaran prepare hw context\n");118/* Don't suspend on bare metal if we are not going to HW reset the ASIC */119if (!amdgpu_sriov_vf(adev))120r = aldebaran_mode2_suspend_ip(adev);121122return r;123}124125static void aldebaran_async_reset(struct work_struct *work)126{127struct amdgpu_reset_handler *handler;128struct amdgpu_reset_control *reset_ctl =129container_of(work, struct amdgpu_reset_control, reset_work);130struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;131int i;132133for_each_handler(i, handler, reset_ctl) {134if (handler->reset_method == reset_ctl->active_reset) {135dev_dbg(adev->dev, "Resetting device\n");136handler->do_reset(adev);137break;138}139}140}141142static int aldebaran_mode2_reset(struct amdgpu_device *adev)143{144/* disable BM */145pci_clear_master(adev->pdev);146adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);147return adev->asic_reset_res;148}149150static int151aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,152struct amdgpu_reset_context *reset_context)153{154struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;155struct list_head *reset_device_list = reset_context->reset_device_list;156struct amdgpu_device *tmp_adev = NULL;157int r = 0;158159dev_dbg(adev->dev, "aldebaran perform hw reset\n");160161if (reset_device_list == NULL)162return -EINVAL;163164if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&165reset_context->hive == NULL) {166/* Wrong context, return error */167return -EINVAL;168}169170list_for_each_entry(tmp_adev, reset_device_list, reset_list) {171mutex_lock(&tmp_adev->reset_cntl->reset_lock);172tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;173}174/*175* Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch176* them together so that they can be completed asynchronously on multiple nodes177*/178list_for_each_entry(tmp_adev, reset_device_list, reset_list) {179/* For XGMI run all resets in parallel to speed up the process */180if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {181if (!queue_work(system_unbound_wq,182&tmp_adev->reset_cntl->reset_work))183r = -EALREADY;184} else185r = aldebaran_mode2_reset(tmp_adev);186if (r) {187dev_err(tmp_adev->dev,188"ASIC reset failed with error, %d for drm dev, %s",189r, adev_to_drm(tmp_adev)->unique);190break;191}192}193194/* For XGMI wait for all resets to complete before proceed */195if (!r) {196list_for_each_entry(tmp_adev, reset_device_list, reset_list) {197if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {198flush_work(&tmp_adev->reset_cntl->reset_work);199r = tmp_adev->asic_reset_res;200if (r)201break;202}203}204}205206list_for_each_entry(tmp_adev, reset_device_list, reset_list) {207mutex_unlock(&tmp_adev->reset_cntl->reset_lock);208tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;209}210211return r;212}213214static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)215{216struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];217uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);218struct amdgpu_firmware_info *ucode;219struct amdgpu_ip_block *cmn_block;220struct amdgpu_ip_block *ih_block;221int ucode_count = 0;222int i, r;223224dev_dbg(adev->dev, "Reloading ucodes after reset\n");225for (i = 0; i < adev->firmware.max_ucodes; i++) {226ucode = &adev->firmware.ucode[i];227if (!ucode->fw)228continue;229switch (ucode->ucode_id) {230case AMDGPU_UCODE_ID_SDMA0:231case AMDGPU_UCODE_ID_SDMA1:232case AMDGPU_UCODE_ID_SDMA2:233case AMDGPU_UCODE_ID_SDMA3:234case AMDGPU_UCODE_ID_SDMA4:235case AMDGPU_UCODE_ID_SDMA5:236case AMDGPU_UCODE_ID_SDMA6:237case AMDGPU_UCODE_ID_SDMA7:238case AMDGPU_UCODE_ID_CP_MEC1:239case AMDGPU_UCODE_ID_CP_MEC1_JT:240case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:241case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:242case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:243case AMDGPU_UCODE_ID_RLC_G:244ucode_list[ucode_count++] = ucode;245break;246default:247break;248}249}250251/* Reinit NBIF block */252cmn_block =253amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_COMMON);254if (unlikely(!cmn_block)) {255dev_err(adev->dev, "Failed to get BIF handle\n");256return -EINVAL;257}258r = amdgpu_ip_block_resume(cmn_block);259if (r)260return r;261262if (ip_block_mask & BIT(AMD_IP_BLOCK_TYPE_IH)) {263ih_block = amdgpu_device_ip_get_ip_block(adev,264AMD_IP_BLOCK_TYPE_IH);265if (unlikely(!ih_block)) {266dev_err(adev->dev, "Failed to get IH handle\n");267return -EINVAL;268}269r = amdgpu_ip_block_resume(ih_block);270if (r)271return r;272}273274/* Reinit GFXHUB */275adev->gfxhub.funcs->init(adev);276r = adev->gfxhub.funcs->gart_enable(adev);277if (r) {278dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");279return r;280}281282/* Reload GFX firmware */283r = psp_load_fw_list(&adev->psp, ucode_list, ucode_count);284if (r) {285dev_err(adev->dev, "GFX ucode load failed after reset\n");286return r;287}288289/* Resume RLC, FW needs RLC alive to complete reset process */290adev->gfx.rlc.funcs->resume(adev);291292/* Wait for FW reset event complete */293r = amdgpu_dpm_wait_for_event(adev, SMU_EVENT_RESET_COMPLETE, 0);294if (r) {295dev_err(adev->dev,296"Failed to get response from firmware after reset\n");297return r;298}299300for (i = 0; i < adev->num_ip_blocks; i++) {301if (!(adev->ip_blocks[i].version->type ==302AMD_IP_BLOCK_TYPE_GFX ||303adev->ip_blocks[i].version->type ==304AMD_IP_BLOCK_TYPE_SDMA))305continue;306307r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);308if (r)309return r;310}311312for (i = 0; i < adev->num_ip_blocks; i++) {313if (!(adev->ip_blocks[i].version->type ==314AMD_IP_BLOCK_TYPE_GFX ||315adev->ip_blocks[i].version->type ==316AMD_IP_BLOCK_TYPE_SDMA ||317adev->ip_blocks[i].version->type ==318AMD_IP_BLOCK_TYPE_COMMON))319continue;320321if (adev->ip_blocks[i].version->funcs->late_init) {322r = adev->ip_blocks[i].version->funcs->late_init(323&adev->ip_blocks[i]);324if (r) {325dev_err(adev->dev,326"late_init of IP block <%s> failed %d after reset\n",327adev->ip_blocks[i].version->funcs->name,328r);329return r;330}331}332adev->ip_blocks[i].status.late_initialized = true;333}334335amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);336amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);337338return r;339}340341static int342aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,343struct amdgpu_reset_context *reset_context)344{345struct list_head *reset_device_list = reset_context->reset_device_list;346struct amdgpu_device *tmp_adev = NULL;347struct amdgpu_ras *con;348int r;349350if (reset_device_list == NULL)351return -EINVAL;352353if (amdgpu_ip_version(reset_context->reset_req_dev, MP1_HWIP, 0) ==354IP_VERSION(13, 0, 2) &&355reset_context->hive == NULL) {356/* Wrong context, return error */357return -EINVAL;358}359360list_for_each_entry(tmp_adev, reset_device_list, reset_list) {361amdgpu_set_init_level(tmp_adev,362AMDGPU_INIT_LEVEL_RESET_RECOVERY);363dev_info(tmp_adev->dev,364"GPU reset succeeded, trying to resume\n");365/*TBD: Ideally should clear only GFX, SDMA blocks*/366amdgpu_ras_clear_err_state(tmp_adev);367r = aldebaran_mode2_restore_ip(tmp_adev);368if (r)369goto end;370371/*372* Add this ASIC as tracked as reset was already373* complete successfully.374*/375amdgpu_register_gpu_instance(tmp_adev);376377/* Resume RAS, ecc_irq */378con = amdgpu_ras_get_context(tmp_adev);379if (!amdgpu_sriov_vf(tmp_adev) && con) {380if (tmp_adev->sdma.ras &&381tmp_adev->sdma.ras->ras_block.ras_late_init) {382r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,383&tmp_adev->sdma.ras->ras_block.ras_comm);384if (r) {385dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);386goto end;387}388}389390if (tmp_adev->gfx.ras &&391tmp_adev->gfx.ras->ras_block.ras_late_init) {392r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,393&tmp_adev->gfx.ras->ras_block.ras_comm);394if (r) {395dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);396goto end;397}398}399}400401amdgpu_ras_resume(tmp_adev);402403/* Update PSP FW topology after reset */404if (reset_context->hive &&405tmp_adev->gmc.xgmi.num_physical_nodes > 1)406r = amdgpu_xgmi_update_topology(reset_context->hive,407tmp_adev);408409if (!r) {410amdgpu_set_init_level(tmp_adev,411AMDGPU_INIT_LEVEL_DEFAULT);412amdgpu_irq_gpu_reset_resume_helper(tmp_adev);413414r = amdgpu_ib_ring_tests(tmp_adev);415if (r) {416dev_err(tmp_adev->dev,417"ib ring test failed (%d).\n", r);418r = -EAGAIN;419tmp_adev->asic_reset_res = r;420goto end;421}422}423}424425end:426return r;427}428429static struct amdgpu_reset_handler aldebaran_mode2_handler = {430.reset_method = AMD_RESET_METHOD_MODE2,431.prepare_env = NULL,432.prepare_hwcontext = aldebaran_mode2_prepare_hwcontext,433.perform_reset = aldebaran_mode2_perform_reset,434.restore_hwcontext = aldebaran_mode2_restore_hwcontext,435.restore_env = NULL,436.do_reset = aldebaran_mode2_reset,437};438439static struct amdgpu_reset_handler440*aldebaran_rst_handlers[AMDGPU_RESET_MAX_HANDLERS] = {441&aldebaran_mode2_handler,442&xgmi_reset_on_init_handler,443};444445int aldebaran_reset_init(struct amdgpu_device *adev)446{447struct amdgpu_reset_control *reset_ctl;448449reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);450if (!reset_ctl)451return -ENOMEM;452453reset_ctl->handle = adev;454reset_ctl->async_reset = aldebaran_async_reset;455reset_ctl->active_reset = AMD_RESET_METHOD_NONE;456reset_ctl->get_reset_handler = aldebaran_get_reset_handler;457458INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);459/* Only mode2 is handled through reset control now */460reset_ctl->reset_handlers = &aldebaran_rst_handlers;461462adev->reset_cntl = reset_ctl;463464return 0;465}466467int aldebaran_reset_fini(struct amdgpu_device *adev)468{469kfree(adev->reset_cntl);470adev->reset_cntl = NULL;471return 0;472}473474475