Path: blob/master/drivers/gpu/drm/amd/amdgpu/aldebaran.c
26517 views
/*1* Copyright 2021 Advanced Micro Devices, Inc.2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice shall be included in11* all copies or substantial portions of the Software.12*13* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL16* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR17* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,18* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR19* OTHER DEALINGS IN THE SOFTWARE.20*21*/2223#include "aldebaran.h"24#include "amdgpu_reset.h"25#include "amdgpu_amdkfd.h"26#include "amdgpu_dpm.h"27#include "amdgpu_job.h"28#include "amdgpu_ring.h"29#include "amdgpu_ras.h"30#include "amdgpu_psp.h"31#include "amdgpu_xgmi.h"3233static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)34{35struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;3637if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&38adev->gmc.xgmi.connected_to_cpu))39return true;4041return false;42}4344static struct amdgpu_reset_handler *45aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,46struct amdgpu_reset_context *reset_context)47{48struct amdgpu_reset_handler *handler;49struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;50int i;5152if (reset_context->method == AMD_RESET_METHOD_NONE) {53if (aldebaran_is_mode2_default(reset_ctl))54reset_context->method = AMD_RESET_METHOD_MODE2;55else56reset_context->method = amdgpu_asic_reset_method(adev);57}5859if (reset_context->method != AMD_RESET_METHOD_NONE) {60dev_dbg(adev->dev, "Getting reset handler for method %d\n",61reset_context->method);62for_each_handler(i, handler, reset_ctl) {63if (handler->reset_method == reset_context->method)64return handler;65}66}6768dev_dbg(adev->dev, "Reset handler not found!\n");6970return NULL;71}7273static inline uint32_t aldebaran_get_ip_block_mask(struct amdgpu_device *adev)74{75uint32_t ip_block_mask = BIT(AMD_IP_BLOCK_TYPE_GFX) |76BIT(AMD_IP_BLOCK_TYPE_SDMA);7778if (adev->aid_mask)79ip_block_mask |= BIT(AMD_IP_BLOCK_TYPE_IH);8081return ip_block_mask;82}8384static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)85{86uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);87uint32_t ip_block;88int r, i;8990amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);91amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);9293for (i = adev->num_ip_blocks - 1; i >= 0; i--) {94ip_block = BIT(adev->ip_blocks[i].version->type);95if (!(ip_block_mask & ip_block))96continue;9798r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);99if (r)100return r;101}102103return 0;104}105106static int107aldebaran_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,108struct amdgpu_reset_context *reset_context)109{110int r = 0;111struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;112113dev_dbg(adev->dev, "Aldebaran prepare hw context\n");114/* Don't suspend on bare metal if we are not going to HW reset the ASIC */115if (!amdgpu_sriov_vf(adev))116r = aldebaran_mode2_suspend_ip(adev);117118return r;119}120121static void aldebaran_async_reset(struct work_struct *work)122{123struct amdgpu_reset_handler *handler;124struct amdgpu_reset_control *reset_ctl =125container_of(work, struct amdgpu_reset_control, reset_work);126struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;127int i;128129for_each_handler(i, handler, reset_ctl) {130if (handler->reset_method == reset_ctl->active_reset) {131dev_dbg(adev->dev, "Resetting device\n");132handler->do_reset(adev);133break;134}135}136}137138static int aldebaran_mode2_reset(struct amdgpu_device *adev)139{140/* disable BM */141pci_clear_master(adev->pdev);142adev->asic_reset_res = amdgpu_dpm_mode2_reset(adev);143return adev->asic_reset_res;144}145146static int147aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,148struct amdgpu_reset_context *reset_context)149{150struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;151struct list_head *reset_device_list = reset_context->reset_device_list;152struct amdgpu_device *tmp_adev = NULL;153int r = 0;154155dev_dbg(adev->dev, "aldebaran perform hw reset\n");156157if (reset_device_list == NULL)158return -EINVAL;159160if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&161reset_context->hive == NULL) {162/* Wrong context, return error */163return -EINVAL;164}165166list_for_each_entry(tmp_adev, reset_device_list, reset_list) {167mutex_lock(&tmp_adev->reset_cntl->reset_lock);168tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;169}170/*171* Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch172* them together so that they can be completed asynchronously on multiple nodes173*/174list_for_each_entry(tmp_adev, reset_device_list, reset_list) {175/* For XGMI run all resets in parallel to speed up the process */176if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {177if (!queue_work(system_unbound_wq,178&tmp_adev->reset_cntl->reset_work))179r = -EALREADY;180} else181r = aldebaran_mode2_reset(tmp_adev);182if (r) {183dev_err(tmp_adev->dev,184"ASIC reset failed with error, %d for drm dev, %s",185r, adev_to_drm(tmp_adev)->unique);186break;187}188}189190/* For XGMI wait for all resets to complete before proceed */191if (!r) {192list_for_each_entry(tmp_adev, reset_device_list, reset_list) {193if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {194flush_work(&tmp_adev->reset_cntl->reset_work);195r = tmp_adev->asic_reset_res;196if (r)197break;198}199}200}201202list_for_each_entry(tmp_adev, reset_device_list, reset_list) {203mutex_unlock(&tmp_adev->reset_cntl->reset_lock);204tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;205}206207return r;208}209210static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)211{212struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];213uint32_t ip_block_mask = aldebaran_get_ip_block_mask(adev);214struct amdgpu_firmware_info *ucode;215struct amdgpu_ip_block *cmn_block;216struct amdgpu_ip_block *ih_block;217int ucode_count = 0;218int i, r;219220dev_dbg(adev->dev, "Reloading ucodes after reset\n");221for (i = 0; i < adev->firmware.max_ucodes; i++) {222ucode = &adev->firmware.ucode[i];223if (!ucode->fw)224continue;225switch (ucode->ucode_id) {226case AMDGPU_UCODE_ID_SDMA0:227case AMDGPU_UCODE_ID_SDMA1:228case AMDGPU_UCODE_ID_SDMA2:229case AMDGPU_UCODE_ID_SDMA3:230case AMDGPU_UCODE_ID_SDMA4:231case AMDGPU_UCODE_ID_SDMA5:232case AMDGPU_UCODE_ID_SDMA6:233case AMDGPU_UCODE_ID_SDMA7:234case AMDGPU_UCODE_ID_CP_MEC1:235case AMDGPU_UCODE_ID_CP_MEC1_JT:236case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL:237case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM:238case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM:239case AMDGPU_UCODE_ID_RLC_G:240ucode_list[ucode_count++] = ucode;241break;242default:243break;244}245}246247/* Reinit NBIF block */248cmn_block =249amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_COMMON);250if (unlikely(!cmn_block)) {251dev_err(adev->dev, "Failed to get BIF handle\n");252return -EINVAL;253}254r = amdgpu_ip_block_resume(cmn_block);255if (r)256return r;257258if (ip_block_mask & BIT(AMD_IP_BLOCK_TYPE_IH)) {259ih_block = amdgpu_device_ip_get_ip_block(adev,260AMD_IP_BLOCK_TYPE_IH);261if (unlikely(!ih_block)) {262dev_err(adev->dev, "Failed to get IH handle\n");263return -EINVAL;264}265r = amdgpu_ip_block_resume(ih_block);266if (r)267return r;268}269270/* Reinit GFXHUB */271adev->gfxhub.funcs->init(adev);272r = adev->gfxhub.funcs->gart_enable(adev);273if (r) {274dev_err(adev->dev, "GFXHUB gart reenable failed after reset\n");275return r;276}277278/* Reload GFX firmware */279r = psp_load_fw_list(&adev->psp, ucode_list, ucode_count);280if (r) {281dev_err(adev->dev, "GFX ucode load failed after reset\n");282return r;283}284285/* Resume RLC, FW needs RLC alive to complete reset process */286adev->gfx.rlc.funcs->resume(adev);287288/* Wait for FW reset event complete */289r = amdgpu_dpm_wait_for_event(adev, SMU_EVENT_RESET_COMPLETE, 0);290if (r) {291dev_err(adev->dev,292"Failed to get response from firmware after reset\n");293return r;294}295296for (i = 0; i < adev->num_ip_blocks; i++) {297if (!(adev->ip_blocks[i].version->type ==298AMD_IP_BLOCK_TYPE_GFX ||299adev->ip_blocks[i].version->type ==300AMD_IP_BLOCK_TYPE_SDMA))301continue;302303r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);304if (r)305return r;306}307308for (i = 0; i < adev->num_ip_blocks; i++) {309if (!(adev->ip_blocks[i].version->type ==310AMD_IP_BLOCK_TYPE_GFX ||311adev->ip_blocks[i].version->type ==312AMD_IP_BLOCK_TYPE_SDMA ||313adev->ip_blocks[i].version->type ==314AMD_IP_BLOCK_TYPE_COMMON))315continue;316317if (adev->ip_blocks[i].version->funcs->late_init) {318r = adev->ip_blocks[i].version->funcs->late_init(319&adev->ip_blocks[i]);320if (r) {321dev_err(adev->dev,322"late_init of IP block <%s> failed %d after reset\n",323adev->ip_blocks[i].version->funcs->name,324r);325return r;326}327}328adev->ip_blocks[i].status.late_initialized = true;329}330331amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);332amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);333334return r;335}336337static int338aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,339struct amdgpu_reset_context *reset_context)340{341struct list_head *reset_device_list = reset_context->reset_device_list;342struct amdgpu_device *tmp_adev = NULL;343struct amdgpu_ras *con;344int r;345346if (reset_device_list == NULL)347return -EINVAL;348349if (amdgpu_ip_version(reset_context->reset_req_dev, MP1_HWIP, 0) ==350IP_VERSION(13, 0, 2) &&351reset_context->hive == NULL) {352/* Wrong context, return error */353return -EINVAL;354}355356list_for_each_entry(tmp_adev, reset_device_list, reset_list) {357amdgpu_set_init_level(tmp_adev,358AMDGPU_INIT_LEVEL_RESET_RECOVERY);359dev_info(tmp_adev->dev,360"GPU reset succeeded, trying to resume\n");361/*TBD: Ideally should clear only GFX, SDMA blocks*/362amdgpu_ras_clear_err_state(tmp_adev);363r = aldebaran_mode2_restore_ip(tmp_adev);364if (r)365goto end;366367/*368* Add this ASIC as tracked as reset was already369* complete successfully.370*/371amdgpu_register_gpu_instance(tmp_adev);372373/* Resume RAS, ecc_irq */374con = amdgpu_ras_get_context(tmp_adev);375if (!amdgpu_sriov_vf(tmp_adev) && con) {376if (tmp_adev->sdma.ras &&377tmp_adev->sdma.ras->ras_block.ras_late_init) {378r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,379&tmp_adev->sdma.ras->ras_block.ras_comm);380if (r) {381dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);382goto end;383}384}385386if (tmp_adev->gfx.ras &&387tmp_adev->gfx.ras->ras_block.ras_late_init) {388r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,389&tmp_adev->gfx.ras->ras_block.ras_comm);390if (r) {391dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);392goto end;393}394}395}396397amdgpu_ras_resume(tmp_adev);398399/* Update PSP FW topology after reset */400if (reset_context->hive &&401tmp_adev->gmc.xgmi.num_physical_nodes > 1)402r = amdgpu_xgmi_update_topology(reset_context->hive,403tmp_adev);404405if (!r) {406amdgpu_set_init_level(tmp_adev,407AMDGPU_INIT_LEVEL_DEFAULT);408amdgpu_irq_gpu_reset_resume_helper(tmp_adev);409410r = amdgpu_ib_ring_tests(tmp_adev);411if (r) {412dev_err(tmp_adev->dev,413"ib ring test failed (%d).\n", r);414r = -EAGAIN;415tmp_adev->asic_reset_res = r;416goto end;417}418}419}420421end:422return r;423}424425static struct amdgpu_reset_handler aldebaran_mode2_handler = {426.reset_method = AMD_RESET_METHOD_MODE2,427.prepare_env = NULL,428.prepare_hwcontext = aldebaran_mode2_prepare_hwcontext,429.perform_reset = aldebaran_mode2_perform_reset,430.restore_hwcontext = aldebaran_mode2_restore_hwcontext,431.restore_env = NULL,432.do_reset = aldebaran_mode2_reset,433};434435static struct amdgpu_reset_handler436*aldebaran_rst_handlers[AMDGPU_RESET_MAX_HANDLERS] = {437&aldebaran_mode2_handler,438&xgmi_reset_on_init_handler,439};440441int aldebaran_reset_init(struct amdgpu_device *adev)442{443struct amdgpu_reset_control *reset_ctl;444445reset_ctl = kzalloc(sizeof(*reset_ctl), GFP_KERNEL);446if (!reset_ctl)447return -ENOMEM;448449reset_ctl->handle = adev;450reset_ctl->async_reset = aldebaran_async_reset;451reset_ctl->active_reset = AMD_RESET_METHOD_NONE;452reset_ctl->get_reset_handler = aldebaran_get_reset_handler;453454INIT_WORK(&reset_ctl->reset_work, reset_ctl->async_reset);455/* Only mode2 is handled through reset control now */456reset_ctl->reset_handlers = &aldebaran_rst_handlers;457458adev->reset_cntl = reset_ctl;459460return 0;461}462463int aldebaran_reset_fini(struct amdgpu_device *adev)464{465kfree(adev->reset_cntl);466adev->reset_cntl = NULL;467return 0;468}469470471