Path: blob/master/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
26517 views
// SPDX-License-Identifier: GPL-2.01/*2* Copyright 2025 Advanced Micro Devices, Inc.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*22*/23#include <linux/list.h>24#include "amdgpu.h"2526static const guid_t MCE = CPER_NOTIFY_MCE;27static const guid_t CMC = CPER_NOTIFY_CMC;28static const guid_t BOOT = BOOT_TYPE;2930static const guid_t CRASHDUMP = AMD_CRASHDUMP;31static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR;3233static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)34{35hdr->record_length += size;36}3738static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)39{40struct tm tm;41time64_t now = ktime_get_real_seconds();4243time64_to_tm(now, 0, &tm);44timestamp->seconds = tm.tm_sec;45timestamp->minutes = tm.tm_min;46timestamp->hours = tm.tm_hour;47timestamp->flag = 0;48timestamp->day = tm.tm_mday;49timestamp->month = 1 + tm.tm_mon;50timestamp->year = (1900 + tm.tm_year) % 100;51timestamp->century = (1900 + tm.tm_year) / 100;52}5354void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,55struct cper_hdr *hdr,56enum amdgpu_cper_type type,57enum cper_error_severity sev)58{59char record_id[16];6061hdr->signature[0] = 'C';62hdr->signature[1] = 'P';63hdr->signature[2] = 'E';64hdr->signature[3] = 'R';65hdr->revision = CPER_HDR_REV_1;66hdr->signature_end = 0xFFFFFFFF;67hdr->error_severity = sev;6869hdr->valid_bits.platform_id = 1;70hdr->valid_bits.partition_id = 1;71hdr->valid_bits.timestamp = 1;7273amdgpu_cper_get_timestamp(&hdr->timestamp);7475snprintf(record_id, 9, "%d:%X",76(adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?77adev->smuio.funcs->get_socket_id(adev) :780,79atomic_inc_return(&adev->cper.unique_id));80memcpy(hdr->record_id, record_id, 8);8182snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",83adev->pdev->vendor, adev->pdev->device);84/* pmfw version should be part of creator_id according to CPER spec */85snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);8687switch (type) {88case AMDGPU_CPER_TYPE_BOOT:89hdr->notify_type = BOOT;90break;91case AMDGPU_CPER_TYPE_FATAL:92case AMDGPU_CPER_TYPE_BP_THRESHOLD:93hdr->notify_type = MCE;94break;95case AMDGPU_CPER_TYPE_RUNTIME:96if (sev == CPER_SEV_NON_FATAL_CORRECTED)97hdr->notify_type = CMC;98else99hdr->notify_type = MCE;100break;101default:102dev_err(adev->dev, "Unknown CPER Type\n");103break;104}105106__inc_entry_length(hdr, HDR_LEN);107}108109static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,110struct cper_sec_desc *section_desc,111bool bp_threshold,112bool poison,113enum cper_error_severity sev,114guid_t sec_type,115uint32_t section_length,116uint32_t section_offset)117{118section_desc->revision_minor = CPER_SEC_MINOR_REV_1;119section_desc->revision_major = CPER_SEC_MAJOR_REV_22;120section_desc->sec_offset = section_offset;121section_desc->sec_length = section_length;122section_desc->valid_bits.fru_text = 1;123section_desc->flag_bits.primary = 1;124section_desc->severity = sev;125section_desc->sec_type = sec_type;126127snprintf(section_desc->fru_text, 20, "OAM%d",128(adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?129adev->smuio.funcs->get_socket_id(adev) :1300);131132if (bp_threshold)133section_desc->flag_bits.exceed_err_threshold = 1;134if (poison)135section_desc->flag_bits.latent_err = 1;136137return 0;138}139140int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,141struct cper_hdr *hdr,142uint32_t idx,143struct cper_sec_crashdump_reg_data reg_data)144{145struct cper_sec_desc *section_desc;146struct cper_sec_crashdump_fatal *section;147148section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));149section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +150FATAL_SEC_OFFSET(hdr->sec_cnt, idx));151152amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,153CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,154FATAL_SEC_OFFSET(hdr->sec_cnt, idx));155156section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;157section->body.reg_arr_size = sizeof(reg_data);158section->body.data = reg_data;159160__inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);161162return 0;163}164165int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,166struct cper_hdr *hdr,167uint32_t idx,168enum cper_error_severity sev,169uint32_t *reg_dump,170uint32_t reg_count)171{172struct cper_sec_desc *section_desc;173struct cper_sec_nonstd_err *section;174bool poison;175176poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true;177section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));178section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +179NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));180181amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,182sev, RUNTIME, NONSTD_SEC_LEN,183NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));184185reg_count = umin(reg_count, CPER_ACA_REG_COUNT);186187section->hdr.valid_bits.err_info_cnt = 1;188section->hdr.valid_bits.err_context_cnt = 1;189190section->info.error_type = RUNTIME;191section->info.ms_chk_bits.err_type_valid = 1;192section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;193section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);194195memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));196197__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);198199return 0;200}201202int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,203struct cper_hdr *hdr,204uint32_t idx)205{206struct cper_sec_desc *section_desc;207struct cper_sec_nonstd_err *section;208209section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));210section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +211NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));212213amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,214CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,215NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));216217section->hdr.valid_bits.err_info_cnt = 1;218section->hdr.valid_bits.err_context_cnt = 1;219220section->info.error_type = RUNTIME;221section->info.ms_chk_bits.err_type_valid = 1;222section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;223section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);224225/* Hardcoded Reg dump for bad page threshold CPER */226section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1;227section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0;228section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;229section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;230section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0;231section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0;232section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0;233section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0;234section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;235section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;236section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0;237section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96;238section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0;239section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0;240241__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);242243return 0;244}245246struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,247enum amdgpu_cper_type type,248uint16_t section_count)249{250struct cper_hdr *hdr;251uint32_t size = 0;252253size += HDR_LEN;254size += (SEC_DESC_LEN * section_count);255256switch (type) {257case AMDGPU_CPER_TYPE_RUNTIME:258case AMDGPU_CPER_TYPE_BP_THRESHOLD:259size += (NONSTD_SEC_LEN * section_count);260break;261case AMDGPU_CPER_TYPE_FATAL:262size += (FATAL_SEC_LEN * section_count);263break;264case AMDGPU_CPER_TYPE_BOOT:265size += (BOOT_SEC_LEN * section_count);266break;267default:268dev_err(adev->dev, "Unknown CPER Type!\n");269return NULL;270}271272hdr = kzalloc(size, GFP_KERNEL);273if (!hdr)274return NULL;275276/* Save this early */277hdr->sec_cnt = section_count;278279return hdr;280}281282int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,283struct aca_bank *bank)284{285struct cper_hdr *fatal = NULL;286struct cper_sec_crashdump_reg_data reg_data = { 0 };287struct amdgpu_ring *ring = &adev->cper.ring_buf;288int ret;289290fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);291if (!fatal) {292dev_err(adev->dev, "fail to alloc cper entry for ue record\n");293return -ENOMEM;294}295296reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);297reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);298reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);299reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);300reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);301reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);302reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);303reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);304305amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);306ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);307if (ret)308return ret;309310amdgpu_cper_ring_write(ring, fatal, fatal->record_length);311kfree(fatal);312313return 0;314}315316int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)317{318struct cper_hdr *bp_threshold = NULL;319struct amdgpu_ring *ring = &adev->cper.ring_buf;320int ret;321322bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);323if (!bp_threshold) {324dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");325return -ENOMEM;326}327328amdgpu_cper_entry_fill_hdr(adev, bp_threshold,329AMDGPU_CPER_TYPE_BP_THRESHOLD,330CPER_SEV_FATAL);331ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);332if (ret)333return ret;334335amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);336kfree(bp_threshold);337338return 0;339}340341static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,342enum aca_error_type aca_err_type)343{344switch (aca_err_type) {345case ACA_ERROR_TYPE_UE:346return CPER_SEV_FATAL;347case ACA_ERROR_TYPE_CE:348return CPER_SEV_NON_FATAL_CORRECTED;349case ACA_ERROR_TYPE_DEFERRED:350return CPER_SEV_NON_FATAL_UNCORRECTED;351default:352dev_err(adev->dev, "Unknown ACA error type!\n");353return CPER_SEV_FATAL;354}355}356357int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,358struct aca_banks *banks,359uint16_t bank_count)360{361struct cper_hdr *corrected = NULL;362enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;363struct amdgpu_ring *ring = &adev->cper.ring_buf;364uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };365struct aca_bank_node *node;366struct aca_bank *bank;367uint32_t i = 0;368int ret;369370corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);371if (!corrected) {372dev_err(adev->dev, "fail to allocate cper entry for ce records\n");373return -ENOMEM;374}375376/* Raise severity if any DE is detected in the ACA bank list */377list_for_each_entry(node, &banks->list, node) {378bank = &node->bank;379if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {380sev = CPER_SEV_NON_FATAL_UNCORRECTED;381break;382}383}384385amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);386387/* Combine CE and DE in cper record */388list_for_each_entry(node, &banks->list, node) {389bank = &node->bank;390reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);391reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);392reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);393reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);394reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);395reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);396reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);397reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);398reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);399reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);400reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);401reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);402reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);403reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);404405ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,406amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),407reg_data, CPER_ACA_REG_COUNT);408if (ret)409return ret;410}411412amdgpu_cper_ring_write(ring, corrected, corrected->record_length);413kfree(corrected);414415return 0;416}417418static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)419{420struct cper_hdr *chdr;421422chdr = (struct cper_hdr *)&(ring->ring[pos]);423return strcmp(chdr->signature, "CPER") ? false : true;424}425426static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)427{428struct cper_hdr *chdr;429u64 p;430u32 chunk, rec_len = 0;431432chdr = (struct cper_hdr *)&(ring->ring[pos]);433chunk = ring->ring_size - (pos << 2);434435if (!strcmp(chdr->signature, "CPER")) {436rec_len = chdr->record_length;437goto calc;438}439440/* ring buffer is not full, no cper data after ring->wptr */441if (ring->count_dw)442goto calc;443444for (p = pos + 1; p <= ring->buf_mask; p++) {445chdr = (struct cper_hdr *)&(ring->ring[p]);446if (!strcmp(chdr->signature, "CPER")) {447rec_len = (p - pos) << 2;448goto calc;449}450}451452calc:453if (!rec_len)454return chunk;455else456return umin(rec_len, chunk);457}458459void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)460{461u64 pos, wptr_old, rptr;462int rec_cnt_dw = count >> 2;463u32 chunk, ent_sz;464u8 *s = (u8 *)src;465466if (count >= ring->ring_size - 4) {467dev_err(ring->adev->dev,468"CPER data size(%d) is larger than ring size(%d)\n",469count, ring->ring_size - 4);470471return;472}473474mutex_lock(&ring->adev->cper.ring_lock);475476wptr_old = ring->wptr;477rptr = *ring->rptr_cpu_addr & ring->ptr_mask;478479while (count) {480ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);481chunk = umin(ent_sz, count);482483memcpy(&ring->ring[ring->wptr], s, chunk);484485ring->wptr += (chunk >> 2);486ring->wptr &= ring->ptr_mask;487count -= chunk;488s += chunk;489}490491if (ring->count_dw < rec_cnt_dw)492ring->count_dw = 0;493494/* the buffer is overflow, adjust rptr */495if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||496((ring->wptr < wptr_old) && (wptr_old < rptr)) ||497((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {498pos = (ring->wptr + 1) & ring->ptr_mask;499500do {501ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);502503rptr += (ent_sz >> 2);504rptr &= ring->ptr_mask;505*ring->rptr_cpu_addr = rptr;506507pos = rptr;508} while (!amdgpu_cper_is_hdr(ring, rptr));509}510511if (ring->count_dw >= rec_cnt_dw)512ring->count_dw -= rec_cnt_dw;513mutex_unlock(&ring->adev->cper.ring_lock);514}515516static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)517{518return *(ring->rptr_cpu_addr);519}520521static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)522{523return ring->wptr;524}525526static const struct amdgpu_ring_funcs cper_ring_funcs = {527.type = AMDGPU_RING_TYPE_CPER,528.align_mask = 0xff,529.support_64bit_ptrs = false,530.get_rptr = amdgpu_cper_ring_get_rptr,531.get_wptr = amdgpu_cper_ring_get_wptr,532};533534static int amdgpu_cper_ring_init(struct amdgpu_device *adev)535{536struct amdgpu_ring *ring = &(adev->cper.ring_buf);537538mutex_init(&adev->cper.ring_lock);539540ring->adev = NULL;541ring->ring_obj = NULL;542ring->use_doorbell = false;543ring->no_scheduler = true;544ring->funcs = &cper_ring_funcs;545546sprintf(ring->name, "cper");547return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,548AMDGPU_RING_PRIO_DEFAULT, NULL);549}550551int amdgpu_cper_init(struct amdgpu_device *adev)552{553int r;554555if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))556return 0;557558r = amdgpu_cper_ring_init(adev);559if (r) {560dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);561return r;562}563564mutex_init(&adev->cper.cper_lock);565566adev->cper.enabled = true;567adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;568569return 0;570}571572int amdgpu_cper_fini(struct amdgpu_device *adev)573{574if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))575return 0;576577adev->cper.enabled = false;578579amdgpu_ring_fini(&(adev->cper.ring_buf));580adev->cper.count = 0;581adev->cper.wptr = 0;582583return 0;584}585586587