Path: blob/master/arch/powerpc/platforms/powernv/opal-hmi.c
26481 views
// SPDX-License-Identifier: GPL-2.0-or-later1/*2* OPAL hypervisor Maintenance interrupt handling support in PowerNV.3*4* Copyright 2014 IBM Corporation5* Author: Mahesh Salgaonkar <[email protected]>6*/78#undef DEBUG910#include <linux/kernel.h>11#include <linux/init.h>12#include <linux/of.h>13#include <linux/mm.h>14#include <linux/slab.h>1516#include <asm/opal.h>17#include <asm/cputable.h>18#include <asm/machdep.h>1920#include "powernv.h"2122static int opal_hmi_handler_nb_init;23struct OpalHmiEvtNode {24struct list_head list;25struct OpalHMIEvent hmi_evt;26};2728struct xstop_reason {29uint32_t xstop_reason;30const char *unit_failed;31const char *description;32};3334static LIST_HEAD(opal_hmi_evt_list);35static DEFINE_SPINLOCK(opal_hmi_evt_lock);3637static void print_core_checkstop_reason(const char *level,38struct OpalHMIEvent *hmi_evt)39{40int i;41static const struct xstop_reason xstop_reason[] = {42{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",43"RegFile core check stop" },44{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },45{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",46"Core checkstop during recovery" },47{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",48"RegFile core check stop (mapper error)" },49{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },50{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },51{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },52{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",53"Recovery in maintenance mode" },54{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",55"RegFile core check stop" },56{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",57"Forward Progress Error" },58{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },59{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },60{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",61"Hypervisor Resource error - core check stop" },62{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",63"Hang Recovery Failed (core check stop)" },64{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",65"Ambiguous Hang Detected (unknown source)" },66{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",67"Debug Trigger Error inject" },68{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",69"Hypervisor check stop via SPRC/SPRD" },70};7172/* Validity check */73if (!hmi_evt->u.xstop_error.xstop_reason) {74printk("%s Unknown Core check stop.\n", level);75return;76}7778printk("%s CPU PIR: %08x\n", level,79be32_to_cpu(hmi_evt->u.xstop_error.u.pir));80for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)81if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &82xstop_reason[i].xstop_reason)83printk("%s [Unit: %-3s] %s\n", level,84xstop_reason[i].unit_failed,85xstop_reason[i].description);86}8788static void print_nx_checkstop_reason(const char *level,89struct OpalHMIEvent *hmi_evt)90{91int i;92static const struct xstop_reason xstop_reason[] = {93{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",94"SHM invalid state error" },95{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",96"DMA invalid state error bit 15" },97{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",98"DMA invalid state error bit 16" },99{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",100"Channel 0 invalid state error" },101{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",102"Channel 1 invalid state error" },103{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",104"Channel 2 invalid state error" },105{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",106"Channel 3 invalid state error" },107{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",108"Channel 4 invalid state error" },109{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",110"Channel 5 invalid state error" },111{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",112"Channel 6 invalid state error" },113{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",114"Channel 7 invalid state error" },115{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",116"UE error on CRB(CSB address, CCB)" },117{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",118"SUE error on CRB(CSB address, CCB)" },119{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",120"CRB Kill ISN received while holding ISN with UE error" },121};122123/* Validity check */124if (!hmi_evt->u.xstop_error.xstop_reason) {125printk("%s Unknown NX check stop.\n", level);126return;127}128129printk("%s NX checkstop on CHIP ID: %x\n", level,130be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));131for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)132if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &133xstop_reason[i].xstop_reason)134printk("%s [Unit: %-3s] %s\n", level,135xstop_reason[i].unit_failed,136xstop_reason[i].description);137}138139static void print_npu_checkstop_reason(const char *level,140struct OpalHMIEvent *hmi_evt)141{142uint8_t reason, reason_count, i;143144/*145* We may not have a checkstop reason on some combination of146* hardware and/or skiboot version147*/148if (!hmi_evt->u.xstop_error.xstop_reason) {149printk("%s NPU checkstop on chip %x\n", level,150be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));151return;152}153154/*155* NPU2 has 3 FIRs. Reason encoded on a byte as:156* 2 bits for the FIR number157* 6 bits for the bit number158* It may be possible to find several reasons.159*160* We don't display a specific message per FIR bit as there161* are too many and most are meaningless without the workbook162* and/or hw team help anyway.163*/164reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /165sizeof(reason);166for (i = 0; i < reason_count; i++) {167reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;168if (reason)169printk("%s NPU checkstop on chip %x: FIR%d bit %d is set\n",170level,171be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),172reason >> 6, reason & 0x3F);173}174}175176static void print_checkstop_reason(const char *level,177struct OpalHMIEvent *hmi_evt)178{179uint8_t type = hmi_evt->u.xstop_error.xstop_type;180switch (type) {181case CHECKSTOP_TYPE_CORE:182print_core_checkstop_reason(level, hmi_evt);183break;184case CHECKSTOP_TYPE_NX:185print_nx_checkstop_reason(level, hmi_evt);186break;187case CHECKSTOP_TYPE_NPU:188print_npu_checkstop_reason(level, hmi_evt);189break;190default:191printk("%s Unknown Malfunction Alert of type %d\n",192level, type);193break;194}195}196197static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)198{199const char *level, *sevstr, *error_info;200static const char *hmi_error_types[] = {201"Malfunction Alert",202"Processor Recovery done",203"Processor recovery occurred again",204"Processor recovery occurred for masked error",205"Timer facility experienced an error",206"TFMR SPR is corrupted",207"UPS (Uninterrupted Power System) Overflow indication",208"An XSCOM operation failure",209"An XSCOM operation completed",210"SCOM has set a reserved FIR bit to cause recovery",211"Debug trigger has set a reserved FIR bit to cause recovery",212"A hypervisor resource error occurred",213"CAPP recovery process is in progress",214};215static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,216DEFAULT_RATELIMIT_BURST);217218/* Print things out */219if (hmi_evt->version < OpalHMIEvt_V1) {220pr_err("HMI Interrupt, Unknown event version %d !\n",221hmi_evt->version);222return;223}224switch (hmi_evt->severity) {225case OpalHMI_SEV_NO_ERROR:226level = KERN_INFO;227sevstr = "Harmless";228break;229case OpalHMI_SEV_WARNING:230level = KERN_WARNING;231sevstr = "";232break;233case OpalHMI_SEV_ERROR_SYNC:234level = KERN_ERR;235sevstr = "Severe";236break;237case OpalHMI_SEV_FATAL:238default:239level = KERN_ERR;240sevstr = "Fatal";241break;242}243244if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {245printk("%s%s Hypervisor Maintenance interrupt [%s]\n",246level, sevstr,247hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?248"Recovered" : "Not recovered");249error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?250hmi_error_types[hmi_evt->type]251: "Unknown";252printk("%s Error detail: %s\n", level, error_info);253printk("%s HMER: %016llx\n", level,254be64_to_cpu(hmi_evt->hmer));255if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||256(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))257printk("%s TFMR: %016llx\n", level,258be64_to_cpu(hmi_evt->tfmr));259}260261if (hmi_evt->version < OpalHMIEvt_V2)262return;263264/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */265if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)266print_checkstop_reason(level, hmi_evt);267}268269static void hmi_event_handler(struct work_struct *work)270{271unsigned long flags;272struct OpalHMIEvent *hmi_evt;273struct OpalHmiEvtNode *msg_node;274uint8_t disposition;275struct opal_msg msg;276int unrecoverable = 0;277278spin_lock_irqsave(&opal_hmi_evt_lock, flags);279while (!list_empty(&opal_hmi_evt_list)) {280msg_node = list_entry(opal_hmi_evt_list.next,281struct OpalHmiEvtNode, list);282list_del(&msg_node->list);283spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);284285hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;286print_hmi_event_info(hmi_evt);287disposition = hmi_evt->disposition;288kfree(msg_node);289290/*291* Check if HMI event has been recovered or not. If not292* then kernel can't continue, we need to panic.293* But before we do that, display all the HMI event294* available on the list and set unrecoverable flag to 1.295*/296if (disposition != OpalHMI_DISPOSITION_RECOVERED)297unrecoverable = 1;298299spin_lock_irqsave(&opal_hmi_evt_lock, flags);300}301spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);302303if (unrecoverable) {304/* Pull all HMI events from OPAL before we panic. */305while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {306u32 type;307308type = be32_to_cpu(msg.msg_type);309310/* skip if not HMI event */311if (type != OPAL_MSG_HMI_EVT)312continue;313314/* HMI event info starts from param[0] */315hmi_evt = (struct OpalHMIEvent *)&msg.params[0];316print_hmi_event_info(hmi_evt);317}318319pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");320}321}322323static DECLARE_WORK(hmi_event_work, hmi_event_handler);324/*325* opal_handle_hmi_event - notifier handler that queues up HMI events326* to be preocessed later.327*/328static int opal_handle_hmi_event(struct notifier_block *nb,329unsigned long msg_type, void *msg)330{331unsigned long flags;332struct OpalHMIEvent *hmi_evt;333struct opal_msg *hmi_msg = msg;334struct OpalHmiEvtNode *msg_node;335336/* Sanity Checks */337if (msg_type != OPAL_MSG_HMI_EVT)338return 0;339340/* HMI event info starts from param[0] */341hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];342343/* Delay the logging of HMI events to workqueue. */344msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);345if (!msg_node) {346pr_err("HMI: out of memory, Opal message event not handled\n");347return -ENOMEM;348}349memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));350351spin_lock_irqsave(&opal_hmi_evt_lock, flags);352list_add(&msg_node->list, &opal_hmi_evt_list);353spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);354355schedule_work(&hmi_event_work);356return 0;357}358359static struct notifier_block opal_hmi_handler_nb = {360.notifier_call = opal_handle_hmi_event,361.next = NULL,362.priority = 0,363};364365int __init opal_hmi_handler_init(void)366{367int ret;368369if (!opal_hmi_handler_nb_init) {370ret = opal_message_notifier_register(371OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);372if (ret) {373pr_err("%s: Can't register OPAL event notifier (%d)\n",374__func__, ret);375return ret;376}377opal_hmi_handler_nb_init = 1;378}379return 0;380}381382383