Path: blob/master/arch/x86/kernel/cpu/mcheck/mce.c
10775 views
/*1* Machine check handler.2*3* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.4* Rest from unknown author(s).5* 2004 Andi Kleen. Rewrote most of it.6* Copyright 2008 Intel Corporation7* Author: Andi Kleen8*/9#include <linux/thread_info.h>10#include <linux/capability.h>11#include <linux/miscdevice.h>12#include <linux/interrupt.h>13#include <linux/ratelimit.h>14#include <linux/kallsyms.h>15#include <linux/rcupdate.h>16#include <linux/kobject.h>17#include <linux/uaccess.h>18#include <linux/kdebug.h>19#include <linux/kernel.h>20#include <linux/percpu.h>21#include <linux/string.h>22#include <linux/sysdev.h>23#include <linux/syscore_ops.h>24#include <linux/delay.h>25#include <linux/ctype.h>26#include <linux/sched.h>27#include <linux/sysfs.h>28#include <linux/types.h>29#include <linux/slab.h>30#include <linux/init.h>31#include <linux/kmod.h>32#include <linux/poll.h>33#include <linux/nmi.h>34#include <linux/cpu.h>35#include <linux/smp.h>36#include <linux/fs.h>37#include <linux/mm.h>38#include <linux/debugfs.h>39#include <linux/edac_mce.h>4041#include <asm/processor.h>42#include <asm/hw_irq.h>43#include <asm/apic.h>44#include <asm/idle.h>45#include <asm/ipi.h>46#include <asm/mce.h>47#include <asm/msr.h>4849#include "mce-internal.h"5051static DEFINE_MUTEX(mce_read_mutex);5253#define rcu_dereference_check_mce(p) \54rcu_dereference_index_check((p), \55rcu_read_lock_sched_held() || \56lockdep_is_held(&mce_read_mutex))5758#define CREATE_TRACE_POINTS59#include <trace/events/mce.h>6061int mce_disabled __read_mostly;6263#define MISC_MCELOG_MINOR 2276465#define SPINUNIT 100 /* 100ns */6667atomic_t mce_entry;6869DEFINE_PER_CPU(unsigned, mce_exception_count);7071/*72* Tolerant levels:73* 0: always panic on uncorrected errors, log corrected errors74* 1: panic or SIGBUS on uncorrected errors, log corrected errors75* 2: SIGBUS or log uncorrected errors (if possible), log corrected errors76* 3: never panic or SIGBUS, log all errors (for testing only)77*/78static int tolerant __read_mostly = 1;79static int banks __read_mostly;80static int rip_msr __read_mostly;81static int mce_bootlog __read_mostly = -1;82static int monarch_timeout __read_mostly = -1;83static int mce_panic_timeout __read_mostly;84static int mce_dont_log_ce __read_mostly;85int mce_cmci_disabled __read_mostly;86int mce_ignore_ce __read_mostly;87int mce_ser __read_mostly;8889struct mce_bank *mce_banks __read_mostly;9091/* User mode helper program triggered by machine check event */92static unsigned long mce_need_notify;93static char mce_helper[128];94static char *mce_helper_argv[2] = { mce_helper, NULL };9596static DECLARE_WAIT_QUEUE_HEAD(mce_wait);97static DEFINE_PER_CPU(struct mce, mces_seen);98static int cpu_missing;99100/*101* CPU/chipset specific EDAC code can register a notifier call here to print102* MCE errors in a human-readable form.103*/104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);106107/* MCA banks polled by the period polling timer for corrected events */108DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {109[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL110};111112static DEFINE_PER_CPU(struct work_struct, mce_work);113114/* Do initial initialization of a struct mce */115void mce_setup(struct mce *m)116{117memset(m, 0, sizeof(struct mce));118m->cpu = m->extcpu = smp_processor_id();119rdtscll(m->tsc);120/* We hope get_seconds stays lockless */121m->time = get_seconds();122m->cpuvendor = boot_cpu_data.x86_vendor;123m->cpuid = cpuid_eax(1);124#ifdef CONFIG_SMP125m->socketid = cpu_data(m->extcpu).phys_proc_id;126#endif127m->apicid = cpu_data(m->extcpu).initial_apicid;128rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);129}130131DEFINE_PER_CPU(struct mce, injectm);132EXPORT_PER_CPU_SYMBOL_GPL(injectm);133134/*135* Lockless MCE logging infrastructure.136* This avoids deadlocks on printk locks without having to break locks. Also137* separate MCEs from kernel messages to avoid bogus bug reports.138*/139140static struct mce_log mcelog = {141.signature = MCE_LOG_SIGNATURE,142.len = MCE_LOG_LEN,143.recordlen = sizeof(struct mce),144};145146void mce_log(struct mce *mce)147{148unsigned next, entry;149150/* Emit the trace record: */151trace_mce_record(mce);152153mce->finished = 0;154wmb();155for (;;) {156entry = rcu_dereference_check_mce(mcelog.next);157for (;;) {158/*159* If edac_mce is enabled, it will check the error type160* and will process it, if it is a known error.161* Otherwise, the error will be sent through mcelog162* interface163*/164if (edac_mce_parse(mce))165return;166167/*168* When the buffer fills up discard new entries.169* Assume that the earlier errors are the more170* interesting ones:171*/172if (entry >= MCE_LOG_LEN) {173set_bit(MCE_OVERFLOW,174(unsigned long *)&mcelog.flags);175return;176}177/* Old left over entry. Skip: */178if (mcelog.entry[entry].finished) {179entry++;180continue;181}182break;183}184smp_rmb();185next = entry + 1;186if (cmpxchg(&mcelog.next, entry, next) == entry)187break;188}189memcpy(mcelog.entry + entry, mce, sizeof(struct mce));190wmb();191mcelog.entry[entry].finished = 1;192wmb();193194mce->finished = 1;195set_bit(0, &mce_need_notify);196}197198static void print_mce(struct mce *m)199{200int ret = 0;201202pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",203m->extcpu, m->mcgstatus, m->bank, m->status);204205if (m->ip) {206pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",207!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",208m->cs, m->ip);209210if (m->cs == __KERNEL_CS)211print_symbol("{%s}", m->ip);212pr_cont("\n");213}214215pr_emerg(HW_ERR "TSC %llx ", m->tsc);216if (m->addr)217pr_cont("ADDR %llx ", m->addr);218if (m->misc)219pr_cont("MISC %llx ", m->misc);220221pr_cont("\n");222pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",223m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);224225/*226* Print out human-readable details about the MCE error,227* (if the CPU has an implementation for that)228*/229ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);230if (ret == NOTIFY_STOP)231return;232233pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");234}235236#define PANIC_TIMEOUT 5 /* 5 seconds */237238static atomic_t mce_paniced;239240static int fake_panic;241static atomic_t mce_fake_paniced;242243/* Panic in progress. Enable interrupts and wait for final IPI */244static void wait_for_panic(void)245{246long timeout = PANIC_TIMEOUT*USEC_PER_SEC;247248preempt_disable();249local_irq_enable();250while (timeout-- > 0)251udelay(1);252if (panic_timeout == 0)253panic_timeout = mce_panic_timeout;254panic("Panicing machine check CPU died");255}256257static void mce_panic(char *msg, struct mce *final, char *exp)258{259int i, apei_err = 0;260261if (!fake_panic) {262/*263* Make sure only one CPU runs in machine check panic264*/265if (atomic_inc_return(&mce_paniced) > 1)266wait_for_panic();267barrier();268269bust_spinlocks(1);270console_verbose();271} else {272/* Don't log too much for fake panic */273if (atomic_inc_return(&mce_fake_paniced) > 1)274return;275}276/* First print corrected ones that are still unlogged */277for (i = 0; i < MCE_LOG_LEN; i++) {278struct mce *m = &mcelog.entry[i];279if (!(m->status & MCI_STATUS_VAL))280continue;281if (!(m->status & MCI_STATUS_UC)) {282print_mce(m);283if (!apei_err)284apei_err = apei_write_mce(m);285}286}287/* Now print uncorrected but with the final one last */288for (i = 0; i < MCE_LOG_LEN; i++) {289struct mce *m = &mcelog.entry[i];290if (!(m->status & MCI_STATUS_VAL))291continue;292if (!(m->status & MCI_STATUS_UC))293continue;294if (!final || memcmp(m, final, sizeof(struct mce))) {295print_mce(m);296if (!apei_err)297apei_err = apei_write_mce(m);298}299}300if (final) {301print_mce(final);302if (!apei_err)303apei_err = apei_write_mce(final);304}305if (cpu_missing)306pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");307if (exp)308pr_emerg(HW_ERR "Machine check: %s\n", exp);309if (!fake_panic) {310if (panic_timeout == 0)311panic_timeout = mce_panic_timeout;312panic(msg);313} else314pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);315}316317/* Support code for software error injection */318319static int msr_to_offset(u32 msr)320{321unsigned bank = __this_cpu_read(injectm.bank);322323if (msr == rip_msr)324return offsetof(struct mce, ip);325if (msr == MSR_IA32_MCx_STATUS(bank))326return offsetof(struct mce, status);327if (msr == MSR_IA32_MCx_ADDR(bank))328return offsetof(struct mce, addr);329if (msr == MSR_IA32_MCx_MISC(bank))330return offsetof(struct mce, misc);331if (msr == MSR_IA32_MCG_STATUS)332return offsetof(struct mce, mcgstatus);333return -1;334}335336/* MSR access wrappers used for error injection */337static u64 mce_rdmsrl(u32 msr)338{339u64 v;340341if (__this_cpu_read(injectm.finished)) {342int offset = msr_to_offset(msr);343344if (offset < 0)345return 0;346return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);347}348349if (rdmsrl_safe(msr, &v)) {350WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);351/*352* Return zero in case the access faulted. This should353* not happen normally but can happen if the CPU does354* something weird, or if the code is buggy.355*/356v = 0;357}358359return v;360}361362static void mce_wrmsrl(u32 msr, u64 v)363{364if (__this_cpu_read(injectm.finished)) {365int offset = msr_to_offset(msr);366367if (offset >= 0)368*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;369return;370}371wrmsrl(msr, v);372}373374/*375* Simple lockless ring to communicate PFNs from the exception handler with the376* process context work function. This is vastly simplified because there's377* only a single reader and a single writer.378*/379#define MCE_RING_SIZE 16 /* we use one entry less */380381struct mce_ring {382unsigned short start;383unsigned short end;384unsigned long ring[MCE_RING_SIZE];385};386static DEFINE_PER_CPU(struct mce_ring, mce_ring);387388/* Runs with CPU affinity in workqueue */389static int mce_ring_empty(void)390{391struct mce_ring *r = &__get_cpu_var(mce_ring);392393return r->start == r->end;394}395396static int mce_ring_get(unsigned long *pfn)397{398struct mce_ring *r;399int ret = 0;400401*pfn = 0;402get_cpu();403r = &__get_cpu_var(mce_ring);404if (r->start == r->end)405goto out;406*pfn = r->ring[r->start];407r->start = (r->start + 1) % MCE_RING_SIZE;408ret = 1;409out:410put_cpu();411return ret;412}413414/* Always runs in MCE context with preempt off */415static int mce_ring_add(unsigned long pfn)416{417struct mce_ring *r = &__get_cpu_var(mce_ring);418unsigned next;419420next = (r->end + 1) % MCE_RING_SIZE;421if (next == r->start)422return -1;423r->ring[r->end] = pfn;424wmb();425r->end = next;426return 0;427}428429int mce_available(struct cpuinfo_x86 *c)430{431if (mce_disabled)432return 0;433return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);434}435436static void mce_schedule_work(void)437{438if (!mce_ring_empty()) {439struct work_struct *work = &__get_cpu_var(mce_work);440if (!work_pending(work))441schedule_work(work);442}443}444445/*446* Get the address of the instruction at the time of the machine check447* error.448*/449static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)450{451452if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {453m->ip = regs->ip;454m->cs = regs->cs;455} else {456m->ip = 0;457m->cs = 0;458}459if (rip_msr)460m->ip = mce_rdmsrl(rip_msr);461}462463#ifdef CONFIG_X86_LOCAL_APIC464/*465* Called after interrupts have been reenabled again466* when a MCE happened during an interrupts off region467* in the kernel.468*/469asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)470{471ack_APIC_irq();472exit_idle();473irq_enter();474mce_notify_irq();475mce_schedule_work();476irq_exit();477}478#endif479480static void mce_report_event(struct pt_regs *regs)481{482if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {483mce_notify_irq();484/*485* Triggering the work queue here is just an insurance486* policy in case the syscall exit notify handler487* doesn't run soon enough or ends up running on the488* wrong CPU (can happen when audit sleeps)489*/490mce_schedule_work();491return;492}493494#ifdef CONFIG_X86_LOCAL_APIC495/*496* Without APIC do not notify. The event will be picked497* up eventually.498*/499if (!cpu_has_apic)500return;501502/*503* When interrupts are disabled we cannot use504* kernel services safely. Trigger an self interrupt505* through the APIC to instead do the notification506* after interrupts are reenabled again.507*/508apic->send_IPI_self(MCE_SELF_VECTOR);509510/*511* Wait for idle afterwards again so that we don't leave the512* APIC in a non idle state because the normal APIC writes513* cannot exclude us.514*/515apic_wait_icr_idle();516#endif517}518519DEFINE_PER_CPU(unsigned, mce_poll_count);520521/*522* Poll for corrected events or events that happened before reset.523* Those are just logged through /dev/mcelog.524*525* This is executed in standard interrupt context.526*527* Note: spec recommends to panic for fatal unsignalled528* errors here. However this would be quite problematic --529* we would need to reimplement the Monarch handling and530* it would mess up the exclusion between exception handler531* and poll hander -- * so we skip this for now.532* These cases should not happen anyways, or only when the CPU533* is already totally * confused. In this case it's likely it will534* not fully execute the machine check handler either.535*/536void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)537{538struct mce m;539int i;540541percpu_inc(mce_poll_count);542543mce_setup(&m);544545m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);546for (i = 0; i < banks; i++) {547if (!mce_banks[i].ctl || !test_bit(i, *b))548continue;549550m.misc = 0;551m.addr = 0;552m.bank = i;553m.tsc = 0;554555barrier();556m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));557if (!(m.status & MCI_STATUS_VAL))558continue;559560/*561* Uncorrected or signalled events are handled by the exception562* handler when it is enabled, so don't process those here.563*564* TBD do the same check for MCI_STATUS_EN here?565*/566if (!(flags & MCP_UC) &&567(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))568continue;569570if (m.status & MCI_STATUS_MISCV)571m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));572if (m.status & MCI_STATUS_ADDRV)573m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));574575if (!(flags & MCP_TIMESTAMP))576m.tsc = 0;577/*578* Don't get the IP here because it's unlikely to579* have anything to do with the actual error location.580*/581if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {582mce_log(&m);583atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);584}585586/*587* Clear state for this bank.588*/589mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);590}591592/*593* Don't clear MCG_STATUS here because it's only defined for594* exceptions.595*/596597sync_core();598}599EXPORT_SYMBOL_GPL(machine_check_poll);600601/*602* Do a quick check if any of the events requires a panic.603* This decides if we keep the events around or clear them.604*/605static int mce_no_way_out(struct mce *m, char **msg)606{607int i;608609for (i = 0; i < banks; i++) {610m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));611if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)612return 1;613}614return 0;615}616617/*618* Variable to establish order between CPUs while scanning.619* Each CPU spins initially until executing is equal its number.620*/621static atomic_t mce_executing;622623/*624* Defines order of CPUs on entry. First CPU becomes Monarch.625*/626static atomic_t mce_callin;627628/*629* Check if a timeout waiting for other CPUs happened.630*/631static int mce_timed_out(u64 *t)632{633/*634* The others already did panic for some reason.635* Bail out like in a timeout.636* rmb() to tell the compiler that system_state637* might have been modified by someone else.638*/639rmb();640if (atomic_read(&mce_paniced))641wait_for_panic();642if (!monarch_timeout)643goto out;644if ((s64)*t < SPINUNIT) {645/* CHECKME: Make panic default for 1 too? */646if (tolerant < 1)647mce_panic("Timeout synchronizing machine check over CPUs",648NULL, NULL);649cpu_missing = 1;650return 1;651}652*t -= SPINUNIT;653out:654touch_nmi_watchdog();655return 0;656}657658/*659* The Monarch's reign. The Monarch is the CPU who entered660* the machine check handler first. It waits for the others to661* raise the exception too and then grades them. When any662* error is fatal panic. Only then let the others continue.663*664* The other CPUs entering the MCE handler will be controlled by the665* Monarch. They are called Subjects.666*667* This way we prevent any potential data corruption in a unrecoverable case668* and also makes sure always all CPU's errors are examined.669*670* Also this detects the case of a machine check event coming from outer671* space (not detected by any CPUs) In this case some external agent wants672* us to shut down, so panic too.673*674* The other CPUs might still decide to panic if the handler happens675* in a unrecoverable place, but in this case the system is in a semi-stable676* state and won't corrupt anything by itself. It's ok to let the others677* continue for a bit first.678*679* All the spin loops have timeouts; when a timeout happens a CPU680* typically elects itself to be Monarch.681*/682static void mce_reign(void)683{684int cpu;685struct mce *m = NULL;686int global_worst = 0;687char *msg = NULL;688char *nmsg = NULL;689690/*691* This CPU is the Monarch and the other CPUs have run692* through their handlers.693* Grade the severity of the errors of all the CPUs.694*/695for_each_possible_cpu(cpu) {696int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,697&nmsg);698if (severity > global_worst) {699msg = nmsg;700global_worst = severity;701m = &per_cpu(mces_seen, cpu);702}703}704705/*706* Cannot recover? Panic here then.707* This dumps all the mces in the log buffer and stops the708* other CPUs.709*/710if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)711mce_panic("Fatal Machine check", m, msg);712713/*714* For UC somewhere we let the CPU who detects it handle it.715* Also must let continue the others, otherwise the handling716* CPU could deadlock on a lock.717*/718719/*720* No machine check event found. Must be some external721* source or one CPU is hung. Panic.722*/723if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)724mce_panic("Machine check from unknown source", NULL, NULL);725726/*727* Now clear all the mces_seen so that they don't reappear on728* the next mce.729*/730for_each_possible_cpu(cpu)731memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));732}733734static atomic_t global_nwo;735736/*737* Start of Monarch synchronization. This waits until all CPUs have738* entered the exception handler and then determines if any of them739* saw a fatal event that requires panic. Then it executes them740* in the entry order.741* TBD double check parallel CPU hotunplug742*/743static int mce_start(int *no_way_out)744{745int order;746int cpus = num_online_cpus();747u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;748749if (!timeout)750return -1;751752atomic_add(*no_way_out, &global_nwo);753/*754* global_nwo should be updated before mce_callin755*/756smp_wmb();757order = atomic_inc_return(&mce_callin);758759/*760* Wait for everyone.761*/762while (atomic_read(&mce_callin) != cpus) {763if (mce_timed_out(&timeout)) {764atomic_set(&global_nwo, 0);765return -1;766}767ndelay(SPINUNIT);768}769770/*771* mce_callin should be read before global_nwo772*/773smp_rmb();774775if (order == 1) {776/*777* Monarch: Starts executing now, the others wait.778*/779atomic_set(&mce_executing, 1);780} else {781/*782* Subject: Now start the scanning loop one by one in783* the original callin order.784* This way when there are any shared banks it will be785* only seen by one CPU before cleared, avoiding duplicates.786*/787while (atomic_read(&mce_executing) < order) {788if (mce_timed_out(&timeout)) {789atomic_set(&global_nwo, 0);790return -1;791}792ndelay(SPINUNIT);793}794}795796/*797* Cache the global no_way_out state.798*/799*no_way_out = atomic_read(&global_nwo);800801return order;802}803804/*805* Synchronize between CPUs after main scanning loop.806* This invokes the bulk of the Monarch processing.807*/808static int mce_end(int order)809{810int ret = -1;811u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;812813if (!timeout)814goto reset;815if (order < 0)816goto reset;817818/*819* Allow others to run.820*/821atomic_inc(&mce_executing);822823if (order == 1) {824/* CHECKME: Can this race with a parallel hotplug? */825int cpus = num_online_cpus();826827/*828* Monarch: Wait for everyone to go through their scanning829* loops.830*/831while (atomic_read(&mce_executing) <= cpus) {832if (mce_timed_out(&timeout))833goto reset;834ndelay(SPINUNIT);835}836837mce_reign();838barrier();839ret = 0;840} else {841/*842* Subject: Wait for Monarch to finish.843*/844while (atomic_read(&mce_executing) != 0) {845if (mce_timed_out(&timeout))846goto reset;847ndelay(SPINUNIT);848}849850/*851* Don't reset anything. That's done by the Monarch.852*/853return 0;854}855856/*857* Reset all global state.858*/859reset:860atomic_set(&global_nwo, 0);861atomic_set(&mce_callin, 0);862barrier();863864/*865* Let others run again.866*/867atomic_set(&mce_executing, 0);868return ret;869}870871/*872* Check if the address reported by the CPU is in a format we can parse.873* It would be possible to add code for most other cases, but all would874* be somewhat complicated (e.g. segment offset would require an instruction875* parser). So only support physical addresses up to page granuality for now.876*/877static int mce_usable_address(struct mce *m)878{879if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))880return 0;881if ((m->misc & 0x3f) > PAGE_SHIFT)882return 0;883if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)884return 0;885return 1;886}887888static void mce_clear_state(unsigned long *toclear)889{890int i;891892for (i = 0; i < banks; i++) {893if (test_bit(i, toclear))894mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);895}896}897898/*899* The actual machine check handler. This only handles real900* exceptions when something got corrupted coming in through int 18.901*902* This is executed in NMI context not subject to normal locking rules. This903* implies that most kernel services cannot be safely used. Don't even904* think about putting a printk in there!905*906* On Intel systems this is entered on all CPUs in parallel through907* MCE broadcast. However some CPUs might be broken beyond repair,908* so be always careful when synchronizing with others.909*/910void do_machine_check(struct pt_regs *regs, long error_code)911{912struct mce m, *final;913int i;914int worst = 0;915int severity;916/*917* Establish sequential order between the CPUs entering the machine918* check handler.919*/920int order;921/*922* If no_way_out gets set, there is no safe way to recover from this923* MCE. If tolerant is cranked up, we'll try anyway.924*/925int no_way_out = 0;926/*927* If kill_it gets set, there might be a way to recover from this928* error.929*/930int kill_it = 0;931DECLARE_BITMAP(toclear, MAX_NR_BANKS);932char *msg = "Unknown";933934atomic_inc(&mce_entry);935936percpu_inc(mce_exception_count);937938if (notify_die(DIE_NMI, "machine check", regs, error_code,93918, SIGKILL) == NOTIFY_STOP)940goto out;941if (!banks)942goto out;943944mce_setup(&m);945946m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);947final = &__get_cpu_var(mces_seen);948*final = m;949950no_way_out = mce_no_way_out(&m, &msg);951952barrier();953954/*955* When no restart IP must always kill or panic.956*/957if (!(m.mcgstatus & MCG_STATUS_RIPV))958kill_it = 1;959960/*961* Go through all the banks in exclusion of the other CPUs.962* This way we don't report duplicated events on shared banks963* because the first one to see it will clear it.964*/965order = mce_start(&no_way_out);966for (i = 0; i < banks; i++) {967__clear_bit(i, toclear);968if (!mce_banks[i].ctl)969continue;970971m.misc = 0;972m.addr = 0;973m.bank = i;974975m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));976if ((m.status & MCI_STATUS_VAL) == 0)977continue;978979/*980* Non uncorrected or non signaled errors are handled by981* machine_check_poll. Leave them alone, unless this panics.982*/983if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&984!no_way_out)985continue;986987/*988* Set taint even when machine check was not enabled.989*/990add_taint(TAINT_MACHINE_CHECK);991992severity = mce_severity(&m, tolerant, NULL);993994/*995* When machine check was for corrected handler don't touch,996* unless we're panicing.997*/998if (severity == MCE_KEEP_SEVERITY && !no_way_out)999continue;1000__set_bit(i, toclear);1001if (severity == MCE_NO_SEVERITY) {1002/*1003* Machine check event was not enabled. Clear, but1004* ignore.1005*/1006continue;1007}10081009/*1010* Kill on action required.1011*/1012if (severity == MCE_AR_SEVERITY)1013kill_it = 1;10141015if (m.status & MCI_STATUS_MISCV)1016m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));1017if (m.status & MCI_STATUS_ADDRV)1018m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));10191020/*1021* Action optional error. Queue address for later processing.1022* When the ring overflows we just ignore the AO error.1023* RED-PEN add some logging mechanism when1024* usable_address or mce_add_ring fails.1025* RED-PEN don't ignore overflow for tolerant == 01026*/1027if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))1028mce_ring_add(m.addr >> PAGE_SHIFT);10291030mce_get_rip(&m, regs);1031mce_log(&m);10321033if (severity > worst) {1034*final = m;1035worst = severity;1036}1037}10381039if (!no_way_out)1040mce_clear_state(toclear);10411042/*1043* Do most of the synchronization with other CPUs.1044* When there's any problem use only local no_way_out state.1045*/1046if (mce_end(order) < 0)1047no_way_out = worst >= MCE_PANIC_SEVERITY;10481049/*1050* If we have decided that we just CAN'T continue, and the user1051* has not set tolerant to an insane level, give up and die.1052*1053* This is mainly used in the case when the system doesn't1054* support MCE broadcasting or it has been disabled.1055*/1056if (no_way_out && tolerant < 3)1057mce_panic("Fatal machine check on current CPU", final, msg);10581059/*1060* If the error seems to be unrecoverable, something should be1061* done. Try to kill as little as possible. If we can kill just1062* one task, do that. If the user has set the tolerance very1063* high, don't try to do anything at all.1064*/10651066if (kill_it && tolerant < 3)1067force_sig(SIGBUS, current);10681069/* notify userspace ASAP */1070set_thread_flag(TIF_MCE_NOTIFY);10711072if (worst > 0)1073mce_report_event(regs);1074mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);1075out:1076atomic_dec(&mce_entry);1077sync_core();1078}1079EXPORT_SYMBOL_GPL(do_machine_check);10801081/* dummy to break dependency. actual code is in mm/memory-failure.c */1082void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)1083{1084printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);1085}10861087/*1088* Called after mce notification in process context. This code1089* is allowed to sleep. Call the high level VM handler to process1090* any corrupted pages.1091* Assume that the work queue code only calls this one at a time1092* per CPU.1093* Note we don't disable preemption, so this code might run on the wrong1094* CPU. In this case the event is picked up by the scheduled work queue.1095* This is merely a fast path to expedite processing in some common1096* cases.1097*/1098void mce_notify_process(void)1099{1100unsigned long pfn;1101mce_notify_irq();1102while (mce_ring_get(&pfn))1103memory_failure(pfn, MCE_VECTOR);1104}11051106static void mce_process_work(struct work_struct *dummy)1107{1108mce_notify_process();1109}11101111#ifdef CONFIG_X86_MCE_INTEL1112/***1113* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog1114* @cpu: The CPU on which the event occurred.1115* @status: Event status information1116*1117* This function should be called by the thermal interrupt after the1118* event has been processed and the decision was made to log the event1119* further.1120*1121* The status parameter will be saved to the 'status' field of 'struct mce'1122* and historically has been the register value of the1123* MSR_IA32_THERMAL_STATUS (Intel) msr.1124*/1125void mce_log_therm_throt_event(__u64 status)1126{1127struct mce m;11281129mce_setup(&m);1130m.bank = MCE_THERMAL_BANK;1131m.status = status;1132mce_log(&m);1133}1134#endif /* CONFIG_X86_MCE_INTEL */11351136/*1137* Periodic polling timer for "silent" machine check errors. If the1138* poller finds an MCE, poll 2x faster. When the poller finds no more1139* errors, poll 2x slower (up to check_interval seconds).1140*/1141static int check_interval = 5 * 60; /* 5 minutes */11421143static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */1144static DEFINE_PER_CPU(struct timer_list, mce_timer);11451146static void mce_start_timer(unsigned long data)1147{1148struct timer_list *t = &per_cpu(mce_timer, data);1149int *n;11501151WARN_ON(smp_processor_id() != data);11521153if (mce_available(__this_cpu_ptr(&cpu_info))) {1154machine_check_poll(MCP_TIMESTAMP,1155&__get_cpu_var(mce_poll_banks));1156}11571158/*1159* Alert userspace if needed. If we logged an MCE, reduce the1160* polling interval, otherwise increase the polling interval.1161*/1162n = &__get_cpu_var(mce_next_interval);1163if (mce_notify_irq())1164*n = max(*n/2, HZ/100);1165else1166*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));11671168t->expires = jiffies + *n;1169add_timer_on(t, smp_processor_id());1170}11711172static void mce_do_trigger(struct work_struct *work)1173{1174call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);1175}11761177static DECLARE_WORK(mce_trigger_work, mce_do_trigger);11781179/*1180* Notify the user(s) about new machine check events.1181* Can be called from interrupt context, but not from machine check/NMI1182* context.1183*/1184int mce_notify_irq(void)1185{1186/* Not more than two messages every minute */1187static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);11881189clear_thread_flag(TIF_MCE_NOTIFY);11901191if (test_and_clear_bit(0, &mce_need_notify)) {1192wake_up_interruptible(&mce_wait);11931194/*1195* There is no risk of missing notifications because1196* work_pending is always cleared before the function is1197* executed.1198*/1199if (mce_helper[0] && !work_pending(&mce_trigger_work))1200schedule_work(&mce_trigger_work);12011202if (__ratelimit(&ratelimit))1203pr_info(HW_ERR "Machine check events logged\n");12041205return 1;1206}1207return 0;1208}1209EXPORT_SYMBOL_GPL(mce_notify_irq);12101211static int __cpuinit __mcheck_cpu_mce_banks_init(void)1212{1213int i;12141215mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);1216if (!mce_banks)1217return -ENOMEM;1218for (i = 0; i < banks; i++) {1219struct mce_bank *b = &mce_banks[i];12201221b->ctl = -1ULL;1222b->init = 1;1223}1224return 0;1225}12261227/*1228* Initialize Machine Checks for a CPU.1229*/1230static int __cpuinit __mcheck_cpu_cap_init(void)1231{1232unsigned b;1233u64 cap;12341235rdmsrl(MSR_IA32_MCG_CAP, cap);12361237b = cap & MCG_BANKCNT_MASK;1238if (!banks)1239printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);12401241if (b > MAX_NR_BANKS) {1242printk(KERN_WARNING1243"MCE: Using only %u machine check banks out of %u\n",1244MAX_NR_BANKS, b);1245b = MAX_NR_BANKS;1246}12471248/* Don't support asymmetric configurations today */1249WARN_ON(banks != 0 && b != banks);1250banks = b;1251if (!mce_banks) {1252int err = __mcheck_cpu_mce_banks_init();12531254if (err)1255return err;1256}12571258/* Use accurate RIP reporting if available. */1259if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)1260rip_msr = MSR_IA32_MCG_EIP;12611262if (cap & MCG_SER_P)1263mce_ser = 1;12641265return 0;1266}12671268static void __mcheck_cpu_init_generic(void)1269{1270mce_banks_t all_banks;1271u64 cap;1272int i;12731274/*1275* Log the machine checks left over from the previous reset.1276*/1277bitmap_fill(all_banks, MAX_NR_BANKS);1278machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);12791280set_in_cr4(X86_CR4_MCE);12811282rdmsrl(MSR_IA32_MCG_CAP, cap);1283if (cap & MCG_CTL_P)1284wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);12851286for (i = 0; i < banks; i++) {1287struct mce_bank *b = &mce_banks[i];12881289if (!b->init)1290continue;1291wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);1292wrmsrl(MSR_IA32_MCx_STATUS(i), 0);1293}1294}12951296/* Add per CPU specific workarounds here */1297static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)1298{1299if (c->x86_vendor == X86_VENDOR_UNKNOWN) {1300pr_info("MCE: unknown CPU type - not enabling MCE support.\n");1301return -EOPNOTSUPP;1302}13031304/* This should be disabled by the BIOS, but isn't always */1305if (c->x86_vendor == X86_VENDOR_AMD) {1306if (c->x86 == 15 && banks > 4) {1307/*1308* disable GART TBL walk error reporting, which1309* trips off incorrectly with the IOMMU & 3ware1310* & Cerberus:1311*/1312clear_bit(10, (unsigned long *)&mce_banks[4].ctl);1313}1314if (c->x86 <= 17 && mce_bootlog < 0) {1315/*1316* Lots of broken BIOS around that don't clear them1317* by default and leave crap in there. Don't log:1318*/1319mce_bootlog = 0;1320}1321/*1322* Various K7s with broken bank 0 around. Always disable1323* by default.1324*/1325if (c->x86 == 6 && banks > 0)1326mce_banks[0].ctl = 0;1327}13281329if (c->x86_vendor == X86_VENDOR_INTEL) {1330/*1331* SDM documents that on family 6 bank 0 should not be written1332* because it aliases to another special BIOS controlled1333* register.1334* But it's not aliased anymore on model 0x1a+1335* Don't ignore bank 0 completely because there could be a1336* valid event later, merely don't write CTL0.1337*/13381339if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)1340mce_banks[0].init = 0;13411342/*1343* All newer Intel systems support MCE broadcasting. Enable1344* synchronization with a one second timeout.1345*/1346if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&1347monarch_timeout < 0)1348monarch_timeout = USEC_PER_SEC;13491350/*1351* There are also broken BIOSes on some Pentium M and1352* earlier systems:1353*/1354if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)1355mce_bootlog = 0;1356}1357if (monarch_timeout < 0)1358monarch_timeout = 0;1359if (mce_bootlog != 0)1360mce_panic_timeout = 30;13611362return 0;1363}13641365static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)1366{1367if (c->x86 != 5)1368return;1369switch (c->x86_vendor) {1370case X86_VENDOR_INTEL:1371intel_p5_mcheck_init(c);1372break;1373case X86_VENDOR_CENTAUR:1374winchip_mcheck_init(c);1375break;1376}1377}13781379static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)1380{1381switch (c->x86_vendor) {1382case X86_VENDOR_INTEL:1383mce_intel_feature_init(c);1384break;1385case X86_VENDOR_AMD:1386mce_amd_feature_init(c);1387break;1388default:1389break;1390}1391}13921393static void __mcheck_cpu_init_timer(void)1394{1395struct timer_list *t = &__get_cpu_var(mce_timer);1396int *n = &__get_cpu_var(mce_next_interval);13971398setup_timer(t, mce_start_timer, smp_processor_id());13991400if (mce_ignore_ce)1401return;14021403*n = check_interval * HZ;1404if (!*n)1405return;1406t->expires = round_jiffies(jiffies + *n);1407add_timer_on(t, smp_processor_id());1408}14091410/* Handle unconfigured int18 (should never happen) */1411static void unexpected_machine_check(struct pt_regs *regs, long error_code)1412{1413printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",1414smp_processor_id());1415}14161417/* Call the installed machine check handler for this CPU setup. */1418void (*machine_check_vector)(struct pt_regs *, long error_code) =1419unexpected_machine_check;14201421/*1422* Called for each booted CPU to set up machine checks.1423* Must be called with preempt off:1424*/1425void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)1426{1427if (mce_disabled)1428return;14291430__mcheck_cpu_ancient_init(c);14311432if (!mce_available(c))1433return;14341435if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {1436mce_disabled = 1;1437return;1438}14391440machine_check_vector = do_machine_check;14411442__mcheck_cpu_init_generic();1443__mcheck_cpu_init_vendor(c);1444__mcheck_cpu_init_timer();1445INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);14461447}14481449/*1450* Character device to read and clear the MCE log.1451*/14521453static DEFINE_SPINLOCK(mce_state_lock);1454static int open_count; /* #times opened */1455static int open_exclu; /* already open exclusive? */14561457static int mce_open(struct inode *inode, struct file *file)1458{1459spin_lock(&mce_state_lock);14601461if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {1462spin_unlock(&mce_state_lock);14631464return -EBUSY;1465}14661467if (file->f_flags & O_EXCL)1468open_exclu = 1;1469open_count++;14701471spin_unlock(&mce_state_lock);14721473return nonseekable_open(inode, file);1474}14751476static int mce_release(struct inode *inode, struct file *file)1477{1478spin_lock(&mce_state_lock);14791480open_count--;1481open_exclu = 0;14821483spin_unlock(&mce_state_lock);14841485return 0;1486}14871488static void collect_tscs(void *data)1489{1490unsigned long *cpu_tsc = (unsigned long *)data;14911492rdtscll(cpu_tsc[smp_processor_id()]);1493}14941495static int mce_apei_read_done;14961497/* Collect MCE record of previous boot in persistent storage via APEI ERST. */1498static int __mce_read_apei(char __user **ubuf, size_t usize)1499{1500int rc;1501u64 record_id;1502struct mce m;15031504if (usize < sizeof(struct mce))1505return -EINVAL;15061507rc = apei_read_mce(&m, &record_id);1508/* Error or no more MCE record */1509if (rc <= 0) {1510mce_apei_read_done = 1;1511return rc;1512}1513rc = -EFAULT;1514if (copy_to_user(*ubuf, &m, sizeof(struct mce)))1515return rc;1516/*1517* In fact, we should have cleared the record after that has1518* been flushed to the disk or sent to network in1519* /sbin/mcelog, but we have no interface to support that now,1520* so just clear it to avoid duplication.1521*/1522rc = apei_clear_mce(record_id);1523if (rc) {1524mce_apei_read_done = 1;1525return rc;1526}1527*ubuf += sizeof(struct mce);15281529return 0;1530}15311532static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,1533loff_t *off)1534{1535char __user *buf = ubuf;1536unsigned long *cpu_tsc;1537unsigned prev, next;1538int i, err;15391540cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);1541if (!cpu_tsc)1542return -ENOMEM;15431544mutex_lock(&mce_read_mutex);15451546if (!mce_apei_read_done) {1547err = __mce_read_apei(&buf, usize);1548if (err || buf != ubuf)1549goto out;1550}15511552next = rcu_dereference_check_mce(mcelog.next);15531554/* Only supports full reads right now */1555err = -EINVAL;1556if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))1557goto out;15581559err = 0;1560prev = 0;1561do {1562for (i = prev; i < next; i++) {1563unsigned long start = jiffies;15641565while (!mcelog.entry[i].finished) {1566if (time_after_eq(jiffies, start + 2)) {1567memset(mcelog.entry + i, 0,1568sizeof(struct mce));1569goto timeout;1570}1571cpu_relax();1572}1573smp_rmb();1574err |= copy_to_user(buf, mcelog.entry + i,1575sizeof(struct mce));1576buf += sizeof(struct mce);1577timeout:1578;1579}15801581memset(mcelog.entry + prev, 0,1582(next - prev) * sizeof(struct mce));1583prev = next;1584next = cmpxchg(&mcelog.next, prev, 0);1585} while (next != prev);15861587synchronize_sched();15881589/*1590* Collect entries that were still getting written before the1591* synchronize.1592*/1593on_each_cpu(collect_tscs, cpu_tsc, 1);15941595for (i = next; i < MCE_LOG_LEN; i++) {1596if (mcelog.entry[i].finished &&1597mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {1598err |= copy_to_user(buf, mcelog.entry+i,1599sizeof(struct mce));1600smp_rmb();1601buf += sizeof(struct mce);1602memset(&mcelog.entry[i], 0, sizeof(struct mce));1603}1604}16051606if (err)1607err = -EFAULT;16081609out:1610mutex_unlock(&mce_read_mutex);1611kfree(cpu_tsc);16121613return err ? err : buf - ubuf;1614}16151616static unsigned int mce_poll(struct file *file, poll_table *wait)1617{1618poll_wait(file, &mce_wait, wait);1619if (rcu_access_index(mcelog.next))1620return POLLIN | POLLRDNORM;1621if (!mce_apei_read_done && apei_check_mce())1622return POLLIN | POLLRDNORM;1623return 0;1624}16251626static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)1627{1628int __user *p = (int __user *)arg;16291630if (!capable(CAP_SYS_ADMIN))1631return -EPERM;16321633switch (cmd) {1634case MCE_GET_RECORD_LEN:1635return put_user(sizeof(struct mce), p);1636case MCE_GET_LOG_LEN:1637return put_user(MCE_LOG_LEN, p);1638case MCE_GETCLEAR_FLAGS: {1639unsigned flags;16401641do {1642flags = mcelog.flags;1643} while (cmpxchg(&mcelog.flags, flags, 0) != flags);16441645return put_user(flags, p);1646}1647default:1648return -ENOTTY;1649}1650}16511652/* Modified in mce-inject.c, so not static or const */1653struct file_operations mce_chrdev_ops = {1654.open = mce_open,1655.release = mce_release,1656.read = mce_read,1657.poll = mce_poll,1658.unlocked_ioctl = mce_ioctl,1659.llseek = no_llseek,1660};1661EXPORT_SYMBOL_GPL(mce_chrdev_ops);16621663static struct miscdevice mce_log_device = {1664MISC_MCELOG_MINOR,1665"mcelog",1666&mce_chrdev_ops,1667};16681669/*1670* mce=off Disables machine check1671* mce=no_cmci Disables CMCI1672* mce=dont_log_ce Clears corrected events silently, no log created for CEs.1673* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.1674* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)1675* monarchtimeout is how long to wait for other CPUs on machine1676* check, or 0 to not wait1677* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.1678* mce=nobootlog Don't log MCEs from before booting.1679*/1680static int __init mcheck_enable(char *str)1681{1682if (*str == 0) {1683enable_p5_mce();1684return 1;1685}1686if (*str == '=')1687str++;1688if (!strcmp(str, "off"))1689mce_disabled = 1;1690else if (!strcmp(str, "no_cmci"))1691mce_cmci_disabled = 1;1692else if (!strcmp(str, "dont_log_ce"))1693mce_dont_log_ce = 1;1694else if (!strcmp(str, "ignore_ce"))1695mce_ignore_ce = 1;1696else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))1697mce_bootlog = (str[0] == 'b');1698else if (isdigit(str[0])) {1699get_option(&str, &tolerant);1700if (*str == ',') {1701++str;1702get_option(&str, &monarch_timeout);1703}1704} else {1705printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",1706str);1707return 0;1708}1709return 1;1710}1711__setup("mce", mcheck_enable);17121713int __init mcheck_init(void)1714{1715mcheck_intel_therm_init();17161717return 0;1718}17191720/*1721* Sysfs support1722*/17231724/*1725* Disable machine checks on suspend and shutdown. We can't really handle1726* them later.1727*/1728static int mce_disable_error_reporting(void)1729{1730int i;17311732for (i = 0; i < banks; i++) {1733struct mce_bank *b = &mce_banks[i];17341735if (b->init)1736wrmsrl(MSR_IA32_MCx_CTL(i), 0);1737}1738return 0;1739}17401741static int mce_suspend(void)1742{1743return mce_disable_error_reporting();1744}17451746static void mce_shutdown(void)1747{1748mce_disable_error_reporting();1749}17501751/*1752* On resume clear all MCE state. Don't want to see leftovers from the BIOS.1753* Only one CPU is active at this time, the others get re-added later using1754* CPU hotplug:1755*/1756static void mce_resume(void)1757{1758__mcheck_cpu_init_generic();1759__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));1760}17611762static struct syscore_ops mce_syscore_ops = {1763.suspend = mce_suspend,1764.shutdown = mce_shutdown,1765.resume = mce_resume,1766};17671768static void mce_cpu_restart(void *data)1769{1770del_timer_sync(&__get_cpu_var(mce_timer));1771if (!mce_available(__this_cpu_ptr(&cpu_info)))1772return;1773__mcheck_cpu_init_generic();1774__mcheck_cpu_init_timer();1775}17761777/* Reinit MCEs after user configuration changes */1778static void mce_restart(void)1779{1780on_each_cpu(mce_cpu_restart, NULL, 1);1781}17821783/* Toggle features for corrected errors */1784static void mce_disable_ce(void *all)1785{1786if (!mce_available(__this_cpu_ptr(&cpu_info)))1787return;1788if (all)1789del_timer_sync(&__get_cpu_var(mce_timer));1790cmci_clear();1791}17921793static void mce_enable_ce(void *all)1794{1795if (!mce_available(__this_cpu_ptr(&cpu_info)))1796return;1797cmci_reenable();1798cmci_recheck();1799if (all)1800__mcheck_cpu_init_timer();1801}18021803static struct sysdev_class mce_sysclass = {1804.name = "machinecheck",1805};18061807DEFINE_PER_CPU(struct sys_device, mce_dev);18081809__cpuinitdata1810void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);18111812static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)1813{1814return container_of(attr, struct mce_bank, attr);1815}18161817static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,1818char *buf)1819{1820return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);1821}18221823static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,1824const char *buf, size_t size)1825{1826u64 new;18271828if (strict_strtoull(buf, 0, &new) < 0)1829return -EINVAL;18301831attr_to_bank(attr)->ctl = new;1832mce_restart();18331834return size;1835}18361837static ssize_t1838show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)1839{1840strcpy(buf, mce_helper);1841strcat(buf, "\n");1842return strlen(mce_helper) + 1;1843}18441845static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,1846const char *buf, size_t siz)1847{1848char *p;18491850strncpy(mce_helper, buf, sizeof(mce_helper));1851mce_helper[sizeof(mce_helper)-1] = 0;1852p = strchr(mce_helper, '\n');18531854if (p)1855*p = 0;18561857return strlen(mce_helper) + !!p;1858}18591860static ssize_t set_ignore_ce(struct sys_device *s,1861struct sysdev_attribute *attr,1862const char *buf, size_t size)1863{1864u64 new;18651866if (strict_strtoull(buf, 0, &new) < 0)1867return -EINVAL;18681869if (mce_ignore_ce ^ !!new) {1870if (new) {1871/* disable ce features */1872on_each_cpu(mce_disable_ce, (void *)1, 1);1873mce_ignore_ce = 1;1874} else {1875/* enable ce features */1876mce_ignore_ce = 0;1877on_each_cpu(mce_enable_ce, (void *)1, 1);1878}1879}1880return size;1881}18821883static ssize_t set_cmci_disabled(struct sys_device *s,1884struct sysdev_attribute *attr,1885const char *buf, size_t size)1886{1887u64 new;18881889if (strict_strtoull(buf, 0, &new) < 0)1890return -EINVAL;18911892if (mce_cmci_disabled ^ !!new) {1893if (new) {1894/* disable cmci */1895on_each_cpu(mce_disable_ce, NULL, 1);1896mce_cmci_disabled = 1;1897} else {1898/* enable cmci */1899mce_cmci_disabled = 0;1900on_each_cpu(mce_enable_ce, NULL, 1);1901}1902}1903return size;1904}19051906static ssize_t store_int_with_restart(struct sys_device *s,1907struct sysdev_attribute *attr,1908const char *buf, size_t size)1909{1910ssize_t ret = sysdev_store_int(s, attr, buf, size);1911mce_restart();1912return ret;1913}19141915static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);1916static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);1917static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);1918static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);19191920static struct sysdev_ext_attribute attr_check_interval = {1921_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,1922store_int_with_restart),1923&check_interval1924};19251926static struct sysdev_ext_attribute attr_ignore_ce = {1927_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),1928&mce_ignore_ce1929};19301931static struct sysdev_ext_attribute attr_cmci_disabled = {1932_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),1933&mce_cmci_disabled1934};19351936static struct sysdev_attribute *mce_attrs[] = {1937&attr_tolerant.attr,1938&attr_check_interval.attr,1939&attr_trigger,1940&attr_monarch_timeout.attr,1941&attr_dont_log_ce.attr,1942&attr_ignore_ce.attr,1943&attr_cmci_disabled.attr,1944NULL1945};19461947static cpumask_var_t mce_dev_initialized;19481949/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */1950static __cpuinit int mce_create_device(unsigned int cpu)1951{1952int err;1953int i, j;19541955if (!mce_available(&boot_cpu_data))1956return -EIO;19571958memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));1959per_cpu(mce_dev, cpu).id = cpu;1960per_cpu(mce_dev, cpu).cls = &mce_sysclass;19611962err = sysdev_register(&per_cpu(mce_dev, cpu));1963if (err)1964return err;19651966for (i = 0; mce_attrs[i]; i++) {1967err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);1968if (err)1969goto error;1970}1971for (j = 0; j < banks; j++) {1972err = sysdev_create_file(&per_cpu(mce_dev, cpu),1973&mce_banks[j].attr);1974if (err)1975goto error2;1976}1977cpumask_set_cpu(cpu, mce_dev_initialized);19781979return 0;1980error2:1981while (--j >= 0)1982sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);1983error:1984while (--i >= 0)1985sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);19861987sysdev_unregister(&per_cpu(mce_dev, cpu));19881989return err;1990}19911992static __cpuinit void mce_remove_device(unsigned int cpu)1993{1994int i;19951996if (!cpumask_test_cpu(cpu, mce_dev_initialized))1997return;19981999for (i = 0; mce_attrs[i]; i++)2000sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);20012002for (i = 0; i < banks; i++)2003sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);20042005sysdev_unregister(&per_cpu(mce_dev, cpu));2006cpumask_clear_cpu(cpu, mce_dev_initialized);2007}20082009/* Make sure there are no machine checks on offlined CPUs. */2010static void __cpuinit mce_disable_cpu(void *h)2011{2012unsigned long action = *(unsigned long *)h;2013int i;20142015if (!mce_available(__this_cpu_ptr(&cpu_info)))2016return;20172018if (!(action & CPU_TASKS_FROZEN))2019cmci_clear();2020for (i = 0; i < banks; i++) {2021struct mce_bank *b = &mce_banks[i];20222023if (b->init)2024wrmsrl(MSR_IA32_MCx_CTL(i), 0);2025}2026}20272028static void __cpuinit mce_reenable_cpu(void *h)2029{2030unsigned long action = *(unsigned long *)h;2031int i;20322033if (!mce_available(__this_cpu_ptr(&cpu_info)))2034return;20352036if (!(action & CPU_TASKS_FROZEN))2037cmci_reenable();2038for (i = 0; i < banks; i++) {2039struct mce_bank *b = &mce_banks[i];20402041if (b->init)2042wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);2043}2044}20452046/* Get notified when a cpu comes on/off. Be hotplug friendly. */2047static int __cpuinit2048mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)2049{2050unsigned int cpu = (unsigned long)hcpu;2051struct timer_list *t = &per_cpu(mce_timer, cpu);20522053switch (action) {2054case CPU_ONLINE:2055case CPU_ONLINE_FROZEN:2056mce_create_device(cpu);2057if (threshold_cpu_callback)2058threshold_cpu_callback(action, cpu);2059break;2060case CPU_DEAD:2061case CPU_DEAD_FROZEN:2062if (threshold_cpu_callback)2063threshold_cpu_callback(action, cpu);2064mce_remove_device(cpu);2065break;2066case CPU_DOWN_PREPARE:2067case CPU_DOWN_PREPARE_FROZEN:2068del_timer_sync(t);2069smp_call_function_single(cpu, mce_disable_cpu, &action, 1);2070break;2071case CPU_DOWN_FAILED:2072case CPU_DOWN_FAILED_FROZEN:2073if (!mce_ignore_ce && check_interval) {2074t->expires = round_jiffies(jiffies +2075__get_cpu_var(mce_next_interval));2076add_timer_on(t, cpu);2077}2078smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);2079break;2080case CPU_POST_DEAD:2081/* intentionally ignoring frozen here */2082cmci_rediscover(cpu);2083break;2084}2085return NOTIFY_OK;2086}20872088static struct notifier_block mce_cpu_notifier __cpuinitdata = {2089.notifier_call = mce_cpu_callback,2090};20912092static __init void mce_init_banks(void)2093{2094int i;20952096for (i = 0; i < banks; i++) {2097struct mce_bank *b = &mce_banks[i];2098struct sysdev_attribute *a = &b->attr;20992100sysfs_attr_init(&a->attr);2101a->attr.name = b->attrname;2102snprintf(b->attrname, ATTR_LEN, "bank%d", i);21032104a->attr.mode = 0644;2105a->show = show_bank;2106a->store = set_bank;2107}2108}21092110static __init int mcheck_init_device(void)2111{2112int err;2113int i = 0;21142115if (!mce_available(&boot_cpu_data))2116return -EIO;21172118zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);21192120mce_init_banks();21212122err = sysdev_class_register(&mce_sysclass);2123if (err)2124return err;21252126for_each_online_cpu(i) {2127err = mce_create_device(i);2128if (err)2129return err;2130}21312132register_syscore_ops(&mce_syscore_ops);2133register_hotcpu_notifier(&mce_cpu_notifier);2134misc_register(&mce_log_device);21352136return err;2137}21382139device_initcall(mcheck_init_device);21402141/*2142* Old style boot options parsing. Only for compatibility.2143*/2144static int __init mcheck_disable(char *str)2145{2146mce_disabled = 1;2147return 1;2148}2149__setup("nomce", mcheck_disable);21502151#ifdef CONFIG_DEBUG_FS2152struct dentry *mce_get_debugfs_dir(void)2153{2154static struct dentry *dmce;21552156if (!dmce)2157dmce = debugfs_create_dir("mce", NULL);21582159return dmce;2160}21612162static void mce_reset(void)2163{2164cpu_missing = 0;2165atomic_set(&mce_fake_paniced, 0);2166atomic_set(&mce_executing, 0);2167atomic_set(&mce_callin, 0);2168atomic_set(&global_nwo, 0);2169}21702171static int fake_panic_get(void *data, u64 *val)2172{2173*val = fake_panic;2174return 0;2175}21762177static int fake_panic_set(void *data, u64 val)2178{2179mce_reset();2180fake_panic = val;2181return 0;2182}21832184DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,2185fake_panic_set, "%llu\n");21862187static int __init mcheck_debugfs_init(void)2188{2189struct dentry *dmce, *ffake_panic;21902191dmce = mce_get_debugfs_dir();2192if (!dmce)2193return -ENOMEM;2194ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,2195&fake_panic_fops);2196if (!ffake_panic)2197return -ENOMEM;21982199return 0;2200}2201late_initcall(mcheck_debugfs_init);2202#endif220322042205