Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/kernel/cpu/mcheck/mce.c
10775 views
1
/*
2
* Machine check handler.
3
*
4
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5
* Rest from unknown author(s).
6
* 2004 Andi Kleen. Rewrote most of it.
7
* Copyright 2008 Intel Corporation
8
* Author: Andi Kleen
9
*/
10
#include <linux/thread_info.h>
11
#include <linux/capability.h>
12
#include <linux/miscdevice.h>
13
#include <linux/interrupt.h>
14
#include <linux/ratelimit.h>
15
#include <linux/kallsyms.h>
16
#include <linux/rcupdate.h>
17
#include <linux/kobject.h>
18
#include <linux/uaccess.h>
19
#include <linux/kdebug.h>
20
#include <linux/kernel.h>
21
#include <linux/percpu.h>
22
#include <linux/string.h>
23
#include <linux/sysdev.h>
24
#include <linux/syscore_ops.h>
25
#include <linux/delay.h>
26
#include <linux/ctype.h>
27
#include <linux/sched.h>
28
#include <linux/sysfs.h>
29
#include <linux/types.h>
30
#include <linux/slab.h>
31
#include <linux/init.h>
32
#include <linux/kmod.h>
33
#include <linux/poll.h>
34
#include <linux/nmi.h>
35
#include <linux/cpu.h>
36
#include <linux/smp.h>
37
#include <linux/fs.h>
38
#include <linux/mm.h>
39
#include <linux/debugfs.h>
40
#include <linux/edac_mce.h>
41
42
#include <asm/processor.h>
43
#include <asm/hw_irq.h>
44
#include <asm/apic.h>
45
#include <asm/idle.h>
46
#include <asm/ipi.h>
47
#include <asm/mce.h>
48
#include <asm/msr.h>
49
50
#include "mce-internal.h"
51
52
static DEFINE_MUTEX(mce_read_mutex);
53
54
#define rcu_dereference_check_mce(p) \
55
rcu_dereference_index_check((p), \
56
rcu_read_lock_sched_held() || \
57
lockdep_is_held(&mce_read_mutex))
58
59
#define CREATE_TRACE_POINTS
60
#include <trace/events/mce.h>
61
62
int mce_disabled __read_mostly;
63
64
#define MISC_MCELOG_MINOR 227
65
66
#define SPINUNIT 100 /* 100ns */
67
68
atomic_t mce_entry;
69
70
DEFINE_PER_CPU(unsigned, mce_exception_count);
71
72
/*
73
* Tolerant levels:
74
* 0: always panic on uncorrected errors, log corrected errors
75
* 1: panic or SIGBUS on uncorrected errors, log corrected errors
76
* 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77
* 3: never panic or SIGBUS, log all errors (for testing only)
78
*/
79
static int tolerant __read_mostly = 1;
80
static int banks __read_mostly;
81
static int rip_msr __read_mostly;
82
static int mce_bootlog __read_mostly = -1;
83
static int monarch_timeout __read_mostly = -1;
84
static int mce_panic_timeout __read_mostly;
85
static int mce_dont_log_ce __read_mostly;
86
int mce_cmci_disabled __read_mostly;
87
int mce_ignore_ce __read_mostly;
88
int mce_ser __read_mostly;
89
90
struct mce_bank *mce_banks __read_mostly;
91
92
/* User mode helper program triggered by machine check event */
93
static unsigned long mce_need_notify;
94
static char mce_helper[128];
95
static char *mce_helper_argv[2] = { mce_helper, NULL };
96
97
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98
static DEFINE_PER_CPU(struct mce, mces_seen);
99
static int cpu_missing;
100
101
/*
102
* CPU/chipset specific EDAC code can register a notifier call here to print
103
* MCE errors in a human-readable form.
104
*/
105
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
106
EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
107
108
/* MCA banks polled by the period polling timer for corrected events */
109
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
110
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
111
};
112
113
static DEFINE_PER_CPU(struct work_struct, mce_work);
114
115
/* Do initial initialization of a struct mce */
116
void mce_setup(struct mce *m)
117
{
118
memset(m, 0, sizeof(struct mce));
119
m->cpu = m->extcpu = smp_processor_id();
120
rdtscll(m->tsc);
121
/* We hope get_seconds stays lockless */
122
m->time = get_seconds();
123
m->cpuvendor = boot_cpu_data.x86_vendor;
124
m->cpuid = cpuid_eax(1);
125
#ifdef CONFIG_SMP
126
m->socketid = cpu_data(m->extcpu).phys_proc_id;
127
#endif
128
m->apicid = cpu_data(m->extcpu).initial_apicid;
129
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
130
}
131
132
DEFINE_PER_CPU(struct mce, injectm);
133
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
134
135
/*
136
* Lockless MCE logging infrastructure.
137
* This avoids deadlocks on printk locks without having to break locks. Also
138
* separate MCEs from kernel messages to avoid bogus bug reports.
139
*/
140
141
static struct mce_log mcelog = {
142
.signature = MCE_LOG_SIGNATURE,
143
.len = MCE_LOG_LEN,
144
.recordlen = sizeof(struct mce),
145
};
146
147
void mce_log(struct mce *mce)
148
{
149
unsigned next, entry;
150
151
/* Emit the trace record: */
152
trace_mce_record(mce);
153
154
mce->finished = 0;
155
wmb();
156
for (;;) {
157
entry = rcu_dereference_check_mce(mcelog.next);
158
for (;;) {
159
/*
160
* If edac_mce is enabled, it will check the error type
161
* and will process it, if it is a known error.
162
* Otherwise, the error will be sent through mcelog
163
* interface
164
*/
165
if (edac_mce_parse(mce))
166
return;
167
168
/*
169
* When the buffer fills up discard new entries.
170
* Assume that the earlier errors are the more
171
* interesting ones:
172
*/
173
if (entry >= MCE_LOG_LEN) {
174
set_bit(MCE_OVERFLOW,
175
(unsigned long *)&mcelog.flags);
176
return;
177
}
178
/* Old left over entry. Skip: */
179
if (mcelog.entry[entry].finished) {
180
entry++;
181
continue;
182
}
183
break;
184
}
185
smp_rmb();
186
next = entry + 1;
187
if (cmpxchg(&mcelog.next, entry, next) == entry)
188
break;
189
}
190
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
191
wmb();
192
mcelog.entry[entry].finished = 1;
193
wmb();
194
195
mce->finished = 1;
196
set_bit(0, &mce_need_notify);
197
}
198
199
static void print_mce(struct mce *m)
200
{
201
int ret = 0;
202
203
pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
204
m->extcpu, m->mcgstatus, m->bank, m->status);
205
206
if (m->ip) {
207
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
208
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
209
m->cs, m->ip);
210
211
if (m->cs == __KERNEL_CS)
212
print_symbol("{%s}", m->ip);
213
pr_cont("\n");
214
}
215
216
pr_emerg(HW_ERR "TSC %llx ", m->tsc);
217
if (m->addr)
218
pr_cont("ADDR %llx ", m->addr);
219
if (m->misc)
220
pr_cont("MISC %llx ", m->misc);
221
222
pr_cont("\n");
223
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
224
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
225
226
/*
227
* Print out human-readable details about the MCE error,
228
* (if the CPU has an implementation for that)
229
*/
230
ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
231
if (ret == NOTIFY_STOP)
232
return;
233
234
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
235
}
236
237
#define PANIC_TIMEOUT 5 /* 5 seconds */
238
239
static atomic_t mce_paniced;
240
241
static int fake_panic;
242
static atomic_t mce_fake_paniced;
243
244
/* Panic in progress. Enable interrupts and wait for final IPI */
245
static void wait_for_panic(void)
246
{
247
long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
248
249
preempt_disable();
250
local_irq_enable();
251
while (timeout-- > 0)
252
udelay(1);
253
if (panic_timeout == 0)
254
panic_timeout = mce_panic_timeout;
255
panic("Panicing machine check CPU died");
256
}
257
258
static void mce_panic(char *msg, struct mce *final, char *exp)
259
{
260
int i, apei_err = 0;
261
262
if (!fake_panic) {
263
/*
264
* Make sure only one CPU runs in machine check panic
265
*/
266
if (atomic_inc_return(&mce_paniced) > 1)
267
wait_for_panic();
268
barrier();
269
270
bust_spinlocks(1);
271
console_verbose();
272
} else {
273
/* Don't log too much for fake panic */
274
if (atomic_inc_return(&mce_fake_paniced) > 1)
275
return;
276
}
277
/* First print corrected ones that are still unlogged */
278
for (i = 0; i < MCE_LOG_LEN; i++) {
279
struct mce *m = &mcelog.entry[i];
280
if (!(m->status & MCI_STATUS_VAL))
281
continue;
282
if (!(m->status & MCI_STATUS_UC)) {
283
print_mce(m);
284
if (!apei_err)
285
apei_err = apei_write_mce(m);
286
}
287
}
288
/* Now print uncorrected but with the final one last */
289
for (i = 0; i < MCE_LOG_LEN; i++) {
290
struct mce *m = &mcelog.entry[i];
291
if (!(m->status & MCI_STATUS_VAL))
292
continue;
293
if (!(m->status & MCI_STATUS_UC))
294
continue;
295
if (!final || memcmp(m, final, sizeof(struct mce))) {
296
print_mce(m);
297
if (!apei_err)
298
apei_err = apei_write_mce(m);
299
}
300
}
301
if (final) {
302
print_mce(final);
303
if (!apei_err)
304
apei_err = apei_write_mce(final);
305
}
306
if (cpu_missing)
307
pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
308
if (exp)
309
pr_emerg(HW_ERR "Machine check: %s\n", exp);
310
if (!fake_panic) {
311
if (panic_timeout == 0)
312
panic_timeout = mce_panic_timeout;
313
panic(msg);
314
} else
315
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
316
}
317
318
/* Support code for software error injection */
319
320
static int msr_to_offset(u32 msr)
321
{
322
unsigned bank = __this_cpu_read(injectm.bank);
323
324
if (msr == rip_msr)
325
return offsetof(struct mce, ip);
326
if (msr == MSR_IA32_MCx_STATUS(bank))
327
return offsetof(struct mce, status);
328
if (msr == MSR_IA32_MCx_ADDR(bank))
329
return offsetof(struct mce, addr);
330
if (msr == MSR_IA32_MCx_MISC(bank))
331
return offsetof(struct mce, misc);
332
if (msr == MSR_IA32_MCG_STATUS)
333
return offsetof(struct mce, mcgstatus);
334
return -1;
335
}
336
337
/* MSR access wrappers used for error injection */
338
static u64 mce_rdmsrl(u32 msr)
339
{
340
u64 v;
341
342
if (__this_cpu_read(injectm.finished)) {
343
int offset = msr_to_offset(msr);
344
345
if (offset < 0)
346
return 0;
347
return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
348
}
349
350
if (rdmsrl_safe(msr, &v)) {
351
WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
352
/*
353
* Return zero in case the access faulted. This should
354
* not happen normally but can happen if the CPU does
355
* something weird, or if the code is buggy.
356
*/
357
v = 0;
358
}
359
360
return v;
361
}
362
363
static void mce_wrmsrl(u32 msr, u64 v)
364
{
365
if (__this_cpu_read(injectm.finished)) {
366
int offset = msr_to_offset(msr);
367
368
if (offset >= 0)
369
*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
370
return;
371
}
372
wrmsrl(msr, v);
373
}
374
375
/*
376
* Simple lockless ring to communicate PFNs from the exception handler with the
377
* process context work function. This is vastly simplified because there's
378
* only a single reader and a single writer.
379
*/
380
#define MCE_RING_SIZE 16 /* we use one entry less */
381
382
struct mce_ring {
383
unsigned short start;
384
unsigned short end;
385
unsigned long ring[MCE_RING_SIZE];
386
};
387
static DEFINE_PER_CPU(struct mce_ring, mce_ring);
388
389
/* Runs with CPU affinity in workqueue */
390
static int mce_ring_empty(void)
391
{
392
struct mce_ring *r = &__get_cpu_var(mce_ring);
393
394
return r->start == r->end;
395
}
396
397
static int mce_ring_get(unsigned long *pfn)
398
{
399
struct mce_ring *r;
400
int ret = 0;
401
402
*pfn = 0;
403
get_cpu();
404
r = &__get_cpu_var(mce_ring);
405
if (r->start == r->end)
406
goto out;
407
*pfn = r->ring[r->start];
408
r->start = (r->start + 1) % MCE_RING_SIZE;
409
ret = 1;
410
out:
411
put_cpu();
412
return ret;
413
}
414
415
/* Always runs in MCE context with preempt off */
416
static int mce_ring_add(unsigned long pfn)
417
{
418
struct mce_ring *r = &__get_cpu_var(mce_ring);
419
unsigned next;
420
421
next = (r->end + 1) % MCE_RING_SIZE;
422
if (next == r->start)
423
return -1;
424
r->ring[r->end] = pfn;
425
wmb();
426
r->end = next;
427
return 0;
428
}
429
430
int mce_available(struct cpuinfo_x86 *c)
431
{
432
if (mce_disabled)
433
return 0;
434
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
435
}
436
437
static void mce_schedule_work(void)
438
{
439
if (!mce_ring_empty()) {
440
struct work_struct *work = &__get_cpu_var(mce_work);
441
if (!work_pending(work))
442
schedule_work(work);
443
}
444
}
445
446
/*
447
* Get the address of the instruction at the time of the machine check
448
* error.
449
*/
450
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
451
{
452
453
if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
454
m->ip = regs->ip;
455
m->cs = regs->cs;
456
} else {
457
m->ip = 0;
458
m->cs = 0;
459
}
460
if (rip_msr)
461
m->ip = mce_rdmsrl(rip_msr);
462
}
463
464
#ifdef CONFIG_X86_LOCAL_APIC
465
/*
466
* Called after interrupts have been reenabled again
467
* when a MCE happened during an interrupts off region
468
* in the kernel.
469
*/
470
asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
471
{
472
ack_APIC_irq();
473
exit_idle();
474
irq_enter();
475
mce_notify_irq();
476
mce_schedule_work();
477
irq_exit();
478
}
479
#endif
480
481
static void mce_report_event(struct pt_regs *regs)
482
{
483
if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
484
mce_notify_irq();
485
/*
486
* Triggering the work queue here is just an insurance
487
* policy in case the syscall exit notify handler
488
* doesn't run soon enough or ends up running on the
489
* wrong CPU (can happen when audit sleeps)
490
*/
491
mce_schedule_work();
492
return;
493
}
494
495
#ifdef CONFIG_X86_LOCAL_APIC
496
/*
497
* Without APIC do not notify. The event will be picked
498
* up eventually.
499
*/
500
if (!cpu_has_apic)
501
return;
502
503
/*
504
* When interrupts are disabled we cannot use
505
* kernel services safely. Trigger an self interrupt
506
* through the APIC to instead do the notification
507
* after interrupts are reenabled again.
508
*/
509
apic->send_IPI_self(MCE_SELF_VECTOR);
510
511
/*
512
* Wait for idle afterwards again so that we don't leave the
513
* APIC in a non idle state because the normal APIC writes
514
* cannot exclude us.
515
*/
516
apic_wait_icr_idle();
517
#endif
518
}
519
520
DEFINE_PER_CPU(unsigned, mce_poll_count);
521
522
/*
523
* Poll for corrected events or events that happened before reset.
524
* Those are just logged through /dev/mcelog.
525
*
526
* This is executed in standard interrupt context.
527
*
528
* Note: spec recommends to panic for fatal unsignalled
529
* errors here. However this would be quite problematic --
530
* we would need to reimplement the Monarch handling and
531
* it would mess up the exclusion between exception handler
532
* and poll hander -- * so we skip this for now.
533
* These cases should not happen anyways, or only when the CPU
534
* is already totally * confused. In this case it's likely it will
535
* not fully execute the machine check handler either.
536
*/
537
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
538
{
539
struct mce m;
540
int i;
541
542
percpu_inc(mce_poll_count);
543
544
mce_setup(&m);
545
546
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
547
for (i = 0; i < banks; i++) {
548
if (!mce_banks[i].ctl || !test_bit(i, *b))
549
continue;
550
551
m.misc = 0;
552
m.addr = 0;
553
m.bank = i;
554
m.tsc = 0;
555
556
barrier();
557
m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
558
if (!(m.status & MCI_STATUS_VAL))
559
continue;
560
561
/*
562
* Uncorrected or signalled events are handled by the exception
563
* handler when it is enabled, so don't process those here.
564
*
565
* TBD do the same check for MCI_STATUS_EN here?
566
*/
567
if (!(flags & MCP_UC) &&
568
(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
569
continue;
570
571
if (m.status & MCI_STATUS_MISCV)
572
m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
573
if (m.status & MCI_STATUS_ADDRV)
574
m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
575
576
if (!(flags & MCP_TIMESTAMP))
577
m.tsc = 0;
578
/*
579
* Don't get the IP here because it's unlikely to
580
* have anything to do with the actual error location.
581
*/
582
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
583
mce_log(&m);
584
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
585
}
586
587
/*
588
* Clear state for this bank.
589
*/
590
mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
591
}
592
593
/*
594
* Don't clear MCG_STATUS here because it's only defined for
595
* exceptions.
596
*/
597
598
sync_core();
599
}
600
EXPORT_SYMBOL_GPL(machine_check_poll);
601
602
/*
603
* Do a quick check if any of the events requires a panic.
604
* This decides if we keep the events around or clear them.
605
*/
606
static int mce_no_way_out(struct mce *m, char **msg)
607
{
608
int i;
609
610
for (i = 0; i < banks; i++) {
611
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
612
if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
613
return 1;
614
}
615
return 0;
616
}
617
618
/*
619
* Variable to establish order between CPUs while scanning.
620
* Each CPU spins initially until executing is equal its number.
621
*/
622
static atomic_t mce_executing;
623
624
/*
625
* Defines order of CPUs on entry. First CPU becomes Monarch.
626
*/
627
static atomic_t mce_callin;
628
629
/*
630
* Check if a timeout waiting for other CPUs happened.
631
*/
632
static int mce_timed_out(u64 *t)
633
{
634
/*
635
* The others already did panic for some reason.
636
* Bail out like in a timeout.
637
* rmb() to tell the compiler that system_state
638
* might have been modified by someone else.
639
*/
640
rmb();
641
if (atomic_read(&mce_paniced))
642
wait_for_panic();
643
if (!monarch_timeout)
644
goto out;
645
if ((s64)*t < SPINUNIT) {
646
/* CHECKME: Make panic default for 1 too? */
647
if (tolerant < 1)
648
mce_panic("Timeout synchronizing machine check over CPUs",
649
NULL, NULL);
650
cpu_missing = 1;
651
return 1;
652
}
653
*t -= SPINUNIT;
654
out:
655
touch_nmi_watchdog();
656
return 0;
657
}
658
659
/*
660
* The Monarch's reign. The Monarch is the CPU who entered
661
* the machine check handler first. It waits for the others to
662
* raise the exception too and then grades them. When any
663
* error is fatal panic. Only then let the others continue.
664
*
665
* The other CPUs entering the MCE handler will be controlled by the
666
* Monarch. They are called Subjects.
667
*
668
* This way we prevent any potential data corruption in a unrecoverable case
669
* and also makes sure always all CPU's errors are examined.
670
*
671
* Also this detects the case of a machine check event coming from outer
672
* space (not detected by any CPUs) In this case some external agent wants
673
* us to shut down, so panic too.
674
*
675
* The other CPUs might still decide to panic if the handler happens
676
* in a unrecoverable place, but in this case the system is in a semi-stable
677
* state and won't corrupt anything by itself. It's ok to let the others
678
* continue for a bit first.
679
*
680
* All the spin loops have timeouts; when a timeout happens a CPU
681
* typically elects itself to be Monarch.
682
*/
683
static void mce_reign(void)
684
{
685
int cpu;
686
struct mce *m = NULL;
687
int global_worst = 0;
688
char *msg = NULL;
689
char *nmsg = NULL;
690
691
/*
692
* This CPU is the Monarch and the other CPUs have run
693
* through their handlers.
694
* Grade the severity of the errors of all the CPUs.
695
*/
696
for_each_possible_cpu(cpu) {
697
int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
698
&nmsg);
699
if (severity > global_worst) {
700
msg = nmsg;
701
global_worst = severity;
702
m = &per_cpu(mces_seen, cpu);
703
}
704
}
705
706
/*
707
* Cannot recover? Panic here then.
708
* This dumps all the mces in the log buffer and stops the
709
* other CPUs.
710
*/
711
if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
712
mce_panic("Fatal Machine check", m, msg);
713
714
/*
715
* For UC somewhere we let the CPU who detects it handle it.
716
* Also must let continue the others, otherwise the handling
717
* CPU could deadlock on a lock.
718
*/
719
720
/*
721
* No machine check event found. Must be some external
722
* source or one CPU is hung. Panic.
723
*/
724
if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
725
mce_panic("Machine check from unknown source", NULL, NULL);
726
727
/*
728
* Now clear all the mces_seen so that they don't reappear on
729
* the next mce.
730
*/
731
for_each_possible_cpu(cpu)
732
memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
733
}
734
735
static atomic_t global_nwo;
736
737
/*
738
* Start of Monarch synchronization. This waits until all CPUs have
739
* entered the exception handler and then determines if any of them
740
* saw a fatal event that requires panic. Then it executes them
741
* in the entry order.
742
* TBD double check parallel CPU hotunplug
743
*/
744
static int mce_start(int *no_way_out)
745
{
746
int order;
747
int cpus = num_online_cpus();
748
u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
749
750
if (!timeout)
751
return -1;
752
753
atomic_add(*no_way_out, &global_nwo);
754
/*
755
* global_nwo should be updated before mce_callin
756
*/
757
smp_wmb();
758
order = atomic_inc_return(&mce_callin);
759
760
/*
761
* Wait for everyone.
762
*/
763
while (atomic_read(&mce_callin) != cpus) {
764
if (mce_timed_out(&timeout)) {
765
atomic_set(&global_nwo, 0);
766
return -1;
767
}
768
ndelay(SPINUNIT);
769
}
770
771
/*
772
* mce_callin should be read before global_nwo
773
*/
774
smp_rmb();
775
776
if (order == 1) {
777
/*
778
* Monarch: Starts executing now, the others wait.
779
*/
780
atomic_set(&mce_executing, 1);
781
} else {
782
/*
783
* Subject: Now start the scanning loop one by one in
784
* the original callin order.
785
* This way when there are any shared banks it will be
786
* only seen by one CPU before cleared, avoiding duplicates.
787
*/
788
while (atomic_read(&mce_executing) < order) {
789
if (mce_timed_out(&timeout)) {
790
atomic_set(&global_nwo, 0);
791
return -1;
792
}
793
ndelay(SPINUNIT);
794
}
795
}
796
797
/*
798
* Cache the global no_way_out state.
799
*/
800
*no_way_out = atomic_read(&global_nwo);
801
802
return order;
803
}
804
805
/*
806
* Synchronize between CPUs after main scanning loop.
807
* This invokes the bulk of the Monarch processing.
808
*/
809
static int mce_end(int order)
810
{
811
int ret = -1;
812
u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
813
814
if (!timeout)
815
goto reset;
816
if (order < 0)
817
goto reset;
818
819
/*
820
* Allow others to run.
821
*/
822
atomic_inc(&mce_executing);
823
824
if (order == 1) {
825
/* CHECKME: Can this race with a parallel hotplug? */
826
int cpus = num_online_cpus();
827
828
/*
829
* Monarch: Wait for everyone to go through their scanning
830
* loops.
831
*/
832
while (atomic_read(&mce_executing) <= cpus) {
833
if (mce_timed_out(&timeout))
834
goto reset;
835
ndelay(SPINUNIT);
836
}
837
838
mce_reign();
839
barrier();
840
ret = 0;
841
} else {
842
/*
843
* Subject: Wait for Monarch to finish.
844
*/
845
while (atomic_read(&mce_executing) != 0) {
846
if (mce_timed_out(&timeout))
847
goto reset;
848
ndelay(SPINUNIT);
849
}
850
851
/*
852
* Don't reset anything. That's done by the Monarch.
853
*/
854
return 0;
855
}
856
857
/*
858
* Reset all global state.
859
*/
860
reset:
861
atomic_set(&global_nwo, 0);
862
atomic_set(&mce_callin, 0);
863
barrier();
864
865
/*
866
* Let others run again.
867
*/
868
atomic_set(&mce_executing, 0);
869
return ret;
870
}
871
872
/*
873
* Check if the address reported by the CPU is in a format we can parse.
874
* It would be possible to add code for most other cases, but all would
875
* be somewhat complicated (e.g. segment offset would require an instruction
876
* parser). So only support physical addresses up to page granuality for now.
877
*/
878
static int mce_usable_address(struct mce *m)
879
{
880
if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
881
return 0;
882
if ((m->misc & 0x3f) > PAGE_SHIFT)
883
return 0;
884
if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
885
return 0;
886
return 1;
887
}
888
889
static void mce_clear_state(unsigned long *toclear)
890
{
891
int i;
892
893
for (i = 0; i < banks; i++) {
894
if (test_bit(i, toclear))
895
mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
896
}
897
}
898
899
/*
900
* The actual machine check handler. This only handles real
901
* exceptions when something got corrupted coming in through int 18.
902
*
903
* This is executed in NMI context not subject to normal locking rules. This
904
* implies that most kernel services cannot be safely used. Don't even
905
* think about putting a printk in there!
906
*
907
* On Intel systems this is entered on all CPUs in parallel through
908
* MCE broadcast. However some CPUs might be broken beyond repair,
909
* so be always careful when synchronizing with others.
910
*/
911
void do_machine_check(struct pt_regs *regs, long error_code)
912
{
913
struct mce m, *final;
914
int i;
915
int worst = 0;
916
int severity;
917
/*
918
* Establish sequential order between the CPUs entering the machine
919
* check handler.
920
*/
921
int order;
922
/*
923
* If no_way_out gets set, there is no safe way to recover from this
924
* MCE. If tolerant is cranked up, we'll try anyway.
925
*/
926
int no_way_out = 0;
927
/*
928
* If kill_it gets set, there might be a way to recover from this
929
* error.
930
*/
931
int kill_it = 0;
932
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
933
char *msg = "Unknown";
934
935
atomic_inc(&mce_entry);
936
937
percpu_inc(mce_exception_count);
938
939
if (notify_die(DIE_NMI, "machine check", regs, error_code,
940
18, SIGKILL) == NOTIFY_STOP)
941
goto out;
942
if (!banks)
943
goto out;
944
945
mce_setup(&m);
946
947
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
948
final = &__get_cpu_var(mces_seen);
949
*final = m;
950
951
no_way_out = mce_no_way_out(&m, &msg);
952
953
barrier();
954
955
/*
956
* When no restart IP must always kill or panic.
957
*/
958
if (!(m.mcgstatus & MCG_STATUS_RIPV))
959
kill_it = 1;
960
961
/*
962
* Go through all the banks in exclusion of the other CPUs.
963
* This way we don't report duplicated events on shared banks
964
* because the first one to see it will clear it.
965
*/
966
order = mce_start(&no_way_out);
967
for (i = 0; i < banks; i++) {
968
__clear_bit(i, toclear);
969
if (!mce_banks[i].ctl)
970
continue;
971
972
m.misc = 0;
973
m.addr = 0;
974
m.bank = i;
975
976
m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
977
if ((m.status & MCI_STATUS_VAL) == 0)
978
continue;
979
980
/*
981
* Non uncorrected or non signaled errors are handled by
982
* machine_check_poll. Leave them alone, unless this panics.
983
*/
984
if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
985
!no_way_out)
986
continue;
987
988
/*
989
* Set taint even when machine check was not enabled.
990
*/
991
add_taint(TAINT_MACHINE_CHECK);
992
993
severity = mce_severity(&m, tolerant, NULL);
994
995
/*
996
* When machine check was for corrected handler don't touch,
997
* unless we're panicing.
998
*/
999
if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1000
continue;
1001
__set_bit(i, toclear);
1002
if (severity == MCE_NO_SEVERITY) {
1003
/*
1004
* Machine check event was not enabled. Clear, but
1005
* ignore.
1006
*/
1007
continue;
1008
}
1009
1010
/*
1011
* Kill on action required.
1012
*/
1013
if (severity == MCE_AR_SEVERITY)
1014
kill_it = 1;
1015
1016
if (m.status & MCI_STATUS_MISCV)
1017
m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1018
if (m.status & MCI_STATUS_ADDRV)
1019
m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1020
1021
/*
1022
* Action optional error. Queue address for later processing.
1023
* When the ring overflows we just ignore the AO error.
1024
* RED-PEN add some logging mechanism when
1025
* usable_address or mce_add_ring fails.
1026
* RED-PEN don't ignore overflow for tolerant == 0
1027
*/
1028
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1029
mce_ring_add(m.addr >> PAGE_SHIFT);
1030
1031
mce_get_rip(&m, regs);
1032
mce_log(&m);
1033
1034
if (severity > worst) {
1035
*final = m;
1036
worst = severity;
1037
}
1038
}
1039
1040
if (!no_way_out)
1041
mce_clear_state(toclear);
1042
1043
/*
1044
* Do most of the synchronization with other CPUs.
1045
* When there's any problem use only local no_way_out state.
1046
*/
1047
if (mce_end(order) < 0)
1048
no_way_out = worst >= MCE_PANIC_SEVERITY;
1049
1050
/*
1051
* If we have decided that we just CAN'T continue, and the user
1052
* has not set tolerant to an insane level, give up and die.
1053
*
1054
* This is mainly used in the case when the system doesn't
1055
* support MCE broadcasting or it has been disabled.
1056
*/
1057
if (no_way_out && tolerant < 3)
1058
mce_panic("Fatal machine check on current CPU", final, msg);
1059
1060
/*
1061
* If the error seems to be unrecoverable, something should be
1062
* done. Try to kill as little as possible. If we can kill just
1063
* one task, do that. If the user has set the tolerance very
1064
* high, don't try to do anything at all.
1065
*/
1066
1067
if (kill_it && tolerant < 3)
1068
force_sig(SIGBUS, current);
1069
1070
/* notify userspace ASAP */
1071
set_thread_flag(TIF_MCE_NOTIFY);
1072
1073
if (worst > 0)
1074
mce_report_event(regs);
1075
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1076
out:
1077
atomic_dec(&mce_entry);
1078
sync_core();
1079
}
1080
EXPORT_SYMBOL_GPL(do_machine_check);
1081
1082
/* dummy to break dependency. actual code is in mm/memory-failure.c */
1083
void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1084
{
1085
printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1086
}
1087
1088
/*
1089
* Called after mce notification in process context. This code
1090
* is allowed to sleep. Call the high level VM handler to process
1091
* any corrupted pages.
1092
* Assume that the work queue code only calls this one at a time
1093
* per CPU.
1094
* Note we don't disable preemption, so this code might run on the wrong
1095
* CPU. In this case the event is picked up by the scheduled work queue.
1096
* This is merely a fast path to expedite processing in some common
1097
* cases.
1098
*/
1099
void mce_notify_process(void)
1100
{
1101
unsigned long pfn;
1102
mce_notify_irq();
1103
while (mce_ring_get(&pfn))
1104
memory_failure(pfn, MCE_VECTOR);
1105
}
1106
1107
static void mce_process_work(struct work_struct *dummy)
1108
{
1109
mce_notify_process();
1110
}
1111
1112
#ifdef CONFIG_X86_MCE_INTEL
1113
/***
1114
* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1115
* @cpu: The CPU on which the event occurred.
1116
* @status: Event status information
1117
*
1118
* This function should be called by the thermal interrupt after the
1119
* event has been processed and the decision was made to log the event
1120
* further.
1121
*
1122
* The status parameter will be saved to the 'status' field of 'struct mce'
1123
* and historically has been the register value of the
1124
* MSR_IA32_THERMAL_STATUS (Intel) msr.
1125
*/
1126
void mce_log_therm_throt_event(__u64 status)
1127
{
1128
struct mce m;
1129
1130
mce_setup(&m);
1131
m.bank = MCE_THERMAL_BANK;
1132
m.status = status;
1133
mce_log(&m);
1134
}
1135
#endif /* CONFIG_X86_MCE_INTEL */
1136
1137
/*
1138
* Periodic polling timer for "silent" machine check errors. If the
1139
* poller finds an MCE, poll 2x faster. When the poller finds no more
1140
* errors, poll 2x slower (up to check_interval seconds).
1141
*/
1142
static int check_interval = 5 * 60; /* 5 minutes */
1143
1144
static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1145
static DEFINE_PER_CPU(struct timer_list, mce_timer);
1146
1147
static void mce_start_timer(unsigned long data)
1148
{
1149
struct timer_list *t = &per_cpu(mce_timer, data);
1150
int *n;
1151
1152
WARN_ON(smp_processor_id() != data);
1153
1154
if (mce_available(__this_cpu_ptr(&cpu_info))) {
1155
machine_check_poll(MCP_TIMESTAMP,
1156
&__get_cpu_var(mce_poll_banks));
1157
}
1158
1159
/*
1160
* Alert userspace if needed. If we logged an MCE, reduce the
1161
* polling interval, otherwise increase the polling interval.
1162
*/
1163
n = &__get_cpu_var(mce_next_interval);
1164
if (mce_notify_irq())
1165
*n = max(*n/2, HZ/100);
1166
else
1167
*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1168
1169
t->expires = jiffies + *n;
1170
add_timer_on(t, smp_processor_id());
1171
}
1172
1173
static void mce_do_trigger(struct work_struct *work)
1174
{
1175
call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1176
}
1177
1178
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1179
1180
/*
1181
* Notify the user(s) about new machine check events.
1182
* Can be called from interrupt context, but not from machine check/NMI
1183
* context.
1184
*/
1185
int mce_notify_irq(void)
1186
{
1187
/* Not more than two messages every minute */
1188
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1189
1190
clear_thread_flag(TIF_MCE_NOTIFY);
1191
1192
if (test_and_clear_bit(0, &mce_need_notify)) {
1193
wake_up_interruptible(&mce_wait);
1194
1195
/*
1196
* There is no risk of missing notifications because
1197
* work_pending is always cleared before the function is
1198
* executed.
1199
*/
1200
if (mce_helper[0] && !work_pending(&mce_trigger_work))
1201
schedule_work(&mce_trigger_work);
1202
1203
if (__ratelimit(&ratelimit))
1204
pr_info(HW_ERR "Machine check events logged\n");
1205
1206
return 1;
1207
}
1208
return 0;
1209
}
1210
EXPORT_SYMBOL_GPL(mce_notify_irq);
1211
1212
static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1213
{
1214
int i;
1215
1216
mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1217
if (!mce_banks)
1218
return -ENOMEM;
1219
for (i = 0; i < banks; i++) {
1220
struct mce_bank *b = &mce_banks[i];
1221
1222
b->ctl = -1ULL;
1223
b->init = 1;
1224
}
1225
return 0;
1226
}
1227
1228
/*
1229
* Initialize Machine Checks for a CPU.
1230
*/
1231
static int __cpuinit __mcheck_cpu_cap_init(void)
1232
{
1233
unsigned b;
1234
u64 cap;
1235
1236
rdmsrl(MSR_IA32_MCG_CAP, cap);
1237
1238
b = cap & MCG_BANKCNT_MASK;
1239
if (!banks)
1240
printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1241
1242
if (b > MAX_NR_BANKS) {
1243
printk(KERN_WARNING
1244
"MCE: Using only %u machine check banks out of %u\n",
1245
MAX_NR_BANKS, b);
1246
b = MAX_NR_BANKS;
1247
}
1248
1249
/* Don't support asymmetric configurations today */
1250
WARN_ON(banks != 0 && b != banks);
1251
banks = b;
1252
if (!mce_banks) {
1253
int err = __mcheck_cpu_mce_banks_init();
1254
1255
if (err)
1256
return err;
1257
}
1258
1259
/* Use accurate RIP reporting if available. */
1260
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1261
rip_msr = MSR_IA32_MCG_EIP;
1262
1263
if (cap & MCG_SER_P)
1264
mce_ser = 1;
1265
1266
return 0;
1267
}
1268
1269
static void __mcheck_cpu_init_generic(void)
1270
{
1271
mce_banks_t all_banks;
1272
u64 cap;
1273
int i;
1274
1275
/*
1276
* Log the machine checks left over from the previous reset.
1277
*/
1278
bitmap_fill(all_banks, MAX_NR_BANKS);
1279
machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1280
1281
set_in_cr4(X86_CR4_MCE);
1282
1283
rdmsrl(MSR_IA32_MCG_CAP, cap);
1284
if (cap & MCG_CTL_P)
1285
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1286
1287
for (i = 0; i < banks; i++) {
1288
struct mce_bank *b = &mce_banks[i];
1289
1290
if (!b->init)
1291
continue;
1292
wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1293
wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1294
}
1295
}
1296
1297
/* Add per CPU specific workarounds here */
1298
static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1299
{
1300
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1301
pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1302
return -EOPNOTSUPP;
1303
}
1304
1305
/* This should be disabled by the BIOS, but isn't always */
1306
if (c->x86_vendor == X86_VENDOR_AMD) {
1307
if (c->x86 == 15 && banks > 4) {
1308
/*
1309
* disable GART TBL walk error reporting, which
1310
* trips off incorrectly with the IOMMU & 3ware
1311
* & Cerberus:
1312
*/
1313
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1314
}
1315
if (c->x86 <= 17 && mce_bootlog < 0) {
1316
/*
1317
* Lots of broken BIOS around that don't clear them
1318
* by default and leave crap in there. Don't log:
1319
*/
1320
mce_bootlog = 0;
1321
}
1322
/*
1323
* Various K7s with broken bank 0 around. Always disable
1324
* by default.
1325
*/
1326
if (c->x86 == 6 && banks > 0)
1327
mce_banks[0].ctl = 0;
1328
}
1329
1330
if (c->x86_vendor == X86_VENDOR_INTEL) {
1331
/*
1332
* SDM documents that on family 6 bank 0 should not be written
1333
* because it aliases to another special BIOS controlled
1334
* register.
1335
* But it's not aliased anymore on model 0x1a+
1336
* Don't ignore bank 0 completely because there could be a
1337
* valid event later, merely don't write CTL0.
1338
*/
1339
1340
if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1341
mce_banks[0].init = 0;
1342
1343
/*
1344
* All newer Intel systems support MCE broadcasting. Enable
1345
* synchronization with a one second timeout.
1346
*/
1347
if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1348
monarch_timeout < 0)
1349
monarch_timeout = USEC_PER_SEC;
1350
1351
/*
1352
* There are also broken BIOSes on some Pentium M and
1353
* earlier systems:
1354
*/
1355
if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1356
mce_bootlog = 0;
1357
}
1358
if (monarch_timeout < 0)
1359
monarch_timeout = 0;
1360
if (mce_bootlog != 0)
1361
mce_panic_timeout = 30;
1362
1363
return 0;
1364
}
1365
1366
static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1367
{
1368
if (c->x86 != 5)
1369
return;
1370
switch (c->x86_vendor) {
1371
case X86_VENDOR_INTEL:
1372
intel_p5_mcheck_init(c);
1373
break;
1374
case X86_VENDOR_CENTAUR:
1375
winchip_mcheck_init(c);
1376
break;
1377
}
1378
}
1379
1380
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1381
{
1382
switch (c->x86_vendor) {
1383
case X86_VENDOR_INTEL:
1384
mce_intel_feature_init(c);
1385
break;
1386
case X86_VENDOR_AMD:
1387
mce_amd_feature_init(c);
1388
break;
1389
default:
1390
break;
1391
}
1392
}
1393
1394
static void __mcheck_cpu_init_timer(void)
1395
{
1396
struct timer_list *t = &__get_cpu_var(mce_timer);
1397
int *n = &__get_cpu_var(mce_next_interval);
1398
1399
setup_timer(t, mce_start_timer, smp_processor_id());
1400
1401
if (mce_ignore_ce)
1402
return;
1403
1404
*n = check_interval * HZ;
1405
if (!*n)
1406
return;
1407
t->expires = round_jiffies(jiffies + *n);
1408
add_timer_on(t, smp_processor_id());
1409
}
1410
1411
/* Handle unconfigured int18 (should never happen) */
1412
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1413
{
1414
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1415
smp_processor_id());
1416
}
1417
1418
/* Call the installed machine check handler for this CPU setup. */
1419
void (*machine_check_vector)(struct pt_regs *, long error_code) =
1420
unexpected_machine_check;
1421
1422
/*
1423
* Called for each booted CPU to set up machine checks.
1424
* Must be called with preempt off:
1425
*/
1426
void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1427
{
1428
if (mce_disabled)
1429
return;
1430
1431
__mcheck_cpu_ancient_init(c);
1432
1433
if (!mce_available(c))
1434
return;
1435
1436
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1437
mce_disabled = 1;
1438
return;
1439
}
1440
1441
machine_check_vector = do_machine_check;
1442
1443
__mcheck_cpu_init_generic();
1444
__mcheck_cpu_init_vendor(c);
1445
__mcheck_cpu_init_timer();
1446
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1447
1448
}
1449
1450
/*
1451
* Character device to read and clear the MCE log.
1452
*/
1453
1454
static DEFINE_SPINLOCK(mce_state_lock);
1455
static int open_count; /* #times opened */
1456
static int open_exclu; /* already open exclusive? */
1457
1458
static int mce_open(struct inode *inode, struct file *file)
1459
{
1460
spin_lock(&mce_state_lock);
1461
1462
if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1463
spin_unlock(&mce_state_lock);
1464
1465
return -EBUSY;
1466
}
1467
1468
if (file->f_flags & O_EXCL)
1469
open_exclu = 1;
1470
open_count++;
1471
1472
spin_unlock(&mce_state_lock);
1473
1474
return nonseekable_open(inode, file);
1475
}
1476
1477
static int mce_release(struct inode *inode, struct file *file)
1478
{
1479
spin_lock(&mce_state_lock);
1480
1481
open_count--;
1482
open_exclu = 0;
1483
1484
spin_unlock(&mce_state_lock);
1485
1486
return 0;
1487
}
1488
1489
static void collect_tscs(void *data)
1490
{
1491
unsigned long *cpu_tsc = (unsigned long *)data;
1492
1493
rdtscll(cpu_tsc[smp_processor_id()]);
1494
}
1495
1496
static int mce_apei_read_done;
1497
1498
/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1499
static int __mce_read_apei(char __user **ubuf, size_t usize)
1500
{
1501
int rc;
1502
u64 record_id;
1503
struct mce m;
1504
1505
if (usize < sizeof(struct mce))
1506
return -EINVAL;
1507
1508
rc = apei_read_mce(&m, &record_id);
1509
/* Error or no more MCE record */
1510
if (rc <= 0) {
1511
mce_apei_read_done = 1;
1512
return rc;
1513
}
1514
rc = -EFAULT;
1515
if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1516
return rc;
1517
/*
1518
* In fact, we should have cleared the record after that has
1519
* been flushed to the disk or sent to network in
1520
* /sbin/mcelog, but we have no interface to support that now,
1521
* so just clear it to avoid duplication.
1522
*/
1523
rc = apei_clear_mce(record_id);
1524
if (rc) {
1525
mce_apei_read_done = 1;
1526
return rc;
1527
}
1528
*ubuf += sizeof(struct mce);
1529
1530
return 0;
1531
}
1532
1533
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1534
loff_t *off)
1535
{
1536
char __user *buf = ubuf;
1537
unsigned long *cpu_tsc;
1538
unsigned prev, next;
1539
int i, err;
1540
1541
cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1542
if (!cpu_tsc)
1543
return -ENOMEM;
1544
1545
mutex_lock(&mce_read_mutex);
1546
1547
if (!mce_apei_read_done) {
1548
err = __mce_read_apei(&buf, usize);
1549
if (err || buf != ubuf)
1550
goto out;
1551
}
1552
1553
next = rcu_dereference_check_mce(mcelog.next);
1554
1555
/* Only supports full reads right now */
1556
err = -EINVAL;
1557
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1558
goto out;
1559
1560
err = 0;
1561
prev = 0;
1562
do {
1563
for (i = prev; i < next; i++) {
1564
unsigned long start = jiffies;
1565
1566
while (!mcelog.entry[i].finished) {
1567
if (time_after_eq(jiffies, start + 2)) {
1568
memset(mcelog.entry + i, 0,
1569
sizeof(struct mce));
1570
goto timeout;
1571
}
1572
cpu_relax();
1573
}
1574
smp_rmb();
1575
err |= copy_to_user(buf, mcelog.entry + i,
1576
sizeof(struct mce));
1577
buf += sizeof(struct mce);
1578
timeout:
1579
;
1580
}
1581
1582
memset(mcelog.entry + prev, 0,
1583
(next - prev) * sizeof(struct mce));
1584
prev = next;
1585
next = cmpxchg(&mcelog.next, prev, 0);
1586
} while (next != prev);
1587
1588
synchronize_sched();
1589
1590
/*
1591
* Collect entries that were still getting written before the
1592
* synchronize.
1593
*/
1594
on_each_cpu(collect_tscs, cpu_tsc, 1);
1595
1596
for (i = next; i < MCE_LOG_LEN; i++) {
1597
if (mcelog.entry[i].finished &&
1598
mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1599
err |= copy_to_user(buf, mcelog.entry+i,
1600
sizeof(struct mce));
1601
smp_rmb();
1602
buf += sizeof(struct mce);
1603
memset(&mcelog.entry[i], 0, sizeof(struct mce));
1604
}
1605
}
1606
1607
if (err)
1608
err = -EFAULT;
1609
1610
out:
1611
mutex_unlock(&mce_read_mutex);
1612
kfree(cpu_tsc);
1613
1614
return err ? err : buf - ubuf;
1615
}
1616
1617
static unsigned int mce_poll(struct file *file, poll_table *wait)
1618
{
1619
poll_wait(file, &mce_wait, wait);
1620
if (rcu_access_index(mcelog.next))
1621
return POLLIN | POLLRDNORM;
1622
if (!mce_apei_read_done && apei_check_mce())
1623
return POLLIN | POLLRDNORM;
1624
return 0;
1625
}
1626
1627
static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1628
{
1629
int __user *p = (int __user *)arg;
1630
1631
if (!capable(CAP_SYS_ADMIN))
1632
return -EPERM;
1633
1634
switch (cmd) {
1635
case MCE_GET_RECORD_LEN:
1636
return put_user(sizeof(struct mce), p);
1637
case MCE_GET_LOG_LEN:
1638
return put_user(MCE_LOG_LEN, p);
1639
case MCE_GETCLEAR_FLAGS: {
1640
unsigned flags;
1641
1642
do {
1643
flags = mcelog.flags;
1644
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1645
1646
return put_user(flags, p);
1647
}
1648
default:
1649
return -ENOTTY;
1650
}
1651
}
1652
1653
/* Modified in mce-inject.c, so not static or const */
1654
struct file_operations mce_chrdev_ops = {
1655
.open = mce_open,
1656
.release = mce_release,
1657
.read = mce_read,
1658
.poll = mce_poll,
1659
.unlocked_ioctl = mce_ioctl,
1660
.llseek = no_llseek,
1661
};
1662
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1663
1664
static struct miscdevice mce_log_device = {
1665
MISC_MCELOG_MINOR,
1666
"mcelog",
1667
&mce_chrdev_ops,
1668
};
1669
1670
/*
1671
* mce=off Disables machine check
1672
* mce=no_cmci Disables CMCI
1673
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1674
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1675
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1676
* monarchtimeout is how long to wait for other CPUs on machine
1677
* check, or 0 to not wait
1678
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1679
* mce=nobootlog Don't log MCEs from before booting.
1680
*/
1681
static int __init mcheck_enable(char *str)
1682
{
1683
if (*str == 0) {
1684
enable_p5_mce();
1685
return 1;
1686
}
1687
if (*str == '=')
1688
str++;
1689
if (!strcmp(str, "off"))
1690
mce_disabled = 1;
1691
else if (!strcmp(str, "no_cmci"))
1692
mce_cmci_disabled = 1;
1693
else if (!strcmp(str, "dont_log_ce"))
1694
mce_dont_log_ce = 1;
1695
else if (!strcmp(str, "ignore_ce"))
1696
mce_ignore_ce = 1;
1697
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1698
mce_bootlog = (str[0] == 'b');
1699
else if (isdigit(str[0])) {
1700
get_option(&str, &tolerant);
1701
if (*str == ',') {
1702
++str;
1703
get_option(&str, &monarch_timeout);
1704
}
1705
} else {
1706
printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1707
str);
1708
return 0;
1709
}
1710
return 1;
1711
}
1712
__setup("mce", mcheck_enable);
1713
1714
int __init mcheck_init(void)
1715
{
1716
mcheck_intel_therm_init();
1717
1718
return 0;
1719
}
1720
1721
/*
1722
* Sysfs support
1723
*/
1724
1725
/*
1726
* Disable machine checks on suspend and shutdown. We can't really handle
1727
* them later.
1728
*/
1729
static int mce_disable_error_reporting(void)
1730
{
1731
int i;
1732
1733
for (i = 0; i < banks; i++) {
1734
struct mce_bank *b = &mce_banks[i];
1735
1736
if (b->init)
1737
wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1738
}
1739
return 0;
1740
}
1741
1742
static int mce_suspend(void)
1743
{
1744
return mce_disable_error_reporting();
1745
}
1746
1747
static void mce_shutdown(void)
1748
{
1749
mce_disable_error_reporting();
1750
}
1751
1752
/*
1753
* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1754
* Only one CPU is active at this time, the others get re-added later using
1755
* CPU hotplug:
1756
*/
1757
static void mce_resume(void)
1758
{
1759
__mcheck_cpu_init_generic();
1760
__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1761
}
1762
1763
static struct syscore_ops mce_syscore_ops = {
1764
.suspend = mce_suspend,
1765
.shutdown = mce_shutdown,
1766
.resume = mce_resume,
1767
};
1768
1769
static void mce_cpu_restart(void *data)
1770
{
1771
del_timer_sync(&__get_cpu_var(mce_timer));
1772
if (!mce_available(__this_cpu_ptr(&cpu_info)))
1773
return;
1774
__mcheck_cpu_init_generic();
1775
__mcheck_cpu_init_timer();
1776
}
1777
1778
/* Reinit MCEs after user configuration changes */
1779
static void mce_restart(void)
1780
{
1781
on_each_cpu(mce_cpu_restart, NULL, 1);
1782
}
1783
1784
/* Toggle features for corrected errors */
1785
static void mce_disable_ce(void *all)
1786
{
1787
if (!mce_available(__this_cpu_ptr(&cpu_info)))
1788
return;
1789
if (all)
1790
del_timer_sync(&__get_cpu_var(mce_timer));
1791
cmci_clear();
1792
}
1793
1794
static void mce_enable_ce(void *all)
1795
{
1796
if (!mce_available(__this_cpu_ptr(&cpu_info)))
1797
return;
1798
cmci_reenable();
1799
cmci_recheck();
1800
if (all)
1801
__mcheck_cpu_init_timer();
1802
}
1803
1804
static struct sysdev_class mce_sysclass = {
1805
.name = "machinecheck",
1806
};
1807
1808
DEFINE_PER_CPU(struct sys_device, mce_dev);
1809
1810
__cpuinitdata
1811
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1812
1813
static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1814
{
1815
return container_of(attr, struct mce_bank, attr);
1816
}
1817
1818
static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1819
char *buf)
1820
{
1821
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1822
}
1823
1824
static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1825
const char *buf, size_t size)
1826
{
1827
u64 new;
1828
1829
if (strict_strtoull(buf, 0, &new) < 0)
1830
return -EINVAL;
1831
1832
attr_to_bank(attr)->ctl = new;
1833
mce_restart();
1834
1835
return size;
1836
}
1837
1838
static ssize_t
1839
show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1840
{
1841
strcpy(buf, mce_helper);
1842
strcat(buf, "\n");
1843
return strlen(mce_helper) + 1;
1844
}
1845
1846
static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1847
const char *buf, size_t siz)
1848
{
1849
char *p;
1850
1851
strncpy(mce_helper, buf, sizeof(mce_helper));
1852
mce_helper[sizeof(mce_helper)-1] = 0;
1853
p = strchr(mce_helper, '\n');
1854
1855
if (p)
1856
*p = 0;
1857
1858
return strlen(mce_helper) + !!p;
1859
}
1860
1861
static ssize_t set_ignore_ce(struct sys_device *s,
1862
struct sysdev_attribute *attr,
1863
const char *buf, size_t size)
1864
{
1865
u64 new;
1866
1867
if (strict_strtoull(buf, 0, &new) < 0)
1868
return -EINVAL;
1869
1870
if (mce_ignore_ce ^ !!new) {
1871
if (new) {
1872
/* disable ce features */
1873
on_each_cpu(mce_disable_ce, (void *)1, 1);
1874
mce_ignore_ce = 1;
1875
} else {
1876
/* enable ce features */
1877
mce_ignore_ce = 0;
1878
on_each_cpu(mce_enable_ce, (void *)1, 1);
1879
}
1880
}
1881
return size;
1882
}
1883
1884
static ssize_t set_cmci_disabled(struct sys_device *s,
1885
struct sysdev_attribute *attr,
1886
const char *buf, size_t size)
1887
{
1888
u64 new;
1889
1890
if (strict_strtoull(buf, 0, &new) < 0)
1891
return -EINVAL;
1892
1893
if (mce_cmci_disabled ^ !!new) {
1894
if (new) {
1895
/* disable cmci */
1896
on_each_cpu(mce_disable_ce, NULL, 1);
1897
mce_cmci_disabled = 1;
1898
} else {
1899
/* enable cmci */
1900
mce_cmci_disabled = 0;
1901
on_each_cpu(mce_enable_ce, NULL, 1);
1902
}
1903
}
1904
return size;
1905
}
1906
1907
static ssize_t store_int_with_restart(struct sys_device *s,
1908
struct sysdev_attribute *attr,
1909
const char *buf, size_t size)
1910
{
1911
ssize_t ret = sysdev_store_int(s, attr, buf, size);
1912
mce_restart();
1913
return ret;
1914
}
1915
1916
static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1917
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1918
static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1919
static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1920
1921
static struct sysdev_ext_attribute attr_check_interval = {
1922
_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1923
store_int_with_restart),
1924
&check_interval
1925
};
1926
1927
static struct sysdev_ext_attribute attr_ignore_ce = {
1928
_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1929
&mce_ignore_ce
1930
};
1931
1932
static struct sysdev_ext_attribute attr_cmci_disabled = {
1933
_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1934
&mce_cmci_disabled
1935
};
1936
1937
static struct sysdev_attribute *mce_attrs[] = {
1938
&attr_tolerant.attr,
1939
&attr_check_interval.attr,
1940
&attr_trigger,
1941
&attr_monarch_timeout.attr,
1942
&attr_dont_log_ce.attr,
1943
&attr_ignore_ce.attr,
1944
&attr_cmci_disabled.attr,
1945
NULL
1946
};
1947
1948
static cpumask_var_t mce_dev_initialized;
1949
1950
/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1951
static __cpuinit int mce_create_device(unsigned int cpu)
1952
{
1953
int err;
1954
int i, j;
1955
1956
if (!mce_available(&boot_cpu_data))
1957
return -EIO;
1958
1959
memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1960
per_cpu(mce_dev, cpu).id = cpu;
1961
per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1962
1963
err = sysdev_register(&per_cpu(mce_dev, cpu));
1964
if (err)
1965
return err;
1966
1967
for (i = 0; mce_attrs[i]; i++) {
1968
err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1969
if (err)
1970
goto error;
1971
}
1972
for (j = 0; j < banks; j++) {
1973
err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1974
&mce_banks[j].attr);
1975
if (err)
1976
goto error2;
1977
}
1978
cpumask_set_cpu(cpu, mce_dev_initialized);
1979
1980
return 0;
1981
error2:
1982
while (--j >= 0)
1983
sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1984
error:
1985
while (--i >= 0)
1986
sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1987
1988
sysdev_unregister(&per_cpu(mce_dev, cpu));
1989
1990
return err;
1991
}
1992
1993
static __cpuinit void mce_remove_device(unsigned int cpu)
1994
{
1995
int i;
1996
1997
if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1998
return;
1999
2000
for (i = 0; mce_attrs[i]; i++)
2001
sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
2002
2003
for (i = 0; i < banks; i++)
2004
sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
2005
2006
sysdev_unregister(&per_cpu(mce_dev, cpu));
2007
cpumask_clear_cpu(cpu, mce_dev_initialized);
2008
}
2009
2010
/* Make sure there are no machine checks on offlined CPUs. */
2011
static void __cpuinit mce_disable_cpu(void *h)
2012
{
2013
unsigned long action = *(unsigned long *)h;
2014
int i;
2015
2016
if (!mce_available(__this_cpu_ptr(&cpu_info)))
2017
return;
2018
2019
if (!(action & CPU_TASKS_FROZEN))
2020
cmci_clear();
2021
for (i = 0; i < banks; i++) {
2022
struct mce_bank *b = &mce_banks[i];
2023
2024
if (b->init)
2025
wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2026
}
2027
}
2028
2029
static void __cpuinit mce_reenable_cpu(void *h)
2030
{
2031
unsigned long action = *(unsigned long *)h;
2032
int i;
2033
2034
if (!mce_available(__this_cpu_ptr(&cpu_info)))
2035
return;
2036
2037
if (!(action & CPU_TASKS_FROZEN))
2038
cmci_reenable();
2039
for (i = 0; i < banks; i++) {
2040
struct mce_bank *b = &mce_banks[i];
2041
2042
if (b->init)
2043
wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2044
}
2045
}
2046
2047
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2048
static int __cpuinit
2049
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2050
{
2051
unsigned int cpu = (unsigned long)hcpu;
2052
struct timer_list *t = &per_cpu(mce_timer, cpu);
2053
2054
switch (action) {
2055
case CPU_ONLINE:
2056
case CPU_ONLINE_FROZEN:
2057
mce_create_device(cpu);
2058
if (threshold_cpu_callback)
2059
threshold_cpu_callback(action, cpu);
2060
break;
2061
case CPU_DEAD:
2062
case CPU_DEAD_FROZEN:
2063
if (threshold_cpu_callback)
2064
threshold_cpu_callback(action, cpu);
2065
mce_remove_device(cpu);
2066
break;
2067
case CPU_DOWN_PREPARE:
2068
case CPU_DOWN_PREPARE_FROZEN:
2069
del_timer_sync(t);
2070
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2071
break;
2072
case CPU_DOWN_FAILED:
2073
case CPU_DOWN_FAILED_FROZEN:
2074
if (!mce_ignore_ce && check_interval) {
2075
t->expires = round_jiffies(jiffies +
2076
__get_cpu_var(mce_next_interval));
2077
add_timer_on(t, cpu);
2078
}
2079
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2080
break;
2081
case CPU_POST_DEAD:
2082
/* intentionally ignoring frozen here */
2083
cmci_rediscover(cpu);
2084
break;
2085
}
2086
return NOTIFY_OK;
2087
}
2088
2089
static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2090
.notifier_call = mce_cpu_callback,
2091
};
2092
2093
static __init void mce_init_banks(void)
2094
{
2095
int i;
2096
2097
for (i = 0; i < banks; i++) {
2098
struct mce_bank *b = &mce_banks[i];
2099
struct sysdev_attribute *a = &b->attr;
2100
2101
sysfs_attr_init(&a->attr);
2102
a->attr.name = b->attrname;
2103
snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2104
2105
a->attr.mode = 0644;
2106
a->show = show_bank;
2107
a->store = set_bank;
2108
}
2109
}
2110
2111
static __init int mcheck_init_device(void)
2112
{
2113
int err;
2114
int i = 0;
2115
2116
if (!mce_available(&boot_cpu_data))
2117
return -EIO;
2118
2119
zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
2120
2121
mce_init_banks();
2122
2123
err = sysdev_class_register(&mce_sysclass);
2124
if (err)
2125
return err;
2126
2127
for_each_online_cpu(i) {
2128
err = mce_create_device(i);
2129
if (err)
2130
return err;
2131
}
2132
2133
register_syscore_ops(&mce_syscore_ops);
2134
register_hotcpu_notifier(&mce_cpu_notifier);
2135
misc_register(&mce_log_device);
2136
2137
return err;
2138
}
2139
2140
device_initcall(mcheck_init_device);
2141
2142
/*
2143
* Old style boot options parsing. Only for compatibility.
2144
*/
2145
static int __init mcheck_disable(char *str)
2146
{
2147
mce_disabled = 1;
2148
return 1;
2149
}
2150
__setup("nomce", mcheck_disable);
2151
2152
#ifdef CONFIG_DEBUG_FS
2153
struct dentry *mce_get_debugfs_dir(void)
2154
{
2155
static struct dentry *dmce;
2156
2157
if (!dmce)
2158
dmce = debugfs_create_dir("mce", NULL);
2159
2160
return dmce;
2161
}
2162
2163
static void mce_reset(void)
2164
{
2165
cpu_missing = 0;
2166
atomic_set(&mce_fake_paniced, 0);
2167
atomic_set(&mce_executing, 0);
2168
atomic_set(&mce_callin, 0);
2169
atomic_set(&global_nwo, 0);
2170
}
2171
2172
static int fake_panic_get(void *data, u64 *val)
2173
{
2174
*val = fake_panic;
2175
return 0;
2176
}
2177
2178
static int fake_panic_set(void *data, u64 val)
2179
{
2180
mce_reset();
2181
fake_panic = val;
2182
return 0;
2183
}
2184
2185
DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2186
fake_panic_set, "%llu\n");
2187
2188
static int __init mcheck_debugfs_init(void)
2189
{
2190
struct dentry *dmce, *ffake_panic;
2191
2192
dmce = mce_get_debugfs_dir();
2193
if (!dmce)
2194
return -ENOMEM;
2195
ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2196
&fake_panic_fops);
2197
if (!ffake_panic)
2198
return -ENOMEM;
2199
2200
return 0;
2201
}
2202
late_initcall(mcheck_debugfs_init);
2203
#endif
2204
2205