Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/mce/threshold.c
49639 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Common corrected MCE threshold handler code:
4
*/
5
#include <linux/interrupt.h>
6
#include <linux/kernel.h>
7
8
#include <asm/irq_vectors.h>
9
#include <asm/traps.h>
10
#include <asm/apic.h>
11
#include <asm/mce.h>
12
#include <asm/trace/irq_vectors.h>
13
14
#include "internal.h"
15
16
static u32 mce_apei_thr_limit;
17
18
void mce_save_apei_thr_limit(u32 thr_limit)
19
{
20
mce_apei_thr_limit = thr_limit;
21
pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
22
}
23
24
u32 mce_get_apei_thr_limit(void)
25
{
26
return mce_apei_thr_limit;
27
}
28
29
static void default_threshold_interrupt(void)
30
{
31
pr_err("Unexpected threshold interrupt at vector %x\n",
32
THRESHOLD_APIC_VECTOR);
33
}
34
35
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
36
37
DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
38
{
39
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
40
inc_irq_stat(irq_threshold_count);
41
mce_threshold_vector();
42
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
43
apic_eoi();
44
}
45
46
DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
47
48
void mce_inherit_storm(unsigned int bank)
49
{
50
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
51
52
/*
53
* Previous CPU owning this bank had put it into storm mode,
54
* but the precise history of that storm is unknown. Assume
55
* the worst (all recent polls of the bank found a valid error
56
* logged). This will avoid the new owner prematurely declaring
57
* the storm has ended.
58
*/
59
storm->banks[bank].history = ~0ull;
60
storm->banks[bank].timestamp = jiffies;
61
}
62
63
bool mce_get_storm_mode(void)
64
{
65
return __this_cpu_read(storm_desc.poll_mode);
66
}
67
68
void mce_set_storm_mode(bool storm)
69
{
70
__this_cpu_write(storm_desc.poll_mode, storm);
71
}
72
73
static void mce_handle_storm(unsigned int bank, bool on)
74
{
75
switch (boot_cpu_data.x86_vendor) {
76
case X86_VENDOR_INTEL:
77
mce_intel_handle_storm(bank, on);
78
break;
79
case X86_VENDOR_AMD:
80
mce_amd_handle_storm(bank, on);
81
break;
82
}
83
}
84
85
void cmci_storm_begin(unsigned int bank)
86
{
87
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
88
89
__set_bit(bank, this_cpu_ptr(mce_poll_banks));
90
storm->banks[bank].in_storm_mode = true;
91
92
/*
93
* If this is the first bank on this CPU to enter storm mode
94
* start polling.
95
*/
96
if (++storm->stormy_bank_count == 1)
97
mce_timer_kick(true);
98
}
99
100
void cmci_storm_end(unsigned int bank)
101
{
102
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
103
104
if (!mce_flags.amd_threshold)
105
__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
106
storm->banks[bank].history = 0;
107
storm->banks[bank].in_storm_mode = false;
108
109
/* If no banks left in storm mode, stop polling. */
110
if (!--storm->stormy_bank_count)
111
mce_timer_kick(false);
112
}
113
114
void mce_track_storm(struct mce *mce)
115
{
116
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
117
unsigned long now = jiffies, delta;
118
unsigned int shift = 1;
119
u64 history = 0;
120
121
/* No tracking needed for banks that do not support CMCI */
122
if (storm->banks[mce->bank].poll_only)
123
return;
124
125
/*
126
* When a bank is in storm mode it is polled once per second and
127
* the history mask will record about the last minute of poll results.
128
* If it is not in storm mode, then the bank is only checked when
129
* there is a CMCI interrupt. Check how long it has been since
130
* this bank was last checked, and adjust the amount of "shift"
131
* to apply to history.
132
*/
133
if (!storm->banks[mce->bank].in_storm_mode) {
134
delta = now - storm->banks[mce->bank].timestamp;
135
shift = (delta + HZ) / HZ;
136
}
137
138
/* If it has been a long time since the last poll, clear history. */
139
if (shift < NUM_HISTORY_BITS)
140
history = storm->banks[mce->bank].history << shift;
141
142
storm->banks[mce->bank].timestamp = now;
143
144
/* History keeps track of corrected errors. VAL=1 && UC=0 */
145
if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
146
history |= 1;
147
148
storm->banks[mce->bank].history = history;
149
150
if (storm->banks[mce->bank].in_storm_mode) {
151
if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
152
return;
153
printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
154
mce_handle_storm(mce->bank, false);
155
cmci_storm_end(mce->bank);
156
} else {
157
if (hweight64(history) < STORM_BEGIN_THRESHOLD)
158
return;
159
printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
160
mce_handle_storm(mce->bank, true);
161
cmci_storm_begin(mce->bank);
162
}
163
}
164
165