Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/edac/mce_amd.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
#include <linux/module.h>
3
#include <linux/slab.h>
4
5
#include <asm/cpu.h>
6
#include <asm/msr.h>
7
8
#include "mce_amd.h"
9
10
static struct amd_decoder_ops fam_ops;
11
12
static u8 xec_mask = 0xf;
13
14
static void (*decode_dram_ecc)(int node_id, struct mce *m);
15
16
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
17
{
18
decode_dram_ecc = f;
19
}
20
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
21
22
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
23
{
24
if (decode_dram_ecc) {
25
WARN_ON(decode_dram_ecc != f);
26
27
decode_dram_ecc = NULL;
28
}
29
}
30
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
31
32
/*
33
* string representation for the different MCA reported error types, see F3x48
34
* or MSR0000_0411.
35
*/
36
37
/* transaction type */
38
static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
39
40
/* cache level */
41
static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
42
43
/* memory transaction type */
44
static const char * const rrrr_msgs[] = {
45
"GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
46
};
47
48
/* participating processor */
49
const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
50
EXPORT_SYMBOL_GPL(pp_msgs);
51
52
/* request timeout */
53
static const char * const to_msgs[] = { "no timeout", "timed out" };
54
55
/* memory or i/o */
56
static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
57
58
/* internal error type */
59
static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
60
61
static const char * const f15h_mc1_mce_desc[] = {
62
"UC during a demand linefill from L2",
63
"Parity error during data load from IC",
64
"Parity error for IC valid bit",
65
"Main tag parity error",
66
"Parity error in prediction queue",
67
"PFB data/address parity error",
68
"Parity error in the branch status reg",
69
"PFB promotion address error",
70
"Tag error during probe/victimization",
71
"Parity error for IC probe tag valid bit",
72
"PFB non-cacheable bit parity error",
73
"PFB valid bit parity error", /* xec = 0xd */
74
"Microcode Patch Buffer", /* xec = 010 */
75
"uop queue",
76
"insn buffer",
77
"predecode buffer",
78
"fetch address FIFO",
79
"dispatch uop queue"
80
};
81
82
static const char * const f15h_mc2_mce_desc[] = {
83
"Fill ECC error on data fills", /* xec = 0x4 */
84
"Fill parity error on insn fills",
85
"Prefetcher request FIFO parity error",
86
"PRQ address parity error",
87
"PRQ data parity error",
88
"WCC Tag ECC error",
89
"WCC Data ECC error",
90
"WCB Data parity error",
91
"VB Data ECC or parity error",
92
"L2 Tag ECC error", /* xec = 0x10 */
93
"Hard L2 Tag ECC error",
94
"Multiple hits on L2 tag",
95
"XAB parity error",
96
"PRB address parity error"
97
};
98
99
static const char * const mc4_mce_desc[] = {
100
"DRAM ECC error detected on the NB",
101
"CRC error detected on HT link",
102
"Link-defined sync error packets detected on HT link",
103
"HT Master abort",
104
"HT Target abort",
105
"Invalid GART PTE entry during GART table walk",
106
"Unsupported atomic RMW received from an IO link",
107
"Watchdog timeout due to lack of progress",
108
"DRAM ECC error detected on the NB",
109
"SVM DMA Exclusion Vector error",
110
"HT data error detected on link",
111
"Protocol error (link, L3, probe filter)",
112
"NB internal arrays parity error",
113
"DRAM addr/ctl signals parity error",
114
"IO link transmission error",
115
"L3 data cache ECC error", /* xec = 0x1c */
116
"L3 cache tag error",
117
"L3 LRU parity bits error",
118
"ECC Error in the Probe Filter directory"
119
};
120
121
static const char * const mc5_mce_desc[] = {
122
"CPU Watchdog timer expire",
123
"Wakeup array dest tag",
124
"AG payload array",
125
"EX payload array",
126
"IDRF array",
127
"Retire dispatch queue",
128
"Mapper checkpoint array",
129
"Physical register file EX0 port",
130
"Physical register file EX1 port",
131
"Physical register file AG0 port",
132
"Physical register file AG1 port",
133
"Flag register file",
134
"DE error occurred",
135
"Retire status queue"
136
};
137
138
static const char * const mc6_mce_desc[] = {
139
"Hardware Assertion",
140
"Free List",
141
"Physical Register File",
142
"Retire Queue",
143
"Scheduler table",
144
"Status Register File",
145
};
146
147
static bool f12h_mc0_mce(u16 ec, u8 xec)
148
{
149
bool ret = false;
150
151
if (MEM_ERROR(ec)) {
152
u8 ll = LL(ec);
153
ret = true;
154
155
if (ll == LL_L2)
156
pr_cont("during L1 linefill from L2.\n");
157
else if (ll == LL_L1)
158
pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
159
else
160
ret = false;
161
}
162
return ret;
163
}
164
165
static bool f10h_mc0_mce(u16 ec, u8 xec)
166
{
167
if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
168
pr_cont("during data scrub.\n");
169
return true;
170
}
171
return f12h_mc0_mce(ec, xec);
172
}
173
174
static bool k8_mc0_mce(u16 ec, u8 xec)
175
{
176
if (BUS_ERROR(ec)) {
177
pr_cont("during system linefill.\n");
178
return true;
179
}
180
181
return f10h_mc0_mce(ec, xec);
182
}
183
184
static bool cat_mc0_mce(u16 ec, u8 xec)
185
{
186
u8 r4 = R4(ec);
187
bool ret = true;
188
189
if (MEM_ERROR(ec)) {
190
191
if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
192
return false;
193
194
switch (r4) {
195
case R4_DRD:
196
case R4_DWR:
197
pr_cont("Data/Tag parity error due to %s.\n",
198
(r4 == R4_DRD ? "load/hw prf" : "store"));
199
break;
200
case R4_EVICT:
201
pr_cont("Copyback parity error on a tag miss.\n");
202
break;
203
case R4_SNOOP:
204
pr_cont("Tag parity error during snoop.\n");
205
break;
206
default:
207
ret = false;
208
}
209
} else if (BUS_ERROR(ec)) {
210
211
if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
212
return false;
213
214
pr_cont("System read data error on a ");
215
216
switch (r4) {
217
case R4_RD:
218
pr_cont("TLB reload.\n");
219
break;
220
case R4_DWR:
221
pr_cont("store.\n");
222
break;
223
case R4_DRD:
224
pr_cont("load.\n");
225
break;
226
default:
227
ret = false;
228
}
229
} else {
230
ret = false;
231
}
232
233
return ret;
234
}
235
236
static bool f15h_mc0_mce(u16 ec, u8 xec)
237
{
238
bool ret = true;
239
240
if (MEM_ERROR(ec)) {
241
242
switch (xec) {
243
case 0x0:
244
pr_cont("Data Array access error.\n");
245
break;
246
247
case 0x1:
248
pr_cont("UC error during a linefill from L2/NB.\n");
249
break;
250
251
case 0x2:
252
case 0x11:
253
pr_cont("STQ access error.\n");
254
break;
255
256
case 0x3:
257
pr_cont("SCB access error.\n");
258
break;
259
260
case 0x10:
261
pr_cont("Tag error.\n");
262
break;
263
264
case 0x12:
265
pr_cont("LDQ access error.\n");
266
break;
267
268
default:
269
ret = false;
270
}
271
} else if (BUS_ERROR(ec)) {
272
273
if (!xec)
274
pr_cont("System Read Data Error.\n");
275
else
276
pr_cont(" Internal error condition type %d.\n", xec);
277
} else if (INT_ERROR(ec)) {
278
if (xec <= 0x1f)
279
pr_cont("Hardware Assert.\n");
280
else
281
ret = false;
282
283
} else
284
ret = false;
285
286
return ret;
287
}
288
289
static void decode_mc0_mce(struct mce *m)
290
{
291
u16 ec = EC(m->status);
292
u8 xec = XEC(m->status, xec_mask);
293
294
pr_emerg(HW_ERR "MC0 Error: ");
295
296
/* TLB error signatures are the same across families */
297
if (TLB_ERROR(ec)) {
298
if (TT(ec) == TT_DATA) {
299
pr_cont("%s TLB %s.\n", LL_MSG(ec),
300
((xec == 2) ? "locked miss"
301
: (xec ? "multimatch" : "parity")));
302
return;
303
}
304
} else if (fam_ops.mc0_mce(ec, xec))
305
;
306
else
307
pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
308
}
309
310
static bool k8_mc1_mce(u16 ec, u8 xec)
311
{
312
u8 ll = LL(ec);
313
bool ret = true;
314
315
if (!MEM_ERROR(ec))
316
return false;
317
318
if (ll == 0x2)
319
pr_cont("during a linefill from L2.\n");
320
else if (ll == 0x1) {
321
switch (R4(ec)) {
322
case R4_IRD:
323
pr_cont("Parity error during data load.\n");
324
break;
325
326
case R4_EVICT:
327
pr_cont("Copyback Parity/Victim error.\n");
328
break;
329
330
case R4_SNOOP:
331
pr_cont("Tag Snoop error.\n");
332
break;
333
334
default:
335
ret = false;
336
break;
337
}
338
} else
339
ret = false;
340
341
return ret;
342
}
343
344
static bool cat_mc1_mce(u16 ec, u8 xec)
345
{
346
u8 r4 = R4(ec);
347
bool ret = true;
348
349
if (!MEM_ERROR(ec))
350
return false;
351
352
if (TT(ec) != TT_INSTR)
353
return false;
354
355
if (r4 == R4_IRD)
356
pr_cont("Data/tag array parity error for a tag hit.\n");
357
else if (r4 == R4_SNOOP)
358
pr_cont("Tag error during snoop/victimization.\n");
359
else if (xec == 0x0)
360
pr_cont("Tag parity error from victim castout.\n");
361
else if (xec == 0x2)
362
pr_cont("Microcode patch RAM parity error.\n");
363
else
364
ret = false;
365
366
return ret;
367
}
368
369
static bool f15h_mc1_mce(u16 ec, u8 xec)
370
{
371
bool ret = true;
372
373
if (!MEM_ERROR(ec))
374
return false;
375
376
switch (xec) {
377
case 0x0 ... 0xa:
378
pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
379
break;
380
381
case 0xd:
382
pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
383
break;
384
385
case 0x10:
386
pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
387
break;
388
389
case 0x11 ... 0x15:
390
pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
391
break;
392
393
default:
394
ret = false;
395
}
396
return ret;
397
}
398
399
static void decode_mc1_mce(struct mce *m)
400
{
401
u16 ec = EC(m->status);
402
u8 xec = XEC(m->status, xec_mask);
403
404
pr_emerg(HW_ERR "MC1 Error: ");
405
406
if (TLB_ERROR(ec))
407
pr_cont("%s TLB %s.\n", LL_MSG(ec),
408
(xec ? "multimatch" : "parity error"));
409
else if (BUS_ERROR(ec)) {
410
bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
411
412
pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
413
} else if (INT_ERROR(ec)) {
414
if (xec <= 0x3f)
415
pr_cont("Hardware Assert.\n");
416
else
417
goto wrong_mc1_mce;
418
} else if (fam_ops.mc1_mce(ec, xec))
419
;
420
else
421
goto wrong_mc1_mce;
422
423
return;
424
425
wrong_mc1_mce:
426
pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
427
}
428
429
static bool k8_mc2_mce(u16 ec, u8 xec)
430
{
431
bool ret = true;
432
433
if (xec == 0x1)
434
pr_cont(" in the write data buffers.\n");
435
else if (xec == 0x3)
436
pr_cont(" in the victim data buffers.\n");
437
else if (xec == 0x2 && MEM_ERROR(ec))
438
pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
439
else if (xec == 0x0) {
440
if (TLB_ERROR(ec))
441
pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
442
TT_MSG(ec));
443
else if (BUS_ERROR(ec))
444
pr_cont(": %s/ECC error in data read from NB: %s.\n",
445
R4_MSG(ec), PP_MSG(ec));
446
else if (MEM_ERROR(ec)) {
447
u8 r4 = R4(ec);
448
449
if (r4 >= 0x7)
450
pr_cont(": %s error during data copyback.\n",
451
R4_MSG(ec));
452
else if (r4 <= 0x1)
453
pr_cont(": %s parity/ECC error during data "
454
"access from L2.\n", R4_MSG(ec));
455
else
456
ret = false;
457
} else
458
ret = false;
459
} else
460
ret = false;
461
462
return ret;
463
}
464
465
static bool f15h_mc2_mce(u16 ec, u8 xec)
466
{
467
bool ret = true;
468
469
if (TLB_ERROR(ec)) {
470
if (xec == 0x0)
471
pr_cont("Data parity TLB read error.\n");
472
else if (xec == 0x1)
473
pr_cont("Poison data provided for TLB fill.\n");
474
else
475
ret = false;
476
} else if (BUS_ERROR(ec)) {
477
if (xec > 2)
478
ret = false;
479
480
pr_cont("Error during attempted NB data read.\n");
481
} else if (MEM_ERROR(ec)) {
482
switch (xec) {
483
case 0x4 ... 0xc:
484
pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
485
break;
486
487
case 0x10 ... 0x14:
488
pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
489
break;
490
491
default:
492
ret = false;
493
}
494
} else if (INT_ERROR(ec)) {
495
if (xec <= 0x3f)
496
pr_cont("Hardware Assert.\n");
497
else
498
ret = false;
499
}
500
501
return ret;
502
}
503
504
static bool f16h_mc2_mce(u16 ec, u8 xec)
505
{
506
u8 r4 = R4(ec);
507
508
if (!MEM_ERROR(ec))
509
return false;
510
511
switch (xec) {
512
case 0x04 ... 0x05:
513
pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
514
break;
515
516
case 0x09 ... 0x0b:
517
case 0x0d ... 0x0f:
518
pr_cont("ECC error in L2 tag (%s).\n",
519
((r4 == R4_GEN) ? "BankReq" :
520
((r4 == R4_SNOOP) ? "Prb" : "Fill")));
521
break;
522
523
case 0x10 ... 0x19:
524
case 0x1b:
525
pr_cont("ECC error in L2 data array (%s).\n",
526
(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
527
((r4 == R4_GEN) ? "Attr" :
528
((r4 == R4_EVICT) ? "Vict" : "Fill"))));
529
break;
530
531
case 0x1c ... 0x1d:
532
case 0x1f:
533
pr_cont("Parity error in L2 attribute bits (%s).\n",
534
((r4 == R4_RD) ? "Hit" :
535
((r4 == R4_GEN) ? "Attr" : "Fill")));
536
break;
537
538
default:
539
return false;
540
}
541
542
return true;
543
}
544
545
static void decode_mc2_mce(struct mce *m)
546
{
547
u16 ec = EC(m->status);
548
u8 xec = XEC(m->status, xec_mask);
549
550
pr_emerg(HW_ERR "MC2 Error: ");
551
552
if (!fam_ops.mc2_mce(ec, xec))
553
pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
554
}
555
556
static void decode_mc3_mce(struct mce *m)
557
{
558
u16 ec = EC(m->status);
559
u8 xec = XEC(m->status, xec_mask);
560
561
if (boot_cpu_data.x86 >= 0x14) {
562
pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
563
" please report on LKML.\n");
564
return;
565
}
566
567
pr_emerg(HW_ERR "MC3 Error");
568
569
if (xec == 0x0) {
570
u8 r4 = R4(ec);
571
572
if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
573
goto wrong_mc3_mce;
574
575
pr_cont(" during %s.\n", R4_MSG(ec));
576
} else
577
goto wrong_mc3_mce;
578
579
return;
580
581
wrong_mc3_mce:
582
pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
583
}
584
585
static void decode_mc4_mce(struct mce *m)
586
{
587
unsigned int fam = x86_family(m->cpuid);
588
int node_id = topology_amd_node_id(m->extcpu);
589
u16 ec = EC(m->status);
590
u8 xec = XEC(m->status, 0x1f);
591
u8 offset = 0;
592
593
pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
594
595
switch (xec) {
596
case 0x0 ... 0xe:
597
598
/* special handling for DRAM ECCs */
599
if (xec == 0x0 || xec == 0x8) {
600
/* no ECCs on F11h */
601
if (fam == 0x11)
602
goto wrong_mc4_mce;
603
604
pr_cont("%s.\n", mc4_mce_desc[xec]);
605
606
if (decode_dram_ecc)
607
decode_dram_ecc(node_id, m);
608
return;
609
}
610
break;
611
612
case 0xf:
613
if (TLB_ERROR(ec))
614
pr_cont("GART Table Walk data error.\n");
615
else if (BUS_ERROR(ec))
616
pr_cont("DMA Exclusion Vector Table Walk error.\n");
617
else
618
goto wrong_mc4_mce;
619
return;
620
621
case 0x19:
622
if (fam == 0x15 || fam == 0x16)
623
pr_cont("Compute Unit Data Error.\n");
624
else
625
goto wrong_mc4_mce;
626
return;
627
628
case 0x1c ... 0x1f:
629
offset = 13;
630
break;
631
632
default:
633
goto wrong_mc4_mce;
634
}
635
636
pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
637
return;
638
639
wrong_mc4_mce:
640
pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
641
}
642
643
static void decode_mc5_mce(struct mce *m)
644
{
645
unsigned int fam = x86_family(m->cpuid);
646
u16 ec = EC(m->status);
647
u8 xec = XEC(m->status, xec_mask);
648
649
if (fam == 0xf || fam == 0x11)
650
goto wrong_mc5_mce;
651
652
pr_emerg(HW_ERR "MC5 Error: ");
653
654
if (INT_ERROR(ec)) {
655
if (xec <= 0x1f) {
656
pr_cont("Hardware Assert.\n");
657
return;
658
} else
659
goto wrong_mc5_mce;
660
}
661
662
if (xec == 0x0 || xec == 0xc)
663
pr_cont("%s.\n", mc5_mce_desc[xec]);
664
else if (xec <= 0xd)
665
pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
666
else
667
goto wrong_mc5_mce;
668
669
return;
670
671
wrong_mc5_mce:
672
pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
673
}
674
675
static void decode_mc6_mce(struct mce *m)
676
{
677
u8 xec = XEC(m->status, xec_mask);
678
679
pr_emerg(HW_ERR "MC6 Error: ");
680
681
if (xec > 0x5)
682
goto wrong_mc6_mce;
683
684
pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
685
return;
686
687
wrong_mc6_mce:
688
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
689
}
690
691
static const char * const smca_long_names[] = {
692
[SMCA_LS ... SMCA_LS_V2] = "Load Store Unit",
693
[SMCA_IF] = "Instruction Fetch Unit",
694
[SMCA_L2_CACHE] = "L2 Cache",
695
[SMCA_DE] = "Decode Unit",
696
[SMCA_RESERVED] = "Reserved",
697
[SMCA_EX] = "Execution Unit",
698
[SMCA_FP] = "Floating Point Unit",
699
[SMCA_L3_CACHE] = "L3 Cache",
700
[SMCA_CS ... SMCA_CS_V2] = "Coherent Slave",
701
[SMCA_PIE] = "Power, Interrupts, etc.",
702
703
/* UMC v2 is separate because both of them can exist in a single system. */
704
[SMCA_UMC] = "Unified Memory Controller",
705
[SMCA_UMC_V2] = "Unified Memory Controller v2",
706
[SMCA_PB] = "Parameter Block",
707
[SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor",
708
[SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit",
709
[SMCA_MP5] = "Microprocessor 5 Unit",
710
[SMCA_MPDMA] = "MPDMA Unit",
711
[SMCA_NBIO] = "Northbridge IO Unit",
712
[SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit",
713
[SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit",
714
[SMCA_NBIF] = "NBIF Unit",
715
[SMCA_SHUB] = "System Hub Unit",
716
[SMCA_SATA] = "SATA Unit",
717
[SMCA_USB] = "USB Unit",
718
[SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit",
719
[SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit",
720
[SMCA_WAFL_PHY] = "WAFL PHY Unit",
721
[SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit",
722
};
723
724
static const char *smca_get_long_name(enum smca_bank_types t)
725
{
726
if (t >= N_SMCA_BANK_TYPES)
727
return NULL;
728
729
return smca_long_names[t];
730
}
731
732
/* Decode errors according to Scalable MCA specification */
733
static void decode_smca_error(struct mce *m)
734
{
735
enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
736
u8 xec = XEC(m->status, xec_mask);
737
738
if (bank_type >= N_SMCA_BANK_TYPES)
739
return;
740
741
if (bank_type == SMCA_RESERVED) {
742
pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
743
return;
744
}
745
746
pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
747
748
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
749
xec == 0 && decode_dram_ecc)
750
decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
751
}
752
753
static inline void amd_decode_err_code(u16 ec)
754
{
755
if (INT_ERROR(ec)) {
756
pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
757
return;
758
}
759
760
pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
761
762
if (BUS_ERROR(ec))
763
pr_cont(", mem/io: %s", II_MSG(ec));
764
else
765
pr_cont(", tx: %s", TT_MSG(ec));
766
767
if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
768
pr_cont(", mem-tx: %s", R4_MSG(ec));
769
770
if (BUS_ERROR(ec))
771
pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
772
}
773
774
pr_cont("\n");
775
}
776
777
static const char *decode_error_status(struct mce *m)
778
{
779
if (m->status & MCI_STATUS_UC) {
780
if (m->status & MCI_STATUS_PCC)
781
return "System Fatal error.";
782
if (m->mcgstatus & MCG_STATUS_RIPV)
783
return "Uncorrected, software restartable error.";
784
return "Uncorrected, software containable error.";
785
}
786
787
if (m->status & MCI_STATUS_DEFERRED)
788
return "Deferred error, no action required.";
789
790
return "Corrected error, no action required.";
791
}
792
793
static int
794
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
795
{
796
struct mce *m = (struct mce *)data;
797
struct mce_hw_err *err = to_mce_hw_err(m);
798
unsigned int fam = x86_family(m->cpuid);
799
u32 mca_config_lo = 0, dummy;
800
int ecc;
801
802
if (m->kflags & MCE_HANDLED_CEC)
803
return NOTIFY_DONE;
804
805
pr_emerg(HW_ERR "%s\n", decode_error_status(m));
806
807
pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
808
m->extcpu,
809
fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
810
m->bank,
811
((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
812
((m->status & MCI_STATUS_UC) ? "UE" :
813
(m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
814
((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
815
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
816
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
817
818
if (boot_cpu_has(X86_FEATURE_SMCA)) {
819
rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
820
821
if (mca_config_lo & MCI_CONFIG_MCAX)
822
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
823
824
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
825
}
826
827
/* do the two bits[14:13] together */
828
ecc = (m->status >> 45) & 0x3;
829
if (ecc)
830
pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
831
832
if (fam >= 0x15) {
833
pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
834
835
/* F15h, bank4, bit 43 is part of McaStatSubCache. */
836
if (fam != 0x15 || m->bank != 4)
837
pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
838
}
839
840
if (fam >= 0x17)
841
pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
842
843
pr_cont("]: 0x%016llx\n", m->status);
844
845
if (m->status & MCI_STATUS_ADDRV)
846
pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
847
848
if (m->ppin)
849
pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
850
851
if (boot_cpu_has(X86_FEATURE_SMCA)) {
852
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
853
854
if (m->status & MCI_STATUS_SYNDV) {
855
pr_cont(", Syndrome: 0x%016llx\n", m->synd);
856
if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
857
char frutext[17];
858
859
frutext[16] = '\0';
860
memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
861
memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
862
863
pr_emerg(HW_ERR "FRU Text: %s", frutext);
864
}
865
}
866
867
pr_cont("\n");
868
869
decode_smca_error(m);
870
goto err_code;
871
}
872
873
if (m->tsc)
874
pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
875
876
/* Doesn't matter which member to test. */
877
if (!fam_ops.mc0_mce)
878
goto err_code;
879
880
switch (m->bank) {
881
case 0:
882
decode_mc0_mce(m);
883
break;
884
885
case 1:
886
decode_mc1_mce(m);
887
break;
888
889
case 2:
890
decode_mc2_mce(m);
891
break;
892
893
case 3:
894
decode_mc3_mce(m);
895
break;
896
897
case 4:
898
decode_mc4_mce(m);
899
break;
900
901
case 5:
902
decode_mc5_mce(m);
903
break;
904
905
case 6:
906
decode_mc6_mce(m);
907
break;
908
909
default:
910
break;
911
}
912
913
err_code:
914
amd_decode_err_code(m->status & 0xffff);
915
916
m->kflags |= MCE_HANDLED_EDAC;
917
return NOTIFY_OK;
918
}
919
920
static struct notifier_block amd_mce_dec_nb = {
921
.notifier_call = amd_decode_mce,
922
.priority = MCE_PRIO_EDAC,
923
};
924
925
static int __init mce_amd_init(void)
926
{
927
struct cpuinfo_x86 *c = &boot_cpu_data;
928
929
if (c->x86_vendor != X86_VENDOR_AMD &&
930
c->x86_vendor != X86_VENDOR_HYGON)
931
return -ENODEV;
932
933
if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
934
return -ENODEV;
935
936
if (boot_cpu_has(X86_FEATURE_SMCA)) {
937
xec_mask = 0x3f;
938
goto out;
939
}
940
941
switch (c->x86) {
942
case 0xf:
943
fam_ops.mc0_mce = k8_mc0_mce;
944
fam_ops.mc1_mce = k8_mc1_mce;
945
fam_ops.mc2_mce = k8_mc2_mce;
946
break;
947
948
case 0x10:
949
fam_ops.mc0_mce = f10h_mc0_mce;
950
fam_ops.mc1_mce = k8_mc1_mce;
951
fam_ops.mc2_mce = k8_mc2_mce;
952
break;
953
954
case 0x11:
955
fam_ops.mc0_mce = k8_mc0_mce;
956
fam_ops.mc1_mce = k8_mc1_mce;
957
fam_ops.mc2_mce = k8_mc2_mce;
958
break;
959
960
case 0x12:
961
fam_ops.mc0_mce = f12h_mc0_mce;
962
fam_ops.mc1_mce = k8_mc1_mce;
963
fam_ops.mc2_mce = k8_mc2_mce;
964
break;
965
966
case 0x14:
967
fam_ops.mc0_mce = cat_mc0_mce;
968
fam_ops.mc1_mce = cat_mc1_mce;
969
fam_ops.mc2_mce = k8_mc2_mce;
970
break;
971
972
case 0x15:
973
xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
974
975
fam_ops.mc0_mce = f15h_mc0_mce;
976
fam_ops.mc1_mce = f15h_mc1_mce;
977
fam_ops.mc2_mce = f15h_mc2_mce;
978
break;
979
980
case 0x16:
981
xec_mask = 0x1f;
982
fam_ops.mc0_mce = cat_mc0_mce;
983
fam_ops.mc1_mce = cat_mc1_mce;
984
fam_ops.mc2_mce = f16h_mc2_mce;
985
break;
986
987
case 0x17:
988
case 0x18:
989
pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
990
return -EINVAL;
991
992
default:
993
printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
994
return -EINVAL;
995
}
996
997
out:
998
pr_info("MCE: In-kernel MCE decoding enabled.\n");
999
1000
mce_register_decode_chain(&amd_mce_dec_nb);
1001
1002
return 0;
1003
}
1004
early_initcall(mce_amd_init);
1005
1006
#ifdef MODULE
1007
static void __exit mce_amd_exit(void)
1008
{
1009
mce_unregister_decode_chain(&amd_mce_dec_nb);
1010
}
1011
1012
MODULE_DESCRIPTION("AMD MCE decoder");
1013
MODULE_ALIAS("edac-mce-amd");
1014
MODULE_LICENSE("GPL");
1015
module_exit(mce_amd_exit);
1016
#endif
1017
1018