Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/edac/ghes_edac.c
51072 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* GHES/EDAC Linux driver
4
*
5
* Copyright (c) 2013 by Mauro Carvalho Chehab
6
*
7
* Red Hat Inc. https://www.redhat.com
8
*/
9
10
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12
#include <acpi/ghes.h>
13
#include <linux/edac.h>
14
#include <linux/dmi.h>
15
#include "edac_module.h"
16
#include <ras/ras_event.h>
17
#include <linux/notifier.h>
18
#include <linux/string.h>
19
20
#define OTHER_DETAIL_LEN 400
21
22
struct ghes_pvt {
23
struct mem_ctl_info *mci;
24
25
/* Buffers for the error handling routine */
26
char other_detail[OTHER_DETAIL_LEN];
27
char msg[80];
28
};
29
30
static refcount_t ghes_refcount = REFCOUNT_INIT(0);
31
32
/*
33
* Access to ghes_pvt must be protected by ghes_lock. The spinlock
34
* also provides the necessary (implicit) memory barrier for the SMP
35
* case to make the pointer visible on another CPU.
36
*/
37
static struct ghes_pvt *ghes_pvt;
38
39
/*
40
* This driver's representation of the system hardware, as collected
41
* from DMI.
42
*/
43
static struct ghes_hw_desc {
44
int num_dimms;
45
struct dimm_info *dimms;
46
} ghes_hw;
47
48
/* GHES registration mutex */
49
static DEFINE_MUTEX(ghes_reg_mutex);
50
51
/*
52
* Sync with other, potentially concurrent callers of
53
* ghes_edac_report_mem_error(). We don't know what the
54
* "inventive" firmware would do.
55
*/
56
static DEFINE_SPINLOCK(ghes_lock);
57
58
static bool system_scanned;
59
60
static struct list_head *ghes_devs;
61
62
/* Memory Device - Type 17 of SMBIOS spec */
63
struct memdev_dmi_entry {
64
u8 type;
65
u8 length;
66
u16 handle;
67
u16 phys_mem_array_handle;
68
u16 mem_err_info_handle;
69
u16 total_width;
70
u16 data_width;
71
u16 size;
72
u8 form_factor;
73
u8 device_set;
74
u8 device_locator;
75
u8 bank_locator;
76
u8 memory_type;
77
u16 type_detail;
78
u16 speed;
79
u8 manufacturer;
80
u8 serial_number;
81
u8 asset_tag;
82
u8 part_number;
83
u8 attributes;
84
u32 extended_size;
85
u16 conf_mem_clk_speed;
86
} __attribute__((__packed__));
87
88
static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
89
{
90
struct dimm_info *dimm;
91
92
mci_for_each_dimm(mci, dimm) {
93
if (dimm->smbios_handle == handle)
94
return dimm;
95
}
96
97
return NULL;
98
}
99
100
static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
101
{
102
const char *bank = NULL, *device = NULL;
103
104
dmi_memdev_name(handle, &bank, &device);
105
106
/*
107
* Set to a NULL string when both bank and device are zero. In this case,
108
* the label assigned by default will be preserved.
109
*/
110
snprintf(dimm->label, sizeof(dimm->label), "%s%s%s",
111
(bank && *bank) ? bank : "",
112
(bank && *bank && device && *device) ? " " : "",
113
(device && *device) ? device : "");
114
}
115
116
static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry)
117
{
118
u16 rdr_mask = BIT(7) | BIT(13);
119
120
if (entry->size == 0xffff) {
121
pr_info("Can't get DIMM%i size\n", dimm->idx);
122
dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
123
} else if (entry->size == 0x7fff) {
124
dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
125
} else {
126
if (entry->size & BIT(15))
127
dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10);
128
else
129
dimm->nr_pages = MiB_TO_PAGES(entry->size);
130
}
131
132
switch (entry->memory_type) {
133
case 0x12:
134
if (entry->type_detail & BIT(13))
135
dimm->mtype = MEM_RDDR;
136
else
137
dimm->mtype = MEM_DDR;
138
break;
139
case 0x13:
140
if (entry->type_detail & BIT(13))
141
dimm->mtype = MEM_RDDR2;
142
else
143
dimm->mtype = MEM_DDR2;
144
break;
145
case 0x14:
146
dimm->mtype = MEM_FB_DDR2;
147
break;
148
case 0x18:
149
if (entry->type_detail & BIT(12))
150
dimm->mtype = MEM_NVDIMM;
151
else if (entry->type_detail & BIT(13))
152
dimm->mtype = MEM_RDDR3;
153
else
154
dimm->mtype = MEM_DDR3;
155
break;
156
case 0x1a:
157
if (entry->type_detail & BIT(12))
158
dimm->mtype = MEM_NVDIMM;
159
else if (entry->type_detail & BIT(13))
160
dimm->mtype = MEM_RDDR4;
161
else
162
dimm->mtype = MEM_DDR4;
163
break;
164
default:
165
if (entry->type_detail & BIT(6))
166
dimm->mtype = MEM_RMBS;
167
else if ((entry->type_detail & rdr_mask) == rdr_mask)
168
dimm->mtype = MEM_RDR;
169
else if (entry->type_detail & BIT(7))
170
dimm->mtype = MEM_SDR;
171
else if (entry->type_detail & BIT(9))
172
dimm->mtype = MEM_EDO;
173
else
174
dimm->mtype = MEM_UNKNOWN;
175
}
176
177
/*
178
* Actually, we can only detect if the memory has bits for
179
* checksum or not
180
*/
181
if (entry->total_width == entry->data_width)
182
dimm->edac_mode = EDAC_NONE;
183
else
184
dimm->edac_mode = EDAC_SECDED;
185
186
dimm->dtype = DEV_UNKNOWN;
187
dimm->grain = 128; /* Likely, worse case */
188
189
dimm_setup_label(dimm, entry->handle);
190
191
if (dimm->nr_pages) {
192
edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
193
dimm->idx, edac_mem_types[dimm->mtype],
194
PAGES_TO_MiB(dimm->nr_pages),
195
(dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
196
edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
197
entry->memory_type, entry->type_detail,
198
entry->total_width, entry->data_width);
199
}
200
201
dimm->smbios_handle = entry->handle;
202
}
203
204
static void enumerate_dimms(const struct dmi_header *dh, void *arg)
205
{
206
struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
207
struct ghes_hw_desc *hw = (struct ghes_hw_desc *)arg;
208
struct dimm_info *d;
209
210
if (dh->type != DMI_ENTRY_MEM_DEVICE)
211
return;
212
213
/* Enlarge the array with additional 16 */
214
if (!hw->num_dimms || !(hw->num_dimms % 16)) {
215
struct dimm_info *new;
216
217
new = krealloc_array(hw->dimms, hw->num_dimms + 16,
218
sizeof(struct dimm_info), GFP_KERNEL);
219
if (!new) {
220
WARN_ON_ONCE(1);
221
return;
222
}
223
224
hw->dimms = new;
225
}
226
227
d = &hw->dimms[hw->num_dimms];
228
d->idx = hw->num_dimms;
229
230
assign_dmi_dimm_info(d, entry);
231
232
hw->num_dimms++;
233
}
234
235
static void ghes_scan_system(void)
236
{
237
if (system_scanned)
238
return;
239
240
dmi_walk(enumerate_dimms, &ghes_hw);
241
242
system_scanned = true;
243
}
244
245
static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char *msg,
246
const char *location, unsigned int len)
247
{
248
u32 n;
249
250
if (!msg)
251
return 0;
252
253
n = 0;
254
len -= 1;
255
256
n += scnprintf(msg + n, len - n, "APEI location: %s ", location);
257
258
if (!(mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS))
259
goto out;
260
261
n += scnprintf(msg + n, len - n, "status(0x%016llx): ", mem->error_status);
262
n += scnprintf(msg + n, len - n, "%s ", cper_mem_err_status_str(mem->error_status));
263
264
out:
265
msg[n] = '\0';
266
267
return n;
268
}
269
270
static int ghes_edac_report_mem_error(struct notifier_block *nb,
271
unsigned long val, void *data)
272
{
273
struct cper_sec_mem_err *mem_err = (struct cper_sec_mem_err *)data;
274
struct cper_mem_err_compact cmem;
275
struct edac_raw_error_desc *e;
276
struct mem_ctl_info *mci;
277
unsigned long sev = val;
278
struct ghes_pvt *pvt;
279
unsigned long flags;
280
char *p;
281
282
/*
283
* We can do the locking below because GHES defers error processing
284
* from NMI to IRQ context. Whenever that changes, we'd at least
285
* know.
286
*/
287
if (WARN_ON_ONCE(in_nmi()))
288
return NOTIFY_OK;
289
290
spin_lock_irqsave(&ghes_lock, flags);
291
292
pvt = ghes_pvt;
293
if (!pvt)
294
goto unlock;
295
296
mci = pvt->mci;
297
e = &mci->error_desc;
298
299
/* Cleans the error report buffer */
300
memset(e, 0, sizeof (*e));
301
e->error_count = 1;
302
e->grain = 1;
303
e->msg = pvt->msg;
304
e->other_detail = pvt->other_detail;
305
e->top_layer = -1;
306
e->mid_layer = -1;
307
e->low_layer = -1;
308
*pvt->other_detail = '\0';
309
*pvt->msg = '\0';
310
311
switch (sev) {
312
case GHES_SEV_CORRECTED:
313
e->type = HW_EVENT_ERR_CORRECTED;
314
break;
315
case GHES_SEV_RECOVERABLE:
316
e->type = HW_EVENT_ERR_UNCORRECTED;
317
break;
318
case GHES_SEV_PANIC:
319
e->type = HW_EVENT_ERR_FATAL;
320
break;
321
default:
322
case GHES_SEV_NO:
323
e->type = HW_EVENT_ERR_INFO;
324
}
325
326
edac_dbg(1, "error validation_bits: 0x%08llx\n",
327
(long long)mem_err->validation_bits);
328
329
/* Error type, mapped on e->msg */
330
if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
331
u8 etype = mem_err->error_type;
332
333
p = pvt->msg;
334
p += snprintf(p, sizeof(pvt->msg), "%s", cper_mem_err_type_str(etype));
335
} else {
336
strscpy(pvt->msg, "unknown error");
337
}
338
339
/* Error address */
340
if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
341
e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
342
e->offset_in_page = offset_in_page(mem_err->physical_addr);
343
}
344
345
/* Error grain */
346
if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
347
e->grain = ~mem_err->physical_addr_mask + 1;
348
349
/* Memory error location, mapped on e->location */
350
p = e->location;
351
cper_mem_err_pack(mem_err, &cmem);
352
p += cper_mem_err_location(&cmem, p);
353
354
if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
355
struct dimm_info *dimm;
356
357
p += cper_dimm_err_location(&cmem, p);
358
dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
359
if (dimm) {
360
e->top_layer = dimm->idx;
361
strscpy(e->label, dimm->label);
362
}
363
}
364
if (p > e->location)
365
*(p - 1) = '\0';
366
367
if (!*e->label)
368
strscpy(e->label, "unknown memory");
369
370
/* All other fields are mapped on e->other_detail */
371
p = pvt->other_detail;
372
p += print_mem_error_other_detail(mem_err, p, e->location, OTHER_DETAIL_LEN);
373
if (p > pvt->other_detail)
374
*(p - 1) = '\0';
375
376
edac_raw_mc_handle_error(e);
377
378
unlock:
379
spin_unlock_irqrestore(&ghes_lock, flags);
380
381
return NOTIFY_OK;
382
}
383
384
static struct notifier_block ghes_edac_mem_err_nb = {
385
.notifier_call = ghes_edac_report_mem_error,
386
.priority = 0,
387
};
388
389
static int ghes_edac_register(struct device *dev)
390
{
391
bool fake = false;
392
struct mem_ctl_info *mci;
393
struct ghes_pvt *pvt;
394
struct edac_mc_layer layers[1];
395
unsigned long flags;
396
int rc = 0;
397
398
/* finish another registration/unregistration instance first */
399
mutex_lock(&ghes_reg_mutex);
400
401
/*
402
* We have only one logical memory controller to which all DIMMs belong.
403
*/
404
if (refcount_inc_not_zero(&ghes_refcount))
405
goto unlock;
406
407
ghes_scan_system();
408
409
/* Check if we've got a bogus BIOS */
410
if (!ghes_hw.num_dimms) {
411
fake = true;
412
ghes_hw.num_dimms = 1;
413
}
414
415
layers[0].type = EDAC_MC_LAYER_ALL_MEM;
416
layers[0].size = ghes_hw.num_dimms;
417
layers[0].is_virt_csrow = true;
418
419
mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_pvt));
420
if (!mci) {
421
pr_info("Can't allocate memory for EDAC data\n");
422
rc = -ENOMEM;
423
goto unlock;
424
}
425
426
pvt = mci->pvt_info;
427
pvt->mci = mci;
428
429
mci->pdev = dev;
430
mci->mtype_cap = MEM_FLAG_EMPTY;
431
mci->edac_ctl_cap = EDAC_FLAG_NONE;
432
mci->edac_cap = EDAC_FLAG_NONE;
433
mci->mod_name = "ghes_edac.c";
434
mci->ctl_name = "ghes_edac";
435
mci->dev_name = "ghes";
436
437
if (fake) {
438
pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
439
pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
440
pr_info("work on such system. Use this driver with caution\n");
441
}
442
443
pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms);
444
445
if (!fake) {
446
struct dimm_info *src, *dst;
447
int i = 0;
448
449
mci_for_each_dimm(mci, dst) {
450
src = &ghes_hw.dimms[i];
451
452
dst->idx = src->idx;
453
dst->smbios_handle = src->smbios_handle;
454
dst->nr_pages = src->nr_pages;
455
dst->mtype = src->mtype;
456
dst->edac_mode = src->edac_mode;
457
dst->dtype = src->dtype;
458
dst->grain = src->grain;
459
460
/*
461
* If no src->label, preserve default label assigned
462
* from EDAC core.
463
*/
464
if (strlen(src->label))
465
memcpy(dst->label, src->label, sizeof(src->label));
466
467
i++;
468
}
469
470
} else {
471
struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0);
472
473
dimm->nr_pages = 1;
474
dimm->grain = 128;
475
dimm->mtype = MEM_UNKNOWN;
476
dimm->dtype = DEV_UNKNOWN;
477
dimm->edac_mode = EDAC_SECDED;
478
}
479
480
rc = edac_mc_add_mc(mci);
481
if (rc < 0) {
482
pr_info("Can't register with the EDAC core\n");
483
edac_mc_free(mci);
484
rc = -ENODEV;
485
goto unlock;
486
}
487
488
spin_lock_irqsave(&ghes_lock, flags);
489
ghes_pvt = pvt;
490
spin_unlock_irqrestore(&ghes_lock, flags);
491
492
ghes_register_report_chain(&ghes_edac_mem_err_nb);
493
494
/* only set on success */
495
refcount_set(&ghes_refcount, 1);
496
497
unlock:
498
499
/* Not needed anymore */
500
kfree(ghes_hw.dimms);
501
ghes_hw.dimms = NULL;
502
503
mutex_unlock(&ghes_reg_mutex);
504
505
return rc;
506
}
507
508
static void ghes_edac_unregister(struct ghes *ghes)
509
{
510
struct mem_ctl_info *mci;
511
unsigned long flags;
512
513
mutex_lock(&ghes_reg_mutex);
514
515
system_scanned = false;
516
memset(&ghes_hw, 0, sizeof(struct ghes_hw_desc));
517
518
if (!refcount_dec_and_test(&ghes_refcount))
519
goto unlock;
520
521
/*
522
* Wait for the irq handler being finished.
523
*/
524
spin_lock_irqsave(&ghes_lock, flags);
525
mci = ghes_pvt ? ghes_pvt->mci : NULL;
526
ghes_pvt = NULL;
527
spin_unlock_irqrestore(&ghes_lock, flags);
528
529
if (!mci)
530
goto unlock;
531
532
mci = edac_mc_del_mc(mci->pdev);
533
if (mci)
534
edac_mc_free(mci);
535
536
ghes_unregister_report_chain(&ghes_edac_mem_err_nb);
537
538
unlock:
539
mutex_unlock(&ghes_reg_mutex);
540
}
541
542
static int __init ghes_edac_init(void)
543
{
544
struct ghes *g, *g_tmp;
545
546
ghes_devs = ghes_get_devices();
547
if (!ghes_devs)
548
return -ENODEV;
549
550
if (list_empty(ghes_devs)) {
551
pr_info("GHES probing device list is empty\n");
552
return -ENODEV;
553
}
554
555
list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
556
ghes_edac_register(g->dev);
557
}
558
559
return 0;
560
}
561
module_init(ghes_edac_init);
562
563
static void __exit ghes_edac_exit(void)
564
{
565
struct ghes *g, *g_tmp;
566
567
list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
568
ghes_edac_unregister(g);
569
}
570
}
571
module_exit(ghes_edac_exit);
572
573
MODULE_LICENSE("GPL");
574
MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC");
575
576