Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/cxl/pci.c
26278 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
3
#include <linux/unaligned.h>
4
#include <linux/io-64-nonatomic-lo-hi.h>
5
#include <linux/moduleparam.h>
6
#include <linux/module.h>
7
#include <linux/delay.h>
8
#include <linux/sizes.h>
9
#include <linux/mutex.h>
10
#include <linux/list.h>
11
#include <linux/pci.h>
12
#include <linux/aer.h>
13
#include <linux/io.h>
14
#include <cxl/mailbox.h>
15
#include "cxlmem.h"
16
#include "cxlpci.h"
17
#include "cxl.h"
18
#include "pmu.h"
19
20
/**
21
* DOC: cxl pci
22
*
23
* This implements the PCI exclusive functionality for a CXL device as it is
24
* defined by the Compute Express Link specification. CXL devices may surface
25
* certain functionality even if it isn't CXL enabled. While this driver is
26
* focused around the PCI specific aspects of a CXL device, it binds to the
27
* specific CXL memory device class code, and therefore the implementation of
28
* cxl_pci is focused around CXL memory devices.
29
*
30
* The driver has several responsibilities, mainly:
31
* - Create the memX device and register on the CXL bus.
32
* - Enumerate device's register interface and map them.
33
* - Registers nvdimm bridge device with cxl_core.
34
* - Registers a CXL mailbox with cxl_core.
35
*/
36
37
#define cxl_doorbell_busy(cxlds) \
38
(readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \
39
CXLDEV_MBOX_CTRL_DOORBELL)
40
41
/* CXL 2.0 - 8.2.8.4 */
42
#define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
43
44
/*
45
* CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to
46
* dictate how long to wait for the mailbox to become ready. The new
47
* field allows the device to tell software the amount of time to wait
48
* before mailbox ready. This field per the spec theoretically allows
49
* for up to 255 seconds. 255 seconds is unreasonably long, its longer
50
* than the maximum SATA port link recovery wait. Default to 60 seconds
51
* until someone builds a CXL device that needs more time in practice.
52
*/
53
static unsigned short mbox_ready_timeout = 60;
54
module_param(mbox_ready_timeout, ushort, 0644);
55
MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
56
57
static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
58
{
59
const unsigned long start = jiffies;
60
unsigned long end = start;
61
62
while (cxl_doorbell_busy(cxlds)) {
63
end = jiffies;
64
65
if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
66
/* Check again in case preempted before timeout test */
67
if (!cxl_doorbell_busy(cxlds))
68
break;
69
return -ETIMEDOUT;
70
}
71
cpu_relax();
72
}
73
74
dev_dbg(cxlds->dev, "Doorbell wait took %dms",
75
jiffies_to_msecs(end) - jiffies_to_msecs(start));
76
return 0;
77
}
78
79
#define cxl_err(dev, status, msg) \
80
dev_err_ratelimited(dev, msg ", device state %s%s\n", \
81
status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
82
status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
83
84
#define cxl_cmd_err(dev, cmd, status, msg) \
85
dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \
86
(cmd)->opcode, \
87
status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
88
status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
89
90
/*
91
* Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique
92
* wrapper object for each irq within the same cxlds.
93
*/
94
struct cxl_dev_id {
95
struct cxl_dev_state *cxlds;
96
};
97
98
static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq,
99
irq_handler_t thread_fn)
100
{
101
struct device *dev = cxlds->dev;
102
struct cxl_dev_id *dev_id;
103
104
dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL);
105
if (!dev_id)
106
return -ENOMEM;
107
dev_id->cxlds = cxlds;
108
109
return devm_request_threaded_irq(dev, irq, NULL, thread_fn,
110
IRQF_SHARED | IRQF_ONESHOT, NULL,
111
dev_id);
112
}
113
114
static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
115
{
116
u64 reg;
117
118
reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
119
return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
120
}
121
122
static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
123
{
124
u64 reg;
125
u16 opcode;
126
struct cxl_dev_id *dev_id = id;
127
struct cxl_dev_state *cxlds = dev_id->cxlds;
128
struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
129
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
130
131
if (!cxl_mbox_background_complete(cxlds))
132
return IRQ_NONE;
133
134
reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
135
opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg);
136
if (opcode == CXL_MBOX_OP_SANITIZE) {
137
mutex_lock(&cxl_mbox->mbox_mutex);
138
if (mds->security.sanitize_node)
139
mod_delayed_work(system_wq, &mds->security.poll_dwork, 0);
140
mutex_unlock(&cxl_mbox->mbox_mutex);
141
} else {
142
/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
143
rcuwait_wake_up(&cxl_mbox->mbox_wait);
144
}
145
146
return IRQ_HANDLED;
147
}
148
149
/*
150
* Sanitization operation polling mode.
151
*/
152
static void cxl_mbox_sanitize_work(struct work_struct *work)
153
{
154
struct cxl_memdev_state *mds =
155
container_of(work, typeof(*mds), security.poll_dwork.work);
156
struct cxl_dev_state *cxlds = &mds->cxlds;
157
struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
158
159
mutex_lock(&cxl_mbox->mbox_mutex);
160
if (cxl_mbox_background_complete(cxlds)) {
161
mds->security.poll_tmo_secs = 0;
162
if (mds->security.sanitize_node)
163
sysfs_notify_dirent(mds->security.sanitize_node);
164
mds->security.sanitize_active = false;
165
166
dev_dbg(cxlds->dev, "Sanitization operation ended\n");
167
} else {
168
int timeout = mds->security.poll_tmo_secs + 10;
169
170
mds->security.poll_tmo_secs = min(15 * 60, timeout);
171
schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ);
172
}
173
mutex_unlock(&cxl_mbox->mbox_mutex);
174
}
175
176
/**
177
* __cxl_pci_mbox_send_cmd() - Execute a mailbox command
178
* @cxl_mbox: CXL mailbox context
179
* @mbox_cmd: Command to send to the memory device.
180
*
181
* Context: Any context. Expects mbox_mutex to be held.
182
* Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
183
* Caller should check the return code in @mbox_cmd to make sure it
184
* succeeded.
185
*
186
* This is a generic form of the CXL mailbox send command thus only using the
187
* registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
188
* devices, and perhaps other types of CXL devices may have further information
189
* available upon error conditions. Driver facilities wishing to send mailbox
190
* commands should use the wrapper command.
191
*
192
* The CXL spec allows for up to two mailboxes. The intention is for the primary
193
* mailbox to be OS controlled and the secondary mailbox to be used by system
194
* firmware. This allows the OS and firmware to communicate with the device and
195
* not need to coordinate with each other. The driver only uses the primary
196
* mailbox.
197
*/
198
static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox,
199
struct cxl_mbox_cmd *mbox_cmd)
200
{
201
struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox);
202
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
203
void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
204
struct device *dev = cxlds->dev;
205
u64 cmd_reg, status_reg;
206
size_t out_len;
207
int rc;
208
209
lockdep_assert_held(&cxl_mbox->mbox_mutex);
210
211
/*
212
* Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
213
* 1. Caller reads MB Control Register to verify doorbell is clear
214
* 2. Caller writes Command Register
215
* 3. Caller writes Command Payload Registers if input payload is non-empty
216
* 4. Caller writes MB Control Register to set doorbell
217
* 5. Caller either polls for doorbell to be clear or waits for interrupt if configured
218
* 6. Caller reads MB Status Register to fetch Return code
219
* 7. If command successful, Caller reads Command Register to get Payload Length
220
* 8. If output payload is non-empty, host reads Command Payload Registers
221
*
222
* Hardware is free to do whatever it wants before the doorbell is rung,
223
* and isn't allowed to change anything after it clears the doorbell. As
224
* such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
225
* also happen in any order (though some orders might not make sense).
226
*/
227
228
/* #1 */
229
if (cxl_doorbell_busy(cxlds)) {
230
u64 md_status =
231
readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
232
233
cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
234
"mailbox queue busy");
235
return -EBUSY;
236
}
237
238
/*
239
* With sanitize polling, hardware might be done and the poller still
240
* not be in sync. Ensure no new command comes in until so. Keep the
241
* hardware semantics and only allow device health status.
242
*/
243
if (mds->security.poll_tmo_secs > 0) {
244
if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO)
245
return -EBUSY;
246
}
247
248
cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
249
mbox_cmd->opcode);
250
if (mbox_cmd->size_in) {
251
if (WARN_ON(!mbox_cmd->payload_in))
252
return -EINVAL;
253
254
cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
255
mbox_cmd->size_in);
256
memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
257
}
258
259
/* #2, #3 */
260
writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
261
262
/* #4 */
263
dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode);
264
writel(CXLDEV_MBOX_CTRL_DOORBELL,
265
cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
266
267
/* #5 */
268
rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
269
if (rc == -ETIMEDOUT) {
270
u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
271
272
cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
273
return rc;
274
}
275
276
/* #6 */
277
status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
278
mbox_cmd->return_code =
279
FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
280
281
/*
282
* Handle the background command in a synchronous manner.
283
*
284
* All other mailbox commands will serialize/queue on the mbox_mutex,
285
* which we currently hold. Furthermore this also guarantees that
286
* cxl_mbox_background_complete() checks are safe amongst each other,
287
* in that no new bg operation can occur in between.
288
*
289
* Background operations are timesliced in accordance with the nature
290
* of the command. In the event of timeout, the mailbox state is
291
* indeterminate until the next successful command submission and the
292
* driver can get back in sync with the hardware state.
293
*/
294
if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
295
u64 bg_status_reg;
296
int i, timeout;
297
298
/*
299
* Sanitization is a special case which monopolizes the device
300
* and cannot be timesliced. Handle asynchronously instead,
301
* and allow userspace to poll(2) for completion.
302
*/
303
if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) {
304
if (mds->security.sanitize_active)
305
return -EBUSY;
306
307
/* give first timeout a second */
308
timeout = 1;
309
mds->security.poll_tmo_secs = timeout;
310
mds->security.sanitize_active = true;
311
schedule_delayed_work(&mds->security.poll_dwork,
312
timeout * HZ);
313
dev_dbg(dev, "Sanitization operation started\n");
314
goto success;
315
}
316
317
dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
318
mbox_cmd->opcode);
319
320
timeout = mbox_cmd->poll_interval_ms;
321
for (i = 0; i < mbox_cmd->poll_count; i++) {
322
if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait,
323
cxl_mbox_background_complete(cxlds),
324
TASK_UNINTERRUPTIBLE,
325
msecs_to_jiffies(timeout)) > 0)
326
break;
327
}
328
329
if (!cxl_mbox_background_complete(cxlds)) {
330
dev_err(dev, "timeout waiting for background (%d ms)\n",
331
timeout * mbox_cmd->poll_count);
332
return -ETIMEDOUT;
333
}
334
335
bg_status_reg = readq(cxlds->regs.mbox +
336
CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
337
mbox_cmd->return_code =
338
FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
339
bg_status_reg);
340
dev_dbg(dev,
341
"Mailbox background operation (0x%04x) completed\n",
342
mbox_cmd->opcode);
343
}
344
345
if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
346
dev_dbg(dev, "Mailbox operation had an error: %s\n",
347
cxl_mbox_cmd_rc2str(mbox_cmd));
348
return 0; /* completed but caller must check return_code */
349
}
350
351
success:
352
/* #7 */
353
cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
354
out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
355
356
/* #8 */
357
if (out_len && mbox_cmd->payload_out) {
358
/*
359
* Sanitize the copy. If hardware misbehaves, out_len per the
360
* spec can actually be greater than the max allowed size (21
361
* bits available but spec defined 1M max). The caller also may
362
* have requested less data than the hardware supplied even
363
* within spec.
364
*/
365
size_t n;
366
367
n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len);
368
memcpy_fromio(mbox_cmd->payload_out, payload, n);
369
mbox_cmd->size_out = n;
370
} else {
371
mbox_cmd->size_out = 0;
372
}
373
374
return 0;
375
}
376
377
static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox,
378
struct cxl_mbox_cmd *cmd)
379
{
380
int rc;
381
382
mutex_lock(&cxl_mbox->mbox_mutex);
383
rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd);
384
mutex_unlock(&cxl_mbox->mbox_mutex);
385
386
return rc;
387
}
388
389
static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
390
{
391
struct cxl_dev_state *cxlds = &mds->cxlds;
392
struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
393
const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
394
struct device *dev = cxlds->dev;
395
unsigned long timeout;
396
int irq, msgnum;
397
u64 md_status;
398
u32 ctrl;
399
400
timeout = jiffies + mbox_ready_timeout * HZ;
401
do {
402
md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
403
if (md_status & CXLMDEV_MBOX_IF_READY)
404
break;
405
if (msleep_interruptible(100))
406
break;
407
} while (!time_after(jiffies, timeout));
408
409
if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
410
cxl_err(dev, md_status, "timeout awaiting mailbox ready");
411
return -ETIMEDOUT;
412
}
413
414
/*
415
* A command may be in flight from a previous driver instance,
416
* think kexec, do one doorbell wait so that
417
* __cxl_pci_mbox_send_cmd() can assume that it is the only
418
* source for future doorbell busy events.
419
*/
420
if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
421
cxl_err(dev, md_status, "timeout awaiting mailbox idle");
422
return -ETIMEDOUT;
423
}
424
425
cxl_mbox->mbox_send = cxl_pci_mbox_send;
426
cxl_mbox->payload_size =
427
1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
428
429
/*
430
* CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
431
*
432
* If the size is too small, mandatory commands will not work and so
433
* there's no point in going forward. If the size is too large, there's
434
* no harm is soft limiting it.
435
*/
436
cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M);
437
if (cxl_mbox->payload_size < 256) {
438
dev_err(dev, "Mailbox is too small (%zub)",
439
cxl_mbox->payload_size);
440
return -ENXIO;
441
}
442
443
dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size);
444
445
INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work);
446
447
/* background command interrupts are optional */
448
if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) || !irq_avail)
449
return 0;
450
451
msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap);
452
irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum);
453
if (irq < 0)
454
return 0;
455
456
if (cxl_request_irq(cxlds, irq, cxl_pci_mbox_irq))
457
return 0;
458
459
dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n");
460
/* enable background command mbox irq support */
461
ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
462
ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ;
463
writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
464
465
return 0;
466
}
467
468
/*
469
* Assume that any RCIEP that emits the CXL memory expander class code
470
* is an RCD
471
*/
472
static bool is_cxl_restricted(struct pci_dev *pdev)
473
{
474
return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END;
475
}
476
477
static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev,
478
struct cxl_register_map *map,
479
struct cxl_dport *dport)
480
{
481
resource_size_t component_reg_phys;
482
483
*map = (struct cxl_register_map) {
484
.host = &pdev->dev,
485
.resource = CXL_RESOURCE_NONE,
486
};
487
488
struct cxl_port *port __free(put_cxl_port) =
489
cxl_pci_find_port(pdev, &dport);
490
if (!port)
491
return -EPROBE_DEFER;
492
493
component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport);
494
if (component_reg_phys == CXL_RESOURCE_NONE)
495
return -ENXIO;
496
497
map->resource = component_reg_phys;
498
map->reg_type = CXL_REGLOC_RBI_COMPONENT;
499
map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE;
500
501
return 0;
502
}
503
504
static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
505
struct cxl_register_map *map)
506
{
507
int rc;
508
509
rc = cxl_find_regblock(pdev, type, map);
510
511
/*
512
* If the Register Locator DVSEC does not exist, check if it
513
* is an RCH and try to extract the Component Registers from
514
* an RCRB.
515
*/
516
if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) {
517
struct cxl_dport *dport;
518
struct cxl_port *port __free(put_cxl_port) =
519
cxl_pci_find_port(pdev, &dport);
520
if (!port)
521
return -EPROBE_DEFER;
522
523
rc = cxl_rcrb_get_comp_regs(pdev, map, dport);
524
if (rc)
525
return rc;
526
527
rc = cxl_dport_map_rcd_linkcap(pdev, dport);
528
if (rc)
529
return rc;
530
531
} else if (rc) {
532
return rc;
533
}
534
535
return cxl_setup_regs(map);
536
}
537
538
static int cxl_pci_ras_unmask(struct pci_dev *pdev)
539
{
540
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
541
void __iomem *addr;
542
u32 orig_val, val, mask;
543
u16 cap;
544
int rc;
545
546
if (!cxlds->regs.ras) {
547
dev_dbg(&pdev->dev, "No RAS registers.\n");
548
return 0;
549
}
550
551
/* BIOS has PCIe AER error control */
552
if (!pcie_aer_is_native(pdev))
553
return 0;
554
555
rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap);
556
if (rc)
557
return rc;
558
559
if (cap & PCI_EXP_DEVCTL_URRE) {
560
addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET;
561
orig_val = readl(addr);
562
563
mask = CXL_RAS_UNCORRECTABLE_MASK_MASK |
564
CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK;
565
val = orig_val & ~mask;
566
writel(val, addr);
567
dev_dbg(&pdev->dev,
568
"Uncorrectable RAS Errors Mask: %#x -> %#x\n",
569
orig_val, val);
570
}
571
572
if (cap & PCI_EXP_DEVCTL_CERE) {
573
addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET;
574
orig_val = readl(addr);
575
val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK;
576
writel(val, addr);
577
dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n",
578
orig_val, val);
579
}
580
581
return 0;
582
}
583
584
static void free_event_buf(void *buf)
585
{
586
kvfree(buf);
587
}
588
589
/*
590
* There is a single buffer for reading event logs from the mailbox. All logs
591
* share this buffer protected by the mds->event_log_lock.
592
*/
593
static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds)
594
{
595
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
596
struct cxl_get_event_payload *buf;
597
598
buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
599
if (!buf)
600
return -ENOMEM;
601
mds->event.buf = buf;
602
603
return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf);
604
}
605
606
static bool cxl_alloc_irq_vectors(struct pci_dev *pdev)
607
{
608
int nvecs;
609
610
/*
611
* Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must
612
* not generate INTx messages if that function participates in
613
* CXL.cache or CXL.mem.
614
*
615
* Additionally pci_alloc_irq_vectors() handles calling
616
* pci_free_irq_vectors() automatically despite not being called
617
* pcim_*. See pci_setup_msi_context().
618
*/
619
nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS,
620
PCI_IRQ_MSIX | PCI_IRQ_MSI);
621
if (nvecs < 1) {
622
dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs);
623
return false;
624
}
625
return true;
626
}
627
628
static irqreturn_t cxl_event_thread(int irq, void *id)
629
{
630
struct cxl_dev_id *dev_id = id;
631
struct cxl_dev_state *cxlds = dev_id->cxlds;
632
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
633
u32 status;
634
635
do {
636
/*
637
* CXL 3.0 8.2.8.3.1: The lower 32 bits are the status;
638
* ignore the reserved upper 32 bits
639
*/
640
status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET);
641
/* Ignore logs unknown to the driver */
642
status &= CXLDEV_EVENT_STATUS_ALL;
643
if (!status)
644
break;
645
cxl_mem_get_event_records(mds, status);
646
cond_resched();
647
} while (status);
648
649
return IRQ_HANDLED;
650
}
651
652
static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting)
653
{
654
struct pci_dev *pdev = to_pci_dev(cxlds->dev);
655
int irq;
656
657
if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX)
658
return -ENXIO;
659
660
irq = pci_irq_vector(pdev,
661
FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting));
662
if (irq < 0)
663
return irq;
664
665
return cxl_request_irq(cxlds, irq, cxl_event_thread);
666
}
667
668
static int cxl_event_get_int_policy(struct cxl_memdev_state *mds,
669
struct cxl_event_interrupt_policy *policy)
670
{
671
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
672
struct cxl_mbox_cmd mbox_cmd = {
673
.opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY,
674
.payload_out = policy,
675
.size_out = sizeof(*policy),
676
};
677
int rc;
678
679
rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
680
if (rc < 0)
681
dev_err(mds->cxlds.dev,
682
"Failed to get event interrupt policy : %d", rc);
683
684
return rc;
685
}
686
687
static int cxl_event_config_msgnums(struct cxl_memdev_state *mds,
688
struct cxl_event_interrupt_policy *policy)
689
{
690
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
691
struct cxl_mbox_cmd mbox_cmd;
692
int rc;
693
694
*policy = (struct cxl_event_interrupt_policy) {
695
.info_settings = CXL_INT_MSI_MSIX,
696
.warn_settings = CXL_INT_MSI_MSIX,
697
.failure_settings = CXL_INT_MSI_MSIX,
698
.fatal_settings = CXL_INT_MSI_MSIX,
699
};
700
701
mbox_cmd = (struct cxl_mbox_cmd) {
702
.opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY,
703
.payload_in = policy,
704
.size_in = sizeof(*policy),
705
};
706
707
rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
708
if (rc < 0) {
709
dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d",
710
rc);
711
return rc;
712
}
713
714
/* Retrieve final interrupt settings */
715
return cxl_event_get_int_policy(mds, policy);
716
}
717
718
static int cxl_event_irqsetup(struct cxl_memdev_state *mds)
719
{
720
struct cxl_dev_state *cxlds = &mds->cxlds;
721
struct cxl_event_interrupt_policy policy;
722
int rc;
723
724
rc = cxl_event_config_msgnums(mds, &policy);
725
if (rc)
726
return rc;
727
728
rc = cxl_event_req_irq(cxlds, policy.info_settings);
729
if (rc) {
730
dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n");
731
return rc;
732
}
733
734
rc = cxl_event_req_irq(cxlds, policy.warn_settings);
735
if (rc) {
736
dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n");
737
return rc;
738
}
739
740
rc = cxl_event_req_irq(cxlds, policy.failure_settings);
741
if (rc) {
742
dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n");
743
return rc;
744
}
745
746
rc = cxl_event_req_irq(cxlds, policy.fatal_settings);
747
if (rc) {
748
dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n");
749
return rc;
750
}
751
752
return 0;
753
}
754
755
static bool cxl_event_int_is_fw(u8 setting)
756
{
757
u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting);
758
759
return mode == CXL_INT_FW;
760
}
761
762
static int cxl_event_config(struct pci_host_bridge *host_bridge,
763
struct cxl_memdev_state *mds, bool irq_avail)
764
{
765
struct cxl_event_interrupt_policy policy;
766
int rc;
767
768
/*
769
* When BIOS maintains CXL error reporting control, it will process
770
* event records. Only one agent can do so.
771
*/
772
if (!host_bridge->native_cxl_error)
773
return 0;
774
775
if (!irq_avail) {
776
dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n");
777
return 0;
778
}
779
780
rc = cxl_event_get_int_policy(mds, &policy);
781
if (rc)
782
return rc;
783
784
if (cxl_event_int_is_fw(policy.info_settings) ||
785
cxl_event_int_is_fw(policy.warn_settings) ||
786
cxl_event_int_is_fw(policy.failure_settings) ||
787
cxl_event_int_is_fw(policy.fatal_settings)) {
788
dev_err(mds->cxlds.dev,
789
"FW still in control of Event Logs despite _OSC settings\n");
790
return -EBUSY;
791
}
792
793
rc = cxl_mem_alloc_event_buf(mds);
794
if (rc)
795
return rc;
796
797
rc = cxl_event_irqsetup(mds);
798
if (rc)
799
return rc;
800
801
cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL);
802
803
return 0;
804
}
805
806
static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds)
807
{
808
int rc;
809
810
/*
811
* Fail the init if there's no mailbox. For a type3 this is out of spec.
812
*/
813
if (!cxlds->reg_map.device_map.mbox.valid)
814
return -ENODEV;
815
816
rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev);
817
if (rc)
818
return rc;
819
820
return 0;
821
}
822
823
static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size_t width)
824
{
825
struct cxl_dev_state *cxlds = dev_get_drvdata(dev);
826
struct cxl_memdev *cxlmd = cxlds->cxlmd;
827
struct device *root_dev;
828
struct cxl_dport *dport;
829
struct cxl_port *root __free(put_cxl_port) =
830
cxl_mem_find_port(cxlmd, &dport);
831
832
if (!root)
833
return -ENXIO;
834
835
root_dev = root->uport_dev;
836
if (!root_dev)
837
return -ENXIO;
838
839
if (!dport->regs.rcd_pcie_cap)
840
return -ENXIO;
841
842
guard(device)(root_dev);
843
if (!root_dev->driver)
844
return -ENXIO;
845
846
switch (width) {
847
case 2:
848
return sysfs_emit(buf, "%#x\n",
849
readw(dport->regs.rcd_pcie_cap + offset));
850
case 4:
851
return sysfs_emit(buf, "%#x\n",
852
readl(dport->regs.rcd_pcie_cap + offset));
853
default:
854
return -EINVAL;
855
}
856
}
857
858
static ssize_t rcd_link_cap_show(struct device *dev,
859
struct device_attribute *attr, char *buf)
860
{
861
return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCAP, buf, sizeof(u32));
862
}
863
static DEVICE_ATTR_RO(rcd_link_cap);
864
865
static ssize_t rcd_link_ctrl_show(struct device *dev,
866
struct device_attribute *attr, char *buf)
867
{
868
return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCTL, buf, sizeof(u16));
869
}
870
static DEVICE_ATTR_RO(rcd_link_ctrl);
871
872
static ssize_t rcd_link_status_show(struct device *dev,
873
struct device_attribute *attr, char *buf)
874
{
875
return rcd_pcie_cap_emit(dev, PCI_EXP_LNKSTA, buf, sizeof(u16));
876
}
877
static DEVICE_ATTR_RO(rcd_link_status);
878
879
static struct attribute *cxl_rcd_attrs[] = {
880
&dev_attr_rcd_link_cap.attr,
881
&dev_attr_rcd_link_ctrl.attr,
882
&dev_attr_rcd_link_status.attr,
883
NULL
884
};
885
886
static umode_t cxl_rcd_visible(struct kobject *kobj, struct attribute *a, int n)
887
{
888
struct device *dev = kobj_to_dev(kobj);
889
struct pci_dev *pdev = to_pci_dev(dev);
890
891
if (is_cxl_restricted(pdev))
892
return a->mode;
893
894
return 0;
895
}
896
897
static struct attribute_group cxl_rcd_group = {
898
.attrs = cxl_rcd_attrs,
899
.is_visible = cxl_rcd_visible,
900
};
901
__ATTRIBUTE_GROUPS(cxl_rcd);
902
903
static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
904
{
905
struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
906
struct cxl_dpa_info range_info = { 0 };
907
struct cxl_memdev_state *mds;
908
struct cxl_dev_state *cxlds;
909
struct cxl_register_map map;
910
struct cxl_memdev *cxlmd;
911
int rc, pmu_count;
912
unsigned int i;
913
bool irq_avail;
914
915
/*
916
* Double check the anonymous union trickery in struct cxl_regs
917
* FIXME switch to struct_group()
918
*/
919
BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
920
offsetof(struct cxl_regs, device_regs.memdev));
921
922
rc = pcim_enable_device(pdev);
923
if (rc)
924
return rc;
925
pci_set_master(pdev);
926
927
mds = cxl_memdev_state_create(&pdev->dev);
928
if (IS_ERR(mds))
929
return PTR_ERR(mds);
930
cxlds = &mds->cxlds;
931
pci_set_drvdata(pdev, cxlds);
932
933
cxlds->rcd = is_cxl_restricted(pdev);
934
cxlds->serial = pci_get_dsn(pdev);
935
cxlds->cxl_dvsec = pci_find_dvsec_capability(
936
pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
937
if (!cxlds->cxl_dvsec)
938
dev_warn(&pdev->dev,
939
"Device DVSEC not present, skip CXL.mem init\n");
940
941
rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
942
if (rc)
943
return rc;
944
945
rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs);
946
if (rc)
947
return rc;
948
949
/*
950
* If the component registers can't be found, the cxl_pci driver may
951
* still be useful for management functions so don't return an error.
952
*/
953
rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT,
954
&cxlds->reg_map);
955
if (rc)
956
dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
957
else if (!cxlds->reg_map.component_map.ras.valid)
958
dev_dbg(&pdev->dev, "RAS registers not found\n");
959
960
rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component,
961
BIT(CXL_CM_CAP_CAP_ID_RAS));
962
if (rc)
963
dev_dbg(&pdev->dev, "Failed to map RAS capability.\n");
964
965
rc = cxl_pci_type3_init_mailbox(cxlds);
966
if (rc)
967
return rc;
968
969
rc = cxl_await_media_ready(cxlds);
970
if (rc == 0)
971
cxlds->media_ready = true;
972
else
973
dev_warn(&pdev->dev, "Media not active (%d)\n", rc);
974
975
irq_avail = cxl_alloc_irq_vectors(pdev);
976
977
rc = cxl_pci_setup_mailbox(mds, irq_avail);
978
if (rc)
979
return rc;
980
981
rc = cxl_enumerate_cmds(mds);
982
if (rc)
983
return rc;
984
985
rc = cxl_set_timestamp(mds);
986
if (rc)
987
return rc;
988
989
rc = cxl_poison_state_init(mds);
990
if (rc)
991
return rc;
992
993
rc = cxl_dev_state_identify(mds);
994
if (rc)
995
return rc;
996
997
rc = cxl_mem_dpa_fetch(mds, &range_info);
998
if (rc)
999
return rc;
1000
1001
rc = cxl_dpa_setup(cxlds, &range_info);
1002
if (rc)
1003
return rc;
1004
1005
rc = devm_cxl_setup_features(cxlds);
1006
if (rc)
1007
dev_dbg(&pdev->dev, "No CXL Features discovered\n");
1008
1009
cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds);
1010
if (IS_ERR(cxlmd))
1011
return PTR_ERR(cxlmd);
1012
1013
rc = devm_cxl_setup_fw_upload(&pdev->dev, mds);
1014
if (rc)
1015
return rc;
1016
1017
rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd);
1018
if (rc)
1019
return rc;
1020
1021
rc = devm_cxl_setup_fwctl(&pdev->dev, cxlmd);
1022
if (rc)
1023
dev_dbg(&pdev->dev, "No CXL FWCTL setup\n");
1024
1025
pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU);
1026
if (pmu_count < 0)
1027
return pmu_count;
1028
1029
for (i = 0; i < pmu_count; i++) {
1030
struct cxl_pmu_regs pmu_regs;
1031
1032
rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i);
1033
if (rc) {
1034
dev_dbg(&pdev->dev, "Could not find PMU regblock\n");
1035
break;
1036
}
1037
1038
rc = cxl_map_pmu_regs(&map, &pmu_regs);
1039
if (rc) {
1040
dev_dbg(&pdev->dev, "Could not map PMU regs\n");
1041
break;
1042
}
1043
1044
rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV);
1045
if (rc) {
1046
dev_dbg(&pdev->dev, "Could not add PMU instance\n");
1047
break;
1048
}
1049
}
1050
1051
rc = cxl_event_config(host_bridge, mds, irq_avail);
1052
if (rc)
1053
return rc;
1054
1055
if (cxl_pci_ras_unmask(pdev))
1056
dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
1057
1058
pci_save_state(pdev);
1059
1060
return rc;
1061
}
1062
1063
static const struct pci_device_id cxl_mem_pci_tbl[] = {
1064
/* PCI class code for CXL.mem Type-3 Devices */
1065
{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
1066
{ /* terminate list */ },
1067
};
1068
MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
1069
1070
static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev)
1071
{
1072
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
1073
struct cxl_memdev *cxlmd = cxlds->cxlmd;
1074
struct device *dev = &cxlmd->dev;
1075
1076
dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n",
1077
dev_name(dev));
1078
pci_restore_state(pdev);
1079
if (device_attach(dev) <= 0)
1080
return PCI_ERS_RESULT_DISCONNECT;
1081
return PCI_ERS_RESULT_RECOVERED;
1082
}
1083
1084
static void cxl_error_resume(struct pci_dev *pdev)
1085
{
1086
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
1087
struct cxl_memdev *cxlmd = cxlds->cxlmd;
1088
struct device *dev = &cxlmd->dev;
1089
1090
dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev),
1091
dev->driver ? "successful" : "failed");
1092
}
1093
1094
static void cxl_reset_done(struct pci_dev *pdev)
1095
{
1096
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
1097
struct cxl_memdev *cxlmd = cxlds->cxlmd;
1098
struct device *dev = &pdev->dev;
1099
1100
/*
1101
* FLR does not expect to touch the HDM decoders and related
1102
* registers. SBR, however, will wipe all device configurations.
1103
* Issue a warning if there was an active decoder before the reset
1104
* that no longer exists.
1105
*/
1106
guard(device)(&cxlmd->dev);
1107
if (cxlmd->endpoint &&
1108
cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) {
1109
dev_crit(dev, "SBR happened without memory regions removal.\n");
1110
dev_crit(dev, "System may be unstable if regions hosted system memory.\n");
1111
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
1112
}
1113
}
1114
1115
static const struct pci_error_handlers cxl_error_handlers = {
1116
.error_detected = cxl_error_detected,
1117
.slot_reset = cxl_slot_reset,
1118
.resume = cxl_error_resume,
1119
.cor_error_detected = cxl_cor_error_detected,
1120
.reset_done = cxl_reset_done,
1121
};
1122
1123
static struct pci_driver cxl_pci_driver = {
1124
.name = KBUILD_MODNAME,
1125
.id_table = cxl_mem_pci_tbl,
1126
.probe = cxl_pci_probe,
1127
.err_handler = &cxl_error_handlers,
1128
.dev_groups = cxl_rcd_groups,
1129
.driver = {
1130
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
1131
},
1132
};
1133
1134
#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
1135
static void cxl_handle_cper_event(enum cxl_event_type ev_type,
1136
struct cxl_cper_event_rec *rec)
1137
{
1138
struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
1139
struct pci_dev *pdev __free(pci_dev_put) = NULL;
1140
enum cxl_event_log_type log_type;
1141
struct cxl_dev_state *cxlds;
1142
unsigned int devfn;
1143
u32 hdr_flags;
1144
1145
pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
1146
device_id->segment_num, device_id->bus_num,
1147
device_id->device_num, device_id->func_num);
1148
1149
devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
1150
pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
1151
device_id->bus_num, devfn);
1152
if (!pdev)
1153
return;
1154
1155
guard(device)(&pdev->dev);
1156
if (pdev->driver != &cxl_pci_driver)
1157
return;
1158
1159
cxlds = pci_get_drvdata(pdev);
1160
if (!cxlds)
1161
return;
1162
1163
/* Fabricate a log type */
1164
hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
1165
log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
1166
1167
cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
1168
&uuid_null, &rec->event);
1169
}
1170
1171
static void cxl_cper_work_fn(struct work_struct *work)
1172
{
1173
struct cxl_cper_work_data wd;
1174
1175
while (cxl_cper_kfifo_get(&wd))
1176
cxl_handle_cper_event(wd.event_type, &wd.rec);
1177
}
1178
static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
1179
1180
static int __init cxl_pci_driver_init(void)
1181
{
1182
int rc;
1183
1184
rc = pci_register_driver(&cxl_pci_driver);
1185
if (rc)
1186
return rc;
1187
1188
rc = cxl_cper_register_work(&cxl_cper_work);
1189
if (rc)
1190
pci_unregister_driver(&cxl_pci_driver);
1191
1192
return rc;
1193
}
1194
1195
static void __exit cxl_pci_driver_exit(void)
1196
{
1197
cxl_cper_unregister_work(&cxl_cper_work);
1198
cancel_work_sync(&cxl_cper_work);
1199
pci_unregister_driver(&cxl_pci_driver);
1200
}
1201
1202
module_init(cxl_pci_driver_init);
1203
module_exit(cxl_pci_driver_exit);
1204
MODULE_DESCRIPTION("CXL: PCI manageability");
1205
MODULE_LICENSE("GPL v2");
1206
MODULE_IMPORT_NS("CXL");
1207
1208