Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/accel/amdxdna/aie2_error.c
26427 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
4
*/
5
6
#include <drm/drm_cache.h>
7
#include <drm/drm_device.h>
8
#include <drm/drm_print.h>
9
#include <drm/gpu_scheduler.h>
10
#include <linux/dma-mapping.h>
11
#include <linux/kthread.h>
12
#include <linux/kernel.h>
13
14
#include "aie2_msg_priv.h"
15
#include "aie2_pci.h"
16
#include "amdxdna_mailbox.h"
17
#include "amdxdna_pci_drv.h"
18
19
struct async_event {
20
struct amdxdna_dev_hdl *ndev;
21
struct async_event_msg_resp resp;
22
struct workqueue_struct *wq;
23
struct work_struct work;
24
u8 *buf;
25
dma_addr_t addr;
26
u32 size;
27
};
28
29
struct async_events {
30
struct workqueue_struct *wq;
31
u8 *buf;
32
dma_addr_t addr;
33
u32 size;
34
u32 event_cnt;
35
struct async_event event[] __counted_by(event_cnt);
36
};
37
38
/*
39
* Below enum, struct and lookup tables are porting from XAIE util header file.
40
*
41
* Below data is defined by AIE device and it is used for decode error message
42
* from the device.
43
*/
44
45
enum aie_module_type {
46
AIE_MEM_MOD = 0,
47
AIE_CORE_MOD,
48
AIE_PL_MOD,
49
};
50
51
enum aie_error_category {
52
AIE_ERROR_SATURATION = 0,
53
AIE_ERROR_FP,
54
AIE_ERROR_STREAM,
55
AIE_ERROR_ACCESS,
56
AIE_ERROR_BUS,
57
AIE_ERROR_INSTRUCTION,
58
AIE_ERROR_ECC,
59
AIE_ERROR_LOCK,
60
AIE_ERROR_DMA,
61
AIE_ERROR_MEM_PARITY,
62
/* Unknown is not from XAIE, added for better category */
63
AIE_ERROR_UNKNOWN,
64
};
65
66
/* Don't pack, unless XAIE side changed */
67
struct aie_error {
68
__u8 row;
69
__u8 col;
70
__u32 mod_type;
71
__u8 event_id;
72
};
73
74
struct aie_err_info {
75
u32 err_cnt;
76
u32 ret_code;
77
u32 rsvd;
78
struct aie_error payload[] __counted_by(err_cnt);
79
};
80
81
struct aie_event_category {
82
u8 event_id;
83
enum aie_error_category category;
84
};
85
86
#define EVENT_CATEGORY(id, cat) { id, cat }
87
static const struct aie_event_category aie_ml_mem_event_cat[] = {
88
EVENT_CATEGORY(88U, AIE_ERROR_ECC),
89
EVENT_CATEGORY(90U, AIE_ERROR_ECC),
90
EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
91
EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
92
EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
93
EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
94
EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
95
EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
96
EVENT_CATEGORY(97U, AIE_ERROR_DMA),
97
EVENT_CATEGORY(98U, AIE_ERROR_DMA),
98
EVENT_CATEGORY(99U, AIE_ERROR_DMA),
99
EVENT_CATEGORY(100U, AIE_ERROR_DMA),
100
EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
101
};
102
103
static const struct aie_event_category aie_ml_core_event_cat[] = {
104
EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
105
EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
106
EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
107
EVENT_CATEGORY(58U, AIE_ERROR_BUS),
108
EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
109
EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
110
EVENT_CATEGORY(62U, AIE_ERROR_ECC),
111
EVENT_CATEGORY(64U, AIE_ERROR_ECC),
112
EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
113
EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
114
EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
115
EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
116
EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
117
EVENT_CATEGORY(72U, AIE_ERROR_BUS),
118
};
119
120
static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
121
EVENT_CATEGORY(130U, AIE_ERROR_ECC),
122
EVENT_CATEGORY(132U, AIE_ERROR_ECC),
123
EVENT_CATEGORY(133U, AIE_ERROR_DMA),
124
EVENT_CATEGORY(134U, AIE_ERROR_DMA),
125
EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
126
EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
127
EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
128
EVENT_CATEGORY(138U, AIE_ERROR_BUS),
129
EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
130
};
131
132
static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
133
EVENT_CATEGORY(64U, AIE_ERROR_BUS),
134
EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
135
EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
136
EVENT_CATEGORY(67U, AIE_ERROR_BUS),
137
EVENT_CATEGORY(68U, AIE_ERROR_BUS),
138
EVENT_CATEGORY(69U, AIE_ERROR_BUS),
139
EVENT_CATEGORY(70U, AIE_ERROR_BUS),
140
EVENT_CATEGORY(71U, AIE_ERROR_BUS),
141
EVENT_CATEGORY(72U, AIE_ERROR_DMA),
142
EVENT_CATEGORY(73U, AIE_ERROR_DMA),
143
EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
144
};
145
146
static enum aie_error_category
147
aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
148
{
149
const struct aie_event_category *lut;
150
int num_entry;
151
int i;
152
153
switch (mod_type) {
154
case AIE_PL_MOD:
155
lut = aie_ml_shim_tile_event_cat;
156
num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
157
break;
158
case AIE_CORE_MOD:
159
lut = aie_ml_core_event_cat;
160
num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
161
break;
162
case AIE_MEM_MOD:
163
if (row == 1) {
164
lut = aie_ml_mem_tile_event_cat;
165
num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
166
} else {
167
lut = aie_ml_mem_event_cat;
168
num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
169
}
170
break;
171
default:
172
return AIE_ERROR_UNKNOWN;
173
}
174
175
for (i = 0; i < num_entry; i++) {
176
if (event_id != lut[i].event_id)
177
continue;
178
179
return lut[i].category;
180
}
181
182
return AIE_ERROR_UNKNOWN;
183
}
184
185
static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
186
{
187
struct aie_error *errs = err_info;
188
u32 err_col = 0; /* assume that AIE has less than 32 columns */
189
int i;
190
191
/* Get err column bitmap */
192
for (i = 0; i < num_err; i++) {
193
struct aie_error *err = &errs[i];
194
enum aie_error_category cat;
195
196
cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
197
XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
198
err->row, err->col, err->mod_type,
199
err->event_id, cat);
200
201
if (err->col >= 32) {
202
XDNA_WARN(ndev->xdna, "Invalid column number");
203
break;
204
}
205
206
err_col |= (1 << err->col);
207
}
208
209
return err_col;
210
}
211
212
static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
213
{
214
struct async_event *e = handle;
215
216
if (data) {
217
e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
218
wmb(); /* Update status in the end, so that no lock for here */
219
e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
220
}
221
queue_work(e->wq, &e->work);
222
return 0;
223
}
224
225
static int aie2_error_event_send(struct async_event *e)
226
{
227
drm_clflush_virt_range(e->buf, e->size); /* device can access */
228
return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
229
aie2_error_async_cb);
230
}
231
232
static void aie2_error_worker(struct work_struct *err_work)
233
{
234
struct aie_err_info *info;
235
struct amdxdna_dev *xdna;
236
struct async_event *e;
237
u32 max_err;
238
u32 err_col;
239
240
e = container_of(err_work, struct async_event, work);
241
242
xdna = e->ndev->xdna;
243
244
if (e->resp.status == MAX_AIE2_STATUS_CODE)
245
return;
246
247
e->resp.status = MAX_AIE2_STATUS_CODE;
248
249
print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
250
e->buf, 0x100, false);
251
252
info = (struct aie_err_info *)e->buf;
253
XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
254
255
max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
256
if (unlikely(info->err_cnt > max_err)) {
257
WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
258
return;
259
}
260
err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
261
if (!err_col) {
262
XDNA_WARN(xdna, "Did not get error column");
263
return;
264
}
265
266
mutex_lock(&xdna->dev_lock);
267
/* Re-sent this event to firmware */
268
if (aie2_error_event_send(e))
269
XDNA_WARN(xdna, "Unable to register async event");
270
mutex_unlock(&xdna->dev_lock);
271
}
272
273
int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
274
{
275
struct amdxdna_dev *xdna = ndev->xdna;
276
struct async_event *e;
277
int i, ret;
278
279
drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
280
for (i = 0; i < ndev->async_events->event_cnt; i++) {
281
e = &ndev->async_events->event[i];
282
ret = aie2_error_event_send(e);
283
if (ret)
284
return ret;
285
}
286
287
return 0;
288
}
289
290
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
291
{
292
struct amdxdna_dev *xdna = ndev->xdna;
293
struct async_events *events;
294
295
events = ndev->async_events;
296
297
mutex_unlock(&xdna->dev_lock);
298
destroy_workqueue(events->wq);
299
mutex_lock(&xdna->dev_lock);
300
301
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
302
events->addr, DMA_FROM_DEVICE);
303
kfree(events);
304
}
305
306
int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
307
{
308
struct amdxdna_dev *xdna = ndev->xdna;
309
u32 total_col = ndev->total_col;
310
u32 total_size = ASYNC_BUF_SIZE * total_col;
311
struct async_events *events;
312
int i, ret;
313
314
events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
315
if (!events)
316
return -ENOMEM;
317
318
events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
319
DMA_FROM_DEVICE, GFP_KERNEL);
320
if (!events->buf) {
321
ret = -ENOMEM;
322
goto free_events;
323
}
324
events->size = total_size;
325
events->event_cnt = total_col;
326
327
events->wq = alloc_ordered_workqueue("async_wq", 0);
328
if (!events->wq) {
329
ret = -ENOMEM;
330
goto free_buf;
331
}
332
333
for (i = 0; i < events->event_cnt; i++) {
334
struct async_event *e = &events->event[i];
335
u32 offset = i * ASYNC_BUF_SIZE;
336
337
e->ndev = ndev;
338
e->wq = events->wq;
339
e->buf = &events->buf[offset];
340
e->addr = events->addr + offset;
341
e->size = ASYNC_BUF_SIZE;
342
e->resp.status = MAX_AIE2_STATUS_CODE;
343
INIT_WORK(&e->work, aie2_error_worker);
344
}
345
346
ndev->async_events = events;
347
348
XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
349
events->event_cnt, events->size);
350
return 0;
351
352
free_buf:
353
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
354
events->addr, DMA_FROM_DEVICE);
355
free_events:
356
kfree(events);
357
return ret;
358
}
359
360