Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/accel/amdxdna/aie2_error.c
50674 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
4
*/
5
6
#include <drm/drm_cache.h>
7
#include <drm/drm_device.h>
8
#include <drm/drm_print.h>
9
#include <drm/gpu_scheduler.h>
10
#include <linux/dma-mapping.h>
11
#include <linux/kthread.h>
12
#include <linux/kernel.h>
13
14
#include "aie2_msg_priv.h"
15
#include "aie2_pci.h"
16
#include "amdxdna_error.h"
17
#include "amdxdna_mailbox.h"
18
#include "amdxdna_pci_drv.h"
19
20
struct async_event {
21
struct amdxdna_dev_hdl *ndev;
22
struct async_event_msg_resp resp;
23
struct workqueue_struct *wq;
24
struct work_struct work;
25
u8 *buf;
26
dma_addr_t addr;
27
u32 size;
28
};
29
30
struct async_events {
31
struct workqueue_struct *wq;
32
u8 *buf;
33
dma_addr_t addr;
34
u32 size;
35
u32 event_cnt;
36
struct async_event event[] __counted_by(event_cnt);
37
};
38
39
/*
40
* Below enum, struct and lookup tables are porting from XAIE util header file.
41
*
42
* Below data is defined by AIE device and it is used for decode error message
43
* from the device.
44
*/
45
46
enum aie_module_type {
47
AIE_MEM_MOD = 0,
48
AIE_CORE_MOD,
49
AIE_PL_MOD,
50
AIE_UNKNOWN_MOD,
51
};
52
53
enum aie_error_category {
54
AIE_ERROR_SATURATION = 0,
55
AIE_ERROR_FP,
56
AIE_ERROR_STREAM,
57
AIE_ERROR_ACCESS,
58
AIE_ERROR_BUS,
59
AIE_ERROR_INSTRUCTION,
60
AIE_ERROR_ECC,
61
AIE_ERROR_LOCK,
62
AIE_ERROR_DMA,
63
AIE_ERROR_MEM_PARITY,
64
/* Unknown is not from XAIE, added for better category */
65
AIE_ERROR_UNKNOWN,
66
};
67
68
/* Don't pack, unless XAIE side changed */
69
struct aie_error {
70
__u8 row;
71
__u8 col;
72
__u32 mod_type;
73
__u8 event_id;
74
};
75
76
struct aie_err_info {
77
u32 err_cnt;
78
u32 ret_code;
79
u32 rsvd;
80
struct aie_error payload[] __counted_by(err_cnt);
81
};
82
83
struct aie_event_category {
84
u8 event_id;
85
enum aie_error_category category;
86
};
87
88
#define EVENT_CATEGORY(id, cat) { id, cat }
89
static const struct aie_event_category aie_ml_mem_event_cat[] = {
90
EVENT_CATEGORY(88U, AIE_ERROR_ECC),
91
EVENT_CATEGORY(90U, AIE_ERROR_ECC),
92
EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
93
EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
94
EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
95
EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
96
EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
97
EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
98
EVENT_CATEGORY(97U, AIE_ERROR_DMA),
99
EVENT_CATEGORY(98U, AIE_ERROR_DMA),
100
EVENT_CATEGORY(99U, AIE_ERROR_DMA),
101
EVENT_CATEGORY(100U, AIE_ERROR_DMA),
102
EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
103
};
104
105
static const struct aie_event_category aie_ml_core_event_cat[] = {
106
EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
107
EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
108
EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
109
EVENT_CATEGORY(58U, AIE_ERROR_BUS),
110
EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
111
EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
112
EVENT_CATEGORY(62U, AIE_ERROR_ECC),
113
EVENT_CATEGORY(64U, AIE_ERROR_ECC),
114
EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
115
EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
116
EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
117
EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
118
EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
119
EVENT_CATEGORY(72U, AIE_ERROR_BUS),
120
};
121
122
static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
123
EVENT_CATEGORY(130U, AIE_ERROR_ECC),
124
EVENT_CATEGORY(132U, AIE_ERROR_ECC),
125
EVENT_CATEGORY(133U, AIE_ERROR_DMA),
126
EVENT_CATEGORY(134U, AIE_ERROR_DMA),
127
EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
128
EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
129
EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
130
EVENT_CATEGORY(138U, AIE_ERROR_BUS),
131
EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
132
};
133
134
static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
135
EVENT_CATEGORY(64U, AIE_ERROR_BUS),
136
EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
137
EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
138
EVENT_CATEGORY(67U, AIE_ERROR_BUS),
139
EVENT_CATEGORY(68U, AIE_ERROR_BUS),
140
EVENT_CATEGORY(69U, AIE_ERROR_BUS),
141
EVENT_CATEGORY(70U, AIE_ERROR_BUS),
142
EVENT_CATEGORY(71U, AIE_ERROR_BUS),
143
EVENT_CATEGORY(72U, AIE_ERROR_DMA),
144
EVENT_CATEGORY(73U, AIE_ERROR_DMA),
145
EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
146
};
147
148
static const enum amdxdna_error_num aie_cat_err_num_map[] = {
149
[AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
150
[AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
151
[AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
152
[AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
153
[AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
154
[AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
155
[AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
156
[AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
157
[AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
158
[AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
159
[AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
160
};
161
162
static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
163
164
static const enum amdxdna_error_module aie_err_mod_map[] = {
165
[AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
166
[AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
167
[AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
168
[AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
169
};
170
171
static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
172
173
static enum aie_error_category
174
aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
175
{
176
const struct aie_event_category *lut;
177
int num_entry;
178
int i;
179
180
switch (mod_type) {
181
case AIE_PL_MOD:
182
lut = aie_ml_shim_tile_event_cat;
183
num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
184
break;
185
case AIE_CORE_MOD:
186
lut = aie_ml_core_event_cat;
187
num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
188
break;
189
case AIE_MEM_MOD:
190
if (row == 1) {
191
lut = aie_ml_mem_tile_event_cat;
192
num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
193
} else {
194
lut = aie_ml_mem_event_cat;
195
num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
196
}
197
break;
198
default:
199
return AIE_ERROR_UNKNOWN;
200
}
201
202
for (i = 0; i < num_entry; i++) {
203
if (event_id != lut[i].event_id)
204
continue;
205
206
if (lut[i].category > AIE_ERROR_UNKNOWN)
207
return AIE_ERROR_UNKNOWN;
208
209
return lut[i].category;
210
}
211
212
return AIE_ERROR_UNKNOWN;
213
}
214
215
static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
216
{
217
struct aie_error *errs = err_info;
218
enum amdxdna_error_module err_mod;
219
enum aie_error_category aie_err;
220
enum amdxdna_error_num err_num;
221
struct aie_error *last_err;
222
223
last_err = &errs[num_err - 1];
224
if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
225
err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
226
err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
227
} else {
228
aie_err = aie_get_error_category(last_err->row,
229
last_err->event_id,
230
last_err->mod_type);
231
err_num = aie_cat_err_num_map[aie_err];
232
err_mod = aie_err_mod_map[last_err->mod_type];
233
}
234
235
ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
236
ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
237
ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
238
}
239
240
static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
241
{
242
struct aie_error *errs = err_info;
243
u32 err_col = 0; /* assume that AIE has less than 32 columns */
244
int i;
245
246
/* Get err column bitmap */
247
for (i = 0; i < num_err; i++) {
248
struct aie_error *err = &errs[i];
249
enum aie_error_category cat;
250
251
cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
252
XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
253
err->row, err->col, err->mod_type,
254
err->event_id, cat);
255
256
if (err->col >= 32) {
257
XDNA_WARN(ndev->xdna, "Invalid column number");
258
break;
259
}
260
261
err_col |= (1 << err->col);
262
}
263
264
return err_col;
265
}
266
267
static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
268
{
269
struct async_event *e = handle;
270
271
if (data) {
272
e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
273
wmb(); /* Update status in the end, so that no lock for here */
274
e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
275
}
276
queue_work(e->wq, &e->work);
277
return 0;
278
}
279
280
static int aie2_error_event_send(struct async_event *e)
281
{
282
drm_clflush_virt_range(e->buf, e->size); /* device can access */
283
return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
284
aie2_error_async_cb);
285
}
286
287
static void aie2_error_worker(struct work_struct *err_work)
288
{
289
struct aie_err_info *info;
290
struct amdxdna_dev *xdna;
291
struct async_event *e;
292
u32 max_err;
293
u32 err_col;
294
295
e = container_of(err_work, struct async_event, work);
296
297
xdna = e->ndev->xdna;
298
299
if (e->resp.status == MAX_AIE2_STATUS_CODE)
300
return;
301
302
e->resp.status = MAX_AIE2_STATUS_CODE;
303
304
print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
305
e->buf, 0x100, false);
306
307
info = (struct aie_err_info *)e->buf;
308
XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
309
310
max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
311
if (unlikely(info->err_cnt > max_err)) {
312
WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
313
return;
314
}
315
err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
316
if (!err_col) {
317
XDNA_WARN(xdna, "Did not get error column");
318
return;
319
}
320
321
mutex_lock(&xdna->dev_lock);
322
aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
323
324
/* Re-sent this event to firmware */
325
if (aie2_error_event_send(e))
326
XDNA_WARN(xdna, "Unable to register async event");
327
mutex_unlock(&xdna->dev_lock);
328
}
329
330
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
331
{
332
struct amdxdna_dev *xdna = ndev->xdna;
333
struct async_events *events;
334
335
events = ndev->async_events;
336
337
mutex_unlock(&xdna->dev_lock);
338
destroy_workqueue(events->wq);
339
mutex_lock(&xdna->dev_lock);
340
341
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
342
events->addr, DMA_FROM_DEVICE);
343
kfree(events);
344
}
345
346
int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
347
{
348
struct amdxdna_dev *xdna = ndev->xdna;
349
u32 total_col = ndev->total_col;
350
u32 total_size = ASYNC_BUF_SIZE * total_col;
351
struct async_events *events;
352
int i, ret;
353
354
events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
355
if (!events)
356
return -ENOMEM;
357
358
events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
359
DMA_FROM_DEVICE, GFP_KERNEL);
360
if (!events->buf) {
361
ret = -ENOMEM;
362
goto free_events;
363
}
364
events->size = total_size;
365
events->event_cnt = total_col;
366
367
events->wq = alloc_ordered_workqueue("async_wq", 0);
368
if (!events->wq) {
369
ret = -ENOMEM;
370
goto free_buf;
371
}
372
373
for (i = 0; i < events->event_cnt; i++) {
374
struct async_event *e = &events->event[i];
375
u32 offset = i * ASYNC_BUF_SIZE;
376
377
e->ndev = ndev;
378
e->wq = events->wq;
379
e->buf = &events->buf[offset];
380
e->addr = events->addr + offset;
381
e->size = ASYNC_BUF_SIZE;
382
e->resp.status = MAX_AIE2_STATUS_CODE;
383
INIT_WORK(&e->work, aie2_error_worker);
384
385
ret = aie2_error_event_send(e);
386
if (ret)
387
goto free_wq;
388
}
389
390
ndev->async_events = events;
391
392
XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
393
events->event_cnt, events->size);
394
return 0;
395
396
free_wq:
397
destroy_workqueue(events->wq);
398
free_buf:
399
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
400
events->addr, DMA_FROM_DEVICE);
401
free_events:
402
kfree(events);
403
return ret;
404
}
405
406
int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
407
{
408
struct amdxdna_dev *xdna = ndev->xdna;
409
410
drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
411
412
args->num_element = 1;
413
args->element_size = sizeof(ndev->last_async_err);
414
if (copy_to_user(u64_to_user_ptr(args->buffer),
415
&ndev->last_async_err, args->element_size))
416
return -EFAULT;
417
418
return 0;
419
}
420
421