Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
26516 views
1
// SPDX-License-Identifier: GPL-2.0 OR MIT
2
/*
3
* Copyright 2015-2022 Advanced Micro Devices, Inc.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be included in
13
* all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
* OTHER DEALINGS IN THE SOFTWARE.
22
*/
23
24
#include <linux/pci.h>
25
#include <linux/acpi.h>
26
#include "kfd_crat.h"
27
#include "kfd_priv.h"
28
#include "kfd_topology.h"
29
#include "amdgpu.h"
30
#include "amdgpu_amdkfd.h"
31
#include "amdgpu_xgmi.h"
32
33
/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
34
* GPU processor ID are expressed with Bit[31]=1.
35
* The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
36
* used in the CRAT.
37
*/
38
static uint32_t gpu_processor_id_low = 0x80001000;
39
40
/* Return the next available gpu_processor_id and increment it for next GPU
41
* @total_cu_count - Total CUs present in the GPU including ones
42
* masked off
43
*/
44
static inline unsigned int get_and_inc_gpu_processor_id(
45
unsigned int total_cu_count)
46
{
47
int current_id = gpu_processor_id_low;
48
49
gpu_processor_id_low += total_cu_count;
50
return current_id;
51
}
52
53
54
static struct kfd_gpu_cache_info kaveri_cache_info[] = {
55
{
56
/* TCP L1 Cache per CU */
57
.cache_size = 16,
58
.cache_level = 1,
59
.cache_line_size = 64,
60
.flags = (CRAT_CACHE_FLAGS_ENABLED |
61
CRAT_CACHE_FLAGS_DATA_CACHE |
62
CRAT_CACHE_FLAGS_SIMD_CACHE),
63
.num_cu_shared = 1,
64
},
65
{
66
/* Scalar L1 Instruction Cache (in SQC module) per bank */
67
.cache_size = 16,
68
.cache_level = 1,
69
.cache_line_size = 64,
70
.flags = (CRAT_CACHE_FLAGS_ENABLED |
71
CRAT_CACHE_FLAGS_INST_CACHE |
72
CRAT_CACHE_FLAGS_SIMD_CACHE),
73
.num_cu_shared = 2,
74
},
75
{
76
/* Scalar L1 Data Cache (in SQC module) per bank */
77
.cache_size = 8,
78
.cache_level = 1,
79
.cache_line_size = 64,
80
.flags = (CRAT_CACHE_FLAGS_ENABLED |
81
CRAT_CACHE_FLAGS_DATA_CACHE |
82
CRAT_CACHE_FLAGS_SIMD_CACHE),
83
.num_cu_shared = 2,
84
},
85
86
/* TODO: Add L2 Cache information */
87
};
88
89
90
static struct kfd_gpu_cache_info carrizo_cache_info[] = {
91
{
92
/* TCP L1 Cache per CU */
93
.cache_size = 16,
94
.cache_level = 1,
95
.cache_line_size = 64,
96
.flags = (CRAT_CACHE_FLAGS_ENABLED |
97
CRAT_CACHE_FLAGS_DATA_CACHE |
98
CRAT_CACHE_FLAGS_SIMD_CACHE),
99
.num_cu_shared = 1,
100
},
101
{
102
/* Scalar L1 Instruction Cache (in SQC module) per bank */
103
.cache_size = 32,
104
.cache_level = 1,
105
.cache_line_size = 64,
106
.flags = (CRAT_CACHE_FLAGS_ENABLED |
107
CRAT_CACHE_FLAGS_INST_CACHE |
108
CRAT_CACHE_FLAGS_SIMD_CACHE),
109
.num_cu_shared = 4,
110
},
111
{
112
/* Scalar L1 Data Cache (in SQC module) per bank. */
113
.cache_size = 16,
114
.cache_level = 1,
115
.cache_line_size = 64,
116
.flags = (CRAT_CACHE_FLAGS_ENABLED |
117
CRAT_CACHE_FLAGS_DATA_CACHE |
118
CRAT_CACHE_FLAGS_SIMD_CACHE),
119
.num_cu_shared = 4,
120
},
121
122
/* TODO: Add L2 Cache information */
123
};
124
125
#define hawaii_cache_info kaveri_cache_info
126
#define tonga_cache_info carrizo_cache_info
127
#define fiji_cache_info carrizo_cache_info
128
#define polaris10_cache_info carrizo_cache_info
129
#define polaris11_cache_info carrizo_cache_info
130
#define polaris12_cache_info carrizo_cache_info
131
#define vegam_cache_info carrizo_cache_info
132
133
/* NOTE: L1 cache information has been updated and L2/L3
134
* cache information has been added for Vega10 and
135
* newer ASICs. The unit for cache_size is KiB.
136
* In future, check & update cache details
137
* for every new ASIC is required.
138
*/
139
140
static struct kfd_gpu_cache_info vega10_cache_info[] = {
141
{
142
/* TCP L1 Cache per CU */
143
.cache_size = 16,
144
.cache_level = 1,
145
.cache_line_size = 64,
146
.flags = (CRAT_CACHE_FLAGS_ENABLED |
147
CRAT_CACHE_FLAGS_DATA_CACHE |
148
CRAT_CACHE_FLAGS_SIMD_CACHE),
149
.num_cu_shared = 1,
150
},
151
{
152
/* Scalar L1 Instruction Cache per SQC */
153
.cache_size = 32,
154
.cache_level = 1,
155
.cache_line_size = 64,
156
.flags = (CRAT_CACHE_FLAGS_ENABLED |
157
CRAT_CACHE_FLAGS_INST_CACHE |
158
CRAT_CACHE_FLAGS_SIMD_CACHE),
159
.num_cu_shared = 3,
160
},
161
{
162
/* Scalar L1 Data Cache per SQC */
163
.cache_size = 16,
164
.cache_level = 1,
165
.cache_line_size = 64,
166
.flags = (CRAT_CACHE_FLAGS_ENABLED |
167
CRAT_CACHE_FLAGS_DATA_CACHE |
168
CRAT_CACHE_FLAGS_SIMD_CACHE),
169
.num_cu_shared = 3,
170
},
171
{
172
/* L2 Data Cache per GPU (Total Tex Cache) */
173
.cache_size = 4096,
174
.cache_level = 2,
175
.cache_line_size = 64,
176
.flags = (CRAT_CACHE_FLAGS_ENABLED |
177
CRAT_CACHE_FLAGS_DATA_CACHE |
178
CRAT_CACHE_FLAGS_SIMD_CACHE),
179
.num_cu_shared = 16,
180
},
181
};
182
183
static struct kfd_gpu_cache_info raven_cache_info[] = {
184
{
185
/* TCP L1 Cache per CU */
186
.cache_size = 16,
187
.cache_level = 1,
188
.cache_line_size = 64,
189
.flags = (CRAT_CACHE_FLAGS_ENABLED |
190
CRAT_CACHE_FLAGS_DATA_CACHE |
191
CRAT_CACHE_FLAGS_SIMD_CACHE),
192
.num_cu_shared = 1,
193
},
194
{
195
/* Scalar L1 Instruction Cache per SQC */
196
.cache_size = 32,
197
.cache_level = 1,
198
.cache_line_size = 64,
199
.flags = (CRAT_CACHE_FLAGS_ENABLED |
200
CRAT_CACHE_FLAGS_INST_CACHE |
201
CRAT_CACHE_FLAGS_SIMD_CACHE),
202
.num_cu_shared = 3,
203
},
204
{
205
/* Scalar L1 Data Cache per SQC */
206
.cache_size = 16,
207
.cache_level = 1,
208
.cache_line_size = 64,
209
.flags = (CRAT_CACHE_FLAGS_ENABLED |
210
CRAT_CACHE_FLAGS_DATA_CACHE |
211
CRAT_CACHE_FLAGS_SIMD_CACHE),
212
.num_cu_shared = 3,
213
},
214
{
215
/* L2 Data Cache per GPU (Total Tex Cache) */
216
.cache_size = 1024,
217
.cache_level = 2,
218
.cache_line_size = 64,
219
.flags = (CRAT_CACHE_FLAGS_ENABLED |
220
CRAT_CACHE_FLAGS_DATA_CACHE |
221
CRAT_CACHE_FLAGS_SIMD_CACHE),
222
.num_cu_shared = 11,
223
},
224
};
225
226
static struct kfd_gpu_cache_info renoir_cache_info[] = {
227
{
228
/* TCP L1 Cache per CU */
229
.cache_size = 16,
230
.cache_level = 1,
231
.cache_line_size = 64,
232
.flags = (CRAT_CACHE_FLAGS_ENABLED |
233
CRAT_CACHE_FLAGS_DATA_CACHE |
234
CRAT_CACHE_FLAGS_SIMD_CACHE),
235
.num_cu_shared = 1,
236
},
237
{
238
/* Scalar L1 Instruction Cache per SQC */
239
.cache_size = 32,
240
.cache_level = 1,
241
.cache_line_size = 64,
242
.flags = (CRAT_CACHE_FLAGS_ENABLED |
243
CRAT_CACHE_FLAGS_INST_CACHE |
244
CRAT_CACHE_FLAGS_SIMD_CACHE),
245
.num_cu_shared = 3,
246
},
247
{
248
/* Scalar L1 Data Cache per SQC */
249
.cache_size = 16,
250
.cache_level = 1,
251
.cache_line_size = 64,
252
.flags = (CRAT_CACHE_FLAGS_ENABLED |
253
CRAT_CACHE_FLAGS_DATA_CACHE |
254
CRAT_CACHE_FLAGS_SIMD_CACHE),
255
.num_cu_shared = 3,
256
},
257
{
258
/* L2 Data Cache per GPU (Total Tex Cache) */
259
.cache_size = 1024,
260
.cache_level = 2,
261
.cache_line_size = 64,
262
.flags = (CRAT_CACHE_FLAGS_ENABLED |
263
CRAT_CACHE_FLAGS_DATA_CACHE |
264
CRAT_CACHE_FLAGS_SIMD_CACHE),
265
.num_cu_shared = 8,
266
},
267
};
268
269
static struct kfd_gpu_cache_info vega12_cache_info[] = {
270
{
271
/* TCP L1 Cache per CU */
272
.cache_size = 16,
273
.cache_level = 1,
274
.cache_line_size = 64,
275
.flags = (CRAT_CACHE_FLAGS_ENABLED |
276
CRAT_CACHE_FLAGS_DATA_CACHE |
277
CRAT_CACHE_FLAGS_SIMD_CACHE),
278
.num_cu_shared = 1,
279
},
280
{
281
/* Scalar L1 Instruction Cache per SQC */
282
.cache_size = 32,
283
.cache_level = 1,
284
.cache_line_size = 64,
285
.flags = (CRAT_CACHE_FLAGS_ENABLED |
286
CRAT_CACHE_FLAGS_INST_CACHE |
287
CRAT_CACHE_FLAGS_SIMD_CACHE),
288
.num_cu_shared = 3,
289
},
290
{
291
/* Scalar L1 Data Cache per SQC */
292
.cache_size = 16,
293
.cache_level = 1,
294
.cache_line_size = 64,
295
.flags = (CRAT_CACHE_FLAGS_ENABLED |
296
CRAT_CACHE_FLAGS_DATA_CACHE |
297
CRAT_CACHE_FLAGS_SIMD_CACHE),
298
.num_cu_shared = 3,
299
},
300
{
301
/* L2 Data Cache per GPU (Total Tex Cache) */
302
.cache_size = 2048,
303
.cache_level = 2,
304
.cache_line_size = 64,
305
.flags = (CRAT_CACHE_FLAGS_ENABLED |
306
CRAT_CACHE_FLAGS_DATA_CACHE |
307
CRAT_CACHE_FLAGS_SIMD_CACHE),
308
.num_cu_shared = 5,
309
},
310
};
311
312
static struct kfd_gpu_cache_info vega20_cache_info[] = {
313
{
314
/* TCP L1 Cache per CU */
315
.cache_size = 16,
316
.cache_level = 1,
317
.cache_line_size = 64,
318
.flags = (CRAT_CACHE_FLAGS_ENABLED |
319
CRAT_CACHE_FLAGS_DATA_CACHE |
320
CRAT_CACHE_FLAGS_SIMD_CACHE),
321
.num_cu_shared = 1,
322
},
323
{
324
/* Scalar L1 Instruction Cache per SQC */
325
.cache_size = 32,
326
.cache_level = 1,
327
.cache_line_size = 64,
328
.flags = (CRAT_CACHE_FLAGS_ENABLED |
329
CRAT_CACHE_FLAGS_INST_CACHE |
330
CRAT_CACHE_FLAGS_SIMD_CACHE),
331
.num_cu_shared = 3,
332
},
333
{
334
/* Scalar L1 Data Cache per SQC */
335
.cache_size = 16,
336
.cache_level = 1,
337
.cache_line_size = 64,
338
.flags = (CRAT_CACHE_FLAGS_ENABLED |
339
CRAT_CACHE_FLAGS_DATA_CACHE |
340
CRAT_CACHE_FLAGS_SIMD_CACHE),
341
.num_cu_shared = 3,
342
},
343
{
344
/* L2 Data Cache per GPU (Total Tex Cache) */
345
.cache_size = 8192,
346
.cache_level = 2,
347
.cache_line_size = 64,
348
.flags = (CRAT_CACHE_FLAGS_ENABLED |
349
CRAT_CACHE_FLAGS_DATA_CACHE |
350
CRAT_CACHE_FLAGS_SIMD_CACHE),
351
.num_cu_shared = 16,
352
},
353
};
354
355
static struct kfd_gpu_cache_info aldebaran_cache_info[] = {
356
{
357
/* TCP L1 Cache per CU */
358
.cache_size = 16,
359
.cache_level = 1,
360
.cache_line_size = 64,
361
.flags = (CRAT_CACHE_FLAGS_ENABLED |
362
CRAT_CACHE_FLAGS_DATA_CACHE |
363
CRAT_CACHE_FLAGS_SIMD_CACHE),
364
.num_cu_shared = 1,
365
},
366
{
367
/* Scalar L1 Instruction Cache per SQC */
368
.cache_size = 32,
369
.cache_level = 1,
370
.cache_line_size = 64,
371
.flags = (CRAT_CACHE_FLAGS_ENABLED |
372
CRAT_CACHE_FLAGS_INST_CACHE |
373
CRAT_CACHE_FLAGS_SIMD_CACHE),
374
.num_cu_shared = 2,
375
},
376
{
377
/* Scalar L1 Data Cache per SQC */
378
.cache_size = 16,
379
.cache_level = 1,
380
.cache_line_size = 64,
381
.flags = (CRAT_CACHE_FLAGS_ENABLED |
382
CRAT_CACHE_FLAGS_DATA_CACHE |
383
CRAT_CACHE_FLAGS_SIMD_CACHE),
384
.num_cu_shared = 2,
385
},
386
{
387
/* L2 Data Cache per GPU (Total Tex Cache) */
388
.cache_size = 8192,
389
.cache_level = 2,
390
.cache_line_size = 128,
391
.flags = (CRAT_CACHE_FLAGS_ENABLED |
392
CRAT_CACHE_FLAGS_DATA_CACHE |
393
CRAT_CACHE_FLAGS_SIMD_CACHE),
394
.num_cu_shared = 14,
395
},
396
};
397
398
static struct kfd_gpu_cache_info navi10_cache_info[] = {
399
{
400
/* TCP L1 Cache per CU */
401
.cache_size = 16,
402
.cache_level = 1,
403
.cache_line_size = 128,
404
.flags = (CRAT_CACHE_FLAGS_ENABLED |
405
CRAT_CACHE_FLAGS_DATA_CACHE |
406
CRAT_CACHE_FLAGS_SIMD_CACHE),
407
.num_cu_shared = 1,
408
},
409
{
410
/* Scalar L1 Instruction Cache per SQC */
411
.cache_size = 32,
412
.cache_level = 1,
413
.cache_line_size = 64,
414
.flags = (CRAT_CACHE_FLAGS_ENABLED |
415
CRAT_CACHE_FLAGS_INST_CACHE |
416
CRAT_CACHE_FLAGS_SIMD_CACHE),
417
.num_cu_shared = 2,
418
},
419
{
420
/* Scalar L1 Data Cache per SQC */
421
.cache_size = 16,
422
.cache_level = 1,
423
.cache_line_size = 64,
424
.flags = (CRAT_CACHE_FLAGS_ENABLED |
425
CRAT_CACHE_FLAGS_DATA_CACHE |
426
CRAT_CACHE_FLAGS_SIMD_CACHE),
427
.num_cu_shared = 2,
428
},
429
{
430
/* GL1 Data Cache per SA */
431
.cache_size = 128,
432
.cache_level = 1,
433
.cache_line_size = 128,
434
.flags = (CRAT_CACHE_FLAGS_ENABLED |
435
CRAT_CACHE_FLAGS_DATA_CACHE |
436
CRAT_CACHE_FLAGS_SIMD_CACHE),
437
.num_cu_shared = 10,
438
},
439
{
440
/* L2 Data Cache per GPU (Total Tex Cache) */
441
.cache_size = 4096,
442
.cache_level = 2,
443
.cache_line_size = 128,
444
.flags = (CRAT_CACHE_FLAGS_ENABLED |
445
CRAT_CACHE_FLAGS_DATA_CACHE |
446
CRAT_CACHE_FLAGS_SIMD_CACHE),
447
.num_cu_shared = 10,
448
},
449
};
450
451
static struct kfd_gpu_cache_info vangogh_cache_info[] = {
452
{
453
/* TCP L1 Cache per CU */
454
.cache_size = 16,
455
.cache_level = 1,
456
.cache_line_size = 128,
457
.flags = (CRAT_CACHE_FLAGS_ENABLED |
458
CRAT_CACHE_FLAGS_DATA_CACHE |
459
CRAT_CACHE_FLAGS_SIMD_CACHE),
460
.num_cu_shared = 1,
461
},
462
{
463
/* Scalar L1 Instruction Cache per SQC */
464
.cache_size = 32,
465
.cache_level = 1,
466
.cache_line_size = 64,
467
.flags = (CRAT_CACHE_FLAGS_ENABLED |
468
CRAT_CACHE_FLAGS_INST_CACHE |
469
CRAT_CACHE_FLAGS_SIMD_CACHE),
470
.num_cu_shared = 2,
471
},
472
{
473
/* Scalar L1 Data Cache per SQC */
474
.cache_size = 16,
475
.cache_level = 1,
476
.cache_line_size = 64,
477
.flags = (CRAT_CACHE_FLAGS_ENABLED |
478
CRAT_CACHE_FLAGS_DATA_CACHE |
479
CRAT_CACHE_FLAGS_SIMD_CACHE),
480
.num_cu_shared = 2,
481
},
482
{
483
/* GL1 Data Cache per SA */
484
.cache_size = 128,
485
.cache_level = 1,
486
.cache_line_size = 128,
487
.flags = (CRAT_CACHE_FLAGS_ENABLED |
488
CRAT_CACHE_FLAGS_DATA_CACHE |
489
CRAT_CACHE_FLAGS_SIMD_CACHE),
490
.num_cu_shared = 8,
491
},
492
{
493
/* L2 Data Cache per GPU (Total Tex Cache) */
494
.cache_size = 1024,
495
.cache_level = 2,
496
.cache_line_size = 128,
497
.flags = (CRAT_CACHE_FLAGS_ENABLED |
498
CRAT_CACHE_FLAGS_DATA_CACHE |
499
CRAT_CACHE_FLAGS_SIMD_CACHE),
500
.num_cu_shared = 8,
501
},
502
};
503
504
static struct kfd_gpu_cache_info navi14_cache_info[] = {
505
{
506
/* TCP L1 Cache per CU */
507
.cache_size = 16,
508
.cache_level = 1,
509
.cache_line_size = 128,
510
.flags = (CRAT_CACHE_FLAGS_ENABLED |
511
CRAT_CACHE_FLAGS_DATA_CACHE |
512
CRAT_CACHE_FLAGS_SIMD_CACHE),
513
.num_cu_shared = 1,
514
},
515
{
516
/* Scalar L1 Instruction Cache per SQC */
517
.cache_size = 32,
518
.cache_level = 1,
519
.cache_line_size = 64,
520
.flags = (CRAT_CACHE_FLAGS_ENABLED |
521
CRAT_CACHE_FLAGS_INST_CACHE |
522
CRAT_CACHE_FLAGS_SIMD_CACHE),
523
.num_cu_shared = 2,
524
},
525
{
526
/* Scalar L1 Data Cache per SQC */
527
.cache_size = 16,
528
.cache_level = 1,
529
.cache_line_size = 64,
530
.flags = (CRAT_CACHE_FLAGS_ENABLED |
531
CRAT_CACHE_FLAGS_DATA_CACHE |
532
CRAT_CACHE_FLAGS_SIMD_CACHE),
533
.num_cu_shared = 2,
534
},
535
{
536
/* GL1 Data Cache per SA */
537
.cache_size = 128,
538
.cache_level = 1,
539
.cache_line_size = 128,
540
.flags = (CRAT_CACHE_FLAGS_ENABLED |
541
CRAT_CACHE_FLAGS_DATA_CACHE |
542
CRAT_CACHE_FLAGS_SIMD_CACHE),
543
.num_cu_shared = 12,
544
},
545
{
546
/* L2 Data Cache per GPU (Total Tex Cache) */
547
.cache_size = 2048,
548
.cache_level = 2,
549
.cache_line_size = 128,
550
.flags = (CRAT_CACHE_FLAGS_ENABLED |
551
CRAT_CACHE_FLAGS_DATA_CACHE |
552
CRAT_CACHE_FLAGS_SIMD_CACHE),
553
.num_cu_shared = 12,
554
},
555
};
556
557
static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = {
558
{
559
/* TCP L1 Cache per CU */
560
.cache_size = 16,
561
.cache_level = 1,
562
.cache_line_size = 128,
563
.flags = (CRAT_CACHE_FLAGS_ENABLED |
564
CRAT_CACHE_FLAGS_DATA_CACHE |
565
CRAT_CACHE_FLAGS_SIMD_CACHE),
566
.num_cu_shared = 1,
567
},
568
{
569
/* Scalar L1 Instruction Cache per SQC */
570
.cache_size = 32,
571
.cache_level = 1,
572
.cache_line_size = 64,
573
.flags = (CRAT_CACHE_FLAGS_ENABLED |
574
CRAT_CACHE_FLAGS_INST_CACHE |
575
CRAT_CACHE_FLAGS_SIMD_CACHE),
576
.num_cu_shared = 2,
577
},
578
{
579
/* Scalar L1 Data Cache per SQC */
580
.cache_size = 16,
581
.cache_level = 1,
582
.cache_line_size = 64,
583
.flags = (CRAT_CACHE_FLAGS_ENABLED |
584
CRAT_CACHE_FLAGS_DATA_CACHE |
585
CRAT_CACHE_FLAGS_SIMD_CACHE),
586
.num_cu_shared = 2,
587
},
588
{
589
/* GL1 Data Cache per SA */
590
.cache_size = 128,
591
.cache_level = 1,
592
.cache_line_size = 128,
593
.flags = (CRAT_CACHE_FLAGS_ENABLED |
594
CRAT_CACHE_FLAGS_DATA_CACHE |
595
CRAT_CACHE_FLAGS_SIMD_CACHE),
596
.num_cu_shared = 10,
597
},
598
{
599
/* L2 Data Cache per GPU (Total Tex Cache) */
600
.cache_size = 4096,
601
.cache_level = 2,
602
.cache_line_size = 128,
603
.flags = (CRAT_CACHE_FLAGS_ENABLED |
604
CRAT_CACHE_FLAGS_DATA_CACHE |
605
CRAT_CACHE_FLAGS_SIMD_CACHE),
606
.num_cu_shared = 10,
607
},
608
{
609
/* L3 Data Cache per GPU */
610
.cache_size = 128*1024,
611
.cache_level = 3,
612
.cache_line_size = 64,
613
.flags = (CRAT_CACHE_FLAGS_ENABLED |
614
CRAT_CACHE_FLAGS_DATA_CACHE |
615
CRAT_CACHE_FLAGS_SIMD_CACHE),
616
.num_cu_shared = 10,
617
},
618
};
619
620
static struct kfd_gpu_cache_info navy_flounder_cache_info[] = {
621
{
622
/* TCP L1 Cache per CU */
623
.cache_size = 16,
624
.cache_level = 1,
625
.cache_line_size = 128,
626
.flags = (CRAT_CACHE_FLAGS_ENABLED |
627
CRAT_CACHE_FLAGS_DATA_CACHE |
628
CRAT_CACHE_FLAGS_SIMD_CACHE),
629
.num_cu_shared = 1,
630
},
631
{
632
/* Scalar L1 Instruction Cache per SQC */
633
.cache_size = 32,
634
.cache_level = 1,
635
.cache_line_size = 64,
636
.flags = (CRAT_CACHE_FLAGS_ENABLED |
637
CRAT_CACHE_FLAGS_INST_CACHE |
638
CRAT_CACHE_FLAGS_SIMD_CACHE),
639
.num_cu_shared = 2,
640
},
641
{
642
/* Scalar L1 Data Cache per SQC */
643
.cache_size = 16,
644
.cache_level = 1,
645
.cache_line_size = 64,
646
.flags = (CRAT_CACHE_FLAGS_ENABLED |
647
CRAT_CACHE_FLAGS_DATA_CACHE |
648
CRAT_CACHE_FLAGS_SIMD_CACHE),
649
.num_cu_shared = 2,
650
},
651
{
652
/* GL1 Data Cache per SA */
653
.cache_size = 128,
654
.cache_level = 1,
655
.cache_line_size = 128,
656
.flags = (CRAT_CACHE_FLAGS_ENABLED |
657
CRAT_CACHE_FLAGS_DATA_CACHE |
658
CRAT_CACHE_FLAGS_SIMD_CACHE),
659
.num_cu_shared = 10,
660
},
661
{
662
/* L2 Data Cache per GPU (Total Tex Cache) */
663
.cache_size = 3072,
664
.cache_level = 2,
665
.cache_line_size = 128,
666
.flags = (CRAT_CACHE_FLAGS_ENABLED |
667
CRAT_CACHE_FLAGS_DATA_CACHE |
668
CRAT_CACHE_FLAGS_SIMD_CACHE),
669
.num_cu_shared = 10,
670
},
671
{
672
/* L3 Data Cache per GPU */
673
.cache_size = 96*1024,
674
.cache_level = 3,
675
.cache_line_size = 64,
676
.flags = (CRAT_CACHE_FLAGS_ENABLED |
677
CRAT_CACHE_FLAGS_DATA_CACHE |
678
CRAT_CACHE_FLAGS_SIMD_CACHE),
679
.num_cu_shared = 10,
680
},
681
};
682
683
static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = {
684
{
685
/* TCP L1 Cache per CU */
686
.cache_size = 16,
687
.cache_level = 1,
688
.cache_line_size = 128,
689
.flags = (CRAT_CACHE_FLAGS_ENABLED |
690
CRAT_CACHE_FLAGS_DATA_CACHE |
691
CRAT_CACHE_FLAGS_SIMD_CACHE),
692
.num_cu_shared = 1,
693
},
694
{
695
/* Scalar L1 Instruction Cache per SQC */
696
.cache_size = 32,
697
.cache_level = 1,
698
.cache_line_size = 64,
699
.flags = (CRAT_CACHE_FLAGS_ENABLED |
700
CRAT_CACHE_FLAGS_INST_CACHE |
701
CRAT_CACHE_FLAGS_SIMD_CACHE),
702
.num_cu_shared = 2,
703
},
704
{
705
/* Scalar L1 Data Cache per SQC */
706
.cache_size = 16,
707
.cache_level = 1,
708
.cache_line_size = 64,
709
.flags = (CRAT_CACHE_FLAGS_ENABLED |
710
CRAT_CACHE_FLAGS_DATA_CACHE |
711
CRAT_CACHE_FLAGS_SIMD_CACHE),
712
.num_cu_shared = 2,
713
},
714
{
715
/* GL1 Data Cache per SA */
716
.cache_size = 128,
717
.cache_level = 1,
718
.cache_line_size = 128,
719
.flags = (CRAT_CACHE_FLAGS_ENABLED |
720
CRAT_CACHE_FLAGS_DATA_CACHE |
721
CRAT_CACHE_FLAGS_SIMD_CACHE),
722
.num_cu_shared = 8,
723
},
724
{
725
/* L2 Data Cache per GPU (Total Tex Cache) */
726
.cache_size = 2048,
727
.cache_level = 2,
728
.cache_line_size = 128,
729
.flags = (CRAT_CACHE_FLAGS_ENABLED |
730
CRAT_CACHE_FLAGS_DATA_CACHE |
731
CRAT_CACHE_FLAGS_SIMD_CACHE),
732
.num_cu_shared = 8,
733
},
734
{
735
/* L3 Data Cache per GPU */
736
.cache_size = 32*1024,
737
.cache_level = 3,
738
.cache_line_size = 64,
739
.flags = (CRAT_CACHE_FLAGS_ENABLED |
740
CRAT_CACHE_FLAGS_DATA_CACHE |
741
CRAT_CACHE_FLAGS_SIMD_CACHE),
742
.num_cu_shared = 8,
743
},
744
};
745
746
static struct kfd_gpu_cache_info beige_goby_cache_info[] = {
747
{
748
/* TCP L1 Cache per CU */
749
.cache_size = 16,
750
.cache_level = 1,
751
.cache_line_size = 128,
752
.flags = (CRAT_CACHE_FLAGS_ENABLED |
753
CRAT_CACHE_FLAGS_DATA_CACHE |
754
CRAT_CACHE_FLAGS_SIMD_CACHE),
755
.num_cu_shared = 1,
756
},
757
{
758
/* Scalar L1 Instruction Cache per SQC */
759
.cache_size = 32,
760
.cache_level = 1,
761
.cache_line_size = 64,
762
.flags = (CRAT_CACHE_FLAGS_ENABLED |
763
CRAT_CACHE_FLAGS_INST_CACHE |
764
CRAT_CACHE_FLAGS_SIMD_CACHE),
765
.num_cu_shared = 2,
766
},
767
{
768
/* Scalar L1 Data Cache per SQC */
769
.cache_size = 16,
770
.cache_level = 1,
771
.cache_line_size = 64,
772
.flags = (CRAT_CACHE_FLAGS_ENABLED |
773
CRAT_CACHE_FLAGS_DATA_CACHE |
774
CRAT_CACHE_FLAGS_SIMD_CACHE),
775
.num_cu_shared = 2,
776
},
777
{
778
/* GL1 Data Cache per SA */
779
.cache_size = 128,
780
.cache_level = 1,
781
.cache_line_size = 128,
782
.flags = (CRAT_CACHE_FLAGS_ENABLED |
783
CRAT_CACHE_FLAGS_DATA_CACHE |
784
CRAT_CACHE_FLAGS_SIMD_CACHE),
785
.num_cu_shared = 8,
786
},
787
{
788
/* L2 Data Cache per GPU (Total Tex Cache) */
789
.cache_size = 1024,
790
.cache_level = 2,
791
.cache_line_size = 128,
792
.flags = (CRAT_CACHE_FLAGS_ENABLED |
793
CRAT_CACHE_FLAGS_DATA_CACHE |
794
CRAT_CACHE_FLAGS_SIMD_CACHE),
795
.num_cu_shared = 8,
796
},
797
{
798
/* L3 Data Cache per GPU */
799
.cache_size = 16*1024,
800
.cache_level = 3,
801
.cache_line_size = 64,
802
.flags = (CRAT_CACHE_FLAGS_ENABLED |
803
CRAT_CACHE_FLAGS_DATA_CACHE |
804
CRAT_CACHE_FLAGS_SIMD_CACHE),
805
.num_cu_shared = 8,
806
},
807
};
808
809
static struct kfd_gpu_cache_info yellow_carp_cache_info[] = {
810
{
811
/* TCP L1 Cache per CU */
812
.cache_size = 16,
813
.cache_level = 1,
814
.cache_line_size = 128,
815
.flags = (CRAT_CACHE_FLAGS_ENABLED |
816
CRAT_CACHE_FLAGS_DATA_CACHE |
817
CRAT_CACHE_FLAGS_SIMD_CACHE),
818
.num_cu_shared = 1,
819
},
820
{
821
/* Scalar L1 Instruction Cache per SQC */
822
.cache_size = 32,
823
.cache_level = 1,
824
.cache_line_size = 64,
825
.flags = (CRAT_CACHE_FLAGS_ENABLED |
826
CRAT_CACHE_FLAGS_INST_CACHE |
827
CRAT_CACHE_FLAGS_SIMD_CACHE),
828
.num_cu_shared = 2,
829
},
830
{
831
/* Scalar L1 Data Cache per SQC */
832
.cache_size = 16,
833
.cache_level = 1,
834
.cache_line_size = 64,
835
.flags = (CRAT_CACHE_FLAGS_ENABLED |
836
CRAT_CACHE_FLAGS_DATA_CACHE |
837
CRAT_CACHE_FLAGS_SIMD_CACHE),
838
.num_cu_shared = 2,
839
},
840
{
841
/* GL1 Data Cache per SA */
842
.cache_size = 128,
843
.cache_level = 1,
844
.cache_line_size = 128,
845
.flags = (CRAT_CACHE_FLAGS_ENABLED |
846
CRAT_CACHE_FLAGS_DATA_CACHE |
847
CRAT_CACHE_FLAGS_SIMD_CACHE),
848
.num_cu_shared = 6,
849
},
850
{
851
/* L2 Data Cache per GPU (Total Tex Cache) */
852
.cache_size = 2048,
853
.cache_level = 2,
854
.cache_line_size = 128,
855
.flags = (CRAT_CACHE_FLAGS_ENABLED |
856
CRAT_CACHE_FLAGS_DATA_CACHE |
857
CRAT_CACHE_FLAGS_SIMD_CACHE),
858
.num_cu_shared = 6,
859
},
860
};
861
862
static struct kfd_gpu_cache_info gfx1037_cache_info[] = {
863
{
864
/* TCP L1 Cache per CU */
865
.cache_size = 16,
866
.cache_level = 1,
867
.cache_line_size = 128,
868
.flags = (CRAT_CACHE_FLAGS_ENABLED |
869
CRAT_CACHE_FLAGS_DATA_CACHE |
870
CRAT_CACHE_FLAGS_SIMD_CACHE),
871
.num_cu_shared = 1,
872
},
873
{
874
/* Scalar L1 Instruction Cache per SQC */
875
.cache_size = 32,
876
.cache_level = 1,
877
.cache_line_size = 64,
878
.flags = (CRAT_CACHE_FLAGS_ENABLED |
879
CRAT_CACHE_FLAGS_INST_CACHE |
880
CRAT_CACHE_FLAGS_SIMD_CACHE),
881
.num_cu_shared = 2,
882
},
883
{
884
/* Scalar L1 Data Cache per SQC */
885
.cache_size = 16,
886
.cache_level = 1,
887
.cache_line_size = 64,
888
.flags = (CRAT_CACHE_FLAGS_ENABLED |
889
CRAT_CACHE_FLAGS_DATA_CACHE |
890
CRAT_CACHE_FLAGS_SIMD_CACHE),
891
.num_cu_shared = 2,
892
},
893
{
894
/* GL1 Data Cache per SA */
895
.cache_size = 128,
896
.cache_level = 1,
897
.cache_line_size = 128,
898
.flags = (CRAT_CACHE_FLAGS_ENABLED |
899
CRAT_CACHE_FLAGS_DATA_CACHE |
900
CRAT_CACHE_FLAGS_SIMD_CACHE),
901
.num_cu_shared = 2,
902
},
903
{
904
/* L2 Data Cache per GPU (Total Tex Cache) */
905
.cache_size = 256,
906
.cache_level = 2,
907
.cache_line_size = 128,
908
.flags = (CRAT_CACHE_FLAGS_ENABLED |
909
CRAT_CACHE_FLAGS_DATA_CACHE |
910
CRAT_CACHE_FLAGS_SIMD_CACHE),
911
.num_cu_shared = 2,
912
},
913
};
914
915
static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = {
916
{
917
/* TCP L1 Cache per CU */
918
.cache_size = 16,
919
.cache_level = 1,
920
.cache_line_size = 128,
921
.flags = (CRAT_CACHE_FLAGS_ENABLED |
922
CRAT_CACHE_FLAGS_DATA_CACHE |
923
CRAT_CACHE_FLAGS_SIMD_CACHE),
924
.num_cu_shared = 1,
925
},
926
{
927
/* Scalar L1 Instruction Cache per SQC */
928
.cache_size = 32,
929
.cache_level = 1,
930
.cache_line_size = 64,
931
.flags = (CRAT_CACHE_FLAGS_ENABLED |
932
CRAT_CACHE_FLAGS_INST_CACHE |
933
CRAT_CACHE_FLAGS_SIMD_CACHE),
934
.num_cu_shared = 2,
935
},
936
{
937
/* Scalar L1 Data Cache per SQC */
938
.cache_size = 16,
939
.cache_level = 1,
940
.cache_line_size = 64,
941
.flags = (CRAT_CACHE_FLAGS_ENABLED |
942
CRAT_CACHE_FLAGS_DATA_CACHE |
943
CRAT_CACHE_FLAGS_SIMD_CACHE),
944
.num_cu_shared = 2,
945
},
946
{
947
/* GL1 Data Cache per SA */
948
.cache_size = 128,
949
.cache_level = 1,
950
.cache_line_size = 128,
951
.flags = (CRAT_CACHE_FLAGS_ENABLED |
952
CRAT_CACHE_FLAGS_DATA_CACHE |
953
CRAT_CACHE_FLAGS_SIMD_CACHE),
954
.num_cu_shared = 2,
955
},
956
{
957
/* L2 Data Cache per GPU (Total Tex Cache) */
958
.cache_size = 256,
959
.cache_level = 2,
960
.cache_line_size = 128,
961
.flags = (CRAT_CACHE_FLAGS_ENABLED |
962
CRAT_CACHE_FLAGS_DATA_CACHE |
963
CRAT_CACHE_FLAGS_SIMD_CACHE),
964
.num_cu_shared = 2,
965
},
966
};
967
968
static struct kfd_gpu_cache_info dummy_cache_info[] = {
969
{
970
/* TCP L1 Cache per CU */
971
.cache_size = 16,
972
.cache_level = 1,
973
.cache_line_size = 64,
974
.flags = (CRAT_CACHE_FLAGS_ENABLED |
975
CRAT_CACHE_FLAGS_DATA_CACHE |
976
CRAT_CACHE_FLAGS_SIMD_CACHE),
977
.num_cu_shared = 1,
978
},
979
{
980
/* Scalar L1 Instruction Cache per SQC */
981
.cache_size = 32,
982
.cache_level = 1,
983
.cache_line_size = 64,
984
.flags = (CRAT_CACHE_FLAGS_ENABLED |
985
CRAT_CACHE_FLAGS_INST_CACHE |
986
CRAT_CACHE_FLAGS_SIMD_CACHE),
987
.num_cu_shared = 2,
988
},
989
{
990
/* Scalar L1 Data Cache per SQC */
991
.cache_size = 16,
992
.cache_level = 1,
993
.cache_line_size = 64,
994
.flags = (CRAT_CACHE_FLAGS_ENABLED |
995
CRAT_CACHE_FLAGS_DATA_CACHE |
996
CRAT_CACHE_FLAGS_SIMD_CACHE),
997
.num_cu_shared = 2,
998
},
999
{
1000
/* GL1 Data Cache per SA */
1001
.cache_size = 128,
1002
.cache_level = 1,
1003
.cache_line_size = 64,
1004
.flags = (CRAT_CACHE_FLAGS_ENABLED |
1005
CRAT_CACHE_FLAGS_DATA_CACHE |
1006
CRAT_CACHE_FLAGS_SIMD_CACHE),
1007
.num_cu_shared = 6,
1008
},
1009
{
1010
/* L2 Data Cache per GPU (Total Tex Cache) */
1011
.cache_size = 2048,
1012
.cache_level = 2,
1013
.cache_line_size = 64,
1014
.flags = (CRAT_CACHE_FLAGS_ENABLED |
1015
CRAT_CACHE_FLAGS_DATA_CACHE |
1016
CRAT_CACHE_FLAGS_SIMD_CACHE),
1017
.num_cu_shared = 6,
1018
},
1019
};
1020
1021
static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
1022
struct crat_subtype_computeunit *cu)
1023
{
1024
dev->node_props.cpu_cores_count = cu->num_cpu_cores;
1025
dev->node_props.cpu_core_id_base = cu->processor_id_low;
1026
if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
1027
dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
1028
1029
pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
1030
cu->processor_id_low);
1031
}
1032
1033
static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
1034
struct crat_subtype_computeunit *cu)
1035
{
1036
dev->node_props.simd_id_base = cu->processor_id_low;
1037
dev->node_props.simd_count = cu->num_simd_cores;
1038
dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
1039
dev->node_props.max_waves_per_simd = cu->max_waves_simd;
1040
dev->node_props.wave_front_size = cu->wave_front_size;
1041
dev->node_props.array_count = cu->array_count;
1042
dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
1043
dev->node_props.simd_per_cu = cu->num_simd_per_cu;
1044
dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
1045
if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
1046
dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
1047
pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
1048
}
1049
1050
/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
1051
* topology device present in the device_list
1052
*/
1053
static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
1054
struct list_head *device_list)
1055
{
1056
struct kfd_topology_device *dev;
1057
1058
pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
1059
cu->proximity_domain, cu->hsa_capability);
1060
list_for_each_entry(dev, device_list, list) {
1061
if (cu->proximity_domain == dev->proximity_domain) {
1062
if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
1063
kfd_populated_cu_info_cpu(dev, cu);
1064
1065
if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
1066
kfd_populated_cu_info_gpu(dev, cu);
1067
break;
1068
}
1069
}
1070
1071
return 0;
1072
}
1073
1074
static struct kfd_mem_properties *
1075
find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,
1076
struct kfd_topology_device *dev)
1077
{
1078
struct kfd_mem_properties *props;
1079
1080
list_for_each_entry(props, &dev->mem_props, list) {
1081
if (props->heap_type == heap_type
1082
&& props->flags == flags
1083
&& props->width == width)
1084
return props;
1085
}
1086
1087
return NULL;
1088
}
1089
/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
1090
* topology device present in the device_list
1091
*/
1092
static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
1093
struct list_head *device_list)
1094
{
1095
struct kfd_mem_properties *props;
1096
struct kfd_topology_device *dev;
1097
uint32_t heap_type;
1098
uint64_t size_in_bytes;
1099
uint32_t flags = 0;
1100
uint32_t width;
1101
1102
pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
1103
mem->proximity_domain);
1104
list_for_each_entry(dev, device_list, list) {
1105
if (mem->proximity_domain == dev->proximity_domain) {
1106
/* We're on GPU node */
1107
if (dev->node_props.cpu_cores_count == 0) {
1108
/* APU */
1109
if (mem->visibility_type == 0)
1110
heap_type =
1111
HSA_MEM_HEAP_TYPE_FB_PRIVATE;
1112
/* dGPU */
1113
else
1114
heap_type = mem->visibility_type;
1115
} else
1116
heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
1117
1118
if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
1119
flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
1120
if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
1121
flags |= HSA_MEM_FLAGS_NON_VOLATILE;
1122
1123
size_in_bytes =
1124
((uint64_t)mem->length_high << 32) +
1125
mem->length_low;
1126
width = mem->width;
1127
1128
/* Multiple banks of the same type are aggregated into
1129
* one. User mode doesn't care about multiple physical
1130
* memory segments. It's managed as a single virtual
1131
* heap for user mode.
1132
*/
1133
props = find_subtype_mem(heap_type, flags, width, dev);
1134
if (props) {
1135
props->size_in_bytes += size_in_bytes;
1136
break;
1137
}
1138
1139
props = kfd_alloc_struct(props);
1140
if (!props)
1141
return -ENOMEM;
1142
1143
props->heap_type = heap_type;
1144
props->flags = flags;
1145
props->size_in_bytes = size_in_bytes;
1146
props->width = width;
1147
1148
dev->node_props.mem_banks_count++;
1149
list_add_tail(&props->list, &dev->mem_props);
1150
1151
break;
1152
}
1153
}
1154
1155
return 0;
1156
}
1157
1158
/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
1159
* topology device present in the device_list
1160
*/
1161
static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
1162
struct list_head *device_list)
1163
{
1164
struct kfd_cache_properties *props;
1165
struct kfd_topology_device *dev;
1166
uint32_t id;
1167
uint32_t total_num_of_cu;
1168
1169
id = cache->processor_id_low;
1170
1171
pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
1172
list_for_each_entry(dev, device_list, list) {
1173
total_num_of_cu = (dev->node_props.array_count *
1174
dev->node_props.cu_per_simd_array);
1175
1176
/* Cache infomration in CRAT doesn't have proximity_domain
1177
* information as it is associated with a CPU core or GPU
1178
* Compute Unit. So map the cache using CPU core Id or SIMD
1179
* (GPU) ID.
1180
* TODO: This works because currently we can safely assume that
1181
* Compute Units are parsed before caches are parsed. In
1182
* future, remove this dependency
1183
*/
1184
if ((id >= dev->node_props.cpu_core_id_base &&
1185
id <= dev->node_props.cpu_core_id_base +
1186
dev->node_props.cpu_cores_count) ||
1187
(id >= dev->node_props.simd_id_base &&
1188
id < dev->node_props.simd_id_base +
1189
total_num_of_cu)) {
1190
props = kfd_alloc_struct(props);
1191
if (!props)
1192
return -ENOMEM;
1193
1194
props->processor_id_low = id;
1195
props->cache_level = cache->cache_level;
1196
props->cache_size = cache->cache_size;
1197
props->cacheline_size = cache->cache_line_size;
1198
props->cachelines_per_tag = cache->lines_per_tag;
1199
props->cache_assoc = cache->associativity;
1200
props->cache_latency = cache->cache_latency;
1201
1202
memcpy(props->sibling_map, cache->sibling_map,
1203
CRAT_SIBLINGMAP_SIZE);
1204
1205
/* set the sibling_map_size as 32 for CRAT from ACPI */
1206
props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;
1207
1208
if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
1209
props->cache_type |= HSA_CACHE_TYPE_DATA;
1210
if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
1211
props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
1212
if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
1213
props->cache_type |= HSA_CACHE_TYPE_CPU;
1214
if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
1215
props->cache_type |= HSA_CACHE_TYPE_HSACU;
1216
1217
dev->node_props.caches_count++;
1218
list_add_tail(&props->list, &dev->cache_props);
1219
1220
break;
1221
}
1222
}
1223
1224
return 0;
1225
}
1226
1227
/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
1228
* topology device present in the device_list
1229
*/
1230
static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
1231
struct list_head *device_list)
1232
{
1233
struct kfd_iolink_properties *props = NULL, *props2;
1234
struct kfd_topology_device *dev, *to_dev;
1235
uint32_t id_from;
1236
uint32_t id_to;
1237
1238
id_from = iolink->proximity_domain_from;
1239
id_to = iolink->proximity_domain_to;
1240
1241
pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n",
1242
id_from, id_to);
1243
list_for_each_entry(dev, device_list, list) {
1244
if (id_from == dev->proximity_domain) {
1245
props = kfd_alloc_struct(props);
1246
if (!props)
1247
return -ENOMEM;
1248
1249
props->node_from = id_from;
1250
props->node_to = id_to;
1251
props->ver_maj = iolink->version_major;
1252
props->ver_min = iolink->version_minor;
1253
props->iolink_type = iolink->io_interface_type;
1254
1255
if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
1256
props->weight = 20;
1257
else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
1258
props->weight = iolink->weight_xgmi;
1259
else
1260
props->weight = node_distance(id_from, id_to);
1261
1262
props->min_latency = iolink->minimum_latency;
1263
props->max_latency = iolink->maximum_latency;
1264
props->min_bandwidth = iolink->minimum_bandwidth_mbs;
1265
props->max_bandwidth = iolink->maximum_bandwidth_mbs;
1266
props->rec_transfer_size =
1267
iolink->recommended_transfer_size;
1268
1269
dev->node_props.io_links_count++;
1270
list_add_tail(&props->list, &dev->io_link_props);
1271
break;
1272
}
1273
}
1274
1275
/* CPU topology is created before GPUs are detected, so CPU->GPU
1276
* links are not built at that time. If a PCIe type is discovered, it
1277
* means a GPU is detected and we are adding GPU->CPU to the topology.
1278
* At this time, also add the corresponded CPU->GPU link if GPU
1279
* is large bar.
1280
* For xGMI, we only added the link with one direction in the crat
1281
* table, add corresponded reversed direction link now.
1282
*/
1283
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
1284
to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
1285
if (!to_dev)
1286
return -ENODEV;
1287
/* same everything but the other direction */
1288
props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
1289
if (!props2)
1290
return -ENOMEM;
1291
1292
props2->node_from = id_to;
1293
props2->node_to = id_from;
1294
props2->kobj = NULL;
1295
to_dev->node_props.io_links_count++;
1296
list_add_tail(&props2->list, &to_dev->io_link_props);
1297
}
1298
1299
return 0;
1300
}
1301
1302
/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
1303
* present in the device_list
1304
* @sub_type_hdr - subtype section of crat_image
1305
* @device_list - list of topology devices present in this crat_image
1306
*/
1307
static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
1308
struct list_head *device_list)
1309
{
1310
struct crat_subtype_computeunit *cu;
1311
struct crat_subtype_memory *mem;
1312
struct crat_subtype_cache *cache;
1313
struct crat_subtype_iolink *iolink;
1314
int ret = 0;
1315
1316
switch (sub_type_hdr->type) {
1317
case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
1318
cu = (struct crat_subtype_computeunit *)sub_type_hdr;
1319
ret = kfd_parse_subtype_cu(cu, device_list);
1320
break;
1321
case CRAT_SUBTYPE_MEMORY_AFFINITY:
1322
mem = (struct crat_subtype_memory *)sub_type_hdr;
1323
ret = kfd_parse_subtype_mem(mem, device_list);
1324
break;
1325
case CRAT_SUBTYPE_CACHE_AFFINITY:
1326
cache = (struct crat_subtype_cache *)sub_type_hdr;
1327
ret = kfd_parse_subtype_cache(cache, device_list);
1328
break;
1329
case CRAT_SUBTYPE_TLB_AFFINITY:
1330
/*
1331
* For now, nothing to do here
1332
*/
1333
pr_debug("Found TLB entry in CRAT table (not processing)\n");
1334
break;
1335
case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
1336
/*
1337
* For now, nothing to do here
1338
*/
1339
pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
1340
break;
1341
case CRAT_SUBTYPE_IOLINK_AFFINITY:
1342
iolink = (struct crat_subtype_iolink *)sub_type_hdr;
1343
ret = kfd_parse_subtype_iolink(iolink, device_list);
1344
break;
1345
default:
1346
pr_warn("Unknown subtype %d in CRAT\n",
1347
sub_type_hdr->type);
1348
}
1349
1350
return ret;
1351
}
1352
1353
/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
1354
* create a kfd_topology_device and add in to device_list. Also parse
1355
* CRAT subtypes and attach it to appropriate kfd_topology_device
1356
* @crat_image - input image containing CRAT
1357
* @device_list - [OUT] list of kfd_topology_device generated after
1358
* parsing crat_image
1359
* @proximity_domain - Proximity domain of the first device in the table
1360
*
1361
* Return - 0 if successful else -ve value
1362
*/
1363
int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
1364
uint32_t proximity_domain)
1365
{
1366
struct kfd_topology_device *top_dev = NULL;
1367
struct crat_subtype_generic *sub_type_hdr;
1368
uint16_t node_id;
1369
int ret = 0;
1370
struct crat_header *crat_table = (struct crat_header *)crat_image;
1371
uint16_t num_nodes;
1372
uint32_t image_len;
1373
1374
if (!crat_image)
1375
return -EINVAL;
1376
1377
if (!list_empty(device_list)) {
1378
pr_warn("Error device list should be empty\n");
1379
return -EINVAL;
1380
}
1381
1382
num_nodes = crat_table->num_domains;
1383
image_len = crat_table->length;
1384
1385
pr_debug("Parsing CRAT table with %d nodes\n", num_nodes);
1386
1387
for (node_id = 0; node_id < num_nodes; node_id++) {
1388
top_dev = kfd_create_topology_device(device_list);
1389
if (!top_dev)
1390
break;
1391
top_dev->proximity_domain = proximity_domain++;
1392
}
1393
1394
if (!top_dev) {
1395
ret = -ENOMEM;
1396
goto err;
1397
}
1398
1399
memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
1400
memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
1401
CRAT_OEMTABLEID_LENGTH);
1402
top_dev->oem_revision = crat_table->oem_revision;
1403
1404
sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
1405
while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
1406
((char *)crat_image) + image_len) {
1407
if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
1408
ret = kfd_parse_subtype(sub_type_hdr, device_list);
1409
if (ret)
1410
break;
1411
}
1412
1413
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1414
sub_type_hdr->length);
1415
}
1416
1417
err:
1418
if (ret)
1419
kfd_release_topology_device_list(device_list);
1420
1421
return ret;
1422
}
1423
1424
1425
static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
1426
bool cache_line_size_missing,
1427
struct kfd_gpu_cache_info *pcache_info)
1428
{
1429
struct amdgpu_device *adev = kdev->adev;
1430
int i = 0;
1431
1432
/* TCP L1 Cache per CU */
1433
if (adev->gfx.config.gc_tcp_l1_size) {
1434
pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size;
1435
pcache_info[i].cache_level = 1;
1436
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1437
CRAT_CACHE_FLAGS_DATA_CACHE |
1438
CRAT_CACHE_FLAGS_SIMD_CACHE);
1439
pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2;
1440
pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size;
1441
if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1442
pcache_info[i].cache_line_size = 128;
1443
i++;
1444
}
1445
/* Scalar L1 Instruction Cache per SQC */
1446
if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
1447
pcache_info[i].cache_size =
1448
adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
1449
pcache_info[i].cache_level = 1;
1450
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1451
CRAT_CACHE_FLAGS_INST_CACHE |
1452
CRAT_CACHE_FLAGS_SIMD_CACHE);
1453
pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
1454
pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size;
1455
if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1456
pcache_info[i].cache_line_size = 128;
1457
i++;
1458
}
1459
/* Scalar L1 Data Cache per SQC */
1460
if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
1461
pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
1462
pcache_info[i].cache_level = 1;
1463
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1464
CRAT_CACHE_FLAGS_DATA_CACHE |
1465
CRAT_CACHE_FLAGS_SIMD_CACHE);
1466
pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
1467
pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size;
1468
if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1469
pcache_info[i].cache_line_size = 64;
1470
i++;
1471
}
1472
/* GL1 Data Cache per SA */
1473
if (adev->gfx.config.gc_gl1c_per_sa &&
1474
adev->gfx.config.gc_gl1c_size_per_instance) {
1475
pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa *
1476
adev->gfx.config.gc_gl1c_size_per_instance;
1477
pcache_info[i].cache_level = 1;
1478
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1479
CRAT_CACHE_FLAGS_DATA_CACHE |
1480
CRAT_CACHE_FLAGS_SIMD_CACHE);
1481
pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1482
if (cache_line_size_missing)
1483
pcache_info[i].cache_line_size = 128;
1484
i++;
1485
}
1486
/* L2 Data Cache per GPU (Total Tex Cache) */
1487
if (adev->gfx.config.gc_gl2c_per_gpu) {
1488
pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu;
1489
pcache_info[i].cache_level = 2;
1490
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1491
CRAT_CACHE_FLAGS_DATA_CACHE |
1492
CRAT_CACHE_FLAGS_SIMD_CACHE);
1493
pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1494
pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size;
1495
if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1496
pcache_info[i].cache_line_size = 128;
1497
i++;
1498
}
1499
/* L3 Data Cache per GPU */
1500
if (adev->gmc.mall_size) {
1501
pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
1502
pcache_info[i].cache_level = 3;
1503
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1504
CRAT_CACHE_FLAGS_DATA_CACHE |
1505
CRAT_CACHE_FLAGS_SIMD_CACHE);
1506
pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1507
pcache_info[i].cache_line_size = 64;
1508
i++;
1509
}
1510
return i;
1511
}
1512
1513
static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev,
1514
struct kfd_gpu_cache_info *pcache_info)
1515
{
1516
struct amdgpu_device *adev = kdev->adev;
1517
int i = 0;
1518
1519
/* TCP L1 Cache per CU */
1520
if (adev->gfx.config.gc_tcp_size_per_cu) {
1521
pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu;
1522
pcache_info[i].cache_level = 1;
1523
/* Cacheline size not available in IP discovery for gc943,gc944 */
1524
pcache_info[i].cache_line_size = 128;
1525
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1526
CRAT_CACHE_FLAGS_DATA_CACHE |
1527
CRAT_CACHE_FLAGS_SIMD_CACHE);
1528
pcache_info[i].num_cu_shared = 1;
1529
i++;
1530
}
1531
/* Scalar L1 Instruction Cache per SQC */
1532
if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
1533
pcache_info[i].cache_size =
1534
adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
1535
pcache_info[i].cache_level = 1;
1536
pcache_info[i].cache_line_size = 64;
1537
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1538
CRAT_CACHE_FLAGS_INST_CACHE |
1539
CRAT_CACHE_FLAGS_SIMD_CACHE);
1540
pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;
1541
i++;
1542
}
1543
/* Scalar L1 Data Cache per SQC */
1544
if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
1545
pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
1546
pcache_info[i].cache_level = 1;
1547
pcache_info[i].cache_line_size = 64;
1548
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1549
CRAT_CACHE_FLAGS_DATA_CACHE |
1550
CRAT_CACHE_FLAGS_SIMD_CACHE);
1551
pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;
1552
i++;
1553
}
1554
/* L2 Data Cache per GPU (Total Tex Cache) */
1555
if (adev->gfx.config.gc_tcc_size) {
1556
pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size;
1557
pcache_info[i].cache_level = 2;
1558
pcache_info[i].cache_line_size = 128;
1559
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1560
CRAT_CACHE_FLAGS_DATA_CACHE |
1561
CRAT_CACHE_FLAGS_SIMD_CACHE);
1562
pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1563
i++;
1564
}
1565
/* L3 Data Cache per GPU */
1566
if (adev->gmc.mall_size) {
1567
pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
1568
pcache_info[i].cache_level = 3;
1569
pcache_info[i].cache_line_size = 64;
1570
pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1571
CRAT_CACHE_FLAGS_DATA_CACHE |
1572
CRAT_CACHE_FLAGS_SIMD_CACHE);
1573
pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1574
i++;
1575
}
1576
return i;
1577
}
1578
1579
int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info)
1580
{
1581
int num_of_cache_types = 0;
1582
bool cache_line_size_missing = false;
1583
1584
switch (kdev->adev->asic_type) {
1585
case CHIP_KAVERI:
1586
*pcache_info = kaveri_cache_info;
1587
num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
1588
break;
1589
case CHIP_HAWAII:
1590
*pcache_info = hawaii_cache_info;
1591
num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
1592
break;
1593
case CHIP_CARRIZO:
1594
*pcache_info = carrizo_cache_info;
1595
num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
1596
break;
1597
case CHIP_TONGA:
1598
*pcache_info = tonga_cache_info;
1599
num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
1600
break;
1601
case CHIP_FIJI:
1602
*pcache_info = fiji_cache_info;
1603
num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
1604
break;
1605
case CHIP_POLARIS10:
1606
*pcache_info = polaris10_cache_info;
1607
num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
1608
break;
1609
case CHIP_POLARIS11:
1610
*pcache_info = polaris11_cache_info;
1611
num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
1612
break;
1613
case CHIP_POLARIS12:
1614
*pcache_info = polaris12_cache_info;
1615
num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
1616
break;
1617
case CHIP_VEGAM:
1618
*pcache_info = vegam_cache_info;
1619
num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
1620
break;
1621
default:
1622
switch (KFD_GC_VERSION(kdev)) {
1623
case IP_VERSION(9, 0, 1):
1624
*pcache_info = vega10_cache_info;
1625
num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
1626
break;
1627
case IP_VERSION(9, 2, 1):
1628
*pcache_info = vega12_cache_info;
1629
num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
1630
break;
1631
case IP_VERSION(9, 4, 0):
1632
case IP_VERSION(9, 4, 1):
1633
*pcache_info = vega20_cache_info;
1634
num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
1635
break;
1636
case IP_VERSION(9, 4, 2):
1637
*pcache_info = aldebaran_cache_info;
1638
num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
1639
break;
1640
case IP_VERSION(9, 4, 3):
1641
case IP_VERSION(9, 4, 4):
1642
case IP_VERSION(9, 5, 0):
1643
num_of_cache_types =
1644
kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd,
1645
*pcache_info);
1646
break;
1647
case IP_VERSION(9, 1, 0):
1648
case IP_VERSION(9, 2, 2):
1649
*pcache_info = raven_cache_info;
1650
num_of_cache_types = ARRAY_SIZE(raven_cache_info);
1651
break;
1652
case IP_VERSION(9, 3, 0):
1653
*pcache_info = renoir_cache_info;
1654
num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
1655
break;
1656
case IP_VERSION(10, 1, 10):
1657
case IP_VERSION(10, 1, 2):
1658
case IP_VERSION(10, 1, 3):
1659
case IP_VERSION(10, 1, 4):
1660
*pcache_info = navi10_cache_info;
1661
num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
1662
break;
1663
case IP_VERSION(10, 1, 1):
1664
*pcache_info = navi14_cache_info;
1665
num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
1666
break;
1667
case IP_VERSION(10, 3, 0):
1668
*pcache_info = sienna_cichlid_cache_info;
1669
num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
1670
break;
1671
case IP_VERSION(10, 3, 2):
1672
*pcache_info = navy_flounder_cache_info;
1673
num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
1674
break;
1675
case IP_VERSION(10, 3, 4):
1676
*pcache_info = dimgrey_cavefish_cache_info;
1677
num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
1678
break;
1679
case IP_VERSION(10, 3, 1):
1680
*pcache_info = vangogh_cache_info;
1681
num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
1682
break;
1683
case IP_VERSION(10, 3, 5):
1684
*pcache_info = beige_goby_cache_info;
1685
num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
1686
break;
1687
case IP_VERSION(10, 3, 3):
1688
*pcache_info = yellow_carp_cache_info;
1689
num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
1690
break;
1691
case IP_VERSION(10, 3, 6):
1692
*pcache_info = gc_10_3_6_cache_info;
1693
num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
1694
break;
1695
case IP_VERSION(10, 3, 7):
1696
*pcache_info = gfx1037_cache_info;
1697
num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);
1698
break;
1699
case IP_VERSION(11, 0, 0):
1700
case IP_VERSION(11, 0, 1):
1701
case IP_VERSION(11, 0, 2):
1702
case IP_VERSION(11, 0, 3):
1703
case IP_VERSION(11, 0, 4):
1704
case IP_VERSION(11, 5, 0):
1705
case IP_VERSION(11, 5, 1):
1706
case IP_VERSION(11, 5, 2):
1707
case IP_VERSION(11, 5, 3):
1708
/* Cacheline size not available in IP discovery for gc11.
1709
* kfd_fill_gpu_cache_info_from_gfx_config to hard code it
1710
*/
1711
cache_line_size_missing = true;
1712
fallthrough;
1713
case IP_VERSION(12, 0, 0):
1714
case IP_VERSION(12, 0, 1):
1715
num_of_cache_types =
1716
kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd,
1717
cache_line_size_missing,
1718
*pcache_info);
1719
break;
1720
default:
1721
*pcache_info = dummy_cache_info;
1722
num_of_cache_types = ARRAY_SIZE(dummy_cache_info);
1723
pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");
1724
break;
1725
}
1726
}
1727
return num_of_cache_types;
1728
}
1729
1730
/* Memory required to create Virtual CRAT.
1731
* Since there is no easy way to predict the amount of memory required, the
1732
* following amount is allocated for GPU Virtual CRAT. This is
1733
* expected to cover all known conditions. But to be safe additional check
1734
* is put in the code to ensure we don't overwrite.
1735
*/
1736
#define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE)
1737
1738
/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
1739
*
1740
* @numa_node_id: CPU NUMA node id
1741
* @avail_size: Available size in the memory
1742
* @sub_type_hdr: Memory into which compute info will be filled in
1743
*
1744
* Return 0 if successful else return -ve value
1745
*/
1746
static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
1747
int proximity_domain,
1748
struct crat_subtype_computeunit *sub_type_hdr)
1749
{
1750
const struct cpumask *cpumask;
1751
1752
*avail_size -= sizeof(struct crat_subtype_computeunit);
1753
if (*avail_size < 0)
1754
return -ENOMEM;
1755
1756
memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
1757
1758
/* Fill in subtype header data */
1759
sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
1760
sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
1761
sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1762
1763
cpumask = cpumask_of_node(numa_node_id);
1764
1765
/* Fill in CU data */
1766
sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
1767
sub_type_hdr->proximity_domain = proximity_domain;
1768
sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
1769
if (sub_type_hdr->processor_id_low == -1)
1770
return -EINVAL;
1771
1772
sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
1773
1774
return 0;
1775
}
1776
1777
/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
1778
*
1779
* @numa_node_id: CPU NUMA node id
1780
* @avail_size: Available size in the memory
1781
* @sub_type_hdr: Memory into which compute info will be filled in
1782
*
1783
* Return 0 if successful else return -ve value
1784
*/
1785
static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
1786
int proximity_domain,
1787
struct crat_subtype_memory *sub_type_hdr)
1788
{
1789
uint64_t mem_in_bytes = 0;
1790
pg_data_t *pgdat;
1791
int zone_type;
1792
1793
*avail_size -= sizeof(struct crat_subtype_memory);
1794
if (*avail_size < 0)
1795
return -ENOMEM;
1796
1797
memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
1798
1799
/* Fill in subtype header data */
1800
sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
1801
sub_type_hdr->length = sizeof(struct crat_subtype_memory);
1802
sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1803
1804
/* Fill in Memory Subunit data */
1805
1806
/* Unlike si_meminfo, si_meminfo_node is not exported. So
1807
* the following lines are duplicated from si_meminfo_node
1808
* function
1809
*/
1810
pgdat = NODE_DATA(numa_node_id);
1811
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
1812
mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]);
1813
mem_in_bytes <<= PAGE_SHIFT;
1814
1815
sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
1816
sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
1817
sub_type_hdr->proximity_domain = proximity_domain;
1818
1819
return 0;
1820
}
1821
1822
#ifdef CONFIG_X86_64
1823
static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
1824
uint32_t *num_entries,
1825
struct crat_subtype_iolink *sub_type_hdr)
1826
{
1827
int nid;
1828
struct cpuinfo_x86 *c = &cpu_data(0);
1829
uint8_t link_type;
1830
1831
if (c->x86_vendor == X86_VENDOR_AMD)
1832
link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
1833
else
1834
link_type = CRAT_IOLINK_TYPE_QPI_1_1;
1835
1836
*num_entries = 0;
1837
1838
/* Create IO links from this node to other CPU nodes */
1839
for_each_online_node(nid) {
1840
if (nid == numa_node_id) /* node itself */
1841
continue;
1842
1843
*avail_size -= sizeof(struct crat_subtype_iolink);
1844
if (*avail_size < 0)
1845
return -ENOMEM;
1846
1847
memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
1848
1849
/* Fill in subtype header data */
1850
sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
1851
sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
1852
sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1853
1854
/* Fill in IO link data */
1855
sub_type_hdr->proximity_domain_from = numa_node_id;
1856
sub_type_hdr->proximity_domain_to = nid;
1857
sub_type_hdr->io_interface_type = link_type;
1858
1859
(*num_entries)++;
1860
sub_type_hdr++;
1861
}
1862
1863
return 0;
1864
}
1865
#endif
1866
1867
/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
1868
*
1869
* @pcrat_image: Fill in VCRAT for CPU
1870
* @size: [IN] allocated size of crat_image.
1871
* [OUT] actual size of data filled in crat_image
1872
*/
1873
static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
1874
{
1875
struct crat_header *crat_table = (struct crat_header *)pcrat_image;
1876
struct acpi_table_header *acpi_table;
1877
acpi_status status;
1878
struct crat_subtype_generic *sub_type_hdr;
1879
int avail_size = *size;
1880
int numa_node_id;
1881
#ifdef CONFIG_X86_64
1882
uint32_t entries = 0;
1883
#endif
1884
int ret = 0;
1885
1886
if (!pcrat_image)
1887
return -EINVAL;
1888
1889
/* Fill in CRAT Header.
1890
* Modify length and total_entries as subunits are added.
1891
*/
1892
avail_size -= sizeof(struct crat_header);
1893
if (avail_size < 0)
1894
return -ENOMEM;
1895
1896
memset(crat_table, 0, sizeof(struct crat_header));
1897
memcpy(&crat_table->signature, CRAT_SIGNATURE,
1898
sizeof(crat_table->signature));
1899
crat_table->length = sizeof(struct crat_header);
1900
1901
status = acpi_get_table("DSDT", 0, &acpi_table);
1902
if (status != AE_OK)
1903
pr_warn("DSDT table not found for OEM information\n");
1904
else {
1905
crat_table->oem_revision = acpi_table->revision;
1906
memcpy(crat_table->oem_id, acpi_table->oem_id,
1907
CRAT_OEMID_LENGTH);
1908
memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
1909
CRAT_OEMTABLEID_LENGTH);
1910
acpi_put_table(acpi_table);
1911
}
1912
crat_table->total_entries = 0;
1913
crat_table->num_domains = 0;
1914
1915
sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
1916
1917
for_each_online_node(numa_node_id) {
1918
if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
1919
continue;
1920
1921
/* Fill in Subtype: Compute Unit */
1922
ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
1923
crat_table->num_domains,
1924
(struct crat_subtype_computeunit *)sub_type_hdr);
1925
if (ret < 0)
1926
return ret;
1927
crat_table->length += sub_type_hdr->length;
1928
crat_table->total_entries++;
1929
1930
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1931
sub_type_hdr->length);
1932
1933
/* Fill in Subtype: Memory */
1934
ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
1935
crat_table->num_domains,
1936
(struct crat_subtype_memory *)sub_type_hdr);
1937
if (ret < 0)
1938
return ret;
1939
crat_table->length += sub_type_hdr->length;
1940
crat_table->total_entries++;
1941
1942
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1943
sub_type_hdr->length);
1944
1945
/* Fill in Subtype: IO Link */
1946
#ifdef CONFIG_X86_64
1947
ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
1948
&entries,
1949
(struct crat_subtype_iolink *)sub_type_hdr);
1950
if (ret < 0)
1951
return ret;
1952
1953
if (entries) {
1954
crat_table->length += (sub_type_hdr->length * entries);
1955
crat_table->total_entries += entries;
1956
1957
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1958
sub_type_hdr->length * entries);
1959
}
1960
#else
1961
pr_info("IO link not available for non x86 platforms\n");
1962
#endif
1963
1964
crat_table->num_domains++;
1965
}
1966
1967
/* TODO: Add cache Subtype for CPU.
1968
* Currently, CPU cache information is available in function
1969
* detect_cache_attributes(cpu) defined in the file
1970
* ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
1971
* exported and to get the same information the code needs to be
1972
* duplicated.
1973
*/
1974
1975
*size = crat_table->length;
1976
pr_info("Virtual CRAT table created for CPU\n");
1977
1978
return 0;
1979
}
1980
1981
static int kfd_fill_gpu_memory_affinity(int *avail_size,
1982
struct kfd_node *kdev, uint8_t type, uint64_t size,
1983
struct crat_subtype_memory *sub_type_hdr,
1984
uint32_t proximity_domain,
1985
const struct kfd_local_mem_info *local_mem_info)
1986
{
1987
*avail_size -= sizeof(struct crat_subtype_memory);
1988
if (*avail_size < 0)
1989
return -ENOMEM;
1990
1991
memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
1992
sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
1993
sub_type_hdr->length = sizeof(struct crat_subtype_memory);
1994
sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
1995
1996
sub_type_hdr->proximity_domain = proximity_domain;
1997
1998
pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
1999
type, size);
2000
2001
sub_type_hdr->length_low = lower_32_bits(size);
2002
sub_type_hdr->length_high = upper_32_bits(size);
2003
2004
sub_type_hdr->width = local_mem_info->vram_width;
2005
sub_type_hdr->visibility_type = type;
2006
2007
return 0;
2008
}
2009
2010
#ifdef CONFIG_ACPI_NUMA
2011
static void kfd_find_numa_node_in_srat(struct kfd_node *kdev)
2012
{
2013
struct acpi_table_header *table_header = NULL;
2014
struct acpi_subtable_header *sub_header = NULL;
2015
unsigned long table_end, subtable_len;
2016
u32 pci_id = pci_domain_nr(kdev->adev->pdev->bus) << 16 |
2017
pci_dev_id(kdev->adev->pdev);
2018
u32 bdf;
2019
acpi_status status;
2020
struct acpi_srat_cpu_affinity *cpu;
2021
struct acpi_srat_generic_affinity *gpu;
2022
int pxm = 0, max_pxm = 0;
2023
int numa_node = NUMA_NO_NODE;
2024
bool found = false;
2025
2026
/* Fetch the SRAT table from ACPI */
2027
status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);
2028
if (status == AE_NOT_FOUND) {
2029
pr_warn("SRAT table not found\n");
2030
return;
2031
} else if (ACPI_FAILURE(status)) {
2032
const char *err = acpi_format_exception(status);
2033
pr_err("SRAT table error: %s\n", err);
2034
return;
2035
}
2036
2037
table_end = (unsigned long)table_header + table_header->length;
2038
2039
/* Parse all entries looking for a match. */
2040
sub_header = (struct acpi_subtable_header *)
2041
((unsigned long)table_header +
2042
sizeof(struct acpi_table_srat));
2043
subtable_len = sub_header->length;
2044
2045
while (((unsigned long)sub_header) + subtable_len < table_end) {
2046
/*
2047
* If length is 0, break from this loop to avoid
2048
* infinite loop.
2049
*/
2050
if (subtable_len == 0) {
2051
pr_err("SRAT invalid zero length\n");
2052
break;
2053
}
2054
2055
switch (sub_header->type) {
2056
case ACPI_SRAT_TYPE_CPU_AFFINITY:
2057
cpu = (struct acpi_srat_cpu_affinity *)sub_header;
2058
pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |
2059
cpu->proximity_domain_lo;
2060
if (pxm > max_pxm)
2061
max_pxm = pxm;
2062
break;
2063
case ACPI_SRAT_TYPE_GENERIC_AFFINITY:
2064
gpu = (struct acpi_srat_generic_affinity *)sub_header;
2065
bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |
2066
*((u16 *)(&gpu->device_handle[2]));
2067
if (bdf == pci_id) {
2068
found = true;
2069
numa_node = pxm_to_node(gpu->proximity_domain);
2070
}
2071
break;
2072
default:
2073
break;
2074
}
2075
2076
if (found)
2077
break;
2078
2079
sub_header = (struct acpi_subtable_header *)
2080
((unsigned long)sub_header + subtable_len);
2081
subtable_len = sub_header->length;
2082
}
2083
2084
acpi_put_table(table_header);
2085
2086
/* Workaround bad cpu-gpu binding case */
2087
if (found && (numa_node < 0 ||
2088
numa_node > pxm_to_node(max_pxm)))
2089
numa_node = 0;
2090
2091
if (numa_node != NUMA_NO_NODE)
2092
set_dev_node(&kdev->adev->pdev->dev, numa_node);
2093
}
2094
#endif
2095
2096
#define KFD_CRAT_INTRA_SOCKET_WEIGHT 13
2097
#define KFD_CRAT_XGMI_WEIGHT 15
2098
2099
/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
2100
* to its NUMA node
2101
* @avail_size: Available size in the memory
2102
* @kdev - [IN] GPU device
2103
* @sub_type_hdr: Memory into which io link info will be filled in
2104
* @proximity_domain - proximity domain of the GPU node
2105
*
2106
* Return 0 if successful else return -ve value
2107
*/
2108
static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
2109
struct kfd_node *kdev,
2110
struct crat_subtype_iolink *sub_type_hdr,
2111
uint32_t proximity_domain)
2112
{
2113
*avail_size -= sizeof(struct crat_subtype_iolink);
2114
if (*avail_size < 0)
2115
return -ENOMEM;
2116
2117
memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
2118
2119
/* Fill in subtype header data */
2120
sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
2121
sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
2122
sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
2123
if (kfd_dev_is_large_bar(kdev))
2124
sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2125
2126
/* Fill in IOLINK subtype.
2127
* TODO: Fill-in other fields of iolink subtype
2128
*/
2129
if (kdev->adev->gmc.xgmi.connected_to_cpu ||
2130
(KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) &&
2131
kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) ==
2132
AMDGPU_PKG_TYPE_APU)) {
2133
bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);
2134
int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :
2135
KFD_CRAT_INTRA_SOCKET_WEIGHT;
2136
/*
2137
* with host gpu xgmi link, host can access gpu memory whether
2138
* or not pcie bar type is large, so always create bidirectional
2139
* io link.
2140
*/
2141
sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2142
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
2143
sub_type_hdr->weight_xgmi = weight;
2144
if (ext_cpu) {
2145
amdgpu_xgmi_get_bandwidth(kdev->adev, NULL,
2146
AMDGPU_XGMI_BW_MODE_PER_LINK,
2147
AMDGPU_XGMI_BW_UNIT_MBYTES,
2148
&sub_type_hdr->minimum_bandwidth_mbs,
2149
&sub_type_hdr->maximum_bandwidth_mbs);
2150
} else {
2151
sub_type_hdr->minimum_bandwidth_mbs = mem_bw;
2152
sub_type_hdr->maximum_bandwidth_mbs = mem_bw;
2153
}
2154
} else {
2155
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
2156
sub_type_hdr->minimum_bandwidth_mbs =
2157
amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true);
2158
sub_type_hdr->maximum_bandwidth_mbs =
2159
amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false);
2160
}
2161
2162
sub_type_hdr->proximity_domain_from = proximity_domain;
2163
2164
#ifdef CONFIG_ACPI_NUMA
2165
if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE &&
2166
num_possible_nodes() > 1)
2167
kfd_find_numa_node_in_srat(kdev);
2168
#endif
2169
#ifdef CONFIG_NUMA
2170
if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE)
2171
sub_type_hdr->proximity_domain_to = 0;
2172
else
2173
sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node;
2174
#else
2175
sub_type_hdr->proximity_domain_to = 0;
2176
#endif
2177
return 0;
2178
}
2179
2180
static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
2181
struct kfd_node *kdev,
2182
struct kfd_node *peer_kdev,
2183
struct crat_subtype_iolink *sub_type_hdr,
2184
uint32_t proximity_domain_from,
2185
uint32_t proximity_domain_to)
2186
{
2187
bool use_ta_info = kdev->kfd->num_nodes == 1;
2188
2189
*avail_size -= sizeof(struct crat_subtype_iolink);
2190
if (*avail_size < 0)
2191
return -ENOMEM;
2192
2193
memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
2194
2195
sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
2196
sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
2197
sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |
2198
CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2199
2200
sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
2201
sub_type_hdr->proximity_domain_from = proximity_domain_from;
2202
sub_type_hdr->proximity_domain_to = proximity_domain_to;
2203
2204
if (use_ta_info) {
2205
sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *
2206
amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);
2207
amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev,
2208
AMDGPU_XGMI_BW_MODE_PER_PEER,
2209
AMDGPU_XGMI_BW_UNIT_MBYTES,
2210
&sub_type_hdr->minimum_bandwidth_mbs,
2211
&sub_type_hdr->maximum_bandwidth_mbs);
2212
} else {
2213
bool is_single_hop = kdev->kfd == peer_kdev->kfd;
2214
int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :
2215
(2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT;
2216
int mem_bw = 819200;
2217
2218
sub_type_hdr->weight_xgmi = weight;
2219
sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0;
2220
sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0;
2221
}
2222
2223
return 0;
2224
}
2225
2226
/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
2227
*
2228
* @pcrat_image: Fill in VCRAT for GPU
2229
* @size: [IN] allocated size of crat_image.
2230
* [OUT] actual size of data filled in crat_image
2231
*/
2232
static int kfd_create_vcrat_image_gpu(void *pcrat_image,
2233
size_t *size, struct kfd_node *kdev,
2234
uint32_t proximity_domain)
2235
{
2236
struct crat_header *crat_table = (struct crat_header *)pcrat_image;
2237
struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;
2238
struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;
2239
struct crat_subtype_generic *sub_type_hdr;
2240
struct kfd_local_mem_info local_mem_info;
2241
struct kfd_topology_device *peer_dev;
2242
struct crat_subtype_computeunit *cu;
2243
int avail_size = *size;
2244
uint32_t total_num_of_cu;
2245
uint32_t nid = 0;
2246
int ret = 0;
2247
2248
if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
2249
return -EINVAL;
2250
2251
/* Fill the CRAT Header.
2252
* Modify length and total_entries as subunits are added.
2253
*/
2254
avail_size -= sizeof(struct crat_header);
2255
memset(crat_table, 0, sizeof(struct crat_header));
2256
2257
memcpy(&crat_table->signature, CRAT_SIGNATURE,
2258
sizeof(crat_table->signature));
2259
/* Change length as we add more subtypes*/
2260
crat_table->length = sizeof(struct crat_header);
2261
crat_table->num_domains = 1;
2262
crat_table->total_entries = 0;
2263
2264
/* Fill in Subtype: Compute Unit
2265
* First fill in the sub type header and then sub type data
2266
*/
2267
avail_size -= sizeof(struct crat_subtype_computeunit);
2268
sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
2269
memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
2270
2271
sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
2272
sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
2273
sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
2274
2275
/* Fill CU subtype data */
2276
cu = (struct crat_subtype_computeunit *)sub_type_hdr;
2277
cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
2278
cu->proximity_domain = proximity_domain;
2279
2280
cu->num_simd_per_cu = cu_info->simd_per_cu;
2281
cu->num_simd_cores = cu_info->simd_per_cu *
2282
(cu_info->number / kdev->kfd->num_nodes);
2283
cu->max_waves_simd = cu_info->max_waves_per_simd;
2284
2285
cu->wave_front_size = cu_info->wave_front_size;
2286
cu->array_count = gfx_info->max_sh_per_se *
2287
gfx_info->max_shader_engines;
2288
total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh);
2289
cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
2290
cu->num_cu_per_array = gfx_info->max_cu_per_sh;
2291
cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu;
2292
cu->num_banks = gfx_info->max_shader_engines;
2293
cu->lds_size_in_kb = cu_info->lds_size;
2294
2295
cu->hsa_capability = 0;
2296
2297
crat_table->length += sub_type_hdr->length;
2298
crat_table->total_entries++;
2299
2300
/* Fill in Subtype: Memory. Only on systems with large BAR (no
2301
* private FB), report memory as public. On other systems
2302
* report the total FB size (public+private) as a single
2303
* private heap.
2304
*/
2305
local_mem_info = kdev->local_mem_info;
2306
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2307
sub_type_hdr->length);
2308
2309
if (kdev->adev->debug_largebar)
2310
local_mem_info.local_mem_size_private = 0;
2311
2312
if (local_mem_info.local_mem_size_private == 0)
2313
ret = kfd_fill_gpu_memory_affinity(&avail_size,
2314
kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
2315
local_mem_info.local_mem_size_public,
2316
(struct crat_subtype_memory *)sub_type_hdr,
2317
proximity_domain,
2318
&local_mem_info);
2319
else
2320
ret = kfd_fill_gpu_memory_affinity(&avail_size,
2321
kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
2322
local_mem_info.local_mem_size_public +
2323
local_mem_info.local_mem_size_private,
2324
(struct crat_subtype_memory *)sub_type_hdr,
2325
proximity_domain,
2326
&local_mem_info);
2327
if (ret < 0)
2328
return ret;
2329
2330
crat_table->length += sizeof(struct crat_subtype_memory);
2331
crat_table->total_entries++;
2332
2333
/* Fill in Subtype: IO_LINKS
2334
* Only direct links are added here which is Link from GPU to
2335
* its NUMA node. Indirect links are added by userspace.
2336
*/
2337
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2338
sub_type_hdr->length);
2339
ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
2340
(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
2341
2342
if (ret < 0)
2343
return ret;
2344
2345
crat_table->length += sub_type_hdr->length;
2346
crat_table->total_entries++;
2347
2348
2349
/* Fill in Subtype: IO_LINKS
2350
* Direct links from GPU to other GPUs through xGMI.
2351
* We will loop GPUs that already be processed (with lower value
2352
* of proximity_domain), add the link for the GPUs with same
2353
* hive id (from this GPU to other GPU) . The reversed iolink
2354
* (from other GPU to this GPU) will be added
2355
* in kfd_parse_subtype_iolink.
2356
*/
2357
if (kdev->kfd->hive_id) {
2358
for (nid = 0; nid < proximity_domain; ++nid) {
2359
peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);
2360
if (!peer_dev->gpu)
2361
continue;
2362
if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
2363
continue;
2364
if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))
2365
continue;
2366
sub_type_hdr = (typeof(sub_type_hdr))(
2367
(char *)sub_type_hdr +
2368
sizeof(struct crat_subtype_iolink));
2369
ret = kfd_fill_gpu_xgmi_link_to_gpu(
2370
&avail_size, kdev, peer_dev->gpu,
2371
(struct crat_subtype_iolink *)sub_type_hdr,
2372
proximity_domain, nid);
2373
if (ret < 0)
2374
return ret;
2375
crat_table->length += sub_type_hdr->length;
2376
crat_table->total_entries++;
2377
}
2378
}
2379
*size = crat_table->length;
2380
pr_info("Virtual CRAT table created for GPU\n");
2381
2382
return ret;
2383
}
2384
2385
/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
2386
* creates a Virtual CRAT (VCRAT) image
2387
*
2388
* NOTE: Call kfd_destroy_crat_image to free CRAT image memory
2389
*
2390
* @crat_image: VCRAT image created because ACPI does not have a
2391
* CRAT for this device
2392
* @size: [OUT] size of virtual crat_image
2393
* @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device
2394
* COMPUTE_UNIT_GPU - Create VCRAT for GPU
2395
* (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
2396
* -- this option is not currently implemented.
2397
* The assumption is that all AMD APUs will have CRAT
2398
* @kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU
2399
*
2400
* Return 0 if successful else return -ve value
2401
*/
2402
int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
2403
int flags, struct kfd_node *kdev,
2404
uint32_t proximity_domain)
2405
{
2406
void *pcrat_image = NULL;
2407
int ret = 0, num_nodes;
2408
size_t dyn_size;
2409
2410
if (!crat_image)
2411
return -EINVAL;
2412
2413
*crat_image = NULL;
2414
2415
/* Allocate the CPU Virtual CRAT size based on the number of online
2416
* nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image.
2417
* This should cover all the current conditions. A check is put not
2418
* to overwrite beyond allocated size for GPUs
2419
*/
2420
switch (flags) {
2421
case COMPUTE_UNIT_CPU:
2422
num_nodes = num_online_nodes();
2423
dyn_size = sizeof(struct crat_header) +
2424
num_nodes * (sizeof(struct crat_subtype_computeunit) +
2425
sizeof(struct crat_subtype_memory) +
2426
(num_nodes - 1) * sizeof(struct crat_subtype_iolink));
2427
pcrat_image = kvmalloc(dyn_size, GFP_KERNEL);
2428
if (!pcrat_image)
2429
return -ENOMEM;
2430
*size = dyn_size;
2431
pr_debug("CRAT size is %ld", dyn_size);
2432
ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
2433
break;
2434
case COMPUTE_UNIT_GPU:
2435
if (!kdev)
2436
return -EINVAL;
2437
pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
2438
if (!pcrat_image)
2439
return -ENOMEM;
2440
*size = VCRAT_SIZE_FOR_GPU;
2441
ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
2442
proximity_domain);
2443
break;
2444
case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
2445
/* TODO: */
2446
ret = -EINVAL;
2447
pr_err("VCRAT not implemented for APU\n");
2448
break;
2449
default:
2450
ret = -EINVAL;
2451
}
2452
2453
if (!ret)
2454
*crat_image = pcrat_image;
2455
else
2456
kvfree(pcrat_image);
2457
2458
return ret;
2459
}
2460
2461
2462
/* kfd_destroy_crat_image
2463
*
2464
* @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
2465
*
2466
*/
2467
void kfd_destroy_crat_image(void *crat_image)
2468
{
2469
kvfree(crat_image);
2470
}
2471
2472