CoCalc -- kfd

GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
²⁶⁵¹⁶ views
1
// SPDX-License-Identifier: GPL-2.0 OR MIT
2
/*
3
 * Copyright 2015-2022 Advanced Micro Devices, Inc.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
 * and/or sell copies of the Software, and to permit persons to whom the
10
 * Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
 * OTHER DEALINGS IN THE SOFTWARE.
22
 */
23

24
#include <linux/pci.h>
25
#include <linux/acpi.h>
26
#include "kfd_crat.h"
27
#include "kfd_priv.h"
28
#include "kfd_topology.h"
29
#include "amdgpu.h"
30
#include "amdgpu_amdkfd.h"
31
#include "amdgpu_xgmi.h"
32

33
/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
34
 * GPU processor ID are expressed with Bit[31]=1.
35
 * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
36
 * used in the CRAT.
37
 */
38
static uint32_t gpu_processor_id_low = 0x80001000;
39

40
/* Return the next available gpu_processor_id and increment it for next GPU
41
 *	@total_cu_count - Total CUs present in the GPU including ones
42
 *			  masked off
43
 */
44
static inline unsigned int get_and_inc_gpu_processor_id(
45
				unsigned int total_cu_count)
46
{
47
	int current_id = gpu_processor_id_low;
48

49
	gpu_processor_id_low += total_cu_count;
50
	return current_id;
51
}
52

53

54
static struct kfd_gpu_cache_info kaveri_cache_info[] = {
55
	{
56
		/* TCP L1 Cache per CU */
57
		.cache_size = 16,
58
		.cache_level = 1,
59
		.cache_line_size = 64,
60
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
61
				CRAT_CACHE_FLAGS_DATA_CACHE |
62
				CRAT_CACHE_FLAGS_SIMD_CACHE),
63
		.num_cu_shared = 1,
64
	},
65
	{
66
		/* Scalar L1 Instruction Cache (in SQC module) per bank */
67
		.cache_size = 16,
68
		.cache_level = 1,
69
		.cache_line_size = 64,
70
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
71
				CRAT_CACHE_FLAGS_INST_CACHE |
72
				CRAT_CACHE_FLAGS_SIMD_CACHE),
73
		.num_cu_shared = 2,
74
	},
75
	{
76
		/* Scalar L1 Data Cache (in SQC module) per bank */
77
		.cache_size = 8,
78
		.cache_level = 1,
79
		.cache_line_size = 64,
80
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
81
				CRAT_CACHE_FLAGS_DATA_CACHE |
82
				CRAT_CACHE_FLAGS_SIMD_CACHE),
83
		.num_cu_shared = 2,
84
	},
85

86
	/* TODO: Add L2 Cache information */
87
};
88

89

90
static struct kfd_gpu_cache_info carrizo_cache_info[] = {
91
	{
92
		/* TCP L1 Cache per CU */
93
		.cache_size = 16,
94
		.cache_level = 1,
95
		.cache_line_size = 64,
96
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
97
				CRAT_CACHE_FLAGS_DATA_CACHE |
98
				CRAT_CACHE_FLAGS_SIMD_CACHE),
99
		.num_cu_shared = 1,
100
	},
101
	{
102
		/* Scalar L1 Instruction Cache (in SQC module) per bank */
103
		.cache_size = 32,
104
		.cache_level = 1,
105
		.cache_line_size = 64,
106
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
107
				CRAT_CACHE_FLAGS_INST_CACHE |
108
				CRAT_CACHE_FLAGS_SIMD_CACHE),
109
		.num_cu_shared = 4,
110
	},
111
	{
112
		/* Scalar L1 Data Cache (in SQC module) per bank. */
113
		.cache_size = 16,
114
		.cache_level = 1,
115
		.cache_line_size = 64,
116
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
117
				CRAT_CACHE_FLAGS_DATA_CACHE |
118
				CRAT_CACHE_FLAGS_SIMD_CACHE),
119
		.num_cu_shared = 4,
120
	},
121

122
	/* TODO: Add L2 Cache information */
123
};
124

125
#define hawaii_cache_info kaveri_cache_info
126
#define tonga_cache_info carrizo_cache_info
127
#define fiji_cache_info  carrizo_cache_info
128
#define polaris10_cache_info carrizo_cache_info
129
#define polaris11_cache_info carrizo_cache_info
130
#define polaris12_cache_info carrizo_cache_info
131
#define vegam_cache_info carrizo_cache_info
132

133
/* NOTE: L1 cache information has been updated and L2/L3
134
 * cache information has been added for Vega10 and
135
 * newer ASICs. The unit for cache_size is KiB.
136
 * In future,  check & update cache details
137
 * for every new ASIC is required.
138
 */
139

140
static struct kfd_gpu_cache_info vega10_cache_info[] = {
141
	{
142
		/* TCP L1 Cache per CU */
143
		.cache_size = 16,
144
		.cache_level = 1,
145
		.cache_line_size = 64,
146
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
147
				CRAT_CACHE_FLAGS_DATA_CACHE |
148
				CRAT_CACHE_FLAGS_SIMD_CACHE),
149
		.num_cu_shared = 1,
150
	},
151
	{
152
		/* Scalar L1 Instruction Cache per SQC */
153
		.cache_size = 32,
154
		.cache_level = 1,
155
		.cache_line_size = 64,
156
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
157
				CRAT_CACHE_FLAGS_INST_CACHE |
158
				CRAT_CACHE_FLAGS_SIMD_CACHE),
159
		.num_cu_shared = 3,
160
	},
161
	{
162
		/* Scalar L1 Data Cache per SQC */
163
		.cache_size = 16,
164
		.cache_level = 1,
165
		.cache_line_size = 64,
166
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
167
				CRAT_CACHE_FLAGS_DATA_CACHE |
168
				CRAT_CACHE_FLAGS_SIMD_CACHE),
169
		.num_cu_shared = 3,
170
	},
171
	{
172
		/* L2 Data Cache per GPU (Total Tex Cache) */
173
		.cache_size = 4096,
174
		.cache_level = 2,
175
		.cache_line_size = 64,
176
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
177
				CRAT_CACHE_FLAGS_DATA_CACHE |
178
				CRAT_CACHE_FLAGS_SIMD_CACHE),
179
		.num_cu_shared = 16,
180
	},
181
};
182

183
static struct kfd_gpu_cache_info raven_cache_info[] = {
184
	{
185
		/* TCP L1 Cache per CU */
186
		.cache_size = 16,
187
		.cache_level = 1,
188
		.cache_line_size = 64,
189
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
190
				CRAT_CACHE_FLAGS_DATA_CACHE |
191
				CRAT_CACHE_FLAGS_SIMD_CACHE),
192
		.num_cu_shared = 1,
193
	},
194
	{
195
		/* Scalar L1 Instruction Cache per SQC */
196
		.cache_size = 32,
197
		.cache_level = 1,
198
		.cache_line_size = 64,
199
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
200
				CRAT_CACHE_FLAGS_INST_CACHE |
201
				CRAT_CACHE_FLAGS_SIMD_CACHE),
202
		.num_cu_shared = 3,
203
	},
204
	{
205
		/* Scalar L1 Data Cache per SQC */
206
		.cache_size = 16,
207
		.cache_level = 1,
208
		.cache_line_size = 64,
209
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
210
				CRAT_CACHE_FLAGS_DATA_CACHE |
211
				CRAT_CACHE_FLAGS_SIMD_CACHE),
212
		.num_cu_shared = 3,
213
	},
214
	{
215
		/* L2 Data Cache per GPU (Total Tex Cache) */
216
		.cache_size = 1024,
217
		.cache_level = 2,
218
		.cache_line_size = 64,
219
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
220
				CRAT_CACHE_FLAGS_DATA_CACHE |
221
				CRAT_CACHE_FLAGS_SIMD_CACHE),
222
		.num_cu_shared = 11,
223
	},
224
};
225

226
static struct kfd_gpu_cache_info renoir_cache_info[] = {
227
	{
228
		/* TCP L1 Cache per CU */
229
		.cache_size = 16,
230
		.cache_level = 1,
231
		.cache_line_size = 64,
232
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
233
				CRAT_CACHE_FLAGS_DATA_CACHE |
234
				CRAT_CACHE_FLAGS_SIMD_CACHE),
235
		.num_cu_shared = 1,
236
	},
237
	{
238
		/* Scalar L1 Instruction Cache per SQC */
239
		.cache_size = 32,
240
		.cache_level = 1,
241
		.cache_line_size = 64,
242
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
243
				CRAT_CACHE_FLAGS_INST_CACHE |
244
				CRAT_CACHE_FLAGS_SIMD_CACHE),
245
		.num_cu_shared = 3,
246
	},
247
	{
248
		/* Scalar L1 Data Cache per SQC */
249
		.cache_size = 16,
250
		.cache_level = 1,
251
		.cache_line_size = 64,
252
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
253
				CRAT_CACHE_FLAGS_DATA_CACHE |
254
				CRAT_CACHE_FLAGS_SIMD_CACHE),
255
		.num_cu_shared = 3,
256
	},
257
	{
258
		/* L2 Data Cache per GPU (Total Tex Cache) */
259
		.cache_size = 1024,
260
		.cache_level = 2,
261
		.cache_line_size = 64,
262
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
263
				CRAT_CACHE_FLAGS_DATA_CACHE |
264
				CRAT_CACHE_FLAGS_SIMD_CACHE),
265
		.num_cu_shared = 8,
266
	},
267
};
268

269
static struct kfd_gpu_cache_info vega12_cache_info[] = {
270
	{
271
		/* TCP L1 Cache per CU */
272
		.cache_size = 16,
273
		.cache_level = 1,
274
		.cache_line_size = 64,
275
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
276
				CRAT_CACHE_FLAGS_DATA_CACHE |
277
				CRAT_CACHE_FLAGS_SIMD_CACHE),
278
		.num_cu_shared = 1,
279
	},
280
	{
281
		/* Scalar L1 Instruction Cache per SQC */
282
		.cache_size = 32,
283
		.cache_level = 1,
284
		.cache_line_size = 64,
285
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
286
				CRAT_CACHE_FLAGS_INST_CACHE |
287
				CRAT_CACHE_FLAGS_SIMD_CACHE),
288
		.num_cu_shared = 3,
289
	},
290
	{
291
		/* Scalar L1 Data Cache per SQC */
292
		.cache_size = 16,
293
		.cache_level = 1,
294
		.cache_line_size = 64,
295
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
296
				CRAT_CACHE_FLAGS_DATA_CACHE |
297
				CRAT_CACHE_FLAGS_SIMD_CACHE),
298
		.num_cu_shared = 3,
299
	},
300
	{
301
		/* L2 Data Cache per GPU (Total Tex Cache) */
302
		.cache_size = 2048,
303
		.cache_level = 2,
304
		.cache_line_size = 64,
305
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
306
				CRAT_CACHE_FLAGS_DATA_CACHE |
307
				CRAT_CACHE_FLAGS_SIMD_CACHE),
308
		.num_cu_shared = 5,
309
	},
310
};
311

312
static struct kfd_gpu_cache_info vega20_cache_info[] = {
313
	{
314
		/* TCP L1 Cache per CU */
315
		.cache_size = 16,
316
		.cache_level = 1,
317
		.cache_line_size = 64,
318
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
319
				CRAT_CACHE_FLAGS_DATA_CACHE |
320
				CRAT_CACHE_FLAGS_SIMD_CACHE),
321
		.num_cu_shared = 1,
322
	},
323
	{
324
		/* Scalar L1 Instruction Cache per SQC */
325
		.cache_size = 32,
326
		.cache_level = 1,
327
		.cache_line_size = 64,
328
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
329
				CRAT_CACHE_FLAGS_INST_CACHE |
330
				CRAT_CACHE_FLAGS_SIMD_CACHE),
331
		.num_cu_shared = 3,
332
	},
333
	{
334
		/* Scalar L1 Data Cache per SQC */
335
		.cache_size = 16,
336
		.cache_level = 1,
337
		.cache_line_size = 64,
338
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
339
				CRAT_CACHE_FLAGS_DATA_CACHE |
340
				CRAT_CACHE_FLAGS_SIMD_CACHE),
341
		.num_cu_shared = 3,
342
	},
343
	{
344
		/* L2 Data Cache per GPU (Total Tex Cache) */
345
		.cache_size = 8192,
346
		.cache_level = 2,
347
		.cache_line_size = 64,
348
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
349
				CRAT_CACHE_FLAGS_DATA_CACHE |
350
				CRAT_CACHE_FLAGS_SIMD_CACHE),
351
		.num_cu_shared = 16,
352
	},
353
};
354

355
static struct kfd_gpu_cache_info aldebaran_cache_info[] = {
356
	{
357
		/* TCP L1 Cache per CU */
358
		.cache_size = 16,
359
		.cache_level = 1,
360
		.cache_line_size = 64,
361
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
362
				CRAT_CACHE_FLAGS_DATA_CACHE |
363
				CRAT_CACHE_FLAGS_SIMD_CACHE),
364
		.num_cu_shared = 1,
365
	},
366
	{
367
		/* Scalar L1 Instruction Cache per SQC */
368
		.cache_size = 32,
369
		.cache_level = 1,
370
		.cache_line_size = 64,
371
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
372
				CRAT_CACHE_FLAGS_INST_CACHE |
373
				CRAT_CACHE_FLAGS_SIMD_CACHE),
374
		.num_cu_shared = 2,
375
	},
376
	{
377
		/* Scalar L1 Data Cache per SQC */
378
		.cache_size = 16,
379
		.cache_level = 1,
380
		.cache_line_size = 64,
381
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
382
				CRAT_CACHE_FLAGS_DATA_CACHE |
383
				CRAT_CACHE_FLAGS_SIMD_CACHE),
384
		.num_cu_shared = 2,
385
	},
386
	{
387
		/* L2 Data Cache per GPU (Total Tex Cache) */
388
		.cache_size = 8192,
389
		.cache_level = 2,
390
		.cache_line_size = 128,
391
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
392
				CRAT_CACHE_FLAGS_DATA_CACHE |
393
				CRAT_CACHE_FLAGS_SIMD_CACHE),
394
		.num_cu_shared = 14,
395
	},
396
};
397

398
static struct kfd_gpu_cache_info navi10_cache_info[] = {
399
	{
400
		/* TCP L1 Cache per CU */
401
		.cache_size = 16,
402
		.cache_level = 1,
403
		.cache_line_size = 128,
404
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
405
				CRAT_CACHE_FLAGS_DATA_CACHE |
406
				CRAT_CACHE_FLAGS_SIMD_CACHE),
407
		.num_cu_shared = 1,
408
	},
409
	{
410
		/* Scalar L1 Instruction Cache per SQC */
411
		.cache_size = 32,
412
		.cache_level = 1,
413
		.cache_line_size = 64,
414
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
415
				CRAT_CACHE_FLAGS_INST_CACHE |
416
				CRAT_CACHE_FLAGS_SIMD_CACHE),
417
		.num_cu_shared = 2,
418
	},
419
	{
420
		/* Scalar L1 Data Cache per SQC */
421
		.cache_size = 16,
422
		.cache_level = 1,
423
		.cache_line_size = 64,
424
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
425
				CRAT_CACHE_FLAGS_DATA_CACHE |
426
				CRAT_CACHE_FLAGS_SIMD_CACHE),
427
		.num_cu_shared = 2,
428
	},
429
	{
430
		/* GL1 Data Cache per SA */
431
		.cache_size = 128,
432
		.cache_level = 1,
433
		.cache_line_size = 128,
434
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
435
				CRAT_CACHE_FLAGS_DATA_CACHE |
436
				CRAT_CACHE_FLAGS_SIMD_CACHE),
437
		.num_cu_shared = 10,
438
	},
439
	{
440
		/* L2 Data Cache per GPU (Total Tex Cache) */
441
		.cache_size = 4096,
442
		.cache_level = 2,
443
		.cache_line_size = 128,
444
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
445
				CRAT_CACHE_FLAGS_DATA_CACHE |
446
				CRAT_CACHE_FLAGS_SIMD_CACHE),
447
		.num_cu_shared = 10,
448
	},
449
};
450

451
static struct kfd_gpu_cache_info vangogh_cache_info[] = {
452
	{
453
		/* TCP L1 Cache per CU */
454
		.cache_size = 16,
455
		.cache_level = 1,
456
		.cache_line_size = 128,
457
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
458
				CRAT_CACHE_FLAGS_DATA_CACHE |
459
				CRAT_CACHE_FLAGS_SIMD_CACHE),
460
		.num_cu_shared = 1,
461
	},
462
	{
463
		/* Scalar L1 Instruction Cache per SQC */
464
		.cache_size = 32,
465
		.cache_level = 1,
466
		.cache_line_size = 64,
467
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
468
				CRAT_CACHE_FLAGS_INST_CACHE |
469
				CRAT_CACHE_FLAGS_SIMD_CACHE),
470
		.num_cu_shared = 2,
471
	},
472
	{
473
		/* Scalar L1 Data Cache per SQC */
474
		.cache_size = 16,
475
		.cache_level = 1,
476
		.cache_line_size = 64,
477
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
478
				CRAT_CACHE_FLAGS_DATA_CACHE |
479
				CRAT_CACHE_FLAGS_SIMD_CACHE),
480
		.num_cu_shared = 2,
481
	},
482
	{
483
		/* GL1 Data Cache per SA */
484
		.cache_size = 128,
485
		.cache_level = 1,
486
		.cache_line_size = 128,
487
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
488
				CRAT_CACHE_FLAGS_DATA_CACHE |
489
				CRAT_CACHE_FLAGS_SIMD_CACHE),
490
		.num_cu_shared = 8,
491
	},
492
	{
493
		/* L2 Data Cache per GPU (Total Tex Cache) */
494
		.cache_size = 1024,
495
		.cache_level = 2,
496
		.cache_line_size = 128,
497
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
498
				CRAT_CACHE_FLAGS_DATA_CACHE |
499
				CRAT_CACHE_FLAGS_SIMD_CACHE),
500
		.num_cu_shared = 8,
501
	},
502
};
503

504
static struct kfd_gpu_cache_info navi14_cache_info[] = {
505
	{
506
		/* TCP L1 Cache per CU */
507
		.cache_size = 16,
508
		.cache_level = 1,
509
		.cache_line_size = 128,
510
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
511
				CRAT_CACHE_FLAGS_DATA_CACHE |
512
				CRAT_CACHE_FLAGS_SIMD_CACHE),
513
		.num_cu_shared = 1,
514
	},
515
	{
516
		/* Scalar L1 Instruction Cache per SQC */
517
		.cache_size = 32,
518
		.cache_level = 1,
519
		.cache_line_size = 64,
520
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
521
				CRAT_CACHE_FLAGS_INST_CACHE |
522
				CRAT_CACHE_FLAGS_SIMD_CACHE),
523
		.num_cu_shared = 2,
524
	},
525
	{
526
		/* Scalar L1 Data Cache per SQC */
527
		.cache_size = 16,
528
		.cache_level = 1,
529
		.cache_line_size = 64,
530
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
531
				CRAT_CACHE_FLAGS_DATA_CACHE |
532
				CRAT_CACHE_FLAGS_SIMD_CACHE),
533
		.num_cu_shared = 2,
534
	},
535
	{
536
		/* GL1 Data Cache per SA */
537
		.cache_size = 128,
538
		.cache_level = 1,
539
		.cache_line_size = 128,
540
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
541
				CRAT_CACHE_FLAGS_DATA_CACHE |
542
				CRAT_CACHE_FLAGS_SIMD_CACHE),
543
		.num_cu_shared = 12,
544
	},
545
	{
546
		/* L2 Data Cache per GPU (Total Tex Cache) */
547
		.cache_size = 2048,
548
		.cache_level = 2,
549
		.cache_line_size = 128,
550
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
551
				CRAT_CACHE_FLAGS_DATA_CACHE |
552
				CRAT_CACHE_FLAGS_SIMD_CACHE),
553
		.num_cu_shared = 12,
554
	},
555
};
556

557
static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = {
558
	{
559
		/* TCP L1 Cache per CU */
560
		.cache_size = 16,
561
		.cache_level = 1,
562
		.cache_line_size = 128,
563
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
564
				CRAT_CACHE_FLAGS_DATA_CACHE |
565
				CRAT_CACHE_FLAGS_SIMD_CACHE),
566
		.num_cu_shared = 1,
567
	},
568
	{
569
		/* Scalar L1 Instruction Cache per SQC */
570
		.cache_size = 32,
571
		.cache_level = 1,
572
		.cache_line_size = 64,
573
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
574
				CRAT_CACHE_FLAGS_INST_CACHE |
575
				CRAT_CACHE_FLAGS_SIMD_CACHE),
576
		.num_cu_shared = 2,
577
	},
578
	{
579
		/* Scalar L1 Data Cache per SQC */
580
		.cache_size = 16,
581
		.cache_level = 1,
582
		.cache_line_size = 64,
583
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
584
				CRAT_CACHE_FLAGS_DATA_CACHE |
585
				CRAT_CACHE_FLAGS_SIMD_CACHE),
586
		.num_cu_shared = 2,
587
	},
588
	{
589
		/* GL1 Data Cache per SA */
590
		.cache_size = 128,
591
		.cache_level = 1,
592
		.cache_line_size = 128,
593
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
594
				CRAT_CACHE_FLAGS_DATA_CACHE |
595
				CRAT_CACHE_FLAGS_SIMD_CACHE),
596
		.num_cu_shared = 10,
597
	},
598
	{
599
		/* L2 Data Cache per GPU (Total Tex Cache) */
600
		.cache_size = 4096,
601
		.cache_level = 2,
602
		.cache_line_size = 128,
603
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
604
				CRAT_CACHE_FLAGS_DATA_CACHE |
605
				CRAT_CACHE_FLAGS_SIMD_CACHE),
606
		.num_cu_shared = 10,
607
	},
608
	{
609
		/* L3 Data Cache per GPU */
610
		.cache_size = 128*1024,
611
		.cache_level = 3,
612
		.cache_line_size = 64,
613
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
614
				CRAT_CACHE_FLAGS_DATA_CACHE |
615
				CRAT_CACHE_FLAGS_SIMD_CACHE),
616
		.num_cu_shared = 10,
617
	},
618
};
619

620
static struct kfd_gpu_cache_info navy_flounder_cache_info[] = {
621
	{
622
		/* TCP L1 Cache per CU */
623
		.cache_size = 16,
624
		.cache_level = 1,
625
		.cache_line_size = 128,
626
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
627
				CRAT_CACHE_FLAGS_DATA_CACHE |
628
				CRAT_CACHE_FLAGS_SIMD_CACHE),
629
		.num_cu_shared = 1,
630
	},
631
	{
632
		/* Scalar L1 Instruction Cache per SQC */
633
		.cache_size = 32,
634
		.cache_level = 1,
635
		.cache_line_size = 64,
636
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
637
				CRAT_CACHE_FLAGS_INST_CACHE |
638
				CRAT_CACHE_FLAGS_SIMD_CACHE),
639
		.num_cu_shared = 2,
640
	},
641
	{
642
		/* Scalar L1 Data Cache per SQC */
643
		.cache_size = 16,
644
		.cache_level = 1,
645
		.cache_line_size = 64,
646
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
647
				CRAT_CACHE_FLAGS_DATA_CACHE |
648
				CRAT_CACHE_FLAGS_SIMD_CACHE),
649
		.num_cu_shared = 2,
650
	},
651
	{
652
		/* GL1 Data Cache per SA */
653
		.cache_size = 128,
654
		.cache_level = 1,
655
		.cache_line_size = 128,
656
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
657
				CRAT_CACHE_FLAGS_DATA_CACHE |
658
				CRAT_CACHE_FLAGS_SIMD_CACHE),
659
		.num_cu_shared = 10,
660
	},
661
	{
662
		/* L2 Data Cache per GPU (Total Tex Cache) */
663
		.cache_size = 3072,
664
		.cache_level = 2,
665
		.cache_line_size = 128,
666
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
667
				CRAT_CACHE_FLAGS_DATA_CACHE |
668
				CRAT_CACHE_FLAGS_SIMD_CACHE),
669
		.num_cu_shared = 10,
670
	},
671
	{
672
		/* L3 Data Cache per GPU */
673
		.cache_size = 96*1024,
674
		.cache_level = 3,
675
		.cache_line_size = 64,
676
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
677
				CRAT_CACHE_FLAGS_DATA_CACHE |
678
				CRAT_CACHE_FLAGS_SIMD_CACHE),
679
		.num_cu_shared = 10,
680
	},
681
};
682

683
static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = {
684
	{
685
		/* TCP L1 Cache per CU */
686
		.cache_size = 16,
687
		.cache_level = 1,
688
		.cache_line_size = 128,
689
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
690
				CRAT_CACHE_FLAGS_DATA_CACHE |
691
				CRAT_CACHE_FLAGS_SIMD_CACHE),
692
		.num_cu_shared = 1,
693
	},
694
	{
695
		/* Scalar L1 Instruction Cache per SQC */
696
		.cache_size = 32,
697
		.cache_level = 1,
698
		.cache_line_size = 64,
699
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
700
				CRAT_CACHE_FLAGS_INST_CACHE |
701
				CRAT_CACHE_FLAGS_SIMD_CACHE),
702
		.num_cu_shared = 2,
703
	},
704
	{
705
		/* Scalar L1 Data Cache per SQC */
706
		.cache_size = 16,
707
		.cache_level = 1,
708
		.cache_line_size = 64,
709
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
710
				CRAT_CACHE_FLAGS_DATA_CACHE |
711
				CRAT_CACHE_FLAGS_SIMD_CACHE),
712
		.num_cu_shared = 2,
713
	},
714
	{
715
		/* GL1 Data Cache per SA */
716
		.cache_size = 128,
717
		.cache_level = 1,
718
		.cache_line_size = 128,
719
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
720
				CRAT_CACHE_FLAGS_DATA_CACHE |
721
				CRAT_CACHE_FLAGS_SIMD_CACHE),
722
		.num_cu_shared = 8,
723
	},
724
	{
725
		/* L2 Data Cache per GPU (Total Tex Cache) */
726
		.cache_size = 2048,
727
		.cache_level = 2,
728
		.cache_line_size = 128,
729
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
730
				CRAT_CACHE_FLAGS_DATA_CACHE |
731
				CRAT_CACHE_FLAGS_SIMD_CACHE),
732
		.num_cu_shared = 8,
733
	},
734
	{
735
		/* L3 Data Cache per GPU */
736
		.cache_size = 32*1024,
737
		.cache_level = 3,
738
		.cache_line_size = 64,
739
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
740
				CRAT_CACHE_FLAGS_DATA_CACHE |
741
				CRAT_CACHE_FLAGS_SIMD_CACHE),
742
		.num_cu_shared = 8,
743
	},
744
};
745

746
static struct kfd_gpu_cache_info beige_goby_cache_info[] = {
747
	{
748
		/* TCP L1 Cache per CU */
749
		.cache_size = 16,
750
		.cache_level = 1,
751
		.cache_line_size = 128,
752
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
753
				CRAT_CACHE_FLAGS_DATA_CACHE |
754
				CRAT_CACHE_FLAGS_SIMD_CACHE),
755
		.num_cu_shared = 1,
756
	},
757
	{
758
		/* Scalar L1 Instruction Cache per SQC */
759
		.cache_size = 32,
760
		.cache_level = 1,
761
		.cache_line_size = 64,
762
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
763
				CRAT_CACHE_FLAGS_INST_CACHE |
764
				CRAT_CACHE_FLAGS_SIMD_CACHE),
765
		.num_cu_shared = 2,
766
	},
767
	{
768
		/* Scalar L1 Data Cache per SQC */
769
		.cache_size = 16,
770
		.cache_level = 1,
771
		.cache_line_size = 64,
772
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
773
				CRAT_CACHE_FLAGS_DATA_CACHE |
774
				CRAT_CACHE_FLAGS_SIMD_CACHE),
775
		.num_cu_shared = 2,
776
	},
777
	{
778
		/* GL1 Data Cache per SA */
779
		.cache_size = 128,
780
		.cache_level = 1,
781
		.cache_line_size = 128,
782
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
783
				CRAT_CACHE_FLAGS_DATA_CACHE |
784
				CRAT_CACHE_FLAGS_SIMD_CACHE),
785
		.num_cu_shared = 8,
786
	},
787
	{
788
		/* L2 Data Cache per GPU (Total Tex Cache) */
789
		.cache_size = 1024,
790
		.cache_level = 2,
791
		.cache_line_size = 128,
792
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
793
				CRAT_CACHE_FLAGS_DATA_CACHE |
794
				CRAT_CACHE_FLAGS_SIMD_CACHE),
795
		.num_cu_shared = 8,
796
	},
797
	{
798
		/* L3 Data Cache per GPU */
799
		.cache_size = 16*1024,
800
		.cache_level = 3,
801
		.cache_line_size = 64,
802
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
803
				CRAT_CACHE_FLAGS_DATA_CACHE |
804
				CRAT_CACHE_FLAGS_SIMD_CACHE),
805
		.num_cu_shared = 8,
806
	},
807
};
808

809
static struct kfd_gpu_cache_info yellow_carp_cache_info[] = {
810
	{
811
		/* TCP L1 Cache per CU */
812
		.cache_size = 16,
813
		.cache_level = 1,
814
		.cache_line_size = 128,
815
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
816
				CRAT_CACHE_FLAGS_DATA_CACHE |
817
				CRAT_CACHE_FLAGS_SIMD_CACHE),
818
		.num_cu_shared = 1,
819
	},
820
	{
821
		/* Scalar L1 Instruction Cache per SQC */
822
		.cache_size = 32,
823
		.cache_level = 1,
824
		.cache_line_size = 64,
825
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
826
				CRAT_CACHE_FLAGS_INST_CACHE |
827
				CRAT_CACHE_FLAGS_SIMD_CACHE),
828
		.num_cu_shared = 2,
829
	},
830
	{
831
		/* Scalar L1 Data Cache per SQC */
832
		.cache_size = 16,
833
		.cache_level = 1,
834
		.cache_line_size = 64,
835
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
836
				CRAT_CACHE_FLAGS_DATA_CACHE |
837
				CRAT_CACHE_FLAGS_SIMD_CACHE),
838
		.num_cu_shared = 2,
839
	},
840
	{
841
		/* GL1 Data Cache per SA */
842
		.cache_size = 128,
843
		.cache_level = 1,
844
		.cache_line_size = 128,
845
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
846
				CRAT_CACHE_FLAGS_DATA_CACHE |
847
				CRAT_CACHE_FLAGS_SIMD_CACHE),
848
		.num_cu_shared = 6,
849
	},
850
	{
851
		/* L2 Data Cache per GPU (Total Tex Cache) */
852
		.cache_size = 2048,
853
		.cache_level = 2,
854
		.cache_line_size = 128,
855
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
856
				CRAT_CACHE_FLAGS_DATA_CACHE |
857
				CRAT_CACHE_FLAGS_SIMD_CACHE),
858
		.num_cu_shared = 6,
859
	},
860
};
861

862
static struct kfd_gpu_cache_info gfx1037_cache_info[] = {
863
	{
864
		/* TCP L1 Cache per CU */
865
		.cache_size = 16,
866
		.cache_level = 1,
867
		.cache_line_size = 128,
868
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
869
				CRAT_CACHE_FLAGS_DATA_CACHE |
870
				CRAT_CACHE_FLAGS_SIMD_CACHE),
871
		.num_cu_shared = 1,
872
	},
873
	{
874
		/* Scalar L1 Instruction Cache per SQC */
875
		.cache_size = 32,
876
		.cache_level = 1,
877
		.cache_line_size = 64,
878
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
879
				CRAT_CACHE_FLAGS_INST_CACHE |
880
				CRAT_CACHE_FLAGS_SIMD_CACHE),
881
		.num_cu_shared = 2,
882
	},
883
	{
884
		/* Scalar L1 Data Cache per SQC */
885
		.cache_size = 16,
886
		.cache_level = 1,
887
		.cache_line_size = 64,
888
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
889
				CRAT_CACHE_FLAGS_DATA_CACHE |
890
				CRAT_CACHE_FLAGS_SIMD_CACHE),
891
		.num_cu_shared = 2,
892
	},
893
	{
894
		/* GL1 Data Cache per SA */
895
		.cache_size = 128,
896
		.cache_level = 1,
897
		.cache_line_size = 128,
898
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
899
				CRAT_CACHE_FLAGS_DATA_CACHE |
900
				CRAT_CACHE_FLAGS_SIMD_CACHE),
901
		.num_cu_shared = 2,
902
	},
903
	{
904
		/* L2 Data Cache per GPU (Total Tex Cache) */
905
		.cache_size = 256,
906
		.cache_level = 2,
907
		.cache_line_size = 128,
908
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
909
				CRAT_CACHE_FLAGS_DATA_CACHE |
910
				CRAT_CACHE_FLAGS_SIMD_CACHE),
911
		.num_cu_shared = 2,
912
	},
913
};
914

915
static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = {
916
	{
917
		/* TCP L1 Cache per CU */
918
		.cache_size = 16,
919
		.cache_level = 1,
920
		.cache_line_size = 128,
921
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
922
			  CRAT_CACHE_FLAGS_DATA_CACHE |
923
			  CRAT_CACHE_FLAGS_SIMD_CACHE),
924
		.num_cu_shared = 1,
925
	},
926
	{
927
		/* Scalar L1 Instruction Cache per SQC */
928
		.cache_size = 32,
929
		.cache_level = 1,
930
		.cache_line_size = 64,
931
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
932
			  CRAT_CACHE_FLAGS_INST_CACHE |
933
			  CRAT_CACHE_FLAGS_SIMD_CACHE),
934
		.num_cu_shared = 2,
935
	},
936
	{
937
		/* Scalar L1 Data Cache per SQC */
938
		.cache_size = 16,
939
		.cache_level = 1,
940
		.cache_line_size = 64,
941
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
942
			  CRAT_CACHE_FLAGS_DATA_CACHE |
943
			  CRAT_CACHE_FLAGS_SIMD_CACHE),
944
		.num_cu_shared = 2,
945
	},
946
	{
947
		/* GL1 Data Cache per SA */
948
		.cache_size = 128,
949
		.cache_level = 1,
950
		.cache_line_size = 128,
951
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
952
			  CRAT_CACHE_FLAGS_DATA_CACHE |
953
			  CRAT_CACHE_FLAGS_SIMD_CACHE),
954
		.num_cu_shared = 2,
955
	},
956
	{
957
		/* L2 Data Cache per GPU (Total Tex Cache) */
958
		.cache_size = 256,
959
		.cache_level = 2,
960
		.cache_line_size = 128,
961
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
962
			  CRAT_CACHE_FLAGS_DATA_CACHE |
963
			  CRAT_CACHE_FLAGS_SIMD_CACHE),
964
		.num_cu_shared = 2,
965
	},
966
};
967

968
static struct kfd_gpu_cache_info dummy_cache_info[] = {
969
	{
970
		/* TCP L1 Cache per CU */
971
		.cache_size = 16,
972
		.cache_level = 1,
973
		.cache_line_size = 64,
974
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
975
				CRAT_CACHE_FLAGS_DATA_CACHE |
976
				CRAT_CACHE_FLAGS_SIMD_CACHE),
977
		.num_cu_shared = 1,
978
	},
979
	{
980
		/* Scalar L1 Instruction Cache per SQC */
981
		.cache_size = 32,
982
		.cache_level = 1,
983
		.cache_line_size = 64,
984
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
985
				CRAT_CACHE_FLAGS_INST_CACHE |
986
				CRAT_CACHE_FLAGS_SIMD_CACHE),
987
		.num_cu_shared = 2,
988
	},
989
	{
990
		/* Scalar L1 Data Cache per SQC */
991
		.cache_size = 16,
992
		.cache_level = 1,
993
		.cache_line_size = 64,
994
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
995
				CRAT_CACHE_FLAGS_DATA_CACHE |
996
				CRAT_CACHE_FLAGS_SIMD_CACHE),
997
		.num_cu_shared = 2,
998
	},
999
	{
1000
		/* GL1 Data Cache per SA */
1001
		.cache_size = 128,
1002
		.cache_level = 1,
1003
		.cache_line_size = 64,
1004
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
1005
				CRAT_CACHE_FLAGS_DATA_CACHE |
1006
				CRAT_CACHE_FLAGS_SIMD_CACHE),
1007
		.num_cu_shared = 6,
1008
	},
1009
	{
1010
		/* L2 Data Cache per GPU (Total Tex Cache) */
1011
		.cache_size = 2048,
1012
		.cache_level = 2,
1013
		.cache_line_size = 64,
1014
		.flags = (CRAT_CACHE_FLAGS_ENABLED |
1015
				CRAT_CACHE_FLAGS_DATA_CACHE |
1016
				CRAT_CACHE_FLAGS_SIMD_CACHE),
1017
		.num_cu_shared = 6,
1018
	},
1019
};
1020

1021
static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
1022
		struct crat_subtype_computeunit *cu)
1023
{
1024
	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
1025
	dev->node_props.cpu_core_id_base = cu->processor_id_low;
1026
	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
1027
		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
1028

1029
	pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
1030
			cu->processor_id_low);
1031
}
1032

1033
static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
1034
		struct crat_subtype_computeunit *cu)
1035
{
1036
	dev->node_props.simd_id_base = cu->processor_id_low;
1037
	dev->node_props.simd_count = cu->num_simd_cores;
1038
	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
1039
	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
1040
	dev->node_props.wave_front_size = cu->wave_front_size;
1041
	dev->node_props.array_count = cu->array_count;
1042
	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
1043
	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
1044
	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
1045
	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
1046
		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
1047
	pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
1048
}
1049

1050
/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
1051
 * topology device present in the device_list
1052
 */
1053
static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
1054
				struct list_head *device_list)
1055
{
1056
	struct kfd_topology_device *dev;
1057

1058
	pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
1059
			cu->proximity_domain, cu->hsa_capability);
1060
	list_for_each_entry(dev, device_list, list) {
1061
		if (cu->proximity_domain == dev->proximity_domain) {
1062
			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
1063
				kfd_populated_cu_info_cpu(dev, cu);
1064

1065
			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
1066
				kfd_populated_cu_info_gpu(dev, cu);
1067
			break;
1068
		}
1069
	}
1070

1071
	return 0;
1072
}
1073

1074
static struct kfd_mem_properties *
1075
find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,
1076
		struct kfd_topology_device *dev)
1077
{
1078
	struct kfd_mem_properties *props;
1079

1080
	list_for_each_entry(props, &dev->mem_props, list) {
1081
		if (props->heap_type == heap_type
1082
				&& props->flags == flags
1083
				&& props->width == width)
1084
			return props;
1085
	}
1086

1087
	return NULL;
1088
}
1089
/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
1090
 * topology device present in the device_list
1091
 */
1092
static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
1093
				struct list_head *device_list)
1094
{
1095
	struct kfd_mem_properties *props;
1096
	struct kfd_topology_device *dev;
1097
	uint32_t heap_type;
1098
	uint64_t size_in_bytes;
1099
	uint32_t flags = 0;
1100
	uint32_t width;
1101

1102
	pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
1103
			mem->proximity_domain);
1104
	list_for_each_entry(dev, device_list, list) {
1105
		if (mem->proximity_domain == dev->proximity_domain) {
1106
			/* We're on GPU node */
1107
			if (dev->node_props.cpu_cores_count == 0) {
1108
				/* APU */
1109
				if (mem->visibility_type == 0)
1110
					heap_type =
1111
						HSA_MEM_HEAP_TYPE_FB_PRIVATE;
1112
				/* dGPU */
1113
				else
1114
					heap_type = mem->visibility_type;
1115
			} else
1116
				heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
1117

1118
			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
1119
				flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
1120
			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
1121
				flags |= HSA_MEM_FLAGS_NON_VOLATILE;
1122

1123
			size_in_bytes =
1124
				((uint64_t)mem->length_high << 32) +
1125
							mem->length_low;
1126
			width = mem->width;
1127

1128
			/* Multiple banks of the same type are aggregated into
1129
			 * one. User mode doesn't care about multiple physical
1130
			 * memory segments. It's managed as a single virtual
1131
			 * heap for user mode.
1132
			 */
1133
			props = find_subtype_mem(heap_type, flags, width, dev);
1134
			if (props) {
1135
				props->size_in_bytes += size_in_bytes;
1136
				break;
1137
			}
1138

1139
			props = kfd_alloc_struct(props);
1140
			if (!props)
1141
				return -ENOMEM;
1142

1143
			props->heap_type = heap_type;
1144
			props->flags = flags;
1145
			props->size_in_bytes = size_in_bytes;
1146
			props->width = width;
1147

1148
			dev->node_props.mem_banks_count++;
1149
			list_add_tail(&props->list, &dev->mem_props);
1150

1151
			break;
1152
		}
1153
	}
1154

1155
	return 0;
1156
}
1157

1158
/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
1159
 * topology device present in the device_list
1160
 */
1161
static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
1162
			struct list_head *device_list)
1163
{
1164
	struct kfd_cache_properties *props;
1165
	struct kfd_topology_device *dev;
1166
	uint32_t id;
1167
	uint32_t total_num_of_cu;
1168

1169
	id = cache->processor_id_low;
1170

1171
	pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
1172
	list_for_each_entry(dev, device_list, list) {
1173
		total_num_of_cu = (dev->node_props.array_count *
1174
					dev->node_props.cu_per_simd_array);
1175

1176
		/* Cache infomration in CRAT doesn't have proximity_domain
1177
		 * information as it is associated with a CPU core or GPU
1178
		 * Compute Unit. So map the cache using CPU core Id or SIMD
1179
		 * (GPU) ID.
1180
		 * TODO: This works because currently we can safely assume that
1181
		 *  Compute Units are parsed before caches are parsed. In
1182
		 *  future, remove this dependency
1183
		 */
1184
		if ((id >= dev->node_props.cpu_core_id_base &&
1185
			id <= dev->node_props.cpu_core_id_base +
1186
				dev->node_props.cpu_cores_count) ||
1187
			(id >= dev->node_props.simd_id_base &&
1188
			id < dev->node_props.simd_id_base +
1189
				total_num_of_cu)) {
1190
			props = kfd_alloc_struct(props);
1191
			if (!props)
1192
				return -ENOMEM;
1193

1194
			props->processor_id_low = id;
1195
			props->cache_level = cache->cache_level;
1196
			props->cache_size = cache->cache_size;
1197
			props->cacheline_size = cache->cache_line_size;
1198
			props->cachelines_per_tag = cache->lines_per_tag;
1199
			props->cache_assoc = cache->associativity;
1200
			props->cache_latency = cache->cache_latency;
1201

1202
			memcpy(props->sibling_map, cache->sibling_map,
1203
					CRAT_SIBLINGMAP_SIZE);
1204

1205
			/* set the sibling_map_size as 32 for CRAT from ACPI */
1206
			props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;
1207

1208
			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
1209
				props->cache_type |= HSA_CACHE_TYPE_DATA;
1210
			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
1211
				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
1212
			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
1213
				props->cache_type |= HSA_CACHE_TYPE_CPU;
1214
			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
1215
				props->cache_type |= HSA_CACHE_TYPE_HSACU;
1216

1217
			dev->node_props.caches_count++;
1218
			list_add_tail(&props->list, &dev->cache_props);
1219

1220
			break;
1221
		}
1222
	}
1223

1224
	return 0;
1225
}
1226

1227
/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
1228
 * topology device present in the device_list
1229
 */
1230
static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
1231
					struct list_head *device_list)
1232
{
1233
	struct kfd_iolink_properties *props = NULL, *props2;
1234
	struct kfd_topology_device *dev, *to_dev;
1235
	uint32_t id_from;
1236
	uint32_t id_to;
1237

1238
	id_from = iolink->proximity_domain_from;
1239
	id_to = iolink->proximity_domain_to;
1240

1241
	pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n",
1242
			id_from, id_to);
1243
	list_for_each_entry(dev, device_list, list) {
1244
		if (id_from == dev->proximity_domain) {
1245
			props = kfd_alloc_struct(props);
1246
			if (!props)
1247
				return -ENOMEM;
1248

1249
			props->node_from = id_from;
1250
			props->node_to = id_to;
1251
			props->ver_maj = iolink->version_major;
1252
			props->ver_min = iolink->version_minor;
1253
			props->iolink_type = iolink->io_interface_type;
1254

1255
			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
1256
				props->weight = 20;
1257
			else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
1258
				props->weight = iolink->weight_xgmi;
1259
			else
1260
				props->weight = node_distance(id_from, id_to);
1261

1262
			props->min_latency = iolink->minimum_latency;
1263
			props->max_latency = iolink->maximum_latency;
1264
			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
1265
			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
1266
			props->rec_transfer_size =
1267
					iolink->recommended_transfer_size;
1268

1269
			dev->node_props.io_links_count++;
1270
			list_add_tail(&props->list, &dev->io_link_props);
1271
			break;
1272
		}
1273
	}
1274

1275
	/* CPU topology is created before GPUs are detected, so CPU->GPU
1276
	 * links are not built at that time. If a PCIe type is discovered, it
1277
	 * means a GPU is detected and we are adding GPU->CPU to the topology.
1278
	 * At this time, also add the corresponded CPU->GPU link if GPU
1279
	 * is large bar.
1280
	 * For xGMI, we only added the link with one direction in the crat
1281
	 * table, add corresponded reversed direction link now.
1282
	 */
1283
	if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
1284
		to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
1285
		if (!to_dev)
1286
			return -ENODEV;
1287
		/* same everything but the other direction */
1288
		props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
1289
		if (!props2)
1290
			return -ENOMEM;
1291

1292
		props2->node_from = id_to;
1293
		props2->node_to = id_from;
1294
		props2->kobj = NULL;
1295
		to_dev->node_props.io_links_count++;
1296
		list_add_tail(&props2->list, &to_dev->io_link_props);
1297
	}
1298

1299
	return 0;
1300
}
1301

1302
/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
1303
 * present in the device_list
1304
 *	@sub_type_hdr - subtype section of crat_image
1305
 *	@device_list - list of topology devices present in this crat_image
1306
 */
1307
static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
1308
				struct list_head *device_list)
1309
{
1310
	struct crat_subtype_computeunit *cu;
1311
	struct crat_subtype_memory *mem;
1312
	struct crat_subtype_cache *cache;
1313
	struct crat_subtype_iolink *iolink;
1314
	int ret = 0;
1315

1316
	switch (sub_type_hdr->type) {
1317
	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
1318
		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
1319
		ret = kfd_parse_subtype_cu(cu, device_list);
1320
		break;
1321
	case CRAT_SUBTYPE_MEMORY_AFFINITY:
1322
		mem = (struct crat_subtype_memory *)sub_type_hdr;
1323
		ret = kfd_parse_subtype_mem(mem, device_list);
1324
		break;
1325
	case CRAT_SUBTYPE_CACHE_AFFINITY:
1326
		cache = (struct crat_subtype_cache *)sub_type_hdr;
1327
		ret = kfd_parse_subtype_cache(cache, device_list);
1328
		break;
1329
	case CRAT_SUBTYPE_TLB_AFFINITY:
1330
		/*
1331
		 * For now, nothing to do here
1332
		 */
1333
		pr_debug("Found TLB entry in CRAT table (not processing)\n");
1334
		break;
1335
	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
1336
		/*
1337
		 * For now, nothing to do here
1338
		 */
1339
		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
1340
		break;
1341
	case CRAT_SUBTYPE_IOLINK_AFFINITY:
1342
		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
1343
		ret = kfd_parse_subtype_iolink(iolink, device_list);
1344
		break;
1345
	default:
1346
		pr_warn("Unknown subtype %d in CRAT\n",
1347
				sub_type_hdr->type);
1348
	}
1349

1350
	return ret;
1351
}
1352

1353
/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
1354
 * create a kfd_topology_device and add in to device_list. Also parse
1355
 * CRAT subtypes and attach it to appropriate kfd_topology_device
1356
 *	@crat_image - input image containing CRAT
1357
 *	@device_list - [OUT] list of kfd_topology_device generated after
1358
 *		       parsing crat_image
1359
 *	@proximity_domain - Proximity domain of the first device in the table
1360
 *
1361
 *	Return - 0 if successful else -ve value
1362
 */
1363
int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
1364
			 uint32_t proximity_domain)
1365
{
1366
	struct kfd_topology_device *top_dev = NULL;
1367
	struct crat_subtype_generic *sub_type_hdr;
1368
	uint16_t node_id;
1369
	int ret = 0;
1370
	struct crat_header *crat_table = (struct crat_header *)crat_image;
1371
	uint16_t num_nodes;
1372
	uint32_t image_len;
1373

1374
	if (!crat_image)
1375
		return -EINVAL;
1376

1377
	if (!list_empty(device_list)) {
1378
		pr_warn("Error device list should be empty\n");
1379
		return -EINVAL;
1380
	}
1381

1382
	num_nodes = crat_table->num_domains;
1383
	image_len = crat_table->length;
1384

1385
	pr_debug("Parsing CRAT table with %d nodes\n", num_nodes);
1386

1387
	for (node_id = 0; node_id < num_nodes; node_id++) {
1388
		top_dev = kfd_create_topology_device(device_list);
1389
		if (!top_dev)
1390
			break;
1391
		top_dev->proximity_domain = proximity_domain++;
1392
	}
1393

1394
	if (!top_dev) {
1395
		ret = -ENOMEM;
1396
		goto err;
1397
	}
1398

1399
	memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
1400
	memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
1401
			CRAT_OEMTABLEID_LENGTH);
1402
	top_dev->oem_revision = crat_table->oem_revision;
1403

1404
	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
1405
	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
1406
			((char *)crat_image) + image_len) {
1407
		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
1408
			ret = kfd_parse_subtype(sub_type_hdr, device_list);
1409
			if (ret)
1410
				break;
1411
		}
1412

1413
		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1414
				sub_type_hdr->length);
1415
	}
1416

1417
err:
1418
	if (ret)
1419
		kfd_release_topology_device_list(device_list);
1420

1421
	return ret;
1422
}
1423

1424

1425
static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
1426
						   bool cache_line_size_missing,
1427
						   struct kfd_gpu_cache_info *pcache_info)
1428
{
1429
	struct amdgpu_device *adev = kdev->adev;
1430
	int i = 0;
1431

1432
	/* TCP L1 Cache per CU */
1433
	if (adev->gfx.config.gc_tcp_l1_size) {
1434
		pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size;
1435
		pcache_info[i].cache_level = 1;
1436
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1437
					CRAT_CACHE_FLAGS_DATA_CACHE |
1438
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1439
		pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2;
1440
		pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size;
1441
		if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1442
			pcache_info[i].cache_line_size = 128;
1443
		i++;
1444
	}
1445
	/* Scalar L1 Instruction Cache per SQC */
1446
	if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
1447
		pcache_info[i].cache_size =
1448
			adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
1449
		pcache_info[i].cache_level = 1;
1450
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1451
					CRAT_CACHE_FLAGS_INST_CACHE |
1452
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1453
		pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
1454
		pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size;
1455
		if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1456
			pcache_info[i].cache_line_size = 128;
1457
		i++;
1458
	}
1459
	/* Scalar L1 Data Cache per SQC */
1460
	if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
1461
		pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
1462
		pcache_info[i].cache_level = 1;
1463
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1464
					CRAT_CACHE_FLAGS_DATA_CACHE |
1465
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1466
		pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;
1467
		pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size;
1468
		if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1469
			pcache_info[i].cache_line_size = 64;
1470
		i++;
1471
	}
1472
	/* GL1 Data Cache per SA */
1473
	if (adev->gfx.config.gc_gl1c_per_sa &&
1474
	    adev->gfx.config.gc_gl1c_size_per_instance) {
1475
		pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa *
1476
			adev->gfx.config.gc_gl1c_size_per_instance;
1477
		pcache_info[i].cache_level = 1;
1478
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1479
					CRAT_CACHE_FLAGS_DATA_CACHE |
1480
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1481
		pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1482
		if (cache_line_size_missing)
1483
			pcache_info[i].cache_line_size = 128;
1484
		i++;
1485
	}
1486
	/* L2 Data Cache per GPU (Total Tex Cache) */
1487
	if (adev->gfx.config.gc_gl2c_per_gpu) {
1488
		pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu;
1489
		pcache_info[i].cache_level = 2;
1490
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1491
					CRAT_CACHE_FLAGS_DATA_CACHE |
1492
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1493
		pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1494
		pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size;
1495
		if (cache_line_size_missing && !pcache_info[i].cache_line_size)
1496
			pcache_info[i].cache_line_size = 128;
1497
		i++;
1498
	}
1499
	/* L3 Data Cache per GPU */
1500
	if (adev->gmc.mall_size) {
1501
		pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
1502
		pcache_info[i].cache_level = 3;
1503
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1504
					CRAT_CACHE_FLAGS_DATA_CACHE |
1505
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1506
		pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1507
		pcache_info[i].cache_line_size = 64;
1508
		i++;
1509
	}
1510
	return i;
1511
}
1512

1513
static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev,
1514
						   struct kfd_gpu_cache_info *pcache_info)
1515
{
1516
	struct amdgpu_device *adev = kdev->adev;
1517
	int i = 0;
1518

1519
	/* TCP L1 Cache per CU */
1520
	if (adev->gfx.config.gc_tcp_size_per_cu) {
1521
		pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu;
1522
		pcache_info[i].cache_level = 1;
1523
		/* Cacheline size not available in IP discovery for gc943,gc944 */
1524
		pcache_info[i].cache_line_size = 128;
1525
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1526
					CRAT_CACHE_FLAGS_DATA_CACHE |
1527
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1528
		pcache_info[i].num_cu_shared = 1;
1529
		i++;
1530
	}
1531
	/* Scalar L1 Instruction Cache per SQC */
1532
	if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {
1533
		pcache_info[i].cache_size =
1534
			adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;
1535
		pcache_info[i].cache_level = 1;
1536
		pcache_info[i].cache_line_size = 64;
1537
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1538
					CRAT_CACHE_FLAGS_INST_CACHE |
1539
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1540
		pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;
1541
		i++;
1542
	}
1543
	/* Scalar L1 Data Cache per SQC */
1544
	if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {
1545
		pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;
1546
		pcache_info[i].cache_level = 1;
1547
		pcache_info[i].cache_line_size = 64;
1548
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1549
					CRAT_CACHE_FLAGS_DATA_CACHE |
1550
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1551
		pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;
1552
		i++;
1553
	}
1554
	/* L2 Data Cache per GPU (Total Tex Cache) */
1555
	if (adev->gfx.config.gc_tcc_size) {
1556
		pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size;
1557
		pcache_info[i].cache_level = 2;
1558
		pcache_info[i].cache_line_size = 128;
1559
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1560
					CRAT_CACHE_FLAGS_DATA_CACHE |
1561
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1562
		pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1563
		i++;
1564
	}
1565
	/* L3 Data Cache per GPU */
1566
	if (adev->gmc.mall_size) {
1567
		pcache_info[i].cache_size = adev->gmc.mall_size / 1024;
1568
		pcache_info[i].cache_level = 3;
1569
		pcache_info[i].cache_line_size = 64;
1570
		pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |
1571
					CRAT_CACHE_FLAGS_DATA_CACHE |
1572
					CRAT_CACHE_FLAGS_SIMD_CACHE);
1573
		pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;
1574
		i++;
1575
	}
1576
	return i;
1577
}
1578

1579
int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info)
1580
{
1581
	int num_of_cache_types = 0;
1582
	bool cache_line_size_missing = false;
1583

1584
	switch (kdev->adev->asic_type) {
1585
	case CHIP_KAVERI:
1586
		*pcache_info = kaveri_cache_info;
1587
		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
1588
		break;
1589
	case CHIP_HAWAII:
1590
		*pcache_info = hawaii_cache_info;
1591
		num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
1592
		break;
1593
	case CHIP_CARRIZO:
1594
		*pcache_info = carrizo_cache_info;
1595
		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
1596
		break;
1597
	case CHIP_TONGA:
1598
		*pcache_info = tonga_cache_info;
1599
		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
1600
		break;
1601
	case CHIP_FIJI:
1602
		*pcache_info = fiji_cache_info;
1603
		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
1604
		break;
1605
	case CHIP_POLARIS10:
1606
		*pcache_info = polaris10_cache_info;
1607
		num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
1608
		break;
1609
	case CHIP_POLARIS11:
1610
		*pcache_info = polaris11_cache_info;
1611
		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
1612
		break;
1613
	case CHIP_POLARIS12:
1614
		*pcache_info = polaris12_cache_info;
1615
		num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
1616
		break;
1617
	case CHIP_VEGAM:
1618
		*pcache_info = vegam_cache_info;
1619
		num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
1620
		break;
1621
	default:
1622
		switch (KFD_GC_VERSION(kdev)) {
1623
		case IP_VERSION(9, 0, 1):
1624
			*pcache_info = vega10_cache_info;
1625
			num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
1626
			break;
1627
		case IP_VERSION(9, 2, 1):
1628
			*pcache_info = vega12_cache_info;
1629
			num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
1630
			break;
1631
		case IP_VERSION(9, 4, 0):
1632
		case IP_VERSION(9, 4, 1):
1633
			*pcache_info = vega20_cache_info;
1634
			num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
1635
			break;
1636
		case IP_VERSION(9, 4, 2):
1637
			*pcache_info = aldebaran_cache_info;
1638
			num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
1639
			break;
1640
		case IP_VERSION(9, 4, 3):
1641
		case IP_VERSION(9, 4, 4):
1642
		case IP_VERSION(9, 5, 0):
1643
			num_of_cache_types =
1644
				kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd,
1645
									*pcache_info);
1646
			break;
1647
		case IP_VERSION(9, 1, 0):
1648
		case IP_VERSION(9, 2, 2):
1649
			*pcache_info = raven_cache_info;
1650
			num_of_cache_types = ARRAY_SIZE(raven_cache_info);
1651
			break;
1652
		case IP_VERSION(9, 3, 0):
1653
			*pcache_info = renoir_cache_info;
1654
			num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
1655
			break;
1656
		case IP_VERSION(10, 1, 10):
1657
		case IP_VERSION(10, 1, 2):
1658
		case IP_VERSION(10, 1, 3):
1659
		case IP_VERSION(10, 1, 4):
1660
			*pcache_info = navi10_cache_info;
1661
			num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
1662
			break;
1663
		case IP_VERSION(10, 1, 1):
1664
			*pcache_info = navi14_cache_info;
1665
			num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
1666
			break;
1667
		case IP_VERSION(10, 3, 0):
1668
			*pcache_info = sienna_cichlid_cache_info;
1669
			num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
1670
			break;
1671
		case IP_VERSION(10, 3, 2):
1672
			*pcache_info = navy_flounder_cache_info;
1673
			num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
1674
			break;
1675
		case IP_VERSION(10, 3, 4):
1676
			*pcache_info = dimgrey_cavefish_cache_info;
1677
			num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
1678
			break;
1679
		case IP_VERSION(10, 3, 1):
1680
			*pcache_info = vangogh_cache_info;
1681
			num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
1682
			break;
1683
		case IP_VERSION(10, 3, 5):
1684
			*pcache_info = beige_goby_cache_info;
1685
			num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
1686
			break;
1687
		case IP_VERSION(10, 3, 3):
1688
			*pcache_info = yellow_carp_cache_info;
1689
			num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
1690
			break;
1691
		case IP_VERSION(10, 3, 6):
1692
			*pcache_info = gc_10_3_6_cache_info;
1693
			num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
1694
			break;
1695
		case IP_VERSION(10, 3, 7):
1696
			*pcache_info = gfx1037_cache_info;
1697
			num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);
1698
			break;
1699
		case IP_VERSION(11, 0, 0):
1700
		case IP_VERSION(11, 0, 1):
1701
		case IP_VERSION(11, 0, 2):
1702
		case IP_VERSION(11, 0, 3):
1703
		case IP_VERSION(11, 0, 4):
1704
		case IP_VERSION(11, 5, 0):
1705
		case IP_VERSION(11, 5, 1):
1706
		case IP_VERSION(11, 5, 2):
1707
		case IP_VERSION(11, 5, 3):
1708
			/* Cacheline size not available in IP discovery for gc11.
1709
			 * kfd_fill_gpu_cache_info_from_gfx_config to hard code it
1710
			 */
1711
			cache_line_size_missing = true;
1712
			fallthrough;
1713
		case IP_VERSION(12, 0, 0):
1714
		case IP_VERSION(12, 0, 1):
1715
			num_of_cache_types =
1716
				kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd,
1717
									cache_line_size_missing,
1718
									*pcache_info);
1719
			break;
1720
		default:
1721
			*pcache_info = dummy_cache_info;
1722
			num_of_cache_types = ARRAY_SIZE(dummy_cache_info);
1723
			pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");
1724
			break;
1725
		}
1726
	}
1727
	return num_of_cache_types;
1728
}
1729

1730
/* Memory required to create Virtual CRAT.
1731
 * Since there is no easy way to predict the amount of memory required, the
1732
 * following amount is allocated for GPU Virtual CRAT. This is
1733
 * expected to cover all known conditions. But to be safe additional check
1734
 * is put in the code to ensure we don't overwrite.
1735
 */
1736
#define VCRAT_SIZE_FOR_GPU	(4 * PAGE_SIZE)
1737

1738
/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
1739
 *
1740
 *	@numa_node_id: CPU NUMA node id
1741
 *	@avail_size: Available size in the memory
1742
 *	@sub_type_hdr: Memory into which compute info will be filled in
1743
 *
1744
 *	Return 0 if successful else return -ve value
1745
 */
1746
static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
1747
				int proximity_domain,
1748
				struct crat_subtype_computeunit *sub_type_hdr)
1749
{
1750
	const struct cpumask *cpumask;
1751

1752
	*avail_size -= sizeof(struct crat_subtype_computeunit);
1753
	if (*avail_size < 0)
1754
		return -ENOMEM;
1755

1756
	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
1757

1758
	/* Fill in subtype header data */
1759
	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
1760
	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
1761
	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1762

1763
	cpumask = cpumask_of_node(numa_node_id);
1764

1765
	/* Fill in CU data */
1766
	sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
1767
	sub_type_hdr->proximity_domain = proximity_domain;
1768
	sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
1769
	if (sub_type_hdr->processor_id_low == -1)
1770
		return -EINVAL;
1771

1772
	sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
1773

1774
	return 0;
1775
}
1776

1777
/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
1778
 *
1779
 *	@numa_node_id: CPU NUMA node id
1780
 *	@avail_size: Available size in the memory
1781
 *	@sub_type_hdr: Memory into which compute info will be filled in
1782
 *
1783
 *	Return 0 if successful else return -ve value
1784
 */
1785
static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
1786
			int proximity_domain,
1787
			struct crat_subtype_memory *sub_type_hdr)
1788
{
1789
	uint64_t mem_in_bytes = 0;
1790
	pg_data_t *pgdat;
1791
	int zone_type;
1792

1793
	*avail_size -= sizeof(struct crat_subtype_memory);
1794
	if (*avail_size < 0)
1795
		return -ENOMEM;
1796

1797
	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
1798

1799
	/* Fill in subtype header data */
1800
	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
1801
	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
1802
	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1803

1804
	/* Fill in Memory Subunit data */
1805

1806
	/* Unlike si_meminfo, si_meminfo_node is not exported. So
1807
	 * the following lines are duplicated from si_meminfo_node
1808
	 * function
1809
	 */
1810
	pgdat = NODE_DATA(numa_node_id);
1811
	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
1812
		mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]);
1813
	mem_in_bytes <<= PAGE_SHIFT;
1814

1815
	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
1816
	sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
1817
	sub_type_hdr->proximity_domain = proximity_domain;
1818

1819
	return 0;
1820
}
1821

1822
#ifdef CONFIG_X86_64
1823
static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
1824
				uint32_t *num_entries,
1825
				struct crat_subtype_iolink *sub_type_hdr)
1826
{
1827
	int nid;
1828
	struct cpuinfo_x86 *c = &cpu_data(0);
1829
	uint8_t link_type;
1830

1831
	if (c->x86_vendor == X86_VENDOR_AMD)
1832
		link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
1833
	else
1834
		link_type = CRAT_IOLINK_TYPE_QPI_1_1;
1835

1836
	*num_entries = 0;
1837

1838
	/* Create IO links from this node to other CPU nodes */
1839
	for_each_online_node(nid) {
1840
		if (nid == numa_node_id) /* node itself */
1841
			continue;
1842

1843
		*avail_size -= sizeof(struct crat_subtype_iolink);
1844
		if (*avail_size < 0)
1845
			return -ENOMEM;
1846

1847
		memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
1848

1849
		/* Fill in subtype header data */
1850
		sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
1851
		sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
1852
		sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
1853

1854
		/* Fill in IO link data */
1855
		sub_type_hdr->proximity_domain_from = numa_node_id;
1856
		sub_type_hdr->proximity_domain_to = nid;
1857
		sub_type_hdr->io_interface_type = link_type;
1858

1859
		(*num_entries)++;
1860
		sub_type_hdr++;
1861
	}
1862

1863
	return 0;
1864
}
1865
#endif
1866

1867
/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
1868
 *
1869
 *	@pcrat_image: Fill in VCRAT for CPU
1870
 *	@size:	[IN] allocated size of crat_image.
1871
 *		[OUT] actual size of data filled in crat_image
1872
 */
1873
static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
1874
{
1875
	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
1876
	struct acpi_table_header *acpi_table;
1877
	acpi_status status;
1878
	struct crat_subtype_generic *sub_type_hdr;
1879
	int avail_size = *size;
1880
	int numa_node_id;
1881
#ifdef CONFIG_X86_64
1882
	uint32_t entries = 0;
1883
#endif
1884
	int ret = 0;
1885

1886
	if (!pcrat_image)
1887
		return -EINVAL;
1888

1889
	/* Fill in CRAT Header.
1890
	 * Modify length and total_entries as subunits are added.
1891
	 */
1892
	avail_size -= sizeof(struct crat_header);
1893
	if (avail_size < 0)
1894
		return -ENOMEM;
1895

1896
	memset(crat_table, 0, sizeof(struct crat_header));
1897
	memcpy(&crat_table->signature, CRAT_SIGNATURE,
1898
			sizeof(crat_table->signature));
1899
	crat_table->length = sizeof(struct crat_header);
1900

1901
	status = acpi_get_table("DSDT", 0, &acpi_table);
1902
	if (status != AE_OK)
1903
		pr_warn("DSDT table not found for OEM information\n");
1904
	else {
1905
		crat_table->oem_revision = acpi_table->revision;
1906
		memcpy(crat_table->oem_id, acpi_table->oem_id,
1907
				CRAT_OEMID_LENGTH);
1908
		memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
1909
				CRAT_OEMTABLEID_LENGTH);
1910
		acpi_put_table(acpi_table);
1911
	}
1912
	crat_table->total_entries = 0;
1913
	crat_table->num_domains = 0;
1914

1915
	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
1916

1917
	for_each_online_node(numa_node_id) {
1918
		if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
1919
			continue;
1920

1921
		/* Fill in Subtype: Compute Unit */
1922
		ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
1923
			crat_table->num_domains,
1924
			(struct crat_subtype_computeunit *)sub_type_hdr);
1925
		if (ret < 0)
1926
			return ret;
1927
		crat_table->length += sub_type_hdr->length;
1928
		crat_table->total_entries++;
1929

1930
		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1931
			sub_type_hdr->length);
1932

1933
		/* Fill in Subtype: Memory */
1934
		ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
1935
			crat_table->num_domains,
1936
			(struct crat_subtype_memory *)sub_type_hdr);
1937
		if (ret < 0)
1938
			return ret;
1939
		crat_table->length += sub_type_hdr->length;
1940
		crat_table->total_entries++;
1941

1942
		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1943
			sub_type_hdr->length);
1944

1945
		/* Fill in Subtype: IO Link */
1946
#ifdef CONFIG_X86_64
1947
		ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
1948
				&entries,
1949
				(struct crat_subtype_iolink *)sub_type_hdr);
1950
		if (ret < 0)
1951
			return ret;
1952

1953
		if (entries) {
1954
			crat_table->length += (sub_type_hdr->length * entries);
1955
			crat_table->total_entries += entries;
1956

1957
			sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
1958
					sub_type_hdr->length * entries);
1959
		}
1960
#else
1961
		pr_info("IO link not available for non x86 platforms\n");
1962
#endif
1963

1964
		crat_table->num_domains++;
1965
	}
1966

1967
	/* TODO: Add cache Subtype for CPU.
1968
	 * Currently, CPU cache information is available in function
1969
	 * detect_cache_attributes(cpu) defined in the file
1970
	 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
1971
	 * exported and to get the same information the code needs to be
1972
	 * duplicated.
1973
	 */
1974

1975
	*size = crat_table->length;
1976
	pr_info("Virtual CRAT table created for CPU\n");
1977

1978
	return 0;
1979
}
1980

1981
static int kfd_fill_gpu_memory_affinity(int *avail_size,
1982
		struct kfd_node *kdev, uint8_t type, uint64_t size,
1983
		struct crat_subtype_memory *sub_type_hdr,
1984
		uint32_t proximity_domain,
1985
		const struct kfd_local_mem_info *local_mem_info)
1986
{
1987
	*avail_size -= sizeof(struct crat_subtype_memory);
1988
	if (*avail_size < 0)
1989
		return -ENOMEM;
1990

1991
	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
1992
	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
1993
	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
1994
	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
1995

1996
	sub_type_hdr->proximity_domain = proximity_domain;
1997

1998
	pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
1999
			type, size);
2000

2001
	sub_type_hdr->length_low = lower_32_bits(size);
2002
	sub_type_hdr->length_high = upper_32_bits(size);
2003

2004
	sub_type_hdr->width = local_mem_info->vram_width;
2005
	sub_type_hdr->visibility_type = type;
2006

2007
	return 0;
2008
}
2009

2010
#ifdef CONFIG_ACPI_NUMA
2011
static void kfd_find_numa_node_in_srat(struct kfd_node *kdev)
2012
{
2013
	struct acpi_table_header *table_header = NULL;
2014
	struct acpi_subtable_header *sub_header = NULL;
2015
	unsigned long table_end, subtable_len;
2016
	u32 pci_id = pci_domain_nr(kdev->adev->pdev->bus) << 16 |
2017
			pci_dev_id(kdev->adev->pdev);
2018
	u32 bdf;
2019
	acpi_status status;
2020
	struct acpi_srat_cpu_affinity *cpu;
2021
	struct acpi_srat_generic_affinity *gpu;
2022
	int pxm = 0, max_pxm = 0;
2023
	int numa_node = NUMA_NO_NODE;
2024
	bool found = false;
2025

2026
	/* Fetch the SRAT table from ACPI */
2027
	status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);
2028
	if (status == AE_NOT_FOUND) {
2029
		pr_warn("SRAT table not found\n");
2030
		return;
2031
	} else if (ACPI_FAILURE(status)) {
2032
		const char *err = acpi_format_exception(status);
2033
		pr_err("SRAT table error: %s\n", err);
2034
		return;
2035
	}
2036

2037
	table_end = (unsigned long)table_header + table_header->length;
2038

2039
	/* Parse all entries looking for a match. */
2040
	sub_header = (struct acpi_subtable_header *)
2041
			((unsigned long)table_header +
2042
			sizeof(struct acpi_table_srat));
2043
	subtable_len = sub_header->length;
2044

2045
	while (((unsigned long)sub_header) + subtable_len  < table_end) {
2046
		/*
2047
		 * If length is 0, break from this loop to avoid
2048
		 * infinite loop.
2049
		 */
2050
		if (subtable_len == 0) {
2051
			pr_err("SRAT invalid zero length\n");
2052
			break;
2053
		}
2054

2055
		switch (sub_header->type) {
2056
		case ACPI_SRAT_TYPE_CPU_AFFINITY:
2057
			cpu = (struct acpi_srat_cpu_affinity *)sub_header;
2058
			pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |
2059
					cpu->proximity_domain_lo;
2060
			if (pxm > max_pxm)
2061
				max_pxm = pxm;
2062
			break;
2063
		case ACPI_SRAT_TYPE_GENERIC_AFFINITY:
2064
			gpu = (struct acpi_srat_generic_affinity *)sub_header;
2065
			bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |
2066
					*((u16 *)(&gpu->device_handle[2]));
2067
			if (bdf == pci_id) {
2068
				found = true;
2069
				numa_node = pxm_to_node(gpu->proximity_domain);
2070
			}
2071
			break;
2072
		default:
2073
			break;
2074
		}
2075

2076
		if (found)
2077
			break;
2078

2079
		sub_header = (struct acpi_subtable_header *)
2080
				((unsigned long)sub_header + subtable_len);
2081
		subtable_len = sub_header->length;
2082
	}
2083

2084
	acpi_put_table(table_header);
2085

2086
	/* Workaround bad cpu-gpu binding case */
2087
	if (found && (numa_node < 0 ||
2088
			numa_node > pxm_to_node(max_pxm)))
2089
		numa_node = 0;
2090

2091
	if (numa_node != NUMA_NO_NODE)
2092
		set_dev_node(&kdev->adev->pdev->dev, numa_node);
2093
}
2094
#endif
2095

2096
#define KFD_CRAT_INTRA_SOCKET_WEIGHT	13
2097
#define KFD_CRAT_XGMI_WEIGHT		15
2098

2099
/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
2100
 * to its NUMA node
2101
 *	@avail_size: Available size in the memory
2102
 *	@kdev - [IN] GPU device
2103
 *	@sub_type_hdr: Memory into which io link info will be filled in
2104
 *	@proximity_domain - proximity domain of the GPU node
2105
 *
2106
 *	Return 0 if successful else return -ve value
2107
 */
2108
static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
2109
			struct kfd_node *kdev,
2110
			struct crat_subtype_iolink *sub_type_hdr,
2111
			uint32_t proximity_domain)
2112
{
2113
	*avail_size -= sizeof(struct crat_subtype_iolink);
2114
	if (*avail_size < 0)
2115
		return -ENOMEM;
2116

2117
	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
2118

2119
	/* Fill in subtype header data */
2120
	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
2121
	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
2122
	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
2123
	if (kfd_dev_is_large_bar(kdev))
2124
		sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2125

2126
	/* Fill in IOLINK subtype.
2127
	 * TODO: Fill-in other fields of iolink subtype
2128
	 */
2129
	if (kdev->adev->gmc.xgmi.connected_to_cpu ||
2130
	    (KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) &&
2131
	     kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) ==
2132
	     AMDGPU_PKG_TYPE_APU)) {
2133
		bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);
2134
		int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :
2135
							KFD_CRAT_INTRA_SOCKET_WEIGHT;
2136
		/*
2137
		 * with host gpu xgmi link, host can access gpu memory whether
2138
		 * or not pcie bar type is large, so always create bidirectional
2139
		 * io link.
2140
		 */
2141
		sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2142
		sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
2143
		sub_type_hdr->weight_xgmi = weight;
2144
		if (ext_cpu) {
2145
			amdgpu_xgmi_get_bandwidth(kdev->adev, NULL,
2146
						  AMDGPU_XGMI_BW_MODE_PER_LINK,
2147
						  AMDGPU_XGMI_BW_UNIT_MBYTES,
2148
						  &sub_type_hdr->minimum_bandwidth_mbs,
2149
						  &sub_type_hdr->maximum_bandwidth_mbs);
2150
		} else {
2151
			sub_type_hdr->minimum_bandwidth_mbs = mem_bw;
2152
			sub_type_hdr->maximum_bandwidth_mbs = mem_bw;
2153
		}
2154
	} else {
2155
		sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
2156
		sub_type_hdr->minimum_bandwidth_mbs =
2157
				amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true);
2158
		sub_type_hdr->maximum_bandwidth_mbs =
2159
				amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false);
2160
	}
2161

2162
	sub_type_hdr->proximity_domain_from = proximity_domain;
2163

2164
#ifdef CONFIG_ACPI_NUMA
2165
	if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE &&
2166
	    num_possible_nodes() > 1)
2167
		kfd_find_numa_node_in_srat(kdev);
2168
#endif
2169
#ifdef CONFIG_NUMA
2170
	if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE)
2171
		sub_type_hdr->proximity_domain_to = 0;
2172
	else
2173
		sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node;
2174
#else
2175
	sub_type_hdr->proximity_domain_to = 0;
2176
#endif
2177
	return 0;
2178
}
2179

2180
static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
2181
			struct kfd_node *kdev,
2182
			struct kfd_node *peer_kdev,
2183
			struct crat_subtype_iolink *sub_type_hdr,
2184
			uint32_t proximity_domain_from,
2185
			uint32_t proximity_domain_to)
2186
{
2187
	bool use_ta_info = kdev->kfd->num_nodes == 1;
2188

2189
	*avail_size -= sizeof(struct crat_subtype_iolink);
2190
	if (*avail_size < 0)
2191
		return -ENOMEM;
2192

2193
	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
2194

2195
	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
2196
	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
2197
	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |
2198
			       CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
2199

2200
	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
2201
	sub_type_hdr->proximity_domain_from = proximity_domain_from;
2202
	sub_type_hdr->proximity_domain_to = proximity_domain_to;
2203

2204
	if (use_ta_info) {
2205
		sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *
2206
			amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);
2207
		amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev,
2208
					  AMDGPU_XGMI_BW_MODE_PER_PEER,
2209
					  AMDGPU_XGMI_BW_UNIT_MBYTES,
2210
					  &sub_type_hdr->minimum_bandwidth_mbs,
2211
					  &sub_type_hdr->maximum_bandwidth_mbs);
2212
	} else {
2213
		bool is_single_hop = kdev->kfd == peer_kdev->kfd;
2214
		int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :
2215
			(2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT;
2216
		int mem_bw = 819200;
2217

2218
		sub_type_hdr->weight_xgmi = weight;
2219
		sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0;
2220
		sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0;
2221
	}
2222

2223
	return 0;
2224
}
2225

2226
/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
2227
 *
2228
 *	@pcrat_image: Fill in VCRAT for GPU
2229
 *	@size:	[IN] allocated size of crat_image.
2230
 *		[OUT] actual size of data filled in crat_image
2231
 */
2232
static int kfd_create_vcrat_image_gpu(void *pcrat_image,
2233
				      size_t *size, struct kfd_node *kdev,
2234
				      uint32_t proximity_domain)
2235
{
2236
	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
2237
	struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;
2238
	struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;
2239
	struct crat_subtype_generic *sub_type_hdr;
2240
	struct kfd_local_mem_info local_mem_info;
2241
	struct kfd_topology_device *peer_dev;
2242
	struct crat_subtype_computeunit *cu;
2243
	int avail_size = *size;
2244
	uint32_t total_num_of_cu;
2245
	uint32_t nid = 0;
2246
	int ret = 0;
2247

2248
	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
2249
		return -EINVAL;
2250

2251
	/* Fill the CRAT Header.
2252
	 * Modify length and total_entries as subunits are added.
2253
	 */
2254
	avail_size -= sizeof(struct crat_header);
2255
	memset(crat_table, 0, sizeof(struct crat_header));
2256

2257
	memcpy(&crat_table->signature, CRAT_SIGNATURE,
2258
			sizeof(crat_table->signature));
2259
	/* Change length as we add more subtypes*/
2260
	crat_table->length = sizeof(struct crat_header);
2261
	crat_table->num_domains = 1;
2262
	crat_table->total_entries = 0;
2263

2264
	/* Fill in Subtype: Compute Unit
2265
	 * First fill in the sub type header and then sub type data
2266
	 */
2267
	avail_size -= sizeof(struct crat_subtype_computeunit);
2268
	sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
2269
	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
2270

2271
	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
2272
	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
2273
	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
2274

2275
	/* Fill CU subtype data */
2276
	cu = (struct crat_subtype_computeunit *)sub_type_hdr;
2277
	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
2278
	cu->proximity_domain = proximity_domain;
2279

2280
	cu->num_simd_per_cu = cu_info->simd_per_cu;
2281
	cu->num_simd_cores = cu_info->simd_per_cu *
2282
			(cu_info->number / kdev->kfd->num_nodes);
2283
	cu->max_waves_simd = cu_info->max_waves_per_simd;
2284

2285
	cu->wave_front_size = cu_info->wave_front_size;
2286
	cu->array_count = gfx_info->max_sh_per_se *
2287
		gfx_info->max_shader_engines;
2288
	total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh);
2289
	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
2290
	cu->num_cu_per_array = gfx_info->max_cu_per_sh;
2291
	cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu;
2292
	cu->num_banks = gfx_info->max_shader_engines;
2293
	cu->lds_size_in_kb = cu_info->lds_size;
2294

2295
	cu->hsa_capability = 0;
2296

2297
	crat_table->length += sub_type_hdr->length;
2298
	crat_table->total_entries++;
2299

2300
	/* Fill in Subtype: Memory. Only on systems with large BAR (no
2301
	 * private FB), report memory as public. On other systems
2302
	 * report the total FB size (public+private) as a single
2303
	 * private heap.
2304
	 */
2305
	local_mem_info = kdev->local_mem_info;
2306
	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2307
			sub_type_hdr->length);
2308

2309
	if (kdev->adev->debug_largebar)
2310
		local_mem_info.local_mem_size_private = 0;
2311

2312
	if (local_mem_info.local_mem_size_private == 0)
2313
		ret = kfd_fill_gpu_memory_affinity(&avail_size,
2314
				kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
2315
				local_mem_info.local_mem_size_public,
2316
				(struct crat_subtype_memory *)sub_type_hdr,
2317
				proximity_domain,
2318
				&local_mem_info);
2319
	else
2320
		ret = kfd_fill_gpu_memory_affinity(&avail_size,
2321
				kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
2322
				local_mem_info.local_mem_size_public +
2323
				local_mem_info.local_mem_size_private,
2324
				(struct crat_subtype_memory *)sub_type_hdr,
2325
				proximity_domain,
2326
				&local_mem_info);
2327
	if (ret < 0)
2328
		return ret;
2329

2330
	crat_table->length += sizeof(struct crat_subtype_memory);
2331
	crat_table->total_entries++;
2332

2333
	/* Fill in Subtype: IO_LINKS
2334
	 *  Only direct links are added here which is Link from GPU to
2335
	 *  its NUMA node. Indirect links are added by userspace.
2336
	 */
2337
	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
2338
		sub_type_hdr->length);
2339
	ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
2340
		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
2341

2342
	if (ret < 0)
2343
		return ret;
2344

2345
	crat_table->length += sub_type_hdr->length;
2346
	crat_table->total_entries++;
2347

2348

2349
	/* Fill in Subtype: IO_LINKS
2350
	 * Direct links from GPU to other GPUs through xGMI.
2351
	 * We will loop GPUs that already be processed (with lower value
2352
	 * of proximity_domain), add the link for the GPUs with same
2353
	 * hive id (from this GPU to other GPU) . The reversed iolink
2354
	 * (from other GPU to this GPU) will be added
2355
	 * in kfd_parse_subtype_iolink.
2356
	 */
2357
	if (kdev->kfd->hive_id) {
2358
		for (nid = 0; nid < proximity_domain; ++nid) {
2359
			peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);
2360
			if (!peer_dev->gpu)
2361
				continue;
2362
			if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)
2363
				continue;
2364
			if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))
2365
				continue;
2366
			sub_type_hdr = (typeof(sub_type_hdr))(
2367
				(char *)sub_type_hdr +
2368
				sizeof(struct crat_subtype_iolink));
2369
			ret = kfd_fill_gpu_xgmi_link_to_gpu(
2370
				&avail_size, kdev, peer_dev->gpu,
2371
				(struct crat_subtype_iolink *)sub_type_hdr,
2372
				proximity_domain, nid);
2373
			if (ret < 0)
2374
				return ret;
2375
			crat_table->length += sub_type_hdr->length;
2376
			crat_table->total_entries++;
2377
		}
2378
	}
2379
	*size = crat_table->length;
2380
	pr_info("Virtual CRAT table created for GPU\n");
2381

2382
	return ret;
2383
}
2384

2385
/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
2386
 *		creates a Virtual CRAT (VCRAT) image
2387
 *
2388
 * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
2389
 *
2390
 *	@crat_image: VCRAT image created because ACPI does not have a
2391
 *		     CRAT for this device
2392
 *	@size: [OUT] size of virtual crat_image
2393
 *	@flags:	COMPUTE_UNIT_CPU - Create VCRAT for CPU device
2394
 *		COMPUTE_UNIT_GPU - Create VCRAT for GPU
2395
 *		(COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
2396
 *			-- this option is not currently implemented.
2397
 *			The assumption is that all AMD APUs will have CRAT
2398
 *	@kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU
2399
 *
2400
 *	Return 0 if successful else return -ve value
2401
 */
2402
int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
2403
				  int flags, struct kfd_node *kdev,
2404
				  uint32_t proximity_domain)
2405
{
2406
	void *pcrat_image = NULL;
2407
	int ret = 0, num_nodes;
2408
	size_t dyn_size;
2409

2410
	if (!crat_image)
2411
		return -EINVAL;
2412

2413
	*crat_image = NULL;
2414

2415
	/* Allocate the CPU Virtual CRAT size based on the number of online
2416
	 * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image.
2417
	 * This should cover all the current conditions. A check is put not
2418
	 * to overwrite beyond allocated size for GPUs
2419
	 */
2420
	switch (flags) {
2421
	case COMPUTE_UNIT_CPU:
2422
		num_nodes = num_online_nodes();
2423
		dyn_size = sizeof(struct crat_header) +
2424
			num_nodes * (sizeof(struct crat_subtype_computeunit) +
2425
			sizeof(struct crat_subtype_memory) +
2426
			(num_nodes - 1) * sizeof(struct crat_subtype_iolink));
2427
		pcrat_image = kvmalloc(dyn_size, GFP_KERNEL);
2428
		if (!pcrat_image)
2429
			return -ENOMEM;
2430
		*size = dyn_size;
2431
		pr_debug("CRAT size is %ld", dyn_size);
2432
		ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
2433
		break;
2434
	case COMPUTE_UNIT_GPU:
2435
		if (!kdev)
2436
			return -EINVAL;
2437
		pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
2438
		if (!pcrat_image)
2439
			return -ENOMEM;
2440
		*size = VCRAT_SIZE_FOR_GPU;
2441
		ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
2442
						 proximity_domain);
2443
		break;
2444
	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
2445
		/* TODO: */
2446
		ret = -EINVAL;
2447
		pr_err("VCRAT not implemented for APU\n");
2448
		break;
2449
	default:
2450
		ret = -EINVAL;
2451
	}
2452

2453
	if (!ret)
2454
		*crat_image = pcrat_image;
2455
	else
2456
		kvfree(pcrat_image);
2457

2458
	return ret;
2459
}
2460

2461

2462
/* kfd_destroy_crat_image
2463
 *
2464
 *	@crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
2465
 *
2466
 */
2467
void kfd_destroy_crat_image(void *crat_image)
2468
{
2469
	kvfree(crat_image);
2470
}
2471

2472
Product

Resources

Company