Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
50903 views
// SPDX-License-Identifier: GPL-2.0 OR MIT1/*2* Copyright 2015-2022 Advanced Micro Devices, Inc.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*/2223#include <linux/pci.h>24#include <linux/acpi.h>25#include "kfd_crat.h"26#include "kfd_priv.h"27#include "kfd_topology.h"28#include "amdgpu.h"29#include "amdgpu_amdkfd.h"30#include "amdgpu_xgmi.h"3132/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.33* GPU processor ID are expressed with Bit[31]=1.34* The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs35* used in the CRAT.36*/37static uint32_t gpu_processor_id_low = 0x80001000;3839/* Return the next available gpu_processor_id and increment it for next GPU40* @total_cu_count - Total CUs present in the GPU including ones41* masked off42*/43static inline unsigned int get_and_inc_gpu_processor_id(44unsigned int total_cu_count)45{46int current_id = gpu_processor_id_low;4748gpu_processor_id_low += total_cu_count;49return current_id;50}515253static struct kfd_gpu_cache_info kaveri_cache_info[] = {54{55/* TCP L1 Cache per CU */56.cache_size = 16,57.cache_level = 1,58.cache_line_size = 64,59.flags = (CRAT_CACHE_FLAGS_ENABLED |60CRAT_CACHE_FLAGS_DATA_CACHE |61CRAT_CACHE_FLAGS_SIMD_CACHE),62.num_cu_shared = 1,63},64{65/* Scalar L1 Instruction Cache (in SQC module) per bank */66.cache_size = 16,67.cache_level = 1,68.cache_line_size = 64,69.flags = (CRAT_CACHE_FLAGS_ENABLED |70CRAT_CACHE_FLAGS_INST_CACHE |71CRAT_CACHE_FLAGS_SIMD_CACHE),72.num_cu_shared = 2,73},74{75/* Scalar L1 Data Cache (in SQC module) per bank */76.cache_size = 8,77.cache_level = 1,78.cache_line_size = 64,79.flags = (CRAT_CACHE_FLAGS_ENABLED |80CRAT_CACHE_FLAGS_DATA_CACHE |81CRAT_CACHE_FLAGS_SIMD_CACHE),82.num_cu_shared = 2,83},8485/* TODO: Add L2 Cache information */86};878889static struct kfd_gpu_cache_info carrizo_cache_info[] = {90{91/* TCP L1 Cache per CU */92.cache_size = 16,93.cache_level = 1,94.cache_line_size = 64,95.flags = (CRAT_CACHE_FLAGS_ENABLED |96CRAT_CACHE_FLAGS_DATA_CACHE |97CRAT_CACHE_FLAGS_SIMD_CACHE),98.num_cu_shared = 1,99},100{101/* Scalar L1 Instruction Cache (in SQC module) per bank */102.cache_size = 32,103.cache_level = 1,104.cache_line_size = 64,105.flags = (CRAT_CACHE_FLAGS_ENABLED |106CRAT_CACHE_FLAGS_INST_CACHE |107CRAT_CACHE_FLAGS_SIMD_CACHE),108.num_cu_shared = 4,109},110{111/* Scalar L1 Data Cache (in SQC module) per bank. */112.cache_size = 16,113.cache_level = 1,114.cache_line_size = 64,115.flags = (CRAT_CACHE_FLAGS_ENABLED |116CRAT_CACHE_FLAGS_DATA_CACHE |117CRAT_CACHE_FLAGS_SIMD_CACHE),118.num_cu_shared = 4,119},120121/* TODO: Add L2 Cache information */122};123124#define hawaii_cache_info kaveri_cache_info125#define tonga_cache_info carrizo_cache_info126#define fiji_cache_info carrizo_cache_info127#define polaris10_cache_info carrizo_cache_info128#define polaris11_cache_info carrizo_cache_info129#define polaris12_cache_info carrizo_cache_info130#define vegam_cache_info carrizo_cache_info131132/* NOTE: L1 cache information has been updated and L2/L3133* cache information has been added for Vega10 and134* newer ASICs. The unit for cache_size is KiB.135* In future, check & update cache details136* for every new ASIC is required.137*/138139static struct kfd_gpu_cache_info vega10_cache_info[] = {140{141/* TCP L1 Cache per CU */142.cache_size = 16,143.cache_level = 1,144.cache_line_size = 64,145.flags = (CRAT_CACHE_FLAGS_ENABLED |146CRAT_CACHE_FLAGS_DATA_CACHE |147CRAT_CACHE_FLAGS_SIMD_CACHE),148.num_cu_shared = 1,149},150{151/* Scalar L1 Instruction Cache per SQC */152.cache_size = 32,153.cache_level = 1,154.cache_line_size = 64,155.flags = (CRAT_CACHE_FLAGS_ENABLED |156CRAT_CACHE_FLAGS_INST_CACHE |157CRAT_CACHE_FLAGS_SIMD_CACHE),158.num_cu_shared = 3,159},160{161/* Scalar L1 Data Cache per SQC */162.cache_size = 16,163.cache_level = 1,164.cache_line_size = 64,165.flags = (CRAT_CACHE_FLAGS_ENABLED |166CRAT_CACHE_FLAGS_DATA_CACHE |167CRAT_CACHE_FLAGS_SIMD_CACHE),168.num_cu_shared = 3,169},170{171/* L2 Data Cache per GPU (Total Tex Cache) */172.cache_size = 4096,173.cache_level = 2,174.cache_line_size = 64,175.flags = (CRAT_CACHE_FLAGS_ENABLED |176CRAT_CACHE_FLAGS_DATA_CACHE |177CRAT_CACHE_FLAGS_SIMD_CACHE),178.num_cu_shared = 16,179},180};181182static struct kfd_gpu_cache_info raven_cache_info[] = {183{184/* TCP L1 Cache per CU */185.cache_size = 16,186.cache_level = 1,187.cache_line_size = 64,188.flags = (CRAT_CACHE_FLAGS_ENABLED |189CRAT_CACHE_FLAGS_DATA_CACHE |190CRAT_CACHE_FLAGS_SIMD_CACHE),191.num_cu_shared = 1,192},193{194/* Scalar L1 Instruction Cache per SQC */195.cache_size = 32,196.cache_level = 1,197.cache_line_size = 64,198.flags = (CRAT_CACHE_FLAGS_ENABLED |199CRAT_CACHE_FLAGS_INST_CACHE |200CRAT_CACHE_FLAGS_SIMD_CACHE),201.num_cu_shared = 3,202},203{204/* Scalar L1 Data Cache per SQC */205.cache_size = 16,206.cache_level = 1,207.cache_line_size = 64,208.flags = (CRAT_CACHE_FLAGS_ENABLED |209CRAT_CACHE_FLAGS_DATA_CACHE |210CRAT_CACHE_FLAGS_SIMD_CACHE),211.num_cu_shared = 3,212},213{214/* L2 Data Cache per GPU (Total Tex Cache) */215.cache_size = 1024,216.cache_level = 2,217.cache_line_size = 64,218.flags = (CRAT_CACHE_FLAGS_ENABLED |219CRAT_CACHE_FLAGS_DATA_CACHE |220CRAT_CACHE_FLAGS_SIMD_CACHE),221.num_cu_shared = 11,222},223};224225static struct kfd_gpu_cache_info renoir_cache_info[] = {226{227/* TCP L1 Cache per CU */228.cache_size = 16,229.cache_level = 1,230.cache_line_size = 64,231.flags = (CRAT_CACHE_FLAGS_ENABLED |232CRAT_CACHE_FLAGS_DATA_CACHE |233CRAT_CACHE_FLAGS_SIMD_CACHE),234.num_cu_shared = 1,235},236{237/* Scalar L1 Instruction Cache per SQC */238.cache_size = 32,239.cache_level = 1,240.cache_line_size = 64,241.flags = (CRAT_CACHE_FLAGS_ENABLED |242CRAT_CACHE_FLAGS_INST_CACHE |243CRAT_CACHE_FLAGS_SIMD_CACHE),244.num_cu_shared = 3,245},246{247/* Scalar L1 Data Cache per SQC */248.cache_size = 16,249.cache_level = 1,250.cache_line_size = 64,251.flags = (CRAT_CACHE_FLAGS_ENABLED |252CRAT_CACHE_FLAGS_DATA_CACHE |253CRAT_CACHE_FLAGS_SIMD_CACHE),254.num_cu_shared = 3,255},256{257/* L2 Data Cache per GPU (Total Tex Cache) */258.cache_size = 1024,259.cache_level = 2,260.cache_line_size = 64,261.flags = (CRAT_CACHE_FLAGS_ENABLED |262CRAT_CACHE_FLAGS_DATA_CACHE |263CRAT_CACHE_FLAGS_SIMD_CACHE),264.num_cu_shared = 8,265},266};267268static struct kfd_gpu_cache_info vega12_cache_info[] = {269{270/* TCP L1 Cache per CU */271.cache_size = 16,272.cache_level = 1,273.cache_line_size = 64,274.flags = (CRAT_CACHE_FLAGS_ENABLED |275CRAT_CACHE_FLAGS_DATA_CACHE |276CRAT_CACHE_FLAGS_SIMD_CACHE),277.num_cu_shared = 1,278},279{280/* Scalar L1 Instruction Cache per SQC */281.cache_size = 32,282.cache_level = 1,283.cache_line_size = 64,284.flags = (CRAT_CACHE_FLAGS_ENABLED |285CRAT_CACHE_FLAGS_INST_CACHE |286CRAT_CACHE_FLAGS_SIMD_CACHE),287.num_cu_shared = 3,288},289{290/* Scalar L1 Data Cache per SQC */291.cache_size = 16,292.cache_level = 1,293.cache_line_size = 64,294.flags = (CRAT_CACHE_FLAGS_ENABLED |295CRAT_CACHE_FLAGS_DATA_CACHE |296CRAT_CACHE_FLAGS_SIMD_CACHE),297.num_cu_shared = 3,298},299{300/* L2 Data Cache per GPU (Total Tex Cache) */301.cache_size = 2048,302.cache_level = 2,303.cache_line_size = 64,304.flags = (CRAT_CACHE_FLAGS_ENABLED |305CRAT_CACHE_FLAGS_DATA_CACHE |306CRAT_CACHE_FLAGS_SIMD_CACHE),307.num_cu_shared = 5,308},309};310311static struct kfd_gpu_cache_info vega20_cache_info[] = {312{313/* TCP L1 Cache per CU */314.cache_size = 16,315.cache_level = 1,316.cache_line_size = 64,317.flags = (CRAT_CACHE_FLAGS_ENABLED |318CRAT_CACHE_FLAGS_DATA_CACHE |319CRAT_CACHE_FLAGS_SIMD_CACHE),320.num_cu_shared = 1,321},322{323/* Scalar L1 Instruction Cache per SQC */324.cache_size = 32,325.cache_level = 1,326.cache_line_size = 64,327.flags = (CRAT_CACHE_FLAGS_ENABLED |328CRAT_CACHE_FLAGS_INST_CACHE |329CRAT_CACHE_FLAGS_SIMD_CACHE),330.num_cu_shared = 3,331},332{333/* Scalar L1 Data Cache per SQC */334.cache_size = 16,335.cache_level = 1,336.cache_line_size = 64,337.flags = (CRAT_CACHE_FLAGS_ENABLED |338CRAT_CACHE_FLAGS_DATA_CACHE |339CRAT_CACHE_FLAGS_SIMD_CACHE),340.num_cu_shared = 3,341},342{343/* L2 Data Cache per GPU (Total Tex Cache) */344.cache_size = 8192,345.cache_level = 2,346.cache_line_size = 64,347.flags = (CRAT_CACHE_FLAGS_ENABLED |348CRAT_CACHE_FLAGS_DATA_CACHE |349CRAT_CACHE_FLAGS_SIMD_CACHE),350.num_cu_shared = 16,351},352};353354static struct kfd_gpu_cache_info aldebaran_cache_info[] = {355{356/* TCP L1 Cache per CU */357.cache_size = 16,358.cache_level = 1,359.cache_line_size = 64,360.flags = (CRAT_CACHE_FLAGS_ENABLED |361CRAT_CACHE_FLAGS_DATA_CACHE |362CRAT_CACHE_FLAGS_SIMD_CACHE),363.num_cu_shared = 1,364},365{366/* Scalar L1 Instruction Cache per SQC */367.cache_size = 32,368.cache_level = 1,369.cache_line_size = 64,370.flags = (CRAT_CACHE_FLAGS_ENABLED |371CRAT_CACHE_FLAGS_INST_CACHE |372CRAT_CACHE_FLAGS_SIMD_CACHE),373.num_cu_shared = 2,374},375{376/* Scalar L1 Data Cache per SQC */377.cache_size = 16,378.cache_level = 1,379.cache_line_size = 64,380.flags = (CRAT_CACHE_FLAGS_ENABLED |381CRAT_CACHE_FLAGS_DATA_CACHE |382CRAT_CACHE_FLAGS_SIMD_CACHE),383.num_cu_shared = 2,384},385{386/* L2 Data Cache per GPU (Total Tex Cache) */387.cache_size = 8192,388.cache_level = 2,389.cache_line_size = 128,390.flags = (CRAT_CACHE_FLAGS_ENABLED |391CRAT_CACHE_FLAGS_DATA_CACHE |392CRAT_CACHE_FLAGS_SIMD_CACHE),393.num_cu_shared = 14,394},395};396397static struct kfd_gpu_cache_info navi10_cache_info[] = {398{399/* TCP L1 Cache per CU */400.cache_size = 16,401.cache_level = 1,402.cache_line_size = 128,403.flags = (CRAT_CACHE_FLAGS_ENABLED |404CRAT_CACHE_FLAGS_DATA_CACHE |405CRAT_CACHE_FLAGS_SIMD_CACHE),406.num_cu_shared = 1,407},408{409/* Scalar L1 Instruction Cache per SQC */410.cache_size = 32,411.cache_level = 1,412.cache_line_size = 64,413.flags = (CRAT_CACHE_FLAGS_ENABLED |414CRAT_CACHE_FLAGS_INST_CACHE |415CRAT_CACHE_FLAGS_SIMD_CACHE),416.num_cu_shared = 2,417},418{419/* Scalar L1 Data Cache per SQC */420.cache_size = 16,421.cache_level = 1,422.cache_line_size = 64,423.flags = (CRAT_CACHE_FLAGS_ENABLED |424CRAT_CACHE_FLAGS_DATA_CACHE |425CRAT_CACHE_FLAGS_SIMD_CACHE),426.num_cu_shared = 2,427},428{429/* GL1 Data Cache per SA */430.cache_size = 128,431.cache_level = 1,432.cache_line_size = 128,433.flags = (CRAT_CACHE_FLAGS_ENABLED |434CRAT_CACHE_FLAGS_DATA_CACHE |435CRAT_CACHE_FLAGS_SIMD_CACHE),436.num_cu_shared = 10,437},438{439/* L2 Data Cache per GPU (Total Tex Cache) */440.cache_size = 4096,441.cache_level = 2,442.cache_line_size = 128,443.flags = (CRAT_CACHE_FLAGS_ENABLED |444CRAT_CACHE_FLAGS_DATA_CACHE |445CRAT_CACHE_FLAGS_SIMD_CACHE),446.num_cu_shared = 10,447},448};449450static struct kfd_gpu_cache_info vangogh_cache_info[] = {451{452/* TCP L1 Cache per CU */453.cache_size = 16,454.cache_level = 1,455.cache_line_size = 128,456.flags = (CRAT_CACHE_FLAGS_ENABLED |457CRAT_CACHE_FLAGS_DATA_CACHE |458CRAT_CACHE_FLAGS_SIMD_CACHE),459.num_cu_shared = 1,460},461{462/* Scalar L1 Instruction Cache per SQC */463.cache_size = 32,464.cache_level = 1,465.cache_line_size = 64,466.flags = (CRAT_CACHE_FLAGS_ENABLED |467CRAT_CACHE_FLAGS_INST_CACHE |468CRAT_CACHE_FLAGS_SIMD_CACHE),469.num_cu_shared = 2,470},471{472/* Scalar L1 Data Cache per SQC */473.cache_size = 16,474.cache_level = 1,475.cache_line_size = 64,476.flags = (CRAT_CACHE_FLAGS_ENABLED |477CRAT_CACHE_FLAGS_DATA_CACHE |478CRAT_CACHE_FLAGS_SIMD_CACHE),479.num_cu_shared = 2,480},481{482/* GL1 Data Cache per SA */483.cache_size = 128,484.cache_level = 1,485.cache_line_size = 128,486.flags = (CRAT_CACHE_FLAGS_ENABLED |487CRAT_CACHE_FLAGS_DATA_CACHE |488CRAT_CACHE_FLAGS_SIMD_CACHE),489.num_cu_shared = 8,490},491{492/* L2 Data Cache per GPU (Total Tex Cache) */493.cache_size = 1024,494.cache_level = 2,495.cache_line_size = 128,496.flags = (CRAT_CACHE_FLAGS_ENABLED |497CRAT_CACHE_FLAGS_DATA_CACHE |498CRAT_CACHE_FLAGS_SIMD_CACHE),499.num_cu_shared = 8,500},501};502503static struct kfd_gpu_cache_info navi14_cache_info[] = {504{505/* TCP L1 Cache per CU */506.cache_size = 16,507.cache_level = 1,508.cache_line_size = 128,509.flags = (CRAT_CACHE_FLAGS_ENABLED |510CRAT_CACHE_FLAGS_DATA_CACHE |511CRAT_CACHE_FLAGS_SIMD_CACHE),512.num_cu_shared = 1,513},514{515/* Scalar L1 Instruction Cache per SQC */516.cache_size = 32,517.cache_level = 1,518.cache_line_size = 64,519.flags = (CRAT_CACHE_FLAGS_ENABLED |520CRAT_CACHE_FLAGS_INST_CACHE |521CRAT_CACHE_FLAGS_SIMD_CACHE),522.num_cu_shared = 2,523},524{525/* Scalar L1 Data Cache per SQC */526.cache_size = 16,527.cache_level = 1,528.cache_line_size = 64,529.flags = (CRAT_CACHE_FLAGS_ENABLED |530CRAT_CACHE_FLAGS_DATA_CACHE |531CRAT_CACHE_FLAGS_SIMD_CACHE),532.num_cu_shared = 2,533},534{535/* GL1 Data Cache per SA */536.cache_size = 128,537.cache_level = 1,538.cache_line_size = 128,539.flags = (CRAT_CACHE_FLAGS_ENABLED |540CRAT_CACHE_FLAGS_DATA_CACHE |541CRAT_CACHE_FLAGS_SIMD_CACHE),542.num_cu_shared = 12,543},544{545/* L2 Data Cache per GPU (Total Tex Cache) */546.cache_size = 2048,547.cache_level = 2,548.cache_line_size = 128,549.flags = (CRAT_CACHE_FLAGS_ENABLED |550CRAT_CACHE_FLAGS_DATA_CACHE |551CRAT_CACHE_FLAGS_SIMD_CACHE),552.num_cu_shared = 12,553},554};555556static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = {557{558/* TCP L1 Cache per CU */559.cache_size = 16,560.cache_level = 1,561.cache_line_size = 128,562.flags = (CRAT_CACHE_FLAGS_ENABLED |563CRAT_CACHE_FLAGS_DATA_CACHE |564CRAT_CACHE_FLAGS_SIMD_CACHE),565.num_cu_shared = 1,566},567{568/* Scalar L1 Instruction Cache per SQC */569.cache_size = 32,570.cache_level = 1,571.cache_line_size = 64,572.flags = (CRAT_CACHE_FLAGS_ENABLED |573CRAT_CACHE_FLAGS_INST_CACHE |574CRAT_CACHE_FLAGS_SIMD_CACHE),575.num_cu_shared = 2,576},577{578/* Scalar L1 Data Cache per SQC */579.cache_size = 16,580.cache_level = 1,581.cache_line_size = 64,582.flags = (CRAT_CACHE_FLAGS_ENABLED |583CRAT_CACHE_FLAGS_DATA_CACHE |584CRAT_CACHE_FLAGS_SIMD_CACHE),585.num_cu_shared = 2,586},587{588/* GL1 Data Cache per SA */589.cache_size = 128,590.cache_level = 1,591.cache_line_size = 128,592.flags = (CRAT_CACHE_FLAGS_ENABLED |593CRAT_CACHE_FLAGS_DATA_CACHE |594CRAT_CACHE_FLAGS_SIMD_CACHE),595.num_cu_shared = 10,596},597{598/* L2 Data Cache per GPU (Total Tex Cache) */599.cache_size = 4096,600.cache_level = 2,601.cache_line_size = 128,602.flags = (CRAT_CACHE_FLAGS_ENABLED |603CRAT_CACHE_FLAGS_DATA_CACHE |604CRAT_CACHE_FLAGS_SIMD_CACHE),605.num_cu_shared = 10,606},607{608/* L3 Data Cache per GPU */609.cache_size = 128*1024,610.cache_level = 3,611.cache_line_size = 64,612.flags = (CRAT_CACHE_FLAGS_ENABLED |613CRAT_CACHE_FLAGS_DATA_CACHE |614CRAT_CACHE_FLAGS_SIMD_CACHE),615.num_cu_shared = 10,616},617};618619static struct kfd_gpu_cache_info navy_flounder_cache_info[] = {620{621/* TCP L1 Cache per CU */622.cache_size = 16,623.cache_level = 1,624.cache_line_size = 128,625.flags = (CRAT_CACHE_FLAGS_ENABLED |626CRAT_CACHE_FLAGS_DATA_CACHE |627CRAT_CACHE_FLAGS_SIMD_CACHE),628.num_cu_shared = 1,629},630{631/* Scalar L1 Instruction Cache per SQC */632.cache_size = 32,633.cache_level = 1,634.cache_line_size = 64,635.flags = (CRAT_CACHE_FLAGS_ENABLED |636CRAT_CACHE_FLAGS_INST_CACHE |637CRAT_CACHE_FLAGS_SIMD_CACHE),638.num_cu_shared = 2,639},640{641/* Scalar L1 Data Cache per SQC */642.cache_size = 16,643.cache_level = 1,644.cache_line_size = 64,645.flags = (CRAT_CACHE_FLAGS_ENABLED |646CRAT_CACHE_FLAGS_DATA_CACHE |647CRAT_CACHE_FLAGS_SIMD_CACHE),648.num_cu_shared = 2,649},650{651/* GL1 Data Cache per SA */652.cache_size = 128,653.cache_level = 1,654.cache_line_size = 128,655.flags = (CRAT_CACHE_FLAGS_ENABLED |656CRAT_CACHE_FLAGS_DATA_CACHE |657CRAT_CACHE_FLAGS_SIMD_CACHE),658.num_cu_shared = 10,659},660{661/* L2 Data Cache per GPU (Total Tex Cache) */662.cache_size = 3072,663.cache_level = 2,664.cache_line_size = 128,665.flags = (CRAT_CACHE_FLAGS_ENABLED |666CRAT_CACHE_FLAGS_DATA_CACHE |667CRAT_CACHE_FLAGS_SIMD_CACHE),668.num_cu_shared = 10,669},670{671/* L3 Data Cache per GPU */672.cache_size = 96*1024,673.cache_level = 3,674.cache_line_size = 64,675.flags = (CRAT_CACHE_FLAGS_ENABLED |676CRAT_CACHE_FLAGS_DATA_CACHE |677CRAT_CACHE_FLAGS_SIMD_CACHE),678.num_cu_shared = 10,679},680};681682static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = {683{684/* TCP L1 Cache per CU */685.cache_size = 16,686.cache_level = 1,687.cache_line_size = 128,688.flags = (CRAT_CACHE_FLAGS_ENABLED |689CRAT_CACHE_FLAGS_DATA_CACHE |690CRAT_CACHE_FLAGS_SIMD_CACHE),691.num_cu_shared = 1,692},693{694/* Scalar L1 Instruction Cache per SQC */695.cache_size = 32,696.cache_level = 1,697.cache_line_size = 64,698.flags = (CRAT_CACHE_FLAGS_ENABLED |699CRAT_CACHE_FLAGS_INST_CACHE |700CRAT_CACHE_FLAGS_SIMD_CACHE),701.num_cu_shared = 2,702},703{704/* Scalar L1 Data Cache per SQC */705.cache_size = 16,706.cache_level = 1,707.cache_line_size = 64,708.flags = (CRAT_CACHE_FLAGS_ENABLED |709CRAT_CACHE_FLAGS_DATA_CACHE |710CRAT_CACHE_FLAGS_SIMD_CACHE),711.num_cu_shared = 2,712},713{714/* GL1 Data Cache per SA */715.cache_size = 128,716.cache_level = 1,717.cache_line_size = 128,718.flags = (CRAT_CACHE_FLAGS_ENABLED |719CRAT_CACHE_FLAGS_DATA_CACHE |720CRAT_CACHE_FLAGS_SIMD_CACHE),721.num_cu_shared = 8,722},723{724/* L2 Data Cache per GPU (Total Tex Cache) */725.cache_size = 2048,726.cache_level = 2,727.cache_line_size = 128,728.flags = (CRAT_CACHE_FLAGS_ENABLED |729CRAT_CACHE_FLAGS_DATA_CACHE |730CRAT_CACHE_FLAGS_SIMD_CACHE),731.num_cu_shared = 8,732},733{734/* L3 Data Cache per GPU */735.cache_size = 32*1024,736.cache_level = 3,737.cache_line_size = 64,738.flags = (CRAT_CACHE_FLAGS_ENABLED |739CRAT_CACHE_FLAGS_DATA_CACHE |740CRAT_CACHE_FLAGS_SIMD_CACHE),741.num_cu_shared = 8,742},743};744745static struct kfd_gpu_cache_info beige_goby_cache_info[] = {746{747/* TCP L1 Cache per CU */748.cache_size = 16,749.cache_level = 1,750.cache_line_size = 128,751.flags = (CRAT_CACHE_FLAGS_ENABLED |752CRAT_CACHE_FLAGS_DATA_CACHE |753CRAT_CACHE_FLAGS_SIMD_CACHE),754.num_cu_shared = 1,755},756{757/* Scalar L1 Instruction Cache per SQC */758.cache_size = 32,759.cache_level = 1,760.cache_line_size = 64,761.flags = (CRAT_CACHE_FLAGS_ENABLED |762CRAT_CACHE_FLAGS_INST_CACHE |763CRAT_CACHE_FLAGS_SIMD_CACHE),764.num_cu_shared = 2,765},766{767/* Scalar L1 Data Cache per SQC */768.cache_size = 16,769.cache_level = 1,770.cache_line_size = 64,771.flags = (CRAT_CACHE_FLAGS_ENABLED |772CRAT_CACHE_FLAGS_DATA_CACHE |773CRAT_CACHE_FLAGS_SIMD_CACHE),774.num_cu_shared = 2,775},776{777/* GL1 Data Cache per SA */778.cache_size = 128,779.cache_level = 1,780.cache_line_size = 128,781.flags = (CRAT_CACHE_FLAGS_ENABLED |782CRAT_CACHE_FLAGS_DATA_CACHE |783CRAT_CACHE_FLAGS_SIMD_CACHE),784.num_cu_shared = 8,785},786{787/* L2 Data Cache per GPU (Total Tex Cache) */788.cache_size = 1024,789.cache_level = 2,790.cache_line_size = 128,791.flags = (CRAT_CACHE_FLAGS_ENABLED |792CRAT_CACHE_FLAGS_DATA_CACHE |793CRAT_CACHE_FLAGS_SIMD_CACHE),794.num_cu_shared = 8,795},796{797/* L3 Data Cache per GPU */798.cache_size = 16*1024,799.cache_level = 3,800.cache_line_size = 64,801.flags = (CRAT_CACHE_FLAGS_ENABLED |802CRAT_CACHE_FLAGS_DATA_CACHE |803CRAT_CACHE_FLAGS_SIMD_CACHE),804.num_cu_shared = 8,805},806};807808static struct kfd_gpu_cache_info yellow_carp_cache_info[] = {809{810/* TCP L1 Cache per CU */811.cache_size = 16,812.cache_level = 1,813.cache_line_size = 128,814.flags = (CRAT_CACHE_FLAGS_ENABLED |815CRAT_CACHE_FLAGS_DATA_CACHE |816CRAT_CACHE_FLAGS_SIMD_CACHE),817.num_cu_shared = 1,818},819{820/* Scalar L1 Instruction Cache per SQC */821.cache_size = 32,822.cache_level = 1,823.cache_line_size = 64,824.flags = (CRAT_CACHE_FLAGS_ENABLED |825CRAT_CACHE_FLAGS_INST_CACHE |826CRAT_CACHE_FLAGS_SIMD_CACHE),827.num_cu_shared = 2,828},829{830/* Scalar L1 Data Cache per SQC */831.cache_size = 16,832.cache_level = 1,833.cache_line_size = 64,834.flags = (CRAT_CACHE_FLAGS_ENABLED |835CRAT_CACHE_FLAGS_DATA_CACHE |836CRAT_CACHE_FLAGS_SIMD_CACHE),837.num_cu_shared = 2,838},839{840/* GL1 Data Cache per SA */841.cache_size = 128,842.cache_level = 1,843.cache_line_size = 128,844.flags = (CRAT_CACHE_FLAGS_ENABLED |845CRAT_CACHE_FLAGS_DATA_CACHE |846CRAT_CACHE_FLAGS_SIMD_CACHE),847.num_cu_shared = 6,848},849{850/* L2 Data Cache per GPU (Total Tex Cache) */851.cache_size = 2048,852.cache_level = 2,853.cache_line_size = 128,854.flags = (CRAT_CACHE_FLAGS_ENABLED |855CRAT_CACHE_FLAGS_DATA_CACHE |856CRAT_CACHE_FLAGS_SIMD_CACHE),857.num_cu_shared = 6,858},859};860861static struct kfd_gpu_cache_info gfx1037_cache_info[] = {862{863/* TCP L1 Cache per CU */864.cache_size = 16,865.cache_level = 1,866.cache_line_size = 128,867.flags = (CRAT_CACHE_FLAGS_ENABLED |868CRAT_CACHE_FLAGS_DATA_CACHE |869CRAT_CACHE_FLAGS_SIMD_CACHE),870.num_cu_shared = 1,871},872{873/* Scalar L1 Instruction Cache per SQC */874.cache_size = 32,875.cache_level = 1,876.cache_line_size = 64,877.flags = (CRAT_CACHE_FLAGS_ENABLED |878CRAT_CACHE_FLAGS_INST_CACHE |879CRAT_CACHE_FLAGS_SIMD_CACHE),880.num_cu_shared = 2,881},882{883/* Scalar L1 Data Cache per SQC */884.cache_size = 16,885.cache_level = 1,886.cache_line_size = 64,887.flags = (CRAT_CACHE_FLAGS_ENABLED |888CRAT_CACHE_FLAGS_DATA_CACHE |889CRAT_CACHE_FLAGS_SIMD_CACHE),890.num_cu_shared = 2,891},892{893/* GL1 Data Cache per SA */894.cache_size = 128,895.cache_level = 1,896.cache_line_size = 128,897.flags = (CRAT_CACHE_FLAGS_ENABLED |898CRAT_CACHE_FLAGS_DATA_CACHE |899CRAT_CACHE_FLAGS_SIMD_CACHE),900.num_cu_shared = 2,901},902{903/* L2 Data Cache per GPU (Total Tex Cache) */904.cache_size = 256,905.cache_level = 2,906.cache_line_size = 128,907.flags = (CRAT_CACHE_FLAGS_ENABLED |908CRAT_CACHE_FLAGS_DATA_CACHE |909CRAT_CACHE_FLAGS_SIMD_CACHE),910.num_cu_shared = 2,911},912};913914static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = {915{916/* TCP L1 Cache per CU */917.cache_size = 16,918.cache_level = 1,919.cache_line_size = 128,920.flags = (CRAT_CACHE_FLAGS_ENABLED |921CRAT_CACHE_FLAGS_DATA_CACHE |922CRAT_CACHE_FLAGS_SIMD_CACHE),923.num_cu_shared = 1,924},925{926/* Scalar L1 Instruction Cache per SQC */927.cache_size = 32,928.cache_level = 1,929.cache_line_size = 64,930.flags = (CRAT_CACHE_FLAGS_ENABLED |931CRAT_CACHE_FLAGS_INST_CACHE |932CRAT_CACHE_FLAGS_SIMD_CACHE),933.num_cu_shared = 2,934},935{936/* Scalar L1 Data Cache per SQC */937.cache_size = 16,938.cache_level = 1,939.cache_line_size = 64,940.flags = (CRAT_CACHE_FLAGS_ENABLED |941CRAT_CACHE_FLAGS_DATA_CACHE |942CRAT_CACHE_FLAGS_SIMD_CACHE),943.num_cu_shared = 2,944},945{946/* GL1 Data Cache per SA */947.cache_size = 128,948.cache_level = 1,949.cache_line_size = 128,950.flags = (CRAT_CACHE_FLAGS_ENABLED |951CRAT_CACHE_FLAGS_DATA_CACHE |952CRAT_CACHE_FLAGS_SIMD_CACHE),953.num_cu_shared = 2,954},955{956/* L2 Data Cache per GPU (Total Tex Cache) */957.cache_size = 256,958.cache_level = 2,959.cache_line_size = 128,960.flags = (CRAT_CACHE_FLAGS_ENABLED |961CRAT_CACHE_FLAGS_DATA_CACHE |962CRAT_CACHE_FLAGS_SIMD_CACHE),963.num_cu_shared = 2,964},965};966967static struct kfd_gpu_cache_info dummy_cache_info[] = {968{969/* TCP L1 Cache per CU */970.cache_size = 16,971.cache_level = 1,972.cache_line_size = 64,973.flags = (CRAT_CACHE_FLAGS_ENABLED |974CRAT_CACHE_FLAGS_DATA_CACHE |975CRAT_CACHE_FLAGS_SIMD_CACHE),976.num_cu_shared = 1,977},978{979/* Scalar L1 Instruction Cache per SQC */980.cache_size = 32,981.cache_level = 1,982.cache_line_size = 64,983.flags = (CRAT_CACHE_FLAGS_ENABLED |984CRAT_CACHE_FLAGS_INST_CACHE |985CRAT_CACHE_FLAGS_SIMD_CACHE),986.num_cu_shared = 2,987},988{989/* Scalar L1 Data Cache per SQC */990.cache_size = 16,991.cache_level = 1,992.cache_line_size = 64,993.flags = (CRAT_CACHE_FLAGS_ENABLED |994CRAT_CACHE_FLAGS_DATA_CACHE |995CRAT_CACHE_FLAGS_SIMD_CACHE),996.num_cu_shared = 2,997},998{999/* GL1 Data Cache per SA */1000.cache_size = 128,1001.cache_level = 1,1002.cache_line_size = 64,1003.flags = (CRAT_CACHE_FLAGS_ENABLED |1004CRAT_CACHE_FLAGS_DATA_CACHE |1005CRAT_CACHE_FLAGS_SIMD_CACHE),1006.num_cu_shared = 6,1007},1008{1009/* L2 Data Cache per GPU (Total Tex Cache) */1010.cache_size = 2048,1011.cache_level = 2,1012.cache_line_size = 64,1013.flags = (CRAT_CACHE_FLAGS_ENABLED |1014CRAT_CACHE_FLAGS_DATA_CACHE |1015CRAT_CACHE_FLAGS_SIMD_CACHE),1016.num_cu_shared = 6,1017},1018};10191020static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,1021struct crat_subtype_computeunit *cu)1022{1023dev->node_props.cpu_cores_count = cu->num_cpu_cores;1024dev->node_props.cpu_core_id_base = cu->processor_id_low;1025if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)1026dev->node_props.capability |= HSA_CAP_ATS_PRESENT;10271028pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,1029cu->processor_id_low);1030}10311032static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,1033struct crat_subtype_computeunit *cu)1034{1035dev->node_props.simd_id_base = cu->processor_id_low;1036dev->node_props.simd_count = cu->num_simd_cores;1037dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;1038dev->node_props.max_waves_per_simd = cu->max_waves_simd;1039dev->node_props.wave_front_size = cu->wave_front_size;1040dev->node_props.array_count = cu->array_count;1041dev->node_props.cu_per_simd_array = cu->num_cu_per_array;1042dev->node_props.simd_per_cu = cu->num_simd_per_cu;1043dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;1044if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)1045dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;1046pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);1047}10481049/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct1050* topology device present in the device_list1051*/1052static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,1053struct list_head *device_list)1054{1055struct kfd_topology_device *dev;10561057pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",1058cu->proximity_domain, cu->hsa_capability);1059list_for_each_entry(dev, device_list, list) {1060if (cu->proximity_domain == dev->proximity_domain) {1061if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)1062kfd_populated_cu_info_cpu(dev, cu);10631064if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)1065kfd_populated_cu_info_gpu(dev, cu);1066break;1067}1068}10691070return 0;1071}10721073static struct kfd_mem_properties *1074find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,1075struct kfd_topology_device *dev)1076{1077struct kfd_mem_properties *props;10781079list_for_each_entry(props, &dev->mem_props, list) {1080if (props->heap_type == heap_type1081&& props->flags == flags1082&& props->width == width)1083return props;1084}10851086return NULL;1087}1088/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct1089* topology device present in the device_list1090*/1091static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,1092struct list_head *device_list)1093{1094struct kfd_mem_properties *props;1095struct kfd_topology_device *dev;1096uint32_t heap_type;1097uint64_t size_in_bytes;1098uint32_t flags = 0;1099uint32_t width;11001101pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",1102mem->proximity_domain);1103list_for_each_entry(dev, device_list, list) {1104if (mem->proximity_domain == dev->proximity_domain) {1105/* We're on GPU node */1106if (dev->node_props.cpu_cores_count == 0) {1107/* APU */1108if (mem->visibility_type == 0)1109heap_type =1110HSA_MEM_HEAP_TYPE_FB_PRIVATE;1111/* dGPU */1112else1113heap_type = mem->visibility_type;1114} else1115heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;11161117if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)1118flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;1119if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)1120flags |= HSA_MEM_FLAGS_NON_VOLATILE;11211122size_in_bytes =1123((uint64_t)mem->length_high << 32) +1124mem->length_low;1125width = mem->width;11261127/* Multiple banks of the same type are aggregated into1128* one. User mode doesn't care about multiple physical1129* memory segments. It's managed as a single virtual1130* heap for user mode.1131*/1132props = find_subtype_mem(heap_type, flags, width, dev);1133if (props) {1134props->size_in_bytes += size_in_bytes;1135break;1136}11371138props = kfd_alloc_struct(props);1139if (!props)1140return -ENOMEM;11411142props->heap_type = heap_type;1143props->flags = flags;1144props->size_in_bytes = size_in_bytes;1145props->width = width;11461147dev->node_props.mem_banks_count++;1148list_add_tail(&props->list, &dev->mem_props);11491150break;1151}1152}11531154return 0;1155}11561157/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct1158* topology device present in the device_list1159*/1160static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,1161struct list_head *device_list)1162{1163struct kfd_cache_properties *props;1164struct kfd_topology_device *dev;1165uint32_t id;1166uint32_t total_num_of_cu;11671168id = cache->processor_id_low;11691170pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);1171list_for_each_entry(dev, device_list, list) {1172total_num_of_cu = (dev->node_props.array_count *1173dev->node_props.cu_per_simd_array);11741175/* Cache infomration in CRAT doesn't have proximity_domain1176* information as it is associated with a CPU core or GPU1177* Compute Unit. So map the cache using CPU core Id or SIMD1178* (GPU) ID.1179* TODO: This works because currently we can safely assume that1180* Compute Units are parsed before caches are parsed. In1181* future, remove this dependency1182*/1183if ((id >= dev->node_props.cpu_core_id_base &&1184id <= dev->node_props.cpu_core_id_base +1185dev->node_props.cpu_cores_count) ||1186(id >= dev->node_props.simd_id_base &&1187id < dev->node_props.simd_id_base +1188total_num_of_cu)) {1189props = kfd_alloc_struct(props);1190if (!props)1191return -ENOMEM;11921193props->processor_id_low = id;1194props->cache_level = cache->cache_level;1195props->cache_size = cache->cache_size;1196props->cacheline_size = cache->cache_line_size;1197props->cachelines_per_tag = cache->lines_per_tag;1198props->cache_assoc = cache->associativity;1199props->cache_latency = cache->cache_latency;12001201memcpy(props->sibling_map, cache->sibling_map,1202CRAT_SIBLINGMAP_SIZE);12031204/* set the sibling_map_size as 32 for CRAT from ACPI */1205props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;12061207if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)1208props->cache_type |= HSA_CACHE_TYPE_DATA;1209if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)1210props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;1211if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)1212props->cache_type |= HSA_CACHE_TYPE_CPU;1213if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)1214props->cache_type |= HSA_CACHE_TYPE_HSACU;12151216dev->node_props.caches_count++;1217list_add_tail(&props->list, &dev->cache_props);12181219break;1220}1221}12221223return 0;1224}12251226/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct1227* topology device present in the device_list1228*/1229static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,1230struct list_head *device_list)1231{1232struct kfd_iolink_properties *props = NULL, *props2;1233struct kfd_topology_device *dev, *to_dev;1234uint32_t id_from;1235uint32_t id_to;12361237id_from = iolink->proximity_domain_from;1238id_to = iolink->proximity_domain_to;12391240pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n",1241id_from, id_to);1242list_for_each_entry(dev, device_list, list) {1243if (id_from == dev->proximity_domain) {1244props = kfd_alloc_struct(props);1245if (!props)1246return -ENOMEM;12471248props->node_from = id_from;1249props->node_to = id_to;1250props->ver_maj = iolink->version_major;1251props->ver_min = iolink->version_minor;1252props->iolink_type = iolink->io_interface_type;12531254if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)1255props->weight = 20;1256else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)1257props->weight = iolink->weight_xgmi;1258else1259props->weight = node_distance(id_from, id_to);12601261props->min_latency = iolink->minimum_latency;1262props->max_latency = iolink->maximum_latency;1263props->min_bandwidth = iolink->minimum_bandwidth_mbs;1264props->max_bandwidth = iolink->maximum_bandwidth_mbs;1265props->rec_transfer_size =1266iolink->recommended_transfer_size;12671268dev->node_props.io_links_count++;1269list_add_tail(&props->list, &dev->io_link_props);1270break;1271}1272}12731274/* CPU topology is created before GPUs are detected, so CPU->GPU1275* links are not built at that time. If a PCIe type is discovered, it1276* means a GPU is detected and we are adding GPU->CPU to the topology.1277* At this time, also add the corresponded CPU->GPU link if GPU1278* is large bar.1279* For xGMI, we only added the link with one direction in the crat1280* table, add corresponded reversed direction link now.1281*/1282if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {1283to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);1284if (!to_dev)1285return -ENODEV;1286/* same everything but the other direction */1287props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);1288if (!props2)1289return -ENOMEM;12901291props2->node_from = id_to;1292props2->node_to = id_from;1293props2->kobj = NULL;1294to_dev->node_props.io_links_count++;1295list_add_tail(&props2->list, &to_dev->io_link_props);1296}12971298return 0;1299}13001301/* kfd_parse_subtype - parse subtypes and attach it to correct topology device1302* present in the device_list1303* @sub_type_hdr - subtype section of crat_image1304* @device_list - list of topology devices present in this crat_image1305*/1306static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,1307struct list_head *device_list)1308{1309struct crat_subtype_computeunit *cu;1310struct crat_subtype_memory *mem;1311struct crat_subtype_cache *cache;1312struct crat_subtype_iolink *iolink;1313int ret = 0;13141315switch (sub_type_hdr->type) {1316case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:1317cu = (struct crat_subtype_computeunit *)sub_type_hdr;1318ret = kfd_parse_subtype_cu(cu, device_list);1319break;1320case CRAT_SUBTYPE_MEMORY_AFFINITY:1321mem = (struct crat_subtype_memory *)sub_type_hdr;1322ret = kfd_parse_subtype_mem(mem, device_list);1323break;1324case CRAT_SUBTYPE_CACHE_AFFINITY:1325cache = (struct crat_subtype_cache *)sub_type_hdr;1326ret = kfd_parse_subtype_cache(cache, device_list);1327break;1328case CRAT_SUBTYPE_TLB_AFFINITY:1329/*1330* For now, nothing to do here1331*/1332pr_debug("Found TLB entry in CRAT table (not processing)\n");1333break;1334case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:1335/*1336* For now, nothing to do here1337*/1338pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");1339break;1340case CRAT_SUBTYPE_IOLINK_AFFINITY:1341iolink = (struct crat_subtype_iolink *)sub_type_hdr;1342ret = kfd_parse_subtype_iolink(iolink, device_list);1343break;1344default:1345pr_warn("Unknown subtype %d in CRAT\n",1346sub_type_hdr->type);1347}13481349return ret;1350}13511352/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT1353* create a kfd_topology_device and add in to device_list. Also parse1354* CRAT subtypes and attach it to appropriate kfd_topology_device1355* @crat_image - input image containing CRAT1356* @device_list - [OUT] list of kfd_topology_device generated after1357* parsing crat_image1358* @proximity_domain - Proximity domain of the first device in the table1359*1360* Return - 0 if successful else -ve value1361*/1362int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,1363uint32_t proximity_domain)1364{1365struct kfd_topology_device *top_dev = NULL;1366struct crat_subtype_generic *sub_type_hdr;1367uint16_t node_id;1368int ret = 0;1369struct crat_header *crat_table = (struct crat_header *)crat_image;1370uint16_t num_nodes;1371uint32_t image_len;13721373if (!crat_image)1374return -EINVAL;13751376if (!list_empty(device_list)) {1377pr_warn("Error device list should be empty\n");1378return -EINVAL;1379}13801381num_nodes = crat_table->num_domains;1382image_len = crat_table->length;13831384pr_debug("Parsing CRAT table with %d nodes\n", num_nodes);13851386for (node_id = 0; node_id < num_nodes; node_id++) {1387top_dev = kfd_create_topology_device(device_list);1388if (!top_dev)1389break;1390top_dev->proximity_domain = proximity_domain++;1391}13921393if (!top_dev) {1394ret = -ENOMEM;1395goto err;1396}13971398memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);1399memcpy(top_dev->oem_table_id, crat_table->oem_table_id,1400CRAT_OEMTABLEID_LENGTH);1401top_dev->oem_revision = crat_table->oem_revision;14021403sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);1404while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <1405((char *)crat_image) + image_len) {1406if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {1407ret = kfd_parse_subtype(sub_type_hdr, device_list);1408if (ret)1409break;1410}14111412sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1413sub_type_hdr->length);1414}14151416err:1417if (ret)1418kfd_release_topology_device_list(device_list);14191420return ret;1421}142214231424static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,1425bool cache_line_size_missing,1426struct kfd_gpu_cache_info *pcache_info)1427{1428struct amdgpu_device *adev = kdev->adev;1429int i = 0;14301431/* TCP L1 Cache per CU */1432if (adev->gfx.config.gc_tcp_l1_size) {1433pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size;1434pcache_info[i].cache_level = 1;1435pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1436CRAT_CACHE_FLAGS_DATA_CACHE |1437CRAT_CACHE_FLAGS_SIMD_CACHE);1438pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2;1439pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size;1440if (cache_line_size_missing && !pcache_info[i].cache_line_size)1441pcache_info[i].cache_line_size = 128;1442i++;1443}1444/* Scalar L1 Instruction Cache per SQC */1445if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {1446pcache_info[i].cache_size =1447adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;1448pcache_info[i].cache_level = 1;1449pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1450CRAT_CACHE_FLAGS_INST_CACHE |1451CRAT_CACHE_FLAGS_SIMD_CACHE);1452pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;1453pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size;1454if (cache_line_size_missing && !pcache_info[i].cache_line_size)1455pcache_info[i].cache_line_size = 128;1456i++;1457}1458/* Scalar L1 Data Cache per SQC */1459if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {1460pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;1461pcache_info[i].cache_level = 1;1462pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1463CRAT_CACHE_FLAGS_DATA_CACHE |1464CRAT_CACHE_FLAGS_SIMD_CACHE);1465pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;1466pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size;1467if (cache_line_size_missing && !pcache_info[i].cache_line_size)1468pcache_info[i].cache_line_size = 64;1469i++;1470}1471/* GL1 Data Cache per SA */1472if (adev->gfx.config.gc_gl1c_per_sa &&1473adev->gfx.config.gc_gl1c_size_per_instance) {1474pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa *1475adev->gfx.config.gc_gl1c_size_per_instance;1476pcache_info[i].cache_level = 1;1477pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1478CRAT_CACHE_FLAGS_DATA_CACHE |1479CRAT_CACHE_FLAGS_SIMD_CACHE);1480pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1481if (cache_line_size_missing)1482pcache_info[i].cache_line_size = 128;1483i++;1484}1485/* L2 Data Cache per GPU (Total Tex Cache) */1486if (adev->gfx.config.gc_gl2c_per_gpu) {1487pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu;1488pcache_info[i].cache_level = 2;1489pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1490CRAT_CACHE_FLAGS_DATA_CACHE |1491CRAT_CACHE_FLAGS_SIMD_CACHE);1492pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1493pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size;1494if (cache_line_size_missing && !pcache_info[i].cache_line_size)1495pcache_info[i].cache_line_size = 128;1496i++;1497}1498/* L3 Data Cache per GPU */1499if (adev->gmc.mall_size) {1500pcache_info[i].cache_size = adev->gmc.mall_size / 1024;1501pcache_info[i].cache_level = 3;1502pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1503CRAT_CACHE_FLAGS_DATA_CACHE |1504CRAT_CACHE_FLAGS_SIMD_CACHE);1505pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1506pcache_info[i].cache_line_size = 64;1507i++;1508}1509return i;1510}15111512static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev,1513struct kfd_gpu_cache_info *pcache_info)1514{1515struct amdgpu_device *adev = kdev->adev;1516int i = 0;15171518/* TCP L1 Cache per CU */1519if (adev->gfx.config.gc_tcp_size_per_cu) {1520pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu;1521pcache_info[i].cache_level = 1;1522/* Cacheline size not available in IP discovery for gc943,gc944 */1523pcache_info[i].cache_line_size = 128;1524pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1525CRAT_CACHE_FLAGS_DATA_CACHE |1526CRAT_CACHE_FLAGS_SIMD_CACHE);1527pcache_info[i].num_cu_shared = 1;1528i++;1529}1530/* Scalar L1 Instruction Cache per SQC */1531if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {1532pcache_info[i].cache_size =1533adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;1534pcache_info[i].cache_level = 1;1535pcache_info[i].cache_line_size = 64;1536pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1537CRAT_CACHE_FLAGS_INST_CACHE |1538CRAT_CACHE_FLAGS_SIMD_CACHE);1539pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;1540i++;1541}1542/* Scalar L1 Data Cache per SQC */1543if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {1544pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;1545pcache_info[i].cache_level = 1;1546pcache_info[i].cache_line_size = 64;1547pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1548CRAT_CACHE_FLAGS_DATA_CACHE |1549CRAT_CACHE_FLAGS_SIMD_CACHE);1550pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;1551i++;1552}1553/* L2 Data Cache per GPU (Total Tex Cache) */1554if (adev->gfx.config.gc_tcc_size) {1555pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size;1556pcache_info[i].cache_level = 2;1557pcache_info[i].cache_line_size = 128;1558pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1559CRAT_CACHE_FLAGS_DATA_CACHE |1560CRAT_CACHE_FLAGS_SIMD_CACHE);1561pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1562i++;1563}1564/* L3 Data Cache per GPU */1565if (adev->gmc.mall_size) {1566pcache_info[i].cache_size = adev->gmc.mall_size / 1024;1567pcache_info[i].cache_level = 3;1568pcache_info[i].cache_line_size = 64;1569pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1570CRAT_CACHE_FLAGS_DATA_CACHE |1571CRAT_CACHE_FLAGS_SIMD_CACHE);1572pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1573i++;1574}1575return i;1576}15771578int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info)1579{1580int num_of_cache_types = 0;1581bool cache_line_size_missing = false;15821583switch (kdev->adev->asic_type) {1584case CHIP_KAVERI:1585*pcache_info = kaveri_cache_info;1586num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);1587break;1588case CHIP_HAWAII:1589*pcache_info = hawaii_cache_info;1590num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);1591break;1592case CHIP_CARRIZO:1593*pcache_info = carrizo_cache_info;1594num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);1595break;1596case CHIP_TONGA:1597*pcache_info = tonga_cache_info;1598num_of_cache_types = ARRAY_SIZE(tonga_cache_info);1599break;1600case CHIP_FIJI:1601*pcache_info = fiji_cache_info;1602num_of_cache_types = ARRAY_SIZE(fiji_cache_info);1603break;1604case CHIP_POLARIS10:1605*pcache_info = polaris10_cache_info;1606num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);1607break;1608case CHIP_POLARIS11:1609*pcache_info = polaris11_cache_info;1610num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);1611break;1612case CHIP_POLARIS12:1613*pcache_info = polaris12_cache_info;1614num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);1615break;1616case CHIP_VEGAM:1617*pcache_info = vegam_cache_info;1618num_of_cache_types = ARRAY_SIZE(vegam_cache_info);1619break;1620default:1621switch (KFD_GC_VERSION(kdev)) {1622case IP_VERSION(9, 0, 1):1623*pcache_info = vega10_cache_info;1624num_of_cache_types = ARRAY_SIZE(vega10_cache_info);1625break;1626case IP_VERSION(9, 2, 1):1627*pcache_info = vega12_cache_info;1628num_of_cache_types = ARRAY_SIZE(vega12_cache_info);1629break;1630case IP_VERSION(9, 4, 0):1631case IP_VERSION(9, 4, 1):1632*pcache_info = vega20_cache_info;1633num_of_cache_types = ARRAY_SIZE(vega20_cache_info);1634break;1635case IP_VERSION(9, 4, 2):1636*pcache_info = aldebaran_cache_info;1637num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);1638break;1639case IP_VERSION(9, 4, 3):1640case IP_VERSION(9, 4, 4):1641case IP_VERSION(9, 5, 0):1642num_of_cache_types =1643kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd,1644*pcache_info);1645break;1646case IP_VERSION(9, 1, 0):1647case IP_VERSION(9, 2, 2):1648*pcache_info = raven_cache_info;1649num_of_cache_types = ARRAY_SIZE(raven_cache_info);1650break;1651case IP_VERSION(9, 3, 0):1652*pcache_info = renoir_cache_info;1653num_of_cache_types = ARRAY_SIZE(renoir_cache_info);1654break;1655case IP_VERSION(10, 1, 10):1656case IP_VERSION(10, 1, 2):1657case IP_VERSION(10, 1, 3):1658case IP_VERSION(10, 1, 4):1659*pcache_info = navi10_cache_info;1660num_of_cache_types = ARRAY_SIZE(navi10_cache_info);1661break;1662case IP_VERSION(10, 1, 1):1663*pcache_info = navi14_cache_info;1664num_of_cache_types = ARRAY_SIZE(navi14_cache_info);1665break;1666case IP_VERSION(10, 3, 0):1667*pcache_info = sienna_cichlid_cache_info;1668num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);1669break;1670case IP_VERSION(10, 3, 2):1671*pcache_info = navy_flounder_cache_info;1672num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);1673break;1674case IP_VERSION(10, 3, 4):1675*pcache_info = dimgrey_cavefish_cache_info;1676num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);1677break;1678case IP_VERSION(10, 3, 1):1679*pcache_info = vangogh_cache_info;1680num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);1681break;1682case IP_VERSION(10, 3, 5):1683*pcache_info = beige_goby_cache_info;1684num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);1685break;1686case IP_VERSION(10, 3, 3):1687*pcache_info = yellow_carp_cache_info;1688num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);1689break;1690case IP_VERSION(10, 3, 6):1691*pcache_info = gc_10_3_6_cache_info;1692num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);1693break;1694case IP_VERSION(10, 3, 7):1695*pcache_info = gfx1037_cache_info;1696num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);1697break;1698case IP_VERSION(11, 0, 0):1699case IP_VERSION(11, 0, 1):1700case IP_VERSION(11, 0, 2):1701case IP_VERSION(11, 0, 3):1702case IP_VERSION(11, 0, 4):1703case IP_VERSION(11, 5, 0):1704case IP_VERSION(11, 5, 1):1705case IP_VERSION(11, 5, 2):1706case IP_VERSION(11, 5, 3):1707case IP_VERSION(11, 5, 4):1708/* Cacheline size not available in IP discovery for gc11.1709* kfd_fill_gpu_cache_info_from_gfx_config to hard code it1710*/1711cache_line_size_missing = true;1712fallthrough;1713case IP_VERSION(12, 0, 0):1714case IP_VERSION(12, 0, 1):1715case IP_VERSION(12, 1, 0):1716num_of_cache_types =1717kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd,1718cache_line_size_missing,1719*pcache_info);1720break;1721default:1722*pcache_info = dummy_cache_info;1723num_of_cache_types = ARRAY_SIZE(dummy_cache_info);1724pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");1725break;1726}1727}1728return num_of_cache_types;1729}17301731/* Memory required to create Virtual CRAT.1732* Since there is no easy way to predict the amount of memory required, the1733* following amount is allocated for GPU Virtual CRAT. This is1734* expected to cover all known conditions. But to be safe additional check1735* is put in the code to ensure we don't overwrite.1736*/1737#define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE)17381739/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node1740*1741* @numa_node_id: CPU NUMA node id1742* @avail_size: Available size in the memory1743* @sub_type_hdr: Memory into which compute info will be filled in1744*1745* Return 0 if successful else return -ve value1746*/1747static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,1748int proximity_domain,1749struct crat_subtype_computeunit *sub_type_hdr)1750{1751const struct cpumask *cpumask;17521753*avail_size -= sizeof(struct crat_subtype_computeunit);1754if (*avail_size < 0)1755return -ENOMEM;17561757memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));17581759/* Fill in subtype header data */1760sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;1761sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);1762sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;17631764cpumask = cpumask_of_node(numa_node_id);17651766/* Fill in CU data */1767sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;1768sub_type_hdr->proximity_domain = proximity_domain;1769sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);1770if (sub_type_hdr->processor_id_low == -1)1771return -EINVAL;17721773sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);17741775return 0;1776}17771778/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node1779*1780* @numa_node_id: CPU NUMA node id1781* @avail_size: Available size in the memory1782* @sub_type_hdr: Memory into which compute info will be filled in1783*1784* Return 0 if successful else return -ve value1785*/1786static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,1787int proximity_domain,1788struct crat_subtype_memory *sub_type_hdr)1789{1790uint64_t mem_in_bytes = 0;1791pg_data_t *pgdat;1792int zone_type;17931794*avail_size -= sizeof(struct crat_subtype_memory);1795if (*avail_size < 0)1796return -ENOMEM;17971798memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));17991800/* Fill in subtype header data */1801sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;1802sub_type_hdr->length = sizeof(struct crat_subtype_memory);1803sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;18041805/* Fill in Memory Subunit data */18061807/* Unlike si_meminfo, si_meminfo_node is not exported. So1808* the following lines are duplicated from si_meminfo_node1809* function1810*/1811pgdat = NODE_DATA(numa_node_id);1812for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)1813mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]);1814mem_in_bytes <<= PAGE_SHIFT;18151816sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);1817sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);1818sub_type_hdr->proximity_domain = proximity_domain;18191820return 0;1821}18221823#ifdef CONFIG_X86_641824static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,1825uint32_t *num_entries,1826struct crat_subtype_iolink *sub_type_hdr)1827{1828int nid;1829struct cpuinfo_x86 *c = &cpu_data(0);1830uint8_t link_type;18311832if (c->x86_vendor == X86_VENDOR_AMD)1833link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;1834else1835link_type = CRAT_IOLINK_TYPE_QPI_1_1;18361837*num_entries = 0;18381839/* Create IO links from this node to other CPU nodes */1840for_each_online_node(nid) {1841if (nid == numa_node_id) /* node itself */1842continue;18431844*avail_size -= sizeof(struct crat_subtype_iolink);1845if (*avail_size < 0)1846return -ENOMEM;18471848memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));18491850/* Fill in subtype header data */1851sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;1852sub_type_hdr->length = sizeof(struct crat_subtype_iolink);1853sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;18541855/* Fill in IO link data */1856sub_type_hdr->proximity_domain_from = numa_node_id;1857sub_type_hdr->proximity_domain_to = nid;1858sub_type_hdr->io_interface_type = link_type;18591860(*num_entries)++;1861sub_type_hdr++;1862}18631864return 0;1865}1866#endif18671868/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU1869*1870* @pcrat_image: Fill in VCRAT for CPU1871* @size: [IN] allocated size of crat_image.1872* [OUT] actual size of data filled in crat_image1873*/1874static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)1875{1876struct crat_header *crat_table = (struct crat_header *)pcrat_image;1877struct acpi_table_header *acpi_table;1878acpi_status status;1879struct crat_subtype_generic *sub_type_hdr;1880int avail_size = *size;1881int numa_node_id;1882#ifdef CONFIG_X86_641883uint32_t entries = 0;1884#endif1885int ret = 0;18861887if (!pcrat_image)1888return -EINVAL;18891890/* Fill in CRAT Header.1891* Modify length and total_entries as subunits are added.1892*/1893avail_size -= sizeof(struct crat_header);1894if (avail_size < 0)1895return -ENOMEM;18961897memset(crat_table, 0, sizeof(struct crat_header));1898memcpy(&crat_table->signature, CRAT_SIGNATURE,1899sizeof(crat_table->signature));1900crat_table->length = sizeof(struct crat_header);19011902status = acpi_get_table("DSDT", 0, &acpi_table);1903if (status != AE_OK)1904pr_warn("DSDT table not found for OEM information\n");1905else {1906crat_table->oem_revision = acpi_table->revision;1907memcpy(crat_table->oem_id, acpi_table->oem_id,1908CRAT_OEMID_LENGTH);1909memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,1910CRAT_OEMTABLEID_LENGTH);1911acpi_put_table(acpi_table);1912}1913crat_table->total_entries = 0;1914crat_table->num_domains = 0;19151916sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);19171918for_each_online_node(numa_node_id) {1919if (kfd_numa_node_to_apic_id(numa_node_id) == -1)1920continue;19211922/* Fill in Subtype: Compute Unit */1923ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,1924crat_table->num_domains,1925(struct crat_subtype_computeunit *)sub_type_hdr);1926if (ret < 0)1927return ret;1928crat_table->length += sub_type_hdr->length;1929crat_table->total_entries++;19301931sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1932sub_type_hdr->length);19331934/* Fill in Subtype: Memory */1935ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,1936crat_table->num_domains,1937(struct crat_subtype_memory *)sub_type_hdr);1938if (ret < 0)1939return ret;1940crat_table->length += sub_type_hdr->length;1941crat_table->total_entries++;19421943sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1944sub_type_hdr->length);19451946/* Fill in Subtype: IO Link */1947#ifdef CONFIG_X86_641948ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,1949&entries,1950(struct crat_subtype_iolink *)sub_type_hdr);1951if (ret < 0)1952return ret;19531954if (entries) {1955crat_table->length += (sub_type_hdr->length * entries);1956crat_table->total_entries += entries;19571958sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1959sub_type_hdr->length * entries);1960}1961#else1962pr_info("IO link not available for non x86 platforms\n");1963#endif19641965crat_table->num_domains++;1966}19671968/* TODO: Add cache Subtype for CPU.1969* Currently, CPU cache information is available in function1970* detect_cache_attributes(cpu) defined in the file1971* ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not1972* exported and to get the same information the code needs to be1973* duplicated.1974*/19751976*size = crat_table->length;1977pr_info("Virtual CRAT table created for CPU\n");19781979return 0;1980}19811982static int kfd_fill_gpu_memory_affinity(int *avail_size,1983struct kfd_node *kdev, uint8_t type, uint64_t size,1984struct crat_subtype_memory *sub_type_hdr,1985uint32_t proximity_domain,1986const struct kfd_local_mem_info *local_mem_info)1987{1988*avail_size -= sizeof(struct crat_subtype_memory);1989if (*avail_size < 0)1990return -ENOMEM;19911992memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));1993sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;1994sub_type_hdr->length = sizeof(struct crat_subtype_memory);1995sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;19961997sub_type_hdr->proximity_domain = proximity_domain;19981999pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",2000type, size);20012002sub_type_hdr->length_low = lower_32_bits(size);2003sub_type_hdr->length_high = upper_32_bits(size);20042005sub_type_hdr->width = local_mem_info->vram_width;2006sub_type_hdr->visibility_type = type;20072008return 0;2009}20102011#ifdef CONFIG_ACPI_NUMA2012static void kfd_find_numa_node_in_srat(struct kfd_node *kdev)2013{2014struct acpi_table_header *table_header = NULL;2015struct acpi_subtable_header *sub_header = NULL;2016unsigned long table_end, subtable_len;2017u32 pci_id = pci_domain_nr(kdev->adev->pdev->bus) << 16 |2018pci_dev_id(kdev->adev->pdev);2019u32 bdf;2020acpi_status status;2021struct acpi_srat_cpu_affinity *cpu;2022struct acpi_srat_generic_affinity *gpu;2023int pxm = 0, max_pxm = 0;2024int numa_node = NUMA_NO_NODE;2025bool found = false;20262027/* Fetch the SRAT table from ACPI */2028status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);2029if (status == AE_NOT_FOUND) {2030pr_warn("SRAT table not found\n");2031return;2032} else if (ACPI_FAILURE(status)) {2033const char *err = acpi_format_exception(status);2034pr_err("SRAT table error: %s\n", err);2035return;2036}20372038table_end = (unsigned long)table_header + table_header->length;20392040/* Parse all entries looking for a match. */2041sub_header = (struct acpi_subtable_header *)2042((unsigned long)table_header +2043sizeof(struct acpi_table_srat));2044subtable_len = sub_header->length;20452046while (((unsigned long)sub_header) + subtable_len < table_end) {2047/*2048* If length is 0, break from this loop to avoid2049* infinite loop.2050*/2051if (subtable_len == 0) {2052pr_err("SRAT invalid zero length\n");2053break;2054}20552056switch (sub_header->type) {2057case ACPI_SRAT_TYPE_CPU_AFFINITY:2058cpu = (struct acpi_srat_cpu_affinity *)sub_header;2059pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |2060cpu->proximity_domain_lo;2061if (pxm > max_pxm)2062max_pxm = pxm;2063break;2064case ACPI_SRAT_TYPE_GENERIC_AFFINITY:2065gpu = (struct acpi_srat_generic_affinity *)sub_header;2066bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |2067*((u16 *)(&gpu->device_handle[2]));2068if (bdf == pci_id) {2069found = true;2070numa_node = pxm_to_node(gpu->proximity_domain);2071}2072break;2073default:2074break;2075}20762077if (found)2078break;20792080sub_header = (struct acpi_subtable_header *)2081((unsigned long)sub_header + subtable_len);2082subtable_len = sub_header->length;2083}20842085acpi_put_table(table_header);20862087/* Workaround bad cpu-gpu binding case */2088if (found && (numa_node < 0 ||2089numa_node > pxm_to_node(max_pxm)))2090numa_node = 0;20912092if (numa_node != NUMA_NO_NODE)2093set_dev_node(&kdev->adev->pdev->dev, numa_node);2094}2095#endif20962097#define KFD_CRAT_INTRA_SOCKET_WEIGHT 132098#define KFD_CRAT_XGMI_WEIGHT 1520992100/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU2101* to its NUMA node2102* @avail_size: Available size in the memory2103* @kdev - [IN] GPU device2104* @sub_type_hdr: Memory into which io link info will be filled in2105* @proximity_domain - proximity domain of the GPU node2106*2107* Return 0 if successful else return -ve value2108*/2109static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,2110struct kfd_node *kdev,2111struct crat_subtype_iolink *sub_type_hdr,2112uint32_t proximity_domain)2113{2114*avail_size -= sizeof(struct crat_subtype_iolink);2115if (*avail_size < 0)2116return -ENOMEM;21172118memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));21192120/* Fill in subtype header data */2121sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;2122sub_type_hdr->length = sizeof(struct crat_subtype_iolink);2123sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;2124if (kfd_dev_is_large_bar(kdev))2125sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;21262127/* Fill in IOLINK subtype.2128* TODO: Fill-in other fields of iolink subtype2129*/2130if (kdev->adev->gmc.xgmi.connected_to_cpu ||2131(KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) &&2132kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) ==2133AMDGPU_PKG_TYPE_APU)) {2134bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);2135int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :2136KFD_CRAT_INTRA_SOCKET_WEIGHT;2137/*2138* with host gpu xgmi link, host can access gpu memory whether2139* or not pcie bar type is large, so always create bidirectional2140* io link.2141*/2142sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;2143sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;2144sub_type_hdr->weight_xgmi = weight;2145if (ext_cpu) {2146amdgpu_xgmi_get_bandwidth(kdev->adev, NULL,2147AMDGPU_XGMI_BW_MODE_PER_LINK,2148AMDGPU_XGMI_BW_UNIT_MBYTES,2149&sub_type_hdr->minimum_bandwidth_mbs,2150&sub_type_hdr->maximum_bandwidth_mbs);2151} else {2152sub_type_hdr->minimum_bandwidth_mbs = mem_bw;2153sub_type_hdr->maximum_bandwidth_mbs = mem_bw;2154}2155} else {2156sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;2157sub_type_hdr->minimum_bandwidth_mbs =2158amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true);2159sub_type_hdr->maximum_bandwidth_mbs =2160amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false);2161}21622163sub_type_hdr->proximity_domain_from = proximity_domain;21642165#ifdef CONFIG_ACPI_NUMA2166if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE &&2167num_possible_nodes() > 1)2168kfd_find_numa_node_in_srat(kdev);2169#endif2170#ifdef CONFIG_NUMA2171if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE)2172sub_type_hdr->proximity_domain_to = 0;2173else2174sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node;2175#else2176sub_type_hdr->proximity_domain_to = 0;2177#endif2178return 0;2179}21802181static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,2182struct kfd_node *kdev,2183struct kfd_node *peer_kdev,2184struct crat_subtype_iolink *sub_type_hdr,2185uint32_t proximity_domain_from,2186uint32_t proximity_domain_to)2187{2188bool use_ta_info = kdev->kfd->num_nodes == 1;21892190*avail_size -= sizeof(struct crat_subtype_iolink);2191if (*avail_size < 0)2192return -ENOMEM;21932194memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));21952196sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;2197sub_type_hdr->length = sizeof(struct crat_subtype_iolink);2198sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |2199CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;22002201sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;2202sub_type_hdr->proximity_domain_from = proximity_domain_from;2203sub_type_hdr->proximity_domain_to = proximity_domain_to;22042205if (use_ta_info) {2206sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *2207amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);2208amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev,2209AMDGPU_XGMI_BW_MODE_PER_PEER,2210AMDGPU_XGMI_BW_UNIT_MBYTES,2211&sub_type_hdr->minimum_bandwidth_mbs,2212&sub_type_hdr->maximum_bandwidth_mbs);2213} else {2214bool is_single_hop = kdev->kfd == peer_kdev->kfd;2215int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :2216(2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT;2217int mem_bw = 819200;22182219sub_type_hdr->weight_xgmi = weight;2220sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0;2221sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0;2222}22232224return 0;2225}22262227/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU2228*2229* @pcrat_image: Fill in VCRAT for GPU2230* @size: [IN] allocated size of crat_image.2231* [OUT] actual size of data filled in crat_image2232*/2233static int kfd_create_vcrat_image_gpu(void *pcrat_image,2234size_t *size, struct kfd_node *kdev,2235uint32_t proximity_domain)2236{2237struct crat_header *crat_table = (struct crat_header *)pcrat_image;2238struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;2239struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;2240struct crat_subtype_generic *sub_type_hdr;2241struct kfd_local_mem_info local_mem_info;2242struct kfd_topology_device *peer_dev;2243struct crat_subtype_computeunit *cu;2244int avail_size = *size;2245uint32_t total_num_of_cu;2246uint32_t nid = 0;2247int ret = 0;22482249if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)2250return -EINVAL;22512252/* Fill the CRAT Header.2253* Modify length and total_entries as subunits are added.2254*/2255avail_size -= sizeof(struct crat_header);2256memset(crat_table, 0, sizeof(struct crat_header));22572258memcpy(&crat_table->signature, CRAT_SIGNATURE,2259sizeof(crat_table->signature));2260/* Change length as we add more subtypes*/2261crat_table->length = sizeof(struct crat_header);2262crat_table->num_domains = 1;2263crat_table->total_entries = 0;22642265/* Fill in Subtype: Compute Unit2266* First fill in the sub type header and then sub type data2267*/2268avail_size -= sizeof(struct crat_subtype_computeunit);2269sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);2270memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));22712272sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;2273sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);2274sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;22752276/* Fill CU subtype data */2277cu = (struct crat_subtype_computeunit *)sub_type_hdr;2278cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;2279cu->proximity_domain = proximity_domain;22802281cu->num_simd_per_cu = cu_info->simd_per_cu;2282cu->num_simd_cores = cu_info->simd_per_cu *2283(cu_info->number / kdev->kfd->num_nodes);2284cu->max_waves_simd = cu_info->max_waves_per_simd;22852286cu->wave_front_size = cu_info->wave_front_size;2287cu->array_count = gfx_info->max_sh_per_se *2288gfx_info->max_shader_engines;2289total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh);2290cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);2291cu->num_cu_per_array = gfx_info->max_cu_per_sh;2292cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu;2293cu->num_banks = gfx_info->max_shader_engines;2294cu->lds_size_in_kb = cu_info->lds_size;22952296cu->hsa_capability = 0;22972298crat_table->length += sub_type_hdr->length;2299crat_table->total_entries++;23002301/* Fill in Subtype: Memory. Only on systems with large BAR (no2302* private FB), report memory as public. On other systems2303* report the total FB size (public+private) as a single2304* private heap.2305*/2306local_mem_info = kdev->local_mem_info;2307sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +2308sub_type_hdr->length);23092310if (kdev->adev->debug_largebar)2311local_mem_info.local_mem_size_private = 0;23122313if (local_mem_info.local_mem_size_private == 0)2314ret = kfd_fill_gpu_memory_affinity(&avail_size,2315kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,2316local_mem_info.local_mem_size_public,2317(struct crat_subtype_memory *)sub_type_hdr,2318proximity_domain,2319&local_mem_info);2320else2321ret = kfd_fill_gpu_memory_affinity(&avail_size,2322kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,2323local_mem_info.local_mem_size_public +2324local_mem_info.local_mem_size_private,2325(struct crat_subtype_memory *)sub_type_hdr,2326proximity_domain,2327&local_mem_info);2328if (ret < 0)2329return ret;23302331crat_table->length += sizeof(struct crat_subtype_memory);2332crat_table->total_entries++;23332334/* Fill in Subtype: IO_LINKS2335* Only direct links are added here which is Link from GPU to2336* its NUMA node. Indirect links are added by userspace.2337*/2338sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +2339sub_type_hdr->length);2340ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,2341(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);23422343if (ret < 0)2344return ret;23452346crat_table->length += sub_type_hdr->length;2347crat_table->total_entries++;234823492350/* Fill in Subtype: IO_LINKS2351* Direct links from GPU to other GPUs through xGMI.2352* We will loop GPUs that already be processed (with lower value2353* of proximity_domain), add the link for the GPUs with same2354* hive id (from this GPU to other GPU) . The reversed iolink2355* (from other GPU to this GPU) will be added2356* in kfd_parse_subtype_iolink.2357*/2358if (kdev->kfd->hive_id) {2359for (nid = 0; nid < proximity_domain; ++nid) {2360peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);2361if (!peer_dev->gpu)2362continue;2363if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)2364continue;2365if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))2366continue;2367sub_type_hdr = (typeof(sub_type_hdr))(2368(char *)sub_type_hdr +2369sizeof(struct crat_subtype_iolink));2370ret = kfd_fill_gpu_xgmi_link_to_gpu(2371&avail_size, kdev, peer_dev->gpu,2372(struct crat_subtype_iolink *)sub_type_hdr,2373proximity_domain, nid);2374if (ret < 0)2375return ret;2376crat_table->length += sub_type_hdr->length;2377crat_table->total_entries++;2378}2379}2380*size = crat_table->length;2381pr_info("Virtual CRAT table created for GPU\n");23822383return ret;2384}23852386/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and2387* creates a Virtual CRAT (VCRAT) image2388*2389* NOTE: Call kfd_destroy_crat_image to free CRAT image memory2390*2391* @crat_image: VCRAT image created because ACPI does not have a2392* CRAT for this device2393* @size: [OUT] size of virtual crat_image2394* @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device2395* COMPUTE_UNIT_GPU - Create VCRAT for GPU2396* (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU2397* -- this option is not currently implemented.2398* The assumption is that all AMD APUs will have CRAT2399* @kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU2400*2401* Return 0 if successful else return -ve value2402*/2403int kfd_create_crat_image_virtual(void **crat_image, size_t *size,2404int flags, struct kfd_node *kdev,2405uint32_t proximity_domain)2406{2407void *pcrat_image = NULL;2408int ret = 0, num_nodes;2409size_t dyn_size;24102411if (!crat_image)2412return -EINVAL;24132414*crat_image = NULL;24152416/* Allocate the CPU Virtual CRAT size based on the number of online2417* nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image.2418* This should cover all the current conditions. A check is put not2419* to overwrite beyond allocated size for GPUs2420*/2421switch (flags) {2422case COMPUTE_UNIT_CPU:2423num_nodes = num_online_nodes();2424dyn_size = sizeof(struct crat_header) +2425num_nodes * (sizeof(struct crat_subtype_computeunit) +2426sizeof(struct crat_subtype_memory) +2427(num_nodes - 1) * sizeof(struct crat_subtype_iolink));2428pcrat_image = kvmalloc(dyn_size, GFP_KERNEL);2429if (!pcrat_image)2430return -ENOMEM;2431*size = dyn_size;2432pr_debug("CRAT size is %ld", dyn_size);2433ret = kfd_create_vcrat_image_cpu(pcrat_image, size);2434break;2435case COMPUTE_UNIT_GPU:2436if (!kdev)2437return -EINVAL;2438pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);2439if (!pcrat_image)2440return -ENOMEM;2441*size = VCRAT_SIZE_FOR_GPU;2442ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,2443proximity_domain);2444break;2445case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):2446/* TODO: */2447ret = -EINVAL;2448pr_err("VCRAT not implemented for APU\n");2449break;2450default:2451ret = -EINVAL;2452}24532454if (!ret)2455*crat_image = pcrat_image;2456else2457kvfree(pcrat_image);24582459return ret;2460}246124622463/* kfd_destroy_crat_image2464*2465* @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)2466*2467*/2468void kfd_destroy_crat_image(void *crat_image)2469{2470kvfree(crat_image);2471}247224732474