Path: blob/master/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
26516 views
// SPDX-License-Identifier: GPL-2.0 OR MIT1/*2* Copyright 2015-2022 Advanced Micro Devices, Inc.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*/2223#include <linux/pci.h>24#include <linux/acpi.h>25#include "kfd_crat.h"26#include "kfd_priv.h"27#include "kfd_topology.h"28#include "amdgpu.h"29#include "amdgpu_amdkfd.h"30#include "amdgpu_xgmi.h"3132/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.33* GPU processor ID are expressed with Bit[31]=1.34* The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs35* used in the CRAT.36*/37static uint32_t gpu_processor_id_low = 0x80001000;3839/* Return the next available gpu_processor_id and increment it for next GPU40* @total_cu_count - Total CUs present in the GPU including ones41* masked off42*/43static inline unsigned int get_and_inc_gpu_processor_id(44unsigned int total_cu_count)45{46int current_id = gpu_processor_id_low;4748gpu_processor_id_low += total_cu_count;49return current_id;50}515253static struct kfd_gpu_cache_info kaveri_cache_info[] = {54{55/* TCP L1 Cache per CU */56.cache_size = 16,57.cache_level = 1,58.cache_line_size = 64,59.flags = (CRAT_CACHE_FLAGS_ENABLED |60CRAT_CACHE_FLAGS_DATA_CACHE |61CRAT_CACHE_FLAGS_SIMD_CACHE),62.num_cu_shared = 1,63},64{65/* Scalar L1 Instruction Cache (in SQC module) per bank */66.cache_size = 16,67.cache_level = 1,68.cache_line_size = 64,69.flags = (CRAT_CACHE_FLAGS_ENABLED |70CRAT_CACHE_FLAGS_INST_CACHE |71CRAT_CACHE_FLAGS_SIMD_CACHE),72.num_cu_shared = 2,73},74{75/* Scalar L1 Data Cache (in SQC module) per bank */76.cache_size = 8,77.cache_level = 1,78.cache_line_size = 64,79.flags = (CRAT_CACHE_FLAGS_ENABLED |80CRAT_CACHE_FLAGS_DATA_CACHE |81CRAT_CACHE_FLAGS_SIMD_CACHE),82.num_cu_shared = 2,83},8485/* TODO: Add L2 Cache information */86};878889static struct kfd_gpu_cache_info carrizo_cache_info[] = {90{91/* TCP L1 Cache per CU */92.cache_size = 16,93.cache_level = 1,94.cache_line_size = 64,95.flags = (CRAT_CACHE_FLAGS_ENABLED |96CRAT_CACHE_FLAGS_DATA_CACHE |97CRAT_CACHE_FLAGS_SIMD_CACHE),98.num_cu_shared = 1,99},100{101/* Scalar L1 Instruction Cache (in SQC module) per bank */102.cache_size = 32,103.cache_level = 1,104.cache_line_size = 64,105.flags = (CRAT_CACHE_FLAGS_ENABLED |106CRAT_CACHE_FLAGS_INST_CACHE |107CRAT_CACHE_FLAGS_SIMD_CACHE),108.num_cu_shared = 4,109},110{111/* Scalar L1 Data Cache (in SQC module) per bank. */112.cache_size = 16,113.cache_level = 1,114.cache_line_size = 64,115.flags = (CRAT_CACHE_FLAGS_ENABLED |116CRAT_CACHE_FLAGS_DATA_CACHE |117CRAT_CACHE_FLAGS_SIMD_CACHE),118.num_cu_shared = 4,119},120121/* TODO: Add L2 Cache information */122};123124#define hawaii_cache_info kaveri_cache_info125#define tonga_cache_info carrizo_cache_info126#define fiji_cache_info carrizo_cache_info127#define polaris10_cache_info carrizo_cache_info128#define polaris11_cache_info carrizo_cache_info129#define polaris12_cache_info carrizo_cache_info130#define vegam_cache_info carrizo_cache_info131132/* NOTE: L1 cache information has been updated and L2/L3133* cache information has been added for Vega10 and134* newer ASICs. The unit for cache_size is KiB.135* In future, check & update cache details136* for every new ASIC is required.137*/138139static struct kfd_gpu_cache_info vega10_cache_info[] = {140{141/* TCP L1 Cache per CU */142.cache_size = 16,143.cache_level = 1,144.cache_line_size = 64,145.flags = (CRAT_CACHE_FLAGS_ENABLED |146CRAT_CACHE_FLAGS_DATA_CACHE |147CRAT_CACHE_FLAGS_SIMD_CACHE),148.num_cu_shared = 1,149},150{151/* Scalar L1 Instruction Cache per SQC */152.cache_size = 32,153.cache_level = 1,154.cache_line_size = 64,155.flags = (CRAT_CACHE_FLAGS_ENABLED |156CRAT_CACHE_FLAGS_INST_CACHE |157CRAT_CACHE_FLAGS_SIMD_CACHE),158.num_cu_shared = 3,159},160{161/* Scalar L1 Data Cache per SQC */162.cache_size = 16,163.cache_level = 1,164.cache_line_size = 64,165.flags = (CRAT_CACHE_FLAGS_ENABLED |166CRAT_CACHE_FLAGS_DATA_CACHE |167CRAT_CACHE_FLAGS_SIMD_CACHE),168.num_cu_shared = 3,169},170{171/* L2 Data Cache per GPU (Total Tex Cache) */172.cache_size = 4096,173.cache_level = 2,174.cache_line_size = 64,175.flags = (CRAT_CACHE_FLAGS_ENABLED |176CRAT_CACHE_FLAGS_DATA_CACHE |177CRAT_CACHE_FLAGS_SIMD_CACHE),178.num_cu_shared = 16,179},180};181182static struct kfd_gpu_cache_info raven_cache_info[] = {183{184/* TCP L1 Cache per CU */185.cache_size = 16,186.cache_level = 1,187.cache_line_size = 64,188.flags = (CRAT_CACHE_FLAGS_ENABLED |189CRAT_CACHE_FLAGS_DATA_CACHE |190CRAT_CACHE_FLAGS_SIMD_CACHE),191.num_cu_shared = 1,192},193{194/* Scalar L1 Instruction Cache per SQC */195.cache_size = 32,196.cache_level = 1,197.cache_line_size = 64,198.flags = (CRAT_CACHE_FLAGS_ENABLED |199CRAT_CACHE_FLAGS_INST_CACHE |200CRAT_CACHE_FLAGS_SIMD_CACHE),201.num_cu_shared = 3,202},203{204/* Scalar L1 Data Cache per SQC */205.cache_size = 16,206.cache_level = 1,207.cache_line_size = 64,208.flags = (CRAT_CACHE_FLAGS_ENABLED |209CRAT_CACHE_FLAGS_DATA_CACHE |210CRAT_CACHE_FLAGS_SIMD_CACHE),211.num_cu_shared = 3,212},213{214/* L2 Data Cache per GPU (Total Tex Cache) */215.cache_size = 1024,216.cache_level = 2,217.cache_line_size = 64,218.flags = (CRAT_CACHE_FLAGS_ENABLED |219CRAT_CACHE_FLAGS_DATA_CACHE |220CRAT_CACHE_FLAGS_SIMD_CACHE),221.num_cu_shared = 11,222},223};224225static struct kfd_gpu_cache_info renoir_cache_info[] = {226{227/* TCP L1 Cache per CU */228.cache_size = 16,229.cache_level = 1,230.cache_line_size = 64,231.flags = (CRAT_CACHE_FLAGS_ENABLED |232CRAT_CACHE_FLAGS_DATA_CACHE |233CRAT_CACHE_FLAGS_SIMD_CACHE),234.num_cu_shared = 1,235},236{237/* Scalar L1 Instruction Cache per SQC */238.cache_size = 32,239.cache_level = 1,240.cache_line_size = 64,241.flags = (CRAT_CACHE_FLAGS_ENABLED |242CRAT_CACHE_FLAGS_INST_CACHE |243CRAT_CACHE_FLAGS_SIMD_CACHE),244.num_cu_shared = 3,245},246{247/* Scalar L1 Data Cache per SQC */248.cache_size = 16,249.cache_level = 1,250.cache_line_size = 64,251.flags = (CRAT_CACHE_FLAGS_ENABLED |252CRAT_CACHE_FLAGS_DATA_CACHE |253CRAT_CACHE_FLAGS_SIMD_CACHE),254.num_cu_shared = 3,255},256{257/* L2 Data Cache per GPU (Total Tex Cache) */258.cache_size = 1024,259.cache_level = 2,260.cache_line_size = 64,261.flags = (CRAT_CACHE_FLAGS_ENABLED |262CRAT_CACHE_FLAGS_DATA_CACHE |263CRAT_CACHE_FLAGS_SIMD_CACHE),264.num_cu_shared = 8,265},266};267268static struct kfd_gpu_cache_info vega12_cache_info[] = {269{270/* TCP L1 Cache per CU */271.cache_size = 16,272.cache_level = 1,273.cache_line_size = 64,274.flags = (CRAT_CACHE_FLAGS_ENABLED |275CRAT_CACHE_FLAGS_DATA_CACHE |276CRAT_CACHE_FLAGS_SIMD_CACHE),277.num_cu_shared = 1,278},279{280/* Scalar L1 Instruction Cache per SQC */281.cache_size = 32,282.cache_level = 1,283.cache_line_size = 64,284.flags = (CRAT_CACHE_FLAGS_ENABLED |285CRAT_CACHE_FLAGS_INST_CACHE |286CRAT_CACHE_FLAGS_SIMD_CACHE),287.num_cu_shared = 3,288},289{290/* Scalar L1 Data Cache per SQC */291.cache_size = 16,292.cache_level = 1,293.cache_line_size = 64,294.flags = (CRAT_CACHE_FLAGS_ENABLED |295CRAT_CACHE_FLAGS_DATA_CACHE |296CRAT_CACHE_FLAGS_SIMD_CACHE),297.num_cu_shared = 3,298},299{300/* L2 Data Cache per GPU (Total Tex Cache) */301.cache_size = 2048,302.cache_level = 2,303.cache_line_size = 64,304.flags = (CRAT_CACHE_FLAGS_ENABLED |305CRAT_CACHE_FLAGS_DATA_CACHE |306CRAT_CACHE_FLAGS_SIMD_CACHE),307.num_cu_shared = 5,308},309};310311static struct kfd_gpu_cache_info vega20_cache_info[] = {312{313/* TCP L1 Cache per CU */314.cache_size = 16,315.cache_level = 1,316.cache_line_size = 64,317.flags = (CRAT_CACHE_FLAGS_ENABLED |318CRAT_CACHE_FLAGS_DATA_CACHE |319CRAT_CACHE_FLAGS_SIMD_CACHE),320.num_cu_shared = 1,321},322{323/* Scalar L1 Instruction Cache per SQC */324.cache_size = 32,325.cache_level = 1,326.cache_line_size = 64,327.flags = (CRAT_CACHE_FLAGS_ENABLED |328CRAT_CACHE_FLAGS_INST_CACHE |329CRAT_CACHE_FLAGS_SIMD_CACHE),330.num_cu_shared = 3,331},332{333/* Scalar L1 Data Cache per SQC */334.cache_size = 16,335.cache_level = 1,336.cache_line_size = 64,337.flags = (CRAT_CACHE_FLAGS_ENABLED |338CRAT_CACHE_FLAGS_DATA_CACHE |339CRAT_CACHE_FLAGS_SIMD_CACHE),340.num_cu_shared = 3,341},342{343/* L2 Data Cache per GPU (Total Tex Cache) */344.cache_size = 8192,345.cache_level = 2,346.cache_line_size = 64,347.flags = (CRAT_CACHE_FLAGS_ENABLED |348CRAT_CACHE_FLAGS_DATA_CACHE |349CRAT_CACHE_FLAGS_SIMD_CACHE),350.num_cu_shared = 16,351},352};353354static struct kfd_gpu_cache_info aldebaran_cache_info[] = {355{356/* TCP L1 Cache per CU */357.cache_size = 16,358.cache_level = 1,359.cache_line_size = 64,360.flags = (CRAT_CACHE_FLAGS_ENABLED |361CRAT_CACHE_FLAGS_DATA_CACHE |362CRAT_CACHE_FLAGS_SIMD_CACHE),363.num_cu_shared = 1,364},365{366/* Scalar L1 Instruction Cache per SQC */367.cache_size = 32,368.cache_level = 1,369.cache_line_size = 64,370.flags = (CRAT_CACHE_FLAGS_ENABLED |371CRAT_CACHE_FLAGS_INST_CACHE |372CRAT_CACHE_FLAGS_SIMD_CACHE),373.num_cu_shared = 2,374},375{376/* Scalar L1 Data Cache per SQC */377.cache_size = 16,378.cache_level = 1,379.cache_line_size = 64,380.flags = (CRAT_CACHE_FLAGS_ENABLED |381CRAT_CACHE_FLAGS_DATA_CACHE |382CRAT_CACHE_FLAGS_SIMD_CACHE),383.num_cu_shared = 2,384},385{386/* L2 Data Cache per GPU (Total Tex Cache) */387.cache_size = 8192,388.cache_level = 2,389.cache_line_size = 128,390.flags = (CRAT_CACHE_FLAGS_ENABLED |391CRAT_CACHE_FLAGS_DATA_CACHE |392CRAT_CACHE_FLAGS_SIMD_CACHE),393.num_cu_shared = 14,394},395};396397static struct kfd_gpu_cache_info navi10_cache_info[] = {398{399/* TCP L1 Cache per CU */400.cache_size = 16,401.cache_level = 1,402.cache_line_size = 128,403.flags = (CRAT_CACHE_FLAGS_ENABLED |404CRAT_CACHE_FLAGS_DATA_CACHE |405CRAT_CACHE_FLAGS_SIMD_CACHE),406.num_cu_shared = 1,407},408{409/* Scalar L1 Instruction Cache per SQC */410.cache_size = 32,411.cache_level = 1,412.cache_line_size = 64,413.flags = (CRAT_CACHE_FLAGS_ENABLED |414CRAT_CACHE_FLAGS_INST_CACHE |415CRAT_CACHE_FLAGS_SIMD_CACHE),416.num_cu_shared = 2,417},418{419/* Scalar L1 Data Cache per SQC */420.cache_size = 16,421.cache_level = 1,422.cache_line_size = 64,423.flags = (CRAT_CACHE_FLAGS_ENABLED |424CRAT_CACHE_FLAGS_DATA_CACHE |425CRAT_CACHE_FLAGS_SIMD_CACHE),426.num_cu_shared = 2,427},428{429/* GL1 Data Cache per SA */430.cache_size = 128,431.cache_level = 1,432.cache_line_size = 128,433.flags = (CRAT_CACHE_FLAGS_ENABLED |434CRAT_CACHE_FLAGS_DATA_CACHE |435CRAT_CACHE_FLAGS_SIMD_CACHE),436.num_cu_shared = 10,437},438{439/* L2 Data Cache per GPU (Total Tex Cache) */440.cache_size = 4096,441.cache_level = 2,442.cache_line_size = 128,443.flags = (CRAT_CACHE_FLAGS_ENABLED |444CRAT_CACHE_FLAGS_DATA_CACHE |445CRAT_CACHE_FLAGS_SIMD_CACHE),446.num_cu_shared = 10,447},448};449450static struct kfd_gpu_cache_info vangogh_cache_info[] = {451{452/* TCP L1 Cache per CU */453.cache_size = 16,454.cache_level = 1,455.cache_line_size = 128,456.flags = (CRAT_CACHE_FLAGS_ENABLED |457CRAT_CACHE_FLAGS_DATA_CACHE |458CRAT_CACHE_FLAGS_SIMD_CACHE),459.num_cu_shared = 1,460},461{462/* Scalar L1 Instruction Cache per SQC */463.cache_size = 32,464.cache_level = 1,465.cache_line_size = 64,466.flags = (CRAT_CACHE_FLAGS_ENABLED |467CRAT_CACHE_FLAGS_INST_CACHE |468CRAT_CACHE_FLAGS_SIMD_CACHE),469.num_cu_shared = 2,470},471{472/* Scalar L1 Data Cache per SQC */473.cache_size = 16,474.cache_level = 1,475.cache_line_size = 64,476.flags = (CRAT_CACHE_FLAGS_ENABLED |477CRAT_CACHE_FLAGS_DATA_CACHE |478CRAT_CACHE_FLAGS_SIMD_CACHE),479.num_cu_shared = 2,480},481{482/* GL1 Data Cache per SA */483.cache_size = 128,484.cache_level = 1,485.cache_line_size = 128,486.flags = (CRAT_CACHE_FLAGS_ENABLED |487CRAT_CACHE_FLAGS_DATA_CACHE |488CRAT_CACHE_FLAGS_SIMD_CACHE),489.num_cu_shared = 8,490},491{492/* L2 Data Cache per GPU (Total Tex Cache) */493.cache_size = 1024,494.cache_level = 2,495.cache_line_size = 128,496.flags = (CRAT_CACHE_FLAGS_ENABLED |497CRAT_CACHE_FLAGS_DATA_CACHE |498CRAT_CACHE_FLAGS_SIMD_CACHE),499.num_cu_shared = 8,500},501};502503static struct kfd_gpu_cache_info navi14_cache_info[] = {504{505/* TCP L1 Cache per CU */506.cache_size = 16,507.cache_level = 1,508.cache_line_size = 128,509.flags = (CRAT_CACHE_FLAGS_ENABLED |510CRAT_CACHE_FLAGS_DATA_CACHE |511CRAT_CACHE_FLAGS_SIMD_CACHE),512.num_cu_shared = 1,513},514{515/* Scalar L1 Instruction Cache per SQC */516.cache_size = 32,517.cache_level = 1,518.cache_line_size = 64,519.flags = (CRAT_CACHE_FLAGS_ENABLED |520CRAT_CACHE_FLAGS_INST_CACHE |521CRAT_CACHE_FLAGS_SIMD_CACHE),522.num_cu_shared = 2,523},524{525/* Scalar L1 Data Cache per SQC */526.cache_size = 16,527.cache_level = 1,528.cache_line_size = 64,529.flags = (CRAT_CACHE_FLAGS_ENABLED |530CRAT_CACHE_FLAGS_DATA_CACHE |531CRAT_CACHE_FLAGS_SIMD_CACHE),532.num_cu_shared = 2,533},534{535/* GL1 Data Cache per SA */536.cache_size = 128,537.cache_level = 1,538.cache_line_size = 128,539.flags = (CRAT_CACHE_FLAGS_ENABLED |540CRAT_CACHE_FLAGS_DATA_CACHE |541CRAT_CACHE_FLAGS_SIMD_CACHE),542.num_cu_shared = 12,543},544{545/* L2 Data Cache per GPU (Total Tex Cache) */546.cache_size = 2048,547.cache_level = 2,548.cache_line_size = 128,549.flags = (CRAT_CACHE_FLAGS_ENABLED |550CRAT_CACHE_FLAGS_DATA_CACHE |551CRAT_CACHE_FLAGS_SIMD_CACHE),552.num_cu_shared = 12,553},554};555556static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = {557{558/* TCP L1 Cache per CU */559.cache_size = 16,560.cache_level = 1,561.cache_line_size = 128,562.flags = (CRAT_CACHE_FLAGS_ENABLED |563CRAT_CACHE_FLAGS_DATA_CACHE |564CRAT_CACHE_FLAGS_SIMD_CACHE),565.num_cu_shared = 1,566},567{568/* Scalar L1 Instruction Cache per SQC */569.cache_size = 32,570.cache_level = 1,571.cache_line_size = 64,572.flags = (CRAT_CACHE_FLAGS_ENABLED |573CRAT_CACHE_FLAGS_INST_CACHE |574CRAT_CACHE_FLAGS_SIMD_CACHE),575.num_cu_shared = 2,576},577{578/* Scalar L1 Data Cache per SQC */579.cache_size = 16,580.cache_level = 1,581.cache_line_size = 64,582.flags = (CRAT_CACHE_FLAGS_ENABLED |583CRAT_CACHE_FLAGS_DATA_CACHE |584CRAT_CACHE_FLAGS_SIMD_CACHE),585.num_cu_shared = 2,586},587{588/* GL1 Data Cache per SA */589.cache_size = 128,590.cache_level = 1,591.cache_line_size = 128,592.flags = (CRAT_CACHE_FLAGS_ENABLED |593CRAT_CACHE_FLAGS_DATA_CACHE |594CRAT_CACHE_FLAGS_SIMD_CACHE),595.num_cu_shared = 10,596},597{598/* L2 Data Cache per GPU (Total Tex Cache) */599.cache_size = 4096,600.cache_level = 2,601.cache_line_size = 128,602.flags = (CRAT_CACHE_FLAGS_ENABLED |603CRAT_CACHE_FLAGS_DATA_CACHE |604CRAT_CACHE_FLAGS_SIMD_CACHE),605.num_cu_shared = 10,606},607{608/* L3 Data Cache per GPU */609.cache_size = 128*1024,610.cache_level = 3,611.cache_line_size = 64,612.flags = (CRAT_CACHE_FLAGS_ENABLED |613CRAT_CACHE_FLAGS_DATA_CACHE |614CRAT_CACHE_FLAGS_SIMD_CACHE),615.num_cu_shared = 10,616},617};618619static struct kfd_gpu_cache_info navy_flounder_cache_info[] = {620{621/* TCP L1 Cache per CU */622.cache_size = 16,623.cache_level = 1,624.cache_line_size = 128,625.flags = (CRAT_CACHE_FLAGS_ENABLED |626CRAT_CACHE_FLAGS_DATA_CACHE |627CRAT_CACHE_FLAGS_SIMD_CACHE),628.num_cu_shared = 1,629},630{631/* Scalar L1 Instruction Cache per SQC */632.cache_size = 32,633.cache_level = 1,634.cache_line_size = 64,635.flags = (CRAT_CACHE_FLAGS_ENABLED |636CRAT_CACHE_FLAGS_INST_CACHE |637CRAT_CACHE_FLAGS_SIMD_CACHE),638.num_cu_shared = 2,639},640{641/* Scalar L1 Data Cache per SQC */642.cache_size = 16,643.cache_level = 1,644.cache_line_size = 64,645.flags = (CRAT_CACHE_FLAGS_ENABLED |646CRAT_CACHE_FLAGS_DATA_CACHE |647CRAT_CACHE_FLAGS_SIMD_CACHE),648.num_cu_shared = 2,649},650{651/* GL1 Data Cache per SA */652.cache_size = 128,653.cache_level = 1,654.cache_line_size = 128,655.flags = (CRAT_CACHE_FLAGS_ENABLED |656CRAT_CACHE_FLAGS_DATA_CACHE |657CRAT_CACHE_FLAGS_SIMD_CACHE),658.num_cu_shared = 10,659},660{661/* L2 Data Cache per GPU (Total Tex Cache) */662.cache_size = 3072,663.cache_level = 2,664.cache_line_size = 128,665.flags = (CRAT_CACHE_FLAGS_ENABLED |666CRAT_CACHE_FLAGS_DATA_CACHE |667CRAT_CACHE_FLAGS_SIMD_CACHE),668.num_cu_shared = 10,669},670{671/* L3 Data Cache per GPU */672.cache_size = 96*1024,673.cache_level = 3,674.cache_line_size = 64,675.flags = (CRAT_CACHE_FLAGS_ENABLED |676CRAT_CACHE_FLAGS_DATA_CACHE |677CRAT_CACHE_FLAGS_SIMD_CACHE),678.num_cu_shared = 10,679},680};681682static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = {683{684/* TCP L1 Cache per CU */685.cache_size = 16,686.cache_level = 1,687.cache_line_size = 128,688.flags = (CRAT_CACHE_FLAGS_ENABLED |689CRAT_CACHE_FLAGS_DATA_CACHE |690CRAT_CACHE_FLAGS_SIMD_CACHE),691.num_cu_shared = 1,692},693{694/* Scalar L1 Instruction Cache per SQC */695.cache_size = 32,696.cache_level = 1,697.cache_line_size = 64,698.flags = (CRAT_CACHE_FLAGS_ENABLED |699CRAT_CACHE_FLAGS_INST_CACHE |700CRAT_CACHE_FLAGS_SIMD_CACHE),701.num_cu_shared = 2,702},703{704/* Scalar L1 Data Cache per SQC */705.cache_size = 16,706.cache_level = 1,707.cache_line_size = 64,708.flags = (CRAT_CACHE_FLAGS_ENABLED |709CRAT_CACHE_FLAGS_DATA_CACHE |710CRAT_CACHE_FLAGS_SIMD_CACHE),711.num_cu_shared = 2,712},713{714/* GL1 Data Cache per SA */715.cache_size = 128,716.cache_level = 1,717.cache_line_size = 128,718.flags = (CRAT_CACHE_FLAGS_ENABLED |719CRAT_CACHE_FLAGS_DATA_CACHE |720CRAT_CACHE_FLAGS_SIMD_CACHE),721.num_cu_shared = 8,722},723{724/* L2 Data Cache per GPU (Total Tex Cache) */725.cache_size = 2048,726.cache_level = 2,727.cache_line_size = 128,728.flags = (CRAT_CACHE_FLAGS_ENABLED |729CRAT_CACHE_FLAGS_DATA_CACHE |730CRAT_CACHE_FLAGS_SIMD_CACHE),731.num_cu_shared = 8,732},733{734/* L3 Data Cache per GPU */735.cache_size = 32*1024,736.cache_level = 3,737.cache_line_size = 64,738.flags = (CRAT_CACHE_FLAGS_ENABLED |739CRAT_CACHE_FLAGS_DATA_CACHE |740CRAT_CACHE_FLAGS_SIMD_CACHE),741.num_cu_shared = 8,742},743};744745static struct kfd_gpu_cache_info beige_goby_cache_info[] = {746{747/* TCP L1 Cache per CU */748.cache_size = 16,749.cache_level = 1,750.cache_line_size = 128,751.flags = (CRAT_CACHE_FLAGS_ENABLED |752CRAT_CACHE_FLAGS_DATA_CACHE |753CRAT_CACHE_FLAGS_SIMD_CACHE),754.num_cu_shared = 1,755},756{757/* Scalar L1 Instruction Cache per SQC */758.cache_size = 32,759.cache_level = 1,760.cache_line_size = 64,761.flags = (CRAT_CACHE_FLAGS_ENABLED |762CRAT_CACHE_FLAGS_INST_CACHE |763CRAT_CACHE_FLAGS_SIMD_CACHE),764.num_cu_shared = 2,765},766{767/* Scalar L1 Data Cache per SQC */768.cache_size = 16,769.cache_level = 1,770.cache_line_size = 64,771.flags = (CRAT_CACHE_FLAGS_ENABLED |772CRAT_CACHE_FLAGS_DATA_CACHE |773CRAT_CACHE_FLAGS_SIMD_CACHE),774.num_cu_shared = 2,775},776{777/* GL1 Data Cache per SA */778.cache_size = 128,779.cache_level = 1,780.cache_line_size = 128,781.flags = (CRAT_CACHE_FLAGS_ENABLED |782CRAT_CACHE_FLAGS_DATA_CACHE |783CRAT_CACHE_FLAGS_SIMD_CACHE),784.num_cu_shared = 8,785},786{787/* L2 Data Cache per GPU (Total Tex Cache) */788.cache_size = 1024,789.cache_level = 2,790.cache_line_size = 128,791.flags = (CRAT_CACHE_FLAGS_ENABLED |792CRAT_CACHE_FLAGS_DATA_CACHE |793CRAT_CACHE_FLAGS_SIMD_CACHE),794.num_cu_shared = 8,795},796{797/* L3 Data Cache per GPU */798.cache_size = 16*1024,799.cache_level = 3,800.cache_line_size = 64,801.flags = (CRAT_CACHE_FLAGS_ENABLED |802CRAT_CACHE_FLAGS_DATA_CACHE |803CRAT_CACHE_FLAGS_SIMD_CACHE),804.num_cu_shared = 8,805},806};807808static struct kfd_gpu_cache_info yellow_carp_cache_info[] = {809{810/* TCP L1 Cache per CU */811.cache_size = 16,812.cache_level = 1,813.cache_line_size = 128,814.flags = (CRAT_CACHE_FLAGS_ENABLED |815CRAT_CACHE_FLAGS_DATA_CACHE |816CRAT_CACHE_FLAGS_SIMD_CACHE),817.num_cu_shared = 1,818},819{820/* Scalar L1 Instruction Cache per SQC */821.cache_size = 32,822.cache_level = 1,823.cache_line_size = 64,824.flags = (CRAT_CACHE_FLAGS_ENABLED |825CRAT_CACHE_FLAGS_INST_CACHE |826CRAT_CACHE_FLAGS_SIMD_CACHE),827.num_cu_shared = 2,828},829{830/* Scalar L1 Data Cache per SQC */831.cache_size = 16,832.cache_level = 1,833.cache_line_size = 64,834.flags = (CRAT_CACHE_FLAGS_ENABLED |835CRAT_CACHE_FLAGS_DATA_CACHE |836CRAT_CACHE_FLAGS_SIMD_CACHE),837.num_cu_shared = 2,838},839{840/* GL1 Data Cache per SA */841.cache_size = 128,842.cache_level = 1,843.cache_line_size = 128,844.flags = (CRAT_CACHE_FLAGS_ENABLED |845CRAT_CACHE_FLAGS_DATA_CACHE |846CRAT_CACHE_FLAGS_SIMD_CACHE),847.num_cu_shared = 6,848},849{850/* L2 Data Cache per GPU (Total Tex Cache) */851.cache_size = 2048,852.cache_level = 2,853.cache_line_size = 128,854.flags = (CRAT_CACHE_FLAGS_ENABLED |855CRAT_CACHE_FLAGS_DATA_CACHE |856CRAT_CACHE_FLAGS_SIMD_CACHE),857.num_cu_shared = 6,858},859};860861static struct kfd_gpu_cache_info gfx1037_cache_info[] = {862{863/* TCP L1 Cache per CU */864.cache_size = 16,865.cache_level = 1,866.cache_line_size = 128,867.flags = (CRAT_CACHE_FLAGS_ENABLED |868CRAT_CACHE_FLAGS_DATA_CACHE |869CRAT_CACHE_FLAGS_SIMD_CACHE),870.num_cu_shared = 1,871},872{873/* Scalar L1 Instruction Cache per SQC */874.cache_size = 32,875.cache_level = 1,876.cache_line_size = 64,877.flags = (CRAT_CACHE_FLAGS_ENABLED |878CRAT_CACHE_FLAGS_INST_CACHE |879CRAT_CACHE_FLAGS_SIMD_CACHE),880.num_cu_shared = 2,881},882{883/* Scalar L1 Data Cache per SQC */884.cache_size = 16,885.cache_level = 1,886.cache_line_size = 64,887.flags = (CRAT_CACHE_FLAGS_ENABLED |888CRAT_CACHE_FLAGS_DATA_CACHE |889CRAT_CACHE_FLAGS_SIMD_CACHE),890.num_cu_shared = 2,891},892{893/* GL1 Data Cache per SA */894.cache_size = 128,895.cache_level = 1,896.cache_line_size = 128,897.flags = (CRAT_CACHE_FLAGS_ENABLED |898CRAT_CACHE_FLAGS_DATA_CACHE |899CRAT_CACHE_FLAGS_SIMD_CACHE),900.num_cu_shared = 2,901},902{903/* L2 Data Cache per GPU (Total Tex Cache) */904.cache_size = 256,905.cache_level = 2,906.cache_line_size = 128,907.flags = (CRAT_CACHE_FLAGS_ENABLED |908CRAT_CACHE_FLAGS_DATA_CACHE |909CRAT_CACHE_FLAGS_SIMD_CACHE),910.num_cu_shared = 2,911},912};913914static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = {915{916/* TCP L1 Cache per CU */917.cache_size = 16,918.cache_level = 1,919.cache_line_size = 128,920.flags = (CRAT_CACHE_FLAGS_ENABLED |921CRAT_CACHE_FLAGS_DATA_CACHE |922CRAT_CACHE_FLAGS_SIMD_CACHE),923.num_cu_shared = 1,924},925{926/* Scalar L1 Instruction Cache per SQC */927.cache_size = 32,928.cache_level = 1,929.cache_line_size = 64,930.flags = (CRAT_CACHE_FLAGS_ENABLED |931CRAT_CACHE_FLAGS_INST_CACHE |932CRAT_CACHE_FLAGS_SIMD_CACHE),933.num_cu_shared = 2,934},935{936/* Scalar L1 Data Cache per SQC */937.cache_size = 16,938.cache_level = 1,939.cache_line_size = 64,940.flags = (CRAT_CACHE_FLAGS_ENABLED |941CRAT_CACHE_FLAGS_DATA_CACHE |942CRAT_CACHE_FLAGS_SIMD_CACHE),943.num_cu_shared = 2,944},945{946/* GL1 Data Cache per SA */947.cache_size = 128,948.cache_level = 1,949.cache_line_size = 128,950.flags = (CRAT_CACHE_FLAGS_ENABLED |951CRAT_CACHE_FLAGS_DATA_CACHE |952CRAT_CACHE_FLAGS_SIMD_CACHE),953.num_cu_shared = 2,954},955{956/* L2 Data Cache per GPU (Total Tex Cache) */957.cache_size = 256,958.cache_level = 2,959.cache_line_size = 128,960.flags = (CRAT_CACHE_FLAGS_ENABLED |961CRAT_CACHE_FLAGS_DATA_CACHE |962CRAT_CACHE_FLAGS_SIMD_CACHE),963.num_cu_shared = 2,964},965};966967static struct kfd_gpu_cache_info dummy_cache_info[] = {968{969/* TCP L1 Cache per CU */970.cache_size = 16,971.cache_level = 1,972.cache_line_size = 64,973.flags = (CRAT_CACHE_FLAGS_ENABLED |974CRAT_CACHE_FLAGS_DATA_CACHE |975CRAT_CACHE_FLAGS_SIMD_CACHE),976.num_cu_shared = 1,977},978{979/* Scalar L1 Instruction Cache per SQC */980.cache_size = 32,981.cache_level = 1,982.cache_line_size = 64,983.flags = (CRAT_CACHE_FLAGS_ENABLED |984CRAT_CACHE_FLAGS_INST_CACHE |985CRAT_CACHE_FLAGS_SIMD_CACHE),986.num_cu_shared = 2,987},988{989/* Scalar L1 Data Cache per SQC */990.cache_size = 16,991.cache_level = 1,992.cache_line_size = 64,993.flags = (CRAT_CACHE_FLAGS_ENABLED |994CRAT_CACHE_FLAGS_DATA_CACHE |995CRAT_CACHE_FLAGS_SIMD_CACHE),996.num_cu_shared = 2,997},998{999/* GL1 Data Cache per SA */1000.cache_size = 128,1001.cache_level = 1,1002.cache_line_size = 64,1003.flags = (CRAT_CACHE_FLAGS_ENABLED |1004CRAT_CACHE_FLAGS_DATA_CACHE |1005CRAT_CACHE_FLAGS_SIMD_CACHE),1006.num_cu_shared = 6,1007},1008{1009/* L2 Data Cache per GPU (Total Tex Cache) */1010.cache_size = 2048,1011.cache_level = 2,1012.cache_line_size = 64,1013.flags = (CRAT_CACHE_FLAGS_ENABLED |1014CRAT_CACHE_FLAGS_DATA_CACHE |1015CRAT_CACHE_FLAGS_SIMD_CACHE),1016.num_cu_shared = 6,1017},1018};10191020static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,1021struct crat_subtype_computeunit *cu)1022{1023dev->node_props.cpu_cores_count = cu->num_cpu_cores;1024dev->node_props.cpu_core_id_base = cu->processor_id_low;1025if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)1026dev->node_props.capability |= HSA_CAP_ATS_PRESENT;10271028pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,1029cu->processor_id_low);1030}10311032static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,1033struct crat_subtype_computeunit *cu)1034{1035dev->node_props.simd_id_base = cu->processor_id_low;1036dev->node_props.simd_count = cu->num_simd_cores;1037dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;1038dev->node_props.max_waves_per_simd = cu->max_waves_simd;1039dev->node_props.wave_front_size = cu->wave_front_size;1040dev->node_props.array_count = cu->array_count;1041dev->node_props.cu_per_simd_array = cu->num_cu_per_array;1042dev->node_props.simd_per_cu = cu->num_simd_per_cu;1043dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;1044if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)1045dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;1046pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);1047}10481049/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct1050* topology device present in the device_list1051*/1052static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,1053struct list_head *device_list)1054{1055struct kfd_topology_device *dev;10561057pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",1058cu->proximity_domain, cu->hsa_capability);1059list_for_each_entry(dev, device_list, list) {1060if (cu->proximity_domain == dev->proximity_domain) {1061if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)1062kfd_populated_cu_info_cpu(dev, cu);10631064if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)1065kfd_populated_cu_info_gpu(dev, cu);1066break;1067}1068}10691070return 0;1071}10721073static struct kfd_mem_properties *1074find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,1075struct kfd_topology_device *dev)1076{1077struct kfd_mem_properties *props;10781079list_for_each_entry(props, &dev->mem_props, list) {1080if (props->heap_type == heap_type1081&& props->flags == flags1082&& props->width == width)1083return props;1084}10851086return NULL;1087}1088/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct1089* topology device present in the device_list1090*/1091static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,1092struct list_head *device_list)1093{1094struct kfd_mem_properties *props;1095struct kfd_topology_device *dev;1096uint32_t heap_type;1097uint64_t size_in_bytes;1098uint32_t flags = 0;1099uint32_t width;11001101pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",1102mem->proximity_domain);1103list_for_each_entry(dev, device_list, list) {1104if (mem->proximity_domain == dev->proximity_domain) {1105/* We're on GPU node */1106if (dev->node_props.cpu_cores_count == 0) {1107/* APU */1108if (mem->visibility_type == 0)1109heap_type =1110HSA_MEM_HEAP_TYPE_FB_PRIVATE;1111/* dGPU */1112else1113heap_type = mem->visibility_type;1114} else1115heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;11161117if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)1118flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;1119if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)1120flags |= HSA_MEM_FLAGS_NON_VOLATILE;11211122size_in_bytes =1123((uint64_t)mem->length_high << 32) +1124mem->length_low;1125width = mem->width;11261127/* Multiple banks of the same type are aggregated into1128* one. User mode doesn't care about multiple physical1129* memory segments. It's managed as a single virtual1130* heap for user mode.1131*/1132props = find_subtype_mem(heap_type, flags, width, dev);1133if (props) {1134props->size_in_bytes += size_in_bytes;1135break;1136}11371138props = kfd_alloc_struct(props);1139if (!props)1140return -ENOMEM;11411142props->heap_type = heap_type;1143props->flags = flags;1144props->size_in_bytes = size_in_bytes;1145props->width = width;11461147dev->node_props.mem_banks_count++;1148list_add_tail(&props->list, &dev->mem_props);11491150break;1151}1152}11531154return 0;1155}11561157/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct1158* topology device present in the device_list1159*/1160static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,1161struct list_head *device_list)1162{1163struct kfd_cache_properties *props;1164struct kfd_topology_device *dev;1165uint32_t id;1166uint32_t total_num_of_cu;11671168id = cache->processor_id_low;11691170pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);1171list_for_each_entry(dev, device_list, list) {1172total_num_of_cu = (dev->node_props.array_count *1173dev->node_props.cu_per_simd_array);11741175/* Cache infomration in CRAT doesn't have proximity_domain1176* information as it is associated with a CPU core or GPU1177* Compute Unit. So map the cache using CPU core Id or SIMD1178* (GPU) ID.1179* TODO: This works because currently we can safely assume that1180* Compute Units are parsed before caches are parsed. In1181* future, remove this dependency1182*/1183if ((id >= dev->node_props.cpu_core_id_base &&1184id <= dev->node_props.cpu_core_id_base +1185dev->node_props.cpu_cores_count) ||1186(id >= dev->node_props.simd_id_base &&1187id < dev->node_props.simd_id_base +1188total_num_of_cu)) {1189props = kfd_alloc_struct(props);1190if (!props)1191return -ENOMEM;11921193props->processor_id_low = id;1194props->cache_level = cache->cache_level;1195props->cache_size = cache->cache_size;1196props->cacheline_size = cache->cache_line_size;1197props->cachelines_per_tag = cache->lines_per_tag;1198props->cache_assoc = cache->associativity;1199props->cache_latency = cache->cache_latency;12001201memcpy(props->sibling_map, cache->sibling_map,1202CRAT_SIBLINGMAP_SIZE);12031204/* set the sibling_map_size as 32 for CRAT from ACPI */1205props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;12061207if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)1208props->cache_type |= HSA_CACHE_TYPE_DATA;1209if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)1210props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;1211if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)1212props->cache_type |= HSA_CACHE_TYPE_CPU;1213if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)1214props->cache_type |= HSA_CACHE_TYPE_HSACU;12151216dev->node_props.caches_count++;1217list_add_tail(&props->list, &dev->cache_props);12181219break;1220}1221}12221223return 0;1224}12251226/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct1227* topology device present in the device_list1228*/1229static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,1230struct list_head *device_list)1231{1232struct kfd_iolink_properties *props = NULL, *props2;1233struct kfd_topology_device *dev, *to_dev;1234uint32_t id_from;1235uint32_t id_to;12361237id_from = iolink->proximity_domain_from;1238id_to = iolink->proximity_domain_to;12391240pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n",1241id_from, id_to);1242list_for_each_entry(dev, device_list, list) {1243if (id_from == dev->proximity_domain) {1244props = kfd_alloc_struct(props);1245if (!props)1246return -ENOMEM;12471248props->node_from = id_from;1249props->node_to = id_to;1250props->ver_maj = iolink->version_major;1251props->ver_min = iolink->version_minor;1252props->iolink_type = iolink->io_interface_type;12531254if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)1255props->weight = 20;1256else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)1257props->weight = iolink->weight_xgmi;1258else1259props->weight = node_distance(id_from, id_to);12601261props->min_latency = iolink->minimum_latency;1262props->max_latency = iolink->maximum_latency;1263props->min_bandwidth = iolink->minimum_bandwidth_mbs;1264props->max_bandwidth = iolink->maximum_bandwidth_mbs;1265props->rec_transfer_size =1266iolink->recommended_transfer_size;12671268dev->node_props.io_links_count++;1269list_add_tail(&props->list, &dev->io_link_props);1270break;1271}1272}12731274/* CPU topology is created before GPUs are detected, so CPU->GPU1275* links are not built at that time. If a PCIe type is discovered, it1276* means a GPU is detected and we are adding GPU->CPU to the topology.1277* At this time, also add the corresponded CPU->GPU link if GPU1278* is large bar.1279* For xGMI, we only added the link with one direction in the crat1280* table, add corresponded reversed direction link now.1281*/1282if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {1283to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);1284if (!to_dev)1285return -ENODEV;1286/* same everything but the other direction */1287props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);1288if (!props2)1289return -ENOMEM;12901291props2->node_from = id_to;1292props2->node_to = id_from;1293props2->kobj = NULL;1294to_dev->node_props.io_links_count++;1295list_add_tail(&props2->list, &to_dev->io_link_props);1296}12971298return 0;1299}13001301/* kfd_parse_subtype - parse subtypes and attach it to correct topology device1302* present in the device_list1303* @sub_type_hdr - subtype section of crat_image1304* @device_list - list of topology devices present in this crat_image1305*/1306static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,1307struct list_head *device_list)1308{1309struct crat_subtype_computeunit *cu;1310struct crat_subtype_memory *mem;1311struct crat_subtype_cache *cache;1312struct crat_subtype_iolink *iolink;1313int ret = 0;13141315switch (sub_type_hdr->type) {1316case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:1317cu = (struct crat_subtype_computeunit *)sub_type_hdr;1318ret = kfd_parse_subtype_cu(cu, device_list);1319break;1320case CRAT_SUBTYPE_MEMORY_AFFINITY:1321mem = (struct crat_subtype_memory *)sub_type_hdr;1322ret = kfd_parse_subtype_mem(mem, device_list);1323break;1324case CRAT_SUBTYPE_CACHE_AFFINITY:1325cache = (struct crat_subtype_cache *)sub_type_hdr;1326ret = kfd_parse_subtype_cache(cache, device_list);1327break;1328case CRAT_SUBTYPE_TLB_AFFINITY:1329/*1330* For now, nothing to do here1331*/1332pr_debug("Found TLB entry in CRAT table (not processing)\n");1333break;1334case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:1335/*1336* For now, nothing to do here1337*/1338pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");1339break;1340case CRAT_SUBTYPE_IOLINK_AFFINITY:1341iolink = (struct crat_subtype_iolink *)sub_type_hdr;1342ret = kfd_parse_subtype_iolink(iolink, device_list);1343break;1344default:1345pr_warn("Unknown subtype %d in CRAT\n",1346sub_type_hdr->type);1347}13481349return ret;1350}13511352/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT1353* create a kfd_topology_device and add in to device_list. Also parse1354* CRAT subtypes and attach it to appropriate kfd_topology_device1355* @crat_image - input image containing CRAT1356* @device_list - [OUT] list of kfd_topology_device generated after1357* parsing crat_image1358* @proximity_domain - Proximity domain of the first device in the table1359*1360* Return - 0 if successful else -ve value1361*/1362int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,1363uint32_t proximity_domain)1364{1365struct kfd_topology_device *top_dev = NULL;1366struct crat_subtype_generic *sub_type_hdr;1367uint16_t node_id;1368int ret = 0;1369struct crat_header *crat_table = (struct crat_header *)crat_image;1370uint16_t num_nodes;1371uint32_t image_len;13721373if (!crat_image)1374return -EINVAL;13751376if (!list_empty(device_list)) {1377pr_warn("Error device list should be empty\n");1378return -EINVAL;1379}13801381num_nodes = crat_table->num_domains;1382image_len = crat_table->length;13831384pr_debug("Parsing CRAT table with %d nodes\n", num_nodes);13851386for (node_id = 0; node_id < num_nodes; node_id++) {1387top_dev = kfd_create_topology_device(device_list);1388if (!top_dev)1389break;1390top_dev->proximity_domain = proximity_domain++;1391}13921393if (!top_dev) {1394ret = -ENOMEM;1395goto err;1396}13971398memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);1399memcpy(top_dev->oem_table_id, crat_table->oem_table_id,1400CRAT_OEMTABLEID_LENGTH);1401top_dev->oem_revision = crat_table->oem_revision;14021403sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);1404while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <1405((char *)crat_image) + image_len) {1406if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {1407ret = kfd_parse_subtype(sub_type_hdr, device_list);1408if (ret)1409break;1410}14111412sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1413sub_type_hdr->length);1414}14151416err:1417if (ret)1418kfd_release_topology_device_list(device_list);14191420return ret;1421}142214231424static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,1425bool cache_line_size_missing,1426struct kfd_gpu_cache_info *pcache_info)1427{1428struct amdgpu_device *adev = kdev->adev;1429int i = 0;14301431/* TCP L1 Cache per CU */1432if (adev->gfx.config.gc_tcp_l1_size) {1433pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size;1434pcache_info[i].cache_level = 1;1435pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1436CRAT_CACHE_FLAGS_DATA_CACHE |1437CRAT_CACHE_FLAGS_SIMD_CACHE);1438pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2;1439pcache_info[i].cache_line_size = adev->gfx.config.gc_tcp_cache_line_size;1440if (cache_line_size_missing && !pcache_info[i].cache_line_size)1441pcache_info[i].cache_line_size = 128;1442i++;1443}1444/* Scalar L1 Instruction Cache per SQC */1445if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {1446pcache_info[i].cache_size =1447adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;1448pcache_info[i].cache_level = 1;1449pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1450CRAT_CACHE_FLAGS_INST_CACHE |1451CRAT_CACHE_FLAGS_SIMD_CACHE);1452pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;1453pcache_info[i].cache_line_size = adev->gfx.config.gc_instruction_cache_line_size;1454if (cache_line_size_missing && !pcache_info[i].cache_line_size)1455pcache_info[i].cache_line_size = 128;1456i++;1457}1458/* Scalar L1 Data Cache per SQC */1459if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {1460pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;1461pcache_info[i].cache_level = 1;1462pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1463CRAT_CACHE_FLAGS_DATA_CACHE |1464CRAT_CACHE_FLAGS_SIMD_CACHE);1465pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2;1466pcache_info[i].cache_line_size = adev->gfx.config.gc_scalar_data_cache_line_size;1467if (cache_line_size_missing && !pcache_info[i].cache_line_size)1468pcache_info[i].cache_line_size = 64;1469i++;1470}1471/* GL1 Data Cache per SA */1472if (adev->gfx.config.gc_gl1c_per_sa &&1473adev->gfx.config.gc_gl1c_size_per_instance) {1474pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa *1475adev->gfx.config.gc_gl1c_size_per_instance;1476pcache_info[i].cache_level = 1;1477pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1478CRAT_CACHE_FLAGS_DATA_CACHE |1479CRAT_CACHE_FLAGS_SIMD_CACHE);1480pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1481if (cache_line_size_missing)1482pcache_info[i].cache_line_size = 128;1483i++;1484}1485/* L2 Data Cache per GPU (Total Tex Cache) */1486if (adev->gfx.config.gc_gl2c_per_gpu) {1487pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu;1488pcache_info[i].cache_level = 2;1489pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1490CRAT_CACHE_FLAGS_DATA_CACHE |1491CRAT_CACHE_FLAGS_SIMD_CACHE);1492pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1493pcache_info[i].cache_line_size = adev->gfx.config.gc_tcc_cache_line_size;1494if (cache_line_size_missing && !pcache_info[i].cache_line_size)1495pcache_info[i].cache_line_size = 128;1496i++;1497}1498/* L3 Data Cache per GPU */1499if (adev->gmc.mall_size) {1500pcache_info[i].cache_size = adev->gmc.mall_size / 1024;1501pcache_info[i].cache_level = 3;1502pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1503CRAT_CACHE_FLAGS_DATA_CACHE |1504CRAT_CACHE_FLAGS_SIMD_CACHE);1505pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1506pcache_info[i].cache_line_size = 64;1507i++;1508}1509return i;1510}15111512static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev,1513struct kfd_gpu_cache_info *pcache_info)1514{1515struct amdgpu_device *adev = kdev->adev;1516int i = 0;15171518/* TCP L1 Cache per CU */1519if (adev->gfx.config.gc_tcp_size_per_cu) {1520pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu;1521pcache_info[i].cache_level = 1;1522/* Cacheline size not available in IP discovery for gc943,gc944 */1523pcache_info[i].cache_line_size = 128;1524pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1525CRAT_CACHE_FLAGS_DATA_CACHE |1526CRAT_CACHE_FLAGS_SIMD_CACHE);1527pcache_info[i].num_cu_shared = 1;1528i++;1529}1530/* Scalar L1 Instruction Cache per SQC */1531if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) {1532pcache_info[i].cache_size =1533adev->gfx.config.gc_l1_instruction_cache_size_per_sqc;1534pcache_info[i].cache_level = 1;1535pcache_info[i].cache_line_size = 64;1536pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1537CRAT_CACHE_FLAGS_INST_CACHE |1538CRAT_CACHE_FLAGS_SIMD_CACHE);1539pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;1540i++;1541}1542/* Scalar L1 Data Cache per SQC */1543if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) {1544pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc;1545pcache_info[i].cache_level = 1;1546pcache_info[i].cache_line_size = 64;1547pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1548CRAT_CACHE_FLAGS_DATA_CACHE |1549CRAT_CACHE_FLAGS_SIMD_CACHE);1550pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc;1551i++;1552}1553/* L2 Data Cache per GPU (Total Tex Cache) */1554if (adev->gfx.config.gc_tcc_size) {1555pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size;1556pcache_info[i].cache_level = 2;1557pcache_info[i].cache_line_size = 128;1558pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1559CRAT_CACHE_FLAGS_DATA_CACHE |1560CRAT_CACHE_FLAGS_SIMD_CACHE);1561pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1562i++;1563}1564/* L3 Data Cache per GPU */1565if (adev->gmc.mall_size) {1566pcache_info[i].cache_size = adev->gmc.mall_size / 1024;1567pcache_info[i].cache_level = 3;1568pcache_info[i].cache_line_size = 64;1569pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED |1570CRAT_CACHE_FLAGS_DATA_CACHE |1571CRAT_CACHE_FLAGS_SIMD_CACHE);1572pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh;1573i++;1574}1575return i;1576}15771578int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info)1579{1580int num_of_cache_types = 0;1581bool cache_line_size_missing = false;15821583switch (kdev->adev->asic_type) {1584case CHIP_KAVERI:1585*pcache_info = kaveri_cache_info;1586num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);1587break;1588case CHIP_HAWAII:1589*pcache_info = hawaii_cache_info;1590num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);1591break;1592case CHIP_CARRIZO:1593*pcache_info = carrizo_cache_info;1594num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);1595break;1596case CHIP_TONGA:1597*pcache_info = tonga_cache_info;1598num_of_cache_types = ARRAY_SIZE(tonga_cache_info);1599break;1600case CHIP_FIJI:1601*pcache_info = fiji_cache_info;1602num_of_cache_types = ARRAY_SIZE(fiji_cache_info);1603break;1604case CHIP_POLARIS10:1605*pcache_info = polaris10_cache_info;1606num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);1607break;1608case CHIP_POLARIS11:1609*pcache_info = polaris11_cache_info;1610num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);1611break;1612case CHIP_POLARIS12:1613*pcache_info = polaris12_cache_info;1614num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);1615break;1616case CHIP_VEGAM:1617*pcache_info = vegam_cache_info;1618num_of_cache_types = ARRAY_SIZE(vegam_cache_info);1619break;1620default:1621switch (KFD_GC_VERSION(kdev)) {1622case IP_VERSION(9, 0, 1):1623*pcache_info = vega10_cache_info;1624num_of_cache_types = ARRAY_SIZE(vega10_cache_info);1625break;1626case IP_VERSION(9, 2, 1):1627*pcache_info = vega12_cache_info;1628num_of_cache_types = ARRAY_SIZE(vega12_cache_info);1629break;1630case IP_VERSION(9, 4, 0):1631case IP_VERSION(9, 4, 1):1632*pcache_info = vega20_cache_info;1633num_of_cache_types = ARRAY_SIZE(vega20_cache_info);1634break;1635case IP_VERSION(9, 4, 2):1636*pcache_info = aldebaran_cache_info;1637num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);1638break;1639case IP_VERSION(9, 4, 3):1640case IP_VERSION(9, 4, 4):1641case IP_VERSION(9, 5, 0):1642num_of_cache_types =1643kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev->kfd,1644*pcache_info);1645break;1646case IP_VERSION(9, 1, 0):1647case IP_VERSION(9, 2, 2):1648*pcache_info = raven_cache_info;1649num_of_cache_types = ARRAY_SIZE(raven_cache_info);1650break;1651case IP_VERSION(9, 3, 0):1652*pcache_info = renoir_cache_info;1653num_of_cache_types = ARRAY_SIZE(renoir_cache_info);1654break;1655case IP_VERSION(10, 1, 10):1656case IP_VERSION(10, 1, 2):1657case IP_VERSION(10, 1, 3):1658case IP_VERSION(10, 1, 4):1659*pcache_info = navi10_cache_info;1660num_of_cache_types = ARRAY_SIZE(navi10_cache_info);1661break;1662case IP_VERSION(10, 1, 1):1663*pcache_info = navi14_cache_info;1664num_of_cache_types = ARRAY_SIZE(navi14_cache_info);1665break;1666case IP_VERSION(10, 3, 0):1667*pcache_info = sienna_cichlid_cache_info;1668num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);1669break;1670case IP_VERSION(10, 3, 2):1671*pcache_info = navy_flounder_cache_info;1672num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);1673break;1674case IP_VERSION(10, 3, 4):1675*pcache_info = dimgrey_cavefish_cache_info;1676num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);1677break;1678case IP_VERSION(10, 3, 1):1679*pcache_info = vangogh_cache_info;1680num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);1681break;1682case IP_VERSION(10, 3, 5):1683*pcache_info = beige_goby_cache_info;1684num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);1685break;1686case IP_VERSION(10, 3, 3):1687*pcache_info = yellow_carp_cache_info;1688num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);1689break;1690case IP_VERSION(10, 3, 6):1691*pcache_info = gc_10_3_6_cache_info;1692num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);1693break;1694case IP_VERSION(10, 3, 7):1695*pcache_info = gfx1037_cache_info;1696num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);1697break;1698case IP_VERSION(11, 0, 0):1699case IP_VERSION(11, 0, 1):1700case IP_VERSION(11, 0, 2):1701case IP_VERSION(11, 0, 3):1702case IP_VERSION(11, 0, 4):1703case IP_VERSION(11, 5, 0):1704case IP_VERSION(11, 5, 1):1705case IP_VERSION(11, 5, 2):1706case IP_VERSION(11, 5, 3):1707/* Cacheline size not available in IP discovery for gc11.1708* kfd_fill_gpu_cache_info_from_gfx_config to hard code it1709*/1710cache_line_size_missing = true;1711fallthrough;1712case IP_VERSION(12, 0, 0):1713case IP_VERSION(12, 0, 1):1714num_of_cache_types =1715kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd,1716cache_line_size_missing,1717*pcache_info);1718break;1719default:1720*pcache_info = dummy_cache_info;1721num_of_cache_types = ARRAY_SIZE(dummy_cache_info);1722pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");1723break;1724}1725}1726return num_of_cache_types;1727}17281729/* Memory required to create Virtual CRAT.1730* Since there is no easy way to predict the amount of memory required, the1731* following amount is allocated for GPU Virtual CRAT. This is1732* expected to cover all known conditions. But to be safe additional check1733* is put in the code to ensure we don't overwrite.1734*/1735#define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE)17361737/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node1738*1739* @numa_node_id: CPU NUMA node id1740* @avail_size: Available size in the memory1741* @sub_type_hdr: Memory into which compute info will be filled in1742*1743* Return 0 if successful else return -ve value1744*/1745static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,1746int proximity_domain,1747struct crat_subtype_computeunit *sub_type_hdr)1748{1749const struct cpumask *cpumask;17501751*avail_size -= sizeof(struct crat_subtype_computeunit);1752if (*avail_size < 0)1753return -ENOMEM;17541755memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));17561757/* Fill in subtype header data */1758sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;1759sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);1760sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;17611762cpumask = cpumask_of_node(numa_node_id);17631764/* Fill in CU data */1765sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;1766sub_type_hdr->proximity_domain = proximity_domain;1767sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);1768if (sub_type_hdr->processor_id_low == -1)1769return -EINVAL;17701771sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);17721773return 0;1774}17751776/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node1777*1778* @numa_node_id: CPU NUMA node id1779* @avail_size: Available size in the memory1780* @sub_type_hdr: Memory into which compute info will be filled in1781*1782* Return 0 if successful else return -ve value1783*/1784static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,1785int proximity_domain,1786struct crat_subtype_memory *sub_type_hdr)1787{1788uint64_t mem_in_bytes = 0;1789pg_data_t *pgdat;1790int zone_type;17911792*avail_size -= sizeof(struct crat_subtype_memory);1793if (*avail_size < 0)1794return -ENOMEM;17951796memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));17971798/* Fill in subtype header data */1799sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;1800sub_type_hdr->length = sizeof(struct crat_subtype_memory);1801sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;18021803/* Fill in Memory Subunit data */18041805/* Unlike si_meminfo, si_meminfo_node is not exported. So1806* the following lines are duplicated from si_meminfo_node1807* function1808*/1809pgdat = NODE_DATA(numa_node_id);1810for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)1811mem_in_bytes += zone_managed_pages(&pgdat->node_zones[zone_type]);1812mem_in_bytes <<= PAGE_SHIFT;18131814sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);1815sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);1816sub_type_hdr->proximity_domain = proximity_domain;18171818return 0;1819}18201821#ifdef CONFIG_X86_641822static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,1823uint32_t *num_entries,1824struct crat_subtype_iolink *sub_type_hdr)1825{1826int nid;1827struct cpuinfo_x86 *c = &cpu_data(0);1828uint8_t link_type;18291830if (c->x86_vendor == X86_VENDOR_AMD)1831link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;1832else1833link_type = CRAT_IOLINK_TYPE_QPI_1_1;18341835*num_entries = 0;18361837/* Create IO links from this node to other CPU nodes */1838for_each_online_node(nid) {1839if (nid == numa_node_id) /* node itself */1840continue;18411842*avail_size -= sizeof(struct crat_subtype_iolink);1843if (*avail_size < 0)1844return -ENOMEM;18451846memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));18471848/* Fill in subtype header data */1849sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;1850sub_type_hdr->length = sizeof(struct crat_subtype_iolink);1851sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;18521853/* Fill in IO link data */1854sub_type_hdr->proximity_domain_from = numa_node_id;1855sub_type_hdr->proximity_domain_to = nid;1856sub_type_hdr->io_interface_type = link_type;18571858(*num_entries)++;1859sub_type_hdr++;1860}18611862return 0;1863}1864#endif18651866/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU1867*1868* @pcrat_image: Fill in VCRAT for CPU1869* @size: [IN] allocated size of crat_image.1870* [OUT] actual size of data filled in crat_image1871*/1872static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)1873{1874struct crat_header *crat_table = (struct crat_header *)pcrat_image;1875struct acpi_table_header *acpi_table;1876acpi_status status;1877struct crat_subtype_generic *sub_type_hdr;1878int avail_size = *size;1879int numa_node_id;1880#ifdef CONFIG_X86_641881uint32_t entries = 0;1882#endif1883int ret = 0;18841885if (!pcrat_image)1886return -EINVAL;18871888/* Fill in CRAT Header.1889* Modify length and total_entries as subunits are added.1890*/1891avail_size -= sizeof(struct crat_header);1892if (avail_size < 0)1893return -ENOMEM;18941895memset(crat_table, 0, sizeof(struct crat_header));1896memcpy(&crat_table->signature, CRAT_SIGNATURE,1897sizeof(crat_table->signature));1898crat_table->length = sizeof(struct crat_header);18991900status = acpi_get_table("DSDT", 0, &acpi_table);1901if (status != AE_OK)1902pr_warn("DSDT table not found for OEM information\n");1903else {1904crat_table->oem_revision = acpi_table->revision;1905memcpy(crat_table->oem_id, acpi_table->oem_id,1906CRAT_OEMID_LENGTH);1907memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,1908CRAT_OEMTABLEID_LENGTH);1909acpi_put_table(acpi_table);1910}1911crat_table->total_entries = 0;1912crat_table->num_domains = 0;19131914sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);19151916for_each_online_node(numa_node_id) {1917if (kfd_numa_node_to_apic_id(numa_node_id) == -1)1918continue;19191920/* Fill in Subtype: Compute Unit */1921ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,1922crat_table->num_domains,1923(struct crat_subtype_computeunit *)sub_type_hdr);1924if (ret < 0)1925return ret;1926crat_table->length += sub_type_hdr->length;1927crat_table->total_entries++;19281929sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1930sub_type_hdr->length);19311932/* Fill in Subtype: Memory */1933ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,1934crat_table->num_domains,1935(struct crat_subtype_memory *)sub_type_hdr);1936if (ret < 0)1937return ret;1938crat_table->length += sub_type_hdr->length;1939crat_table->total_entries++;19401941sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1942sub_type_hdr->length);19431944/* Fill in Subtype: IO Link */1945#ifdef CONFIG_X86_641946ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,1947&entries,1948(struct crat_subtype_iolink *)sub_type_hdr);1949if (ret < 0)1950return ret;19511952if (entries) {1953crat_table->length += (sub_type_hdr->length * entries);1954crat_table->total_entries += entries;19551956sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +1957sub_type_hdr->length * entries);1958}1959#else1960pr_info("IO link not available for non x86 platforms\n");1961#endif19621963crat_table->num_domains++;1964}19651966/* TODO: Add cache Subtype for CPU.1967* Currently, CPU cache information is available in function1968* detect_cache_attributes(cpu) defined in the file1969* ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not1970* exported and to get the same information the code needs to be1971* duplicated.1972*/19731974*size = crat_table->length;1975pr_info("Virtual CRAT table created for CPU\n");19761977return 0;1978}19791980static int kfd_fill_gpu_memory_affinity(int *avail_size,1981struct kfd_node *kdev, uint8_t type, uint64_t size,1982struct crat_subtype_memory *sub_type_hdr,1983uint32_t proximity_domain,1984const struct kfd_local_mem_info *local_mem_info)1985{1986*avail_size -= sizeof(struct crat_subtype_memory);1987if (*avail_size < 0)1988return -ENOMEM;19891990memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));1991sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;1992sub_type_hdr->length = sizeof(struct crat_subtype_memory);1993sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;19941995sub_type_hdr->proximity_domain = proximity_domain;19961997pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",1998type, size);19992000sub_type_hdr->length_low = lower_32_bits(size);2001sub_type_hdr->length_high = upper_32_bits(size);20022003sub_type_hdr->width = local_mem_info->vram_width;2004sub_type_hdr->visibility_type = type;20052006return 0;2007}20082009#ifdef CONFIG_ACPI_NUMA2010static void kfd_find_numa_node_in_srat(struct kfd_node *kdev)2011{2012struct acpi_table_header *table_header = NULL;2013struct acpi_subtable_header *sub_header = NULL;2014unsigned long table_end, subtable_len;2015u32 pci_id = pci_domain_nr(kdev->adev->pdev->bus) << 16 |2016pci_dev_id(kdev->adev->pdev);2017u32 bdf;2018acpi_status status;2019struct acpi_srat_cpu_affinity *cpu;2020struct acpi_srat_generic_affinity *gpu;2021int pxm = 0, max_pxm = 0;2022int numa_node = NUMA_NO_NODE;2023bool found = false;20242025/* Fetch the SRAT table from ACPI */2026status = acpi_get_table(ACPI_SIG_SRAT, 0, &table_header);2027if (status == AE_NOT_FOUND) {2028pr_warn("SRAT table not found\n");2029return;2030} else if (ACPI_FAILURE(status)) {2031const char *err = acpi_format_exception(status);2032pr_err("SRAT table error: %s\n", err);2033return;2034}20352036table_end = (unsigned long)table_header + table_header->length;20372038/* Parse all entries looking for a match. */2039sub_header = (struct acpi_subtable_header *)2040((unsigned long)table_header +2041sizeof(struct acpi_table_srat));2042subtable_len = sub_header->length;20432044while (((unsigned long)sub_header) + subtable_len < table_end) {2045/*2046* If length is 0, break from this loop to avoid2047* infinite loop.2048*/2049if (subtable_len == 0) {2050pr_err("SRAT invalid zero length\n");2051break;2052}20532054switch (sub_header->type) {2055case ACPI_SRAT_TYPE_CPU_AFFINITY:2056cpu = (struct acpi_srat_cpu_affinity *)sub_header;2057pxm = *((u32 *)cpu->proximity_domain_hi) << 8 |2058cpu->proximity_domain_lo;2059if (pxm > max_pxm)2060max_pxm = pxm;2061break;2062case ACPI_SRAT_TYPE_GENERIC_AFFINITY:2063gpu = (struct acpi_srat_generic_affinity *)sub_header;2064bdf = *((u16 *)(&gpu->device_handle[0])) << 16 |2065*((u16 *)(&gpu->device_handle[2]));2066if (bdf == pci_id) {2067found = true;2068numa_node = pxm_to_node(gpu->proximity_domain);2069}2070break;2071default:2072break;2073}20742075if (found)2076break;20772078sub_header = (struct acpi_subtable_header *)2079((unsigned long)sub_header + subtable_len);2080subtable_len = sub_header->length;2081}20822083acpi_put_table(table_header);20842085/* Workaround bad cpu-gpu binding case */2086if (found && (numa_node < 0 ||2087numa_node > pxm_to_node(max_pxm)))2088numa_node = 0;20892090if (numa_node != NUMA_NO_NODE)2091set_dev_node(&kdev->adev->pdev->dev, numa_node);2092}2093#endif20942095#define KFD_CRAT_INTRA_SOCKET_WEIGHT 132096#define KFD_CRAT_XGMI_WEIGHT 1520972098/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU2099* to its NUMA node2100* @avail_size: Available size in the memory2101* @kdev - [IN] GPU device2102* @sub_type_hdr: Memory into which io link info will be filled in2103* @proximity_domain - proximity domain of the GPU node2104*2105* Return 0 if successful else return -ve value2106*/2107static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,2108struct kfd_node *kdev,2109struct crat_subtype_iolink *sub_type_hdr,2110uint32_t proximity_domain)2111{2112*avail_size -= sizeof(struct crat_subtype_iolink);2113if (*avail_size < 0)2114return -ENOMEM;21152116memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));21172118/* Fill in subtype header data */2119sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;2120sub_type_hdr->length = sizeof(struct crat_subtype_iolink);2121sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;2122if (kfd_dev_is_large_bar(kdev))2123sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;21242125/* Fill in IOLINK subtype.2126* TODO: Fill-in other fields of iolink subtype2127*/2128if (kdev->adev->gmc.xgmi.connected_to_cpu ||2129(KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) &&2130kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) ==2131AMDGPU_PKG_TYPE_APU)) {2132bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);2133int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :2134KFD_CRAT_INTRA_SOCKET_WEIGHT;2135/*2136* with host gpu xgmi link, host can access gpu memory whether2137* or not pcie bar type is large, so always create bidirectional2138* io link.2139*/2140sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;2141sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;2142sub_type_hdr->weight_xgmi = weight;2143if (ext_cpu) {2144amdgpu_xgmi_get_bandwidth(kdev->adev, NULL,2145AMDGPU_XGMI_BW_MODE_PER_LINK,2146AMDGPU_XGMI_BW_UNIT_MBYTES,2147&sub_type_hdr->minimum_bandwidth_mbs,2148&sub_type_hdr->maximum_bandwidth_mbs);2149} else {2150sub_type_hdr->minimum_bandwidth_mbs = mem_bw;2151sub_type_hdr->maximum_bandwidth_mbs = mem_bw;2152}2153} else {2154sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;2155sub_type_hdr->minimum_bandwidth_mbs =2156amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, true);2157sub_type_hdr->maximum_bandwidth_mbs =2158amdgpu_amdkfd_get_pcie_bandwidth_mbytes(kdev->adev, false);2159}21602161sub_type_hdr->proximity_domain_from = proximity_domain;21622163#ifdef CONFIG_ACPI_NUMA2164if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE &&2165num_possible_nodes() > 1)2166kfd_find_numa_node_in_srat(kdev);2167#endif2168#ifdef CONFIG_NUMA2169if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE)2170sub_type_hdr->proximity_domain_to = 0;2171else2172sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node;2173#else2174sub_type_hdr->proximity_domain_to = 0;2175#endif2176return 0;2177}21782179static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,2180struct kfd_node *kdev,2181struct kfd_node *peer_kdev,2182struct crat_subtype_iolink *sub_type_hdr,2183uint32_t proximity_domain_from,2184uint32_t proximity_domain_to)2185{2186bool use_ta_info = kdev->kfd->num_nodes == 1;21872188*avail_size -= sizeof(struct crat_subtype_iolink);2189if (*avail_size < 0)2190return -ENOMEM;21912192memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));21932194sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;2195sub_type_hdr->length = sizeof(struct crat_subtype_iolink);2196sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |2197CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;21982199sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;2200sub_type_hdr->proximity_domain_from = proximity_domain_from;2201sub_type_hdr->proximity_domain_to = proximity_domain_to;22022203if (use_ta_info) {2204sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *2205amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);2206amdgpu_xgmi_get_bandwidth(kdev->adev, peer_kdev->adev,2207AMDGPU_XGMI_BW_MODE_PER_PEER,2208AMDGPU_XGMI_BW_UNIT_MBYTES,2209&sub_type_hdr->minimum_bandwidth_mbs,2210&sub_type_hdr->maximum_bandwidth_mbs);2211} else {2212bool is_single_hop = kdev->kfd == peer_kdev->kfd;2213int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :2214(2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT;2215int mem_bw = 819200;22162217sub_type_hdr->weight_xgmi = weight;2218sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0;2219sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0;2220}22212222return 0;2223}22242225/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU2226*2227* @pcrat_image: Fill in VCRAT for GPU2228* @size: [IN] allocated size of crat_image.2229* [OUT] actual size of data filled in crat_image2230*/2231static int kfd_create_vcrat_image_gpu(void *pcrat_image,2232size_t *size, struct kfd_node *kdev,2233uint32_t proximity_domain)2234{2235struct crat_header *crat_table = (struct crat_header *)pcrat_image;2236struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;2237struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;2238struct crat_subtype_generic *sub_type_hdr;2239struct kfd_local_mem_info local_mem_info;2240struct kfd_topology_device *peer_dev;2241struct crat_subtype_computeunit *cu;2242int avail_size = *size;2243uint32_t total_num_of_cu;2244uint32_t nid = 0;2245int ret = 0;22462247if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)2248return -EINVAL;22492250/* Fill the CRAT Header.2251* Modify length and total_entries as subunits are added.2252*/2253avail_size -= sizeof(struct crat_header);2254memset(crat_table, 0, sizeof(struct crat_header));22552256memcpy(&crat_table->signature, CRAT_SIGNATURE,2257sizeof(crat_table->signature));2258/* Change length as we add more subtypes*/2259crat_table->length = sizeof(struct crat_header);2260crat_table->num_domains = 1;2261crat_table->total_entries = 0;22622263/* Fill in Subtype: Compute Unit2264* First fill in the sub type header and then sub type data2265*/2266avail_size -= sizeof(struct crat_subtype_computeunit);2267sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);2268memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));22692270sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;2271sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);2272sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;22732274/* Fill CU subtype data */2275cu = (struct crat_subtype_computeunit *)sub_type_hdr;2276cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;2277cu->proximity_domain = proximity_domain;22782279cu->num_simd_per_cu = cu_info->simd_per_cu;2280cu->num_simd_cores = cu_info->simd_per_cu *2281(cu_info->number / kdev->kfd->num_nodes);2282cu->max_waves_simd = cu_info->max_waves_per_simd;22832284cu->wave_front_size = cu_info->wave_front_size;2285cu->array_count = gfx_info->max_sh_per_se *2286gfx_info->max_shader_engines;2287total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh);2288cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);2289cu->num_cu_per_array = gfx_info->max_cu_per_sh;2290cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu;2291cu->num_banks = gfx_info->max_shader_engines;2292cu->lds_size_in_kb = cu_info->lds_size;22932294cu->hsa_capability = 0;22952296crat_table->length += sub_type_hdr->length;2297crat_table->total_entries++;22982299/* Fill in Subtype: Memory. Only on systems with large BAR (no2300* private FB), report memory as public. On other systems2301* report the total FB size (public+private) as a single2302* private heap.2303*/2304local_mem_info = kdev->local_mem_info;2305sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +2306sub_type_hdr->length);23072308if (kdev->adev->debug_largebar)2309local_mem_info.local_mem_size_private = 0;23102311if (local_mem_info.local_mem_size_private == 0)2312ret = kfd_fill_gpu_memory_affinity(&avail_size,2313kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,2314local_mem_info.local_mem_size_public,2315(struct crat_subtype_memory *)sub_type_hdr,2316proximity_domain,2317&local_mem_info);2318else2319ret = kfd_fill_gpu_memory_affinity(&avail_size,2320kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,2321local_mem_info.local_mem_size_public +2322local_mem_info.local_mem_size_private,2323(struct crat_subtype_memory *)sub_type_hdr,2324proximity_domain,2325&local_mem_info);2326if (ret < 0)2327return ret;23282329crat_table->length += sizeof(struct crat_subtype_memory);2330crat_table->total_entries++;23312332/* Fill in Subtype: IO_LINKS2333* Only direct links are added here which is Link from GPU to2334* its NUMA node. Indirect links are added by userspace.2335*/2336sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +2337sub_type_hdr->length);2338ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,2339(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);23402341if (ret < 0)2342return ret;23432344crat_table->length += sub_type_hdr->length;2345crat_table->total_entries++;234623472348/* Fill in Subtype: IO_LINKS2349* Direct links from GPU to other GPUs through xGMI.2350* We will loop GPUs that already be processed (with lower value2351* of proximity_domain), add the link for the GPUs with same2352* hive id (from this GPU to other GPU) . The reversed iolink2353* (from other GPU to this GPU) will be added2354* in kfd_parse_subtype_iolink.2355*/2356if (kdev->kfd->hive_id) {2357for (nid = 0; nid < proximity_domain; ++nid) {2358peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);2359if (!peer_dev->gpu)2360continue;2361if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id)2362continue;2363if (!amdgpu_xgmi_get_is_sharing_enabled(kdev->adev, peer_dev->gpu->adev))2364continue;2365sub_type_hdr = (typeof(sub_type_hdr))(2366(char *)sub_type_hdr +2367sizeof(struct crat_subtype_iolink));2368ret = kfd_fill_gpu_xgmi_link_to_gpu(2369&avail_size, kdev, peer_dev->gpu,2370(struct crat_subtype_iolink *)sub_type_hdr,2371proximity_domain, nid);2372if (ret < 0)2373return ret;2374crat_table->length += sub_type_hdr->length;2375crat_table->total_entries++;2376}2377}2378*size = crat_table->length;2379pr_info("Virtual CRAT table created for GPU\n");23802381return ret;2382}23832384/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and2385* creates a Virtual CRAT (VCRAT) image2386*2387* NOTE: Call kfd_destroy_crat_image to free CRAT image memory2388*2389* @crat_image: VCRAT image created because ACPI does not have a2390* CRAT for this device2391* @size: [OUT] size of virtual crat_image2392* @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device2393* COMPUTE_UNIT_GPU - Create VCRAT for GPU2394* (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU2395* -- this option is not currently implemented.2396* The assumption is that all AMD APUs will have CRAT2397* @kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU2398*2399* Return 0 if successful else return -ve value2400*/2401int kfd_create_crat_image_virtual(void **crat_image, size_t *size,2402int flags, struct kfd_node *kdev,2403uint32_t proximity_domain)2404{2405void *pcrat_image = NULL;2406int ret = 0, num_nodes;2407size_t dyn_size;24082409if (!crat_image)2410return -EINVAL;24112412*crat_image = NULL;24132414/* Allocate the CPU Virtual CRAT size based on the number of online2415* nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image.2416* This should cover all the current conditions. A check is put not2417* to overwrite beyond allocated size for GPUs2418*/2419switch (flags) {2420case COMPUTE_UNIT_CPU:2421num_nodes = num_online_nodes();2422dyn_size = sizeof(struct crat_header) +2423num_nodes * (sizeof(struct crat_subtype_computeunit) +2424sizeof(struct crat_subtype_memory) +2425(num_nodes - 1) * sizeof(struct crat_subtype_iolink));2426pcrat_image = kvmalloc(dyn_size, GFP_KERNEL);2427if (!pcrat_image)2428return -ENOMEM;2429*size = dyn_size;2430pr_debug("CRAT size is %ld", dyn_size);2431ret = kfd_create_vcrat_image_cpu(pcrat_image, size);2432break;2433case COMPUTE_UNIT_GPU:2434if (!kdev)2435return -EINVAL;2436pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);2437if (!pcrat_image)2438return -ENOMEM;2439*size = VCRAT_SIZE_FOR_GPU;2440ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,2441proximity_domain);2442break;2443case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):2444/* TODO: */2445ret = -EINVAL;2446pr_err("VCRAT not implemented for APU\n");2447break;2448default:2449ret = -EINVAL;2450}24512452if (!ret)2453*crat_image = pcrat_image;2454else2455kvfree(pcrat_image);24562457return ret;2458}245924602461/* kfd_destroy_crat_image2462*2463* @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)2464*2465*/2466void kfd_destroy_crat_image(void *crat_image)2467{2468kvfree(crat_image);2469}247024712472