Path: blob/main/contrib/llvm-project/clang/lib/Headers/amxintrin.h
35233 views
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===------------------------------------------------------------------------===7*/89#ifndef __IMMINTRIN_H10#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."11#endif /* __IMMINTRIN_H */1213#ifndef __AMXINTRIN_H14#define __AMXINTRIN_H15#ifdef __x86_64__1617/* Define the default attributes for the functions in this file. */18#define __DEFAULT_FN_ATTRS_TILE \19__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))20#define __DEFAULT_FN_ATTRS_INT8 \21__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))22#define __DEFAULT_FN_ATTRS_BF16 \23__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))24#define __DEFAULT_FN_ATTRS_FP16 \25__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))2627/// Load tile configuration from a 64-byte memory location specified by28/// "mem_addr". The tile configuration includes the tile type palette, the29/// number of bytes per row, and the number of rows. If the specified30/// palette_id is zero, that signifies the init state for both the tile31/// config and the tile data, and the tiles are zeroed. Any invalid32/// configurations will result in #GP fault.33///34/// \headerfile <immintrin.h>35///36/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.37///38/// \param __config39/// A pointer to 512-bits configuration40static __inline__ void __DEFAULT_FN_ATTRS_TILE41_tile_loadconfig(const void *__config) {42__builtin_ia32_tile_loadconfig(__config);43}4445/// Stores the current tile configuration to a 64-byte memory location46/// specified by "mem_addr". The tile configuration includes the tile type47/// palette, the number of bytes per row, and the number of rows. If tiles48/// are not configured, all zeroes will be stored to memory.49///50/// \headerfile <immintrin.h>51///52/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.53///54/// \param __config55/// A pointer to 512-bits configuration56static __inline__ void __DEFAULT_FN_ATTRS_TILE57_tile_storeconfig(void *__config) {58__builtin_ia32_tile_storeconfig(__config);59}6061/// Release the tile configuration to return to the init state, which62/// releases all storage it currently holds.63///64/// \headerfile <immintrin.h>65///66/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.67static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {68__builtin_ia32_tilerelease();69}7071/// Load tile rows from memory specifieid by "base" address and "stride" into72/// destination tile "dst" using the tile configuration previously configured73/// via "_tile_loadconfig".74///75/// \headerfile <immintrin.h>76///77/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.78///79/// \param dst80/// A destination tile. Max size is 1024 Bytes.81/// \param base82/// A pointer to base address.83/// \param stride84/// The stride between the rows' data to be loaded in memory.85#define _tile_loadd(dst, base, stride) \86__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \87(__SIZE_TYPE__)(stride))8889/// Load tile rows from memory specifieid by "base" address and "stride" into90/// destination tile "dst" using the tile configuration previously configured91/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation92/// that the data will likely not be reused in the near future and the data93/// caching can be optimized accordingly.94///95/// \headerfile <immintrin.h>96///97/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.98///99/// \param dst100/// A destination tile. Max size is 1024 Bytes.101/// \param base102/// A pointer to base address.103/// \param stride104/// The stride between the rows' data to be loaded in memory.105#define _tile_stream_loadd(dst, base, stride) \106__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \107(__SIZE_TYPE__)(stride))108109/// Store the tile specified by "src" to memory specifieid by "base" address and110/// "stride" using the tile configuration previously configured via111/// "_tile_loadconfig".112///113/// \headerfile <immintrin.h>114///115/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.116///117/// \param dst118/// A destination tile. Max size is 1024 Bytes.119/// \param base120/// A pointer to base address.121/// \param stride122/// The stride between the rows' data to be stored in memory.123#define _tile_stored(dst, base, stride) \124__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))125126/// Zero the tile specified by "tdest".127///128/// \headerfile <immintrin.h>129///130/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.131///132/// \param tile133/// The destination tile to be zero. Max size is 1024 Bytes.134#define _tile_zero(tile) __builtin_ia32_tilezero((tile))135136/// Compute dot-product of bytes in tiles with a source/destination accumulator.137/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with138/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit139/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",140/// and store the 32-bit result back to tile "dst".141///142/// \headerfile <immintrin.h>143///144/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.145///146/// \param dst147/// The destination tile. Max size is 1024 Bytes.148/// \param src0149/// The 1st source tile. Max size is 1024 Bytes.150/// \param src1151/// The 2nd source tile. Max size is 1024 Bytes.152#define _tile_dpbssd(dst, src0, src1) \153__builtin_ia32_tdpbssd((dst), (src0), (src1))154155/// Compute dot-product of bytes in tiles with a source/destination accumulator.156/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with157/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate158/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer159/// in "dst", and store the 32-bit result back to tile "dst".160///161/// \headerfile <immintrin.h>162///163/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.164///165/// \param dst166/// The destination tile. Max size is 1024 Bytes.167/// \param src0168/// The 1st source tile. Max size is 1024 Bytes.169/// \param src1170/// The 2nd source tile. Max size is 1024 Bytes.171#define _tile_dpbsud(dst, src0, src1) \172__builtin_ia32_tdpbsud((dst), (src0), (src1))173174/// Compute dot-product of bytes in tiles with a source/destination accumulator.175/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with176/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit177/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",178/// and store the 32-bit result back to tile "dst".179///180/// \headerfile <immintrin.h>181///182/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.183///184/// \param dst185/// The destination tile. Max size is 1024 Bytes.186/// \param src0187/// The 1st source tile. Max size is 1024 Bytes.188/// \param src1189/// The 2nd source tile. Max size is 1024 Bytes.190#define _tile_dpbusd(dst, src0, src1) \191__builtin_ia32_tdpbusd((dst), (src0), (src1))192193/// Compute dot-product of bytes in tiles with a source/destination accumulator.194/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with195/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate196/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in197/// "dst", and store the 32-bit result back to tile "dst".198///199/// \headerfile <immintrin.h>200///201/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.202///203/// \param dst204/// The destination tile. Max size is 1024 Bytes.205/// \param src0206/// The 1st source tile. Max size is 1024 Bytes.207/// \param src1208/// The 2nd source tile. Max size is 1024 Bytes.209#define _tile_dpbuud(dst, src0, src1) \210__builtin_ia32_tdpbuud((dst), (src0), (src1))211212/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and213/// src1, accumulating the intermediate single-precision (32-bit) floating-point214/// elements with elements in "dst", and store the 32-bit result back to tile215/// "dst".216///217/// \headerfile <immintrin.h>218///219/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.220///221/// \param dst222/// The destination tile. Max size is 1024 Bytes.223/// \param src0224/// The 1st source tile. Max size is 1024 Bytes.225/// \param src1226/// The 2nd source tile. Max size is 1024 Bytes.227#define _tile_dpbf16ps(dst, src0, src1) \228__builtin_ia32_tdpbf16ps((dst), (src0), (src1))229230/// AMX tile register size can be configured, the maximum size is 16x64=1024231/// bytes. Since there is no 2D type in llvm IR, we use vector type to232/// represent 2D tile and the fixed size is maximum amx tile register size.233typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));234235/// This is internal intrinsic. C/C++ user should avoid calling it directly.236static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8237_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,238__SIZE_TYPE__ stride) {239return __builtin_ia32_tileloadd64_internal(m, n, base,240(__SIZE_TYPE__)(stride));241}242243/// This is internal intrinsic. C/C++ user should avoid calling it directly.244static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8245_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,246__SIZE_TYPE__ stride) {247return __builtin_ia32_tileloaddt164_internal(m, n, base,248(__SIZE_TYPE__)(stride));249}250251/// This is internal intrinsic. C/C++ user should avoid calling it directly.252static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8253_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,254_tile1024i dst, _tile1024i src1, _tile1024i src2) {255return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);256}257258/// This is internal intrinsic. C/C++ user should avoid calling it directly.259static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8260_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,261_tile1024i dst, _tile1024i src1, _tile1024i src2) {262return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);263}264265/// This is internal intrinsic. C/C++ user should avoid calling it directly.266static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8267_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,268_tile1024i dst, _tile1024i src1, _tile1024i src2) {269return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);270}271272/// This is internal intrinsic. C/C++ user should avoid calling it directly.273static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8274_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,275_tile1024i dst, _tile1024i src1, _tile1024i src2) {276return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);277}278279/// This is internal intrinsic. C/C++ user should avoid calling it directly.280static __inline__ void __DEFAULT_FN_ATTRS_INT8281_tile_stored_internal(unsigned short m, unsigned short n, void *base,282__SIZE_TYPE__ stride, _tile1024i tile) {283return __builtin_ia32_tilestored64_internal(m, n, base,284(__SIZE_TYPE__)(stride), tile);285}286287/// This is internal intrinsic. C/C++ user should avoid calling it directly.288static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16289_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,290_tile1024i dst, _tile1024i src1, _tile1024i src2) {291return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);292}293294/// This is internal intrinsic. C/C++ user should avoid calling it directly.295static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16296_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,297_tile1024i dst, _tile1024i src1, _tile1024i src2) {298return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);299}300301/// This struct pack the shape and tile data together for user. We suggest302/// initializing the struct as early as possible, because compiler depends303/// on the shape information to do configure. The constant value is preferred304/// for optimization by compiler.305typedef struct __tile1024i_str {306const unsigned short row;307const unsigned short col;308_tile1024i tile;309} __tile1024i;310311/// Load tile rows from memory specifieid by "base" address and "stride" into312/// destination tile "dst".313///314/// \headerfile <immintrin.h>315///316/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.317///318/// \param dst319/// A destination tile. Max size is 1024 Bytes.320/// \param base321/// A pointer to base address.322/// \param stride323/// The stride between the rows' data to be loaded in memory.324__DEFAULT_FN_ATTRS_TILE325static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,326__SIZE_TYPE__ stride) {327dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);328}329330/// Load tile rows from memory specifieid by "base" address and "stride" into331/// destination tile "dst". This intrinsic provides a hint to the implementation332/// that the data will likely not be reused in the near future and the data333/// caching can be optimized accordingly.334///335/// \headerfile <immintrin.h>336///337/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.338///339/// \param dst340/// A destination tile. Max size is 1024 Bytes.341/// \param base342/// A pointer to base address.343/// \param stride344/// The stride between the rows' data to be loaded in memory.345__DEFAULT_FN_ATTRS_TILE346static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,347__SIZE_TYPE__ stride) {348dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);349}350351/// Compute dot-product of bytes in tiles with a source/destination accumulator.352/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with353/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit354/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",355/// and store the 32-bit result back to tile "dst".356///357/// \headerfile <immintrin.h>358///359/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.360///361/// \param dst362/// The destination tile. Max size is 1024 Bytes.363/// \param src0364/// The 1st source tile. Max size is 1024 Bytes.365/// \param src1366/// The 2nd source tile. Max size is 1024 Bytes.367__DEFAULT_FN_ATTRS_INT8368static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,369__tile1024i src1) {370dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,371src0.tile, src1.tile);372}373374/// Compute dot-product of bytes in tiles with a source/destination accumulator.375/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with376/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate377/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer378/// in "dst", and store the 32-bit result back to tile "dst".379///380/// \headerfile <immintrin.h>381///382/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.383///384/// \param dst385/// The destination tile. Max size is 1024 Bytes.386/// \param src0387/// The 1st source tile. Max size is 1024 Bytes.388/// \param src1389/// The 2nd source tile. Max size is 1024 Bytes.390__DEFAULT_FN_ATTRS_INT8391static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,392__tile1024i src1) {393dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,394src0.tile, src1.tile);395}396397/// Compute dot-product of bytes in tiles with a source/destination accumulator.398/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with399/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit400/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",401/// and store the 32-bit result back to tile "dst".402///403/// \headerfile <immintrin.h>404///405/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.406///407/// \param dst408/// The destination tile. Max size is 1024 Bytes.409/// \param src0410/// The 1st source tile. Max size is 1024 Bytes.411/// \param src1412/// The 2nd source tile. Max size is 1024 Bytes.413__DEFAULT_FN_ATTRS_INT8414static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,415__tile1024i src1) {416dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,417src0.tile, src1.tile);418}419420/// Compute dot-product of bytes in tiles with a source/destination accumulator.421/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with422/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate423/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in424/// "dst", and store the 32-bit result back to tile "dst".425///426/// \headerfile <immintrin.h>427///428/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.429///430/// \param dst431/// The destination tile. Max size is 1024 Bytes.432/// \param src0433/// The 1st source tile. Max size is 1024 Bytes.434/// \param src1435/// The 2nd source tile. Max size is 1024 Bytes.436__DEFAULT_FN_ATTRS_INT8437static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,438__tile1024i src1) {439dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,440src0.tile, src1.tile);441}442443/// Store the tile specified by "src" to memory specifieid by "base" address and444/// "stride".445///446/// \headerfile <immintrin.h>447///448/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.449///450/// \param base451/// A pointer to base address.452/// \param stride453/// The stride between the rows' data to be stored in memory.454__DEFAULT_FN_ATTRS_TILE455static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,456__tile1024i src) {457_tile_stored_internal(src.row, src.col, base, stride, src.tile);458}459460/// Zero the tile specified by "dst".461///462/// \headerfile <immintrin.h>463///464/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.465///466/// \param dst467/// The destination tile to be zero. Max size is 1024 Bytes.468__DEFAULT_FN_ATTRS_TILE469static __inline__ void __tile_zero(__tile1024i *dst) {470dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);471}472473/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and474/// src1, accumulating the intermediate single-precision (32-bit) floating-point475/// elements with elements in "dst", and store the 32-bit result back to tile476/// "dst".477///478/// \headerfile <immintrin.h>479///480/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.481///482/// \param dst483/// The destination tile. Max size is 1024 Bytes.484/// \param src0485/// The 1st source tile. Max size is 1024 Bytes.486/// \param src1487/// The 2nd source tile. Max size is 1024 Bytes.488__DEFAULT_FN_ATTRS_BF16489static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,490__tile1024i src1) {491dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,492src0.tile, src1.tile);493}494495/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and496/// src1, accumulating the intermediate single-precision (32-bit) floating-point497/// elements with elements in "dst", and store the 32-bit result back to tile498/// "dst".499///500/// \headerfile <immintrin.h>501///502/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.503///504/// \param dst505/// The destination tile. Max size is 1024 Bytes.506/// \param src0507/// The 1st source tile. Max size is 1024 Bytes.508/// \param src1509/// The 2nd source tile. Max size is 1024 Bytes.510__DEFAULT_FN_ATTRS_FP16511static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,512__tile1024i src1) {513dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,514src0.tile, src1.tile);515}516517#undef __DEFAULT_FN_ATTRS_TILE518#undef __DEFAULT_FN_ATTRS_INT8519#undef __DEFAULT_FN_ATTRS_BF16520#undef __DEFAULT_FN_ATTRS_FP16521522#endif /* __x86_64__ */523#endif /* __AMXINTRIN_H */524525526