Path: blob/main/contrib/llvm-project/clang/lib/Headers/amxcomplextransposeintrin.h
213766 views
/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===------------------------------------------------------------------------===7*/89#ifndef __IMMINTRIN_H10#error \11"Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."12#endif // __IMMINTRIN_H1314#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H15#define __AMX_COMPLEXTRANSPOSEINTRIN_H16#ifdef __x86_64__1718#define __DEFAULT_FN_ATTRS \19__attribute__((__always_inline__, __nodebug__, \20__target__("amx-complex,amx-transpose")))2122/// Perform matrix multiplication of two tiles containing complex elements and23/// accumulate the results into a packed single precision tile. Each dword24/// element in input tiles \a a and \a b is interpreted as a complex number25/// with FP16 real part and FP16 imaginary part.26/// Calculates the imaginary part of the result. For each possible combination27/// of (transposed column of \a a, column of \a b), it performs a set of28/// multiplication and accumulations on all corresponding complex numbers29/// (one from \a a and one from \a b). The imaginary part of the \a a element30/// is multiplied with the real part of the corresponding \a b element, and31/// the real part of the \a a element is multiplied with the imaginary part32/// of the corresponding \a b elements. The two accumulated results are33/// added, and then accumulated into the corresponding row and column of34/// \a dst.35///36/// \headerfile <x86intrin.h>37///38/// \code39/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);40/// \endcode41///42/// \code{.operation}43/// FOR m := 0 TO dst.rows - 144/// tmp := dst.row[m]45/// FOR k := 0 TO a.rows - 146/// FOR n := 0 TO (dst.colsb / 4) - 147/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])48/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])49/// ENDFOR50/// ENDFOR51/// write_row_and_zero(dst, m, tmp, dst.colsb)52/// ENDFOR53/// zero_upper_rows(dst, dst.rows)54/// zero_tileconfig_start()55/// \endcode56///57/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.58///59/// \param dst60/// The destination tile. Max size is 1024 Bytes.61/// \param a62/// The 1st source tile. Max size is 1024 Bytes.63/// \param b64/// The 2nd source tile. Max size is 1024 Bytes.65#define _tile_tcmmimfp16ps(dst, a, b) \66__builtin_ia32_ttcmmimfp16ps((dst), (a), (b))6768/// Perform matrix multiplication of two tiles containing complex elements and69/// accumulate the results into a packed single precision tile. Each dword70/// element in input tiles \a a and \a b is interpreted as a complex number71/// with FP16 real part and FP16 imaginary part.72/// Calculates the real part of the result. For each possible combination73/// of (rtransposed colum of \a a, column of \a b), it performs a set of74/// multiplication and accumulations on all corresponding complex numbers75/// (one from \a a and one from \a b). The real part of the \a a element is76/// multiplied with the real part of the corresponding \a b element, and the77/// negated imaginary part of the \a a element is multiplied with the78/// imaginary part of the corresponding \a b elements. The two accumulated79/// results are added, and then accumulated into the corresponding row and80/// column of \a dst.81///82/// \headerfile <x86intrin.h>83///84/// \code85/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);86/// \endcode87///88/// \code{.operation}89/// FOR m := 0 TO dst.rows - 190/// tmp := dst.row[m]91/// FOR k := 0 TO a.rows - 192/// FOR n := 0 TO (dst.colsb / 4) - 193/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])94/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])95/// ENDFOR96/// ENDFOR97/// write_row_and_zero(dst, m, tmp, dst.colsb)98/// ENDFOR99/// zero_upper_rows(dst, dst.rows)100/// zero_tileconfig_start()101/// \endcode102///103/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.104///105/// \param dst106/// The destination tile. Max size is 1024 Bytes.107/// \param a108/// The 1st source tile. Max size is 1024 Bytes.109/// \param b110/// The 2nd source tile. Max size is 1024 Bytes.111#define _tile_tcmmrlfp16ps(dst, a, b) \112__builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))113114/// Perform matrix conjugate transpose and multiplication of two tiles115/// containing complex elements and accumulate the results into a packed116/// single precision tile. Each dword element in input tiles \a a and \a b117/// is interpreted as a complex number with FP16 real part and FP16 imaginary118/// part.119/// Calculates the imaginary part of the result. For each possible combination120/// of (transposed column of \a a, column of \a b), it performs a set of121/// multiplication and accumulations on all corresponding complex numbers122/// (one from \a a and one from \a b). The negated imaginary part of the \a a123/// element is multiplied with the real part of the corresponding \a b124/// element, and the real part of the \a a element is multiplied with the125/// imaginary part of the corresponding \a b elements. The two accumulated126/// results are added, and then accumulated into the corresponding row and127/// column of \a dst.128///129/// \headerfile <x86intrin.h>130///131/// \code132/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);133/// \endcode134///135/// \code{.operation}136/// FOR m := 0 TO dst.rows - 1137/// tmp := dst.row[m]138/// FOR k := 0 TO a.rows - 1139/// FOR n := 0 TO (dst.colsb / 4) - 1140/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])141/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])142/// ENDFOR143/// ENDFOR144/// write_row_and_zero(dst, m, tmp, dst.colsb)145/// ENDFOR146/// zero_upper_rows(dst, dst.rows)147/// zero_tileconfig_start()148/// \endcode149///150/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.151///152/// \param dst153/// The destination tile. Max size is 1024 Bytes.154/// \param a155/// The 1st source tile. Max size is 1024 Bytes.156/// \param b157/// The 2nd source tile. Max size is 1024 Bytes.158#define _tile_conjtcmmimfp16ps(dst, a, b) \159__builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))160161/// Perform conjugate transpose of an FP16-pair of complex elements from \a a162/// and writes the result to \a dst.163///164/// \headerfile <x86intrin.h>165///166/// \code167/// void _tile_conjtfp16(__tile dst, __tile a);168/// \endcode169///170/// \code{.operation}171/// FOR i := 0 TO dst.rows - 1172/// FOR j := 0 TO (dst.colsb / 4) - 1173/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]174/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]175/// ENDFOR176/// write_row_and_zero(dst, i, tmp, dst.colsb)177/// ENDFOR178/// zero_upper_rows(dst, dst.rows)179/// zero_tileconfig_start()180/// \endcode181///182/// This intrinsic corresponds to the \c TCONJTFP16 instruction.183///184/// \param dst185/// The destination tile. Max size is 1024 Bytes.186/// \param a187/// The source tile. Max size is 1024 Bytes.188#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))189190static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(191unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,192_tile1024i src1, _tile1024i src2) {193return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);194}195196static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(197unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,198_tile1024i src1, _tile1024i src2) {199return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);200}201202static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(203unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,204_tile1024i src1, _tile1024i src2) {205return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);206}207208static __inline__ _tile1024i __DEFAULT_FN_ATTRS209_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {210return __builtin_ia32_tconjtfp16_internal(m, n, src);211}212213/// Perform matrix multiplication of two tiles containing complex elements and214/// accumulate the results into a packed single precision tile. Each dword215/// element in input tiles src0 and src1 is interpreted as a complex number216/// with FP16 real part and FP16 imaginary part.217/// This function calculates the imaginary part of the result.218///219/// \headerfile <immintrin.h>220///221/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.222///223/// \param dst224/// The destination tile. Max size is 1024 Bytes.225/// \param src0226/// The 1st source tile. Max size is 1024 Bytes.227/// \param src1228/// The 2nd source tile. Max size is 1024 Bytes.229__DEFAULT_FN_ATTRS230static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,231__tile1024i src1) {232dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,233dst->tile, src0.tile, src1.tile);234}235236/// Perform matrix multiplication of two tiles containing complex elements and237/// accumulate the results into a packed single precision tile. Each dword238/// element in input tiles src0 and src1 is interpreted as a complex number239/// with FP16 real part and FP16 imaginary part.240/// This function calculates the real part of the result.241///242/// \headerfile <immintrin.h>243///244/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.245///246/// \param dst247/// The destination tile. Max size is 1024 Bytes.248/// \param src0249/// The 1st source tile. Max size is 1024 Bytes.250/// \param src1251/// The 2nd source tile. Max size is 1024 Bytes.252__DEFAULT_FN_ATTRS253static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,254__tile1024i src1) {255dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,256dst->tile, src0.tile, src1.tile);257}258259/// Perform matrix conjugate transpose and multiplication of two tiles260/// containing complex elements and accumulate the results into a packed261/// single precision tile. Each dword element in input tiles src0 and src1262/// is interpreted as a complex number with FP16 real part and FP16 imaginary263/// part.264/// This function calculates the imaginary part of the result.265///266/// \headerfile <immintrin.h>267///268/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.269///270/// \param dst271/// The destination tile. Max size is 1024 Bytes.272/// \param src0273/// The 1st source tile. Max size is 1024 Bytes.274/// \param src1275/// The 2nd source tile. Max size is 1024 Bytes.276__DEFAULT_FN_ATTRS277static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,278__tile1024i src1) {279dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,280dst->tile, src0.tile, src1.tile);281}282283/// Perform conjugate transpose of an FP16-pair of complex elements from src and284/// writes the result to dst.285///286/// \headerfile <immintrin.h>287///288/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.289///290/// \param dst291/// The destination tile. Max size is 1024 Bytes.292/// \param src293/// The source tile. Max size is 1024 Bytes.294__DEFAULT_FN_ATTRS295static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {296dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);297}298299#undef __DEFAULT_FN_ATTRS300301#endif // __x86_64__302#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H303304305