Path: blob/main/contrib/llvm-project/clang/lib/Headers/amxcomplexintrin.h
35233 views
/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===------------------------------------------------------------------------===7*/89#ifndef __IMMINTRIN_H10#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."11#endif // __IMMINTRIN_H1213#ifndef __AMX_COMPLEXINTRIN_H14#define __AMX_COMPLEXINTRIN_H15#ifdef __x86_64__1617#define __DEFAULT_FN_ATTRS_COMPLEX \18__attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))1920/// Perform matrix multiplication of two tiles containing complex elements and21/// accumulate the results into a packed single precision tile. Each dword22/// element in input tiles \a a and \a b is interpreted as a complex number23/// with FP16 real part and FP16 imaginary part.24/// Calculates the imaginary part of the result. For each possible combination25/// of (row of \a a, column of \a b), it performs a set of multiplication26/// and accumulations on all corresponding complex numbers (one from \a a27/// and one from \a b). The imaginary part of the \a a element is multiplied28/// with the real part of the corresponding \a b element, and the real part29/// of the \a a element is multiplied with the imaginary part of the30/// corresponding \a b elements. The two accumulated results are added, and31/// then accumulated into the corresponding row and column of \a dst.32///33/// \headerfile <x86intrin.h>34///35/// \code36/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);37/// \endcode38///39/// \code{.operation}40/// FOR m := 0 TO dst.rows - 141/// tmp := dst.row[m]42/// FOR k := 0 TO (a.colsb / 4) - 143/// FOR n := 0 TO (dst.colsb / 4) - 144/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])45/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])46/// ENDFOR47/// ENDFOR48/// write_row_and_zero(dst, m, tmp, dst.colsb)49/// ENDFOR50/// zero_upper_rows(dst, dst.rows)51/// zero_tileconfig_start()52/// \endcode53///54/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.55///56/// \param dst57/// The destination tile. Max size is 1024 Bytes.58/// \param a59/// The 1st source tile. Max size is 1024 Bytes.60/// \param b61/// The 2nd source tile. Max size is 1024 Bytes.62#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)6364/// Perform matrix multiplication of two tiles containing complex elements and65/// accumulate the results into a packed single precision tile. Each dword66/// element in input tiles \a a and \a b is interpreted as a complex number67/// with FP16 real part and FP16 imaginary part.68/// Calculates the real part of the result. For each possible combination69/// of (row of \a a, column of \a b), it performs a set of multiplication70/// and accumulations on all corresponding complex numbers (one from \a a71/// and one from \a b). The real part of the \a a element is multiplied72/// with the real part of the corresponding \a b element, and the negated73/// imaginary part of the \a a element is multiplied with the imaginary74/// part of the corresponding \a b elements. The two accumulated results75/// are added, and then accumulated into the corresponding row and column76/// of \a dst.77///78/// \headerfile <x86intrin.h>79///80/// \code81/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);82/// \endcode83///84/// \code{.operation}85/// FOR m := 0 TO dst.rows - 186/// tmp := dst.row[m]87/// FOR k := 0 TO (a.colsb / 4) - 188/// FOR n := 0 TO (dst.colsb / 4) - 189/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])90/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])91/// ENDFOR92/// ENDFOR93/// write_row_and_zero(dst, m, tmp, dst.colsb)94/// ENDFOR95/// zero_upper_rows(dst, dst.rows)96/// zero_tileconfig_start()97/// \endcode98///99/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.100///101/// \param dst102/// The destination tile. Max size is 1024 Bytes.103/// \param a104/// The 1st source tile. Max size is 1024 Bytes.105/// \param b106/// The 2nd source tile. Max size is 1024 Bytes.107#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)108109static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX110_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,111_tile1024i dst, _tile1024i src1, _tile1024i src2) {112return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);113}114115static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX116_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,117_tile1024i dst, _tile1024i src1, _tile1024i src2) {118return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);119}120121/// Perform matrix multiplication of two tiles containing complex elements and122/// accumulate the results into a packed single precision tile. Each dword123/// element in input tiles src0 and src1 is interpreted as a complex number with124/// FP16 real part and FP16 imaginary part.125/// This function calculates the imaginary part of the result.126///127/// \headerfile <immintrin.h>128///129/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.130///131/// \param dst132/// The destination tile. Max size is 1024 Bytes.133/// \param src0134/// The 1st source tile. Max size is 1024 Bytes.135/// \param src1136/// The 2nd source tile. Max size is 1024 Bytes.137__DEFAULT_FN_ATTRS_COMPLEX138static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,139__tile1024i src1) {140dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,141dst->tile, src0.tile, src1.tile);142}143144/// Perform matrix multiplication of two tiles containing complex elements and145/// accumulate the results into a packed single precision tile. Each dword146/// element in input tiles src0 and src1 is interpreted as a complex number with147/// FP16 real part and FP16 imaginary part.148/// This function calculates the real part of the result.149///150/// \headerfile <immintrin.h>151///152/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.153///154/// \param dst155/// The destination tile. Max size is 1024 Bytes.156/// \param src0157/// The 1st source tile. Max size is 1024 Bytes.158/// \param src1159/// The 2nd source tile. Max size is 1024 Bytes.160__DEFAULT_FN_ATTRS_COMPLEX161static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,162__tile1024i src1) {163dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,164dst->tile, src0.tile, src1.tile);165}166167#endif // __x86_64__168#endif // __AMX_COMPLEXINTRIN_H169170171