Path: blob/main/contrib/llvm-project/clang/lib/Headers/amxfp8intrin.h
213766 views
/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===------------------------------------------------------------------------===7*/89#ifndef __IMMINTRIN_H10#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."11#endif /* __IMMINTRIN_H */1213#ifndef __AMXFP8INTRIN_H14#define __AMXFP8INTRIN_H15#ifdef __x86_64__1617#define __DEFAULT_FN_ATTRS_FP8 \18__attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))1920static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP821_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,22_tile1024i dst, _tile1024i src1, _tile1024i src2) {23return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);24}2526/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src227/// accumulating into a Single Precision (FP32) source/dest \a dst.28///29/// \headerfile <immintrin.h>30///31/// \code32/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)33/// \endcode34///35/// \code{.operation}36/// FOR m := 0 TO dst.rows - 137/// temp1[(dst.colsb / 4 - 1) : 0] = 038/// FOR k := 0 TO src1.colsb / 4 - 139/// FOR n := 0 TO dst.colsb / 4 - 140/// temp1[n] +=41/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])42/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])43/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])44/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])45/// ENDFOR46/// ENDFOR47/// FOR n := 0 TO dst.colsb / 4 - 148/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])49/// ENDFOR50/// write_row_and_zero(dst, m, tmp, dst.colsb)51/// zero_upper_rows(dst, dst.rows)52/// zero_tileconfig_start()53/// \endcode54///55/// This intrinsic corresponds to the \c TDPBF8PS instruction.56///57/// \param dst58/// The destination tile. Max size is 1024 Bytes.59/// \param src160/// The 1st source tile. Max size is 1024 Bytes.61/// \param src262/// The 2nd source tile. Max size is 1024 Bytes.63__DEFAULT_FN_ATTRS_FP8 static void64__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {65dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,66src1.tile, src2.tile);67}6869static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP870_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,71_tile1024i dst, _tile1024i src1, _tile1024i src2) {72return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);73}7475/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src276/// accumulating into a Single Precision (FP32) source/dest \a dst.77///78/// \headerfile <immintrin.h>79///80/// \code81/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)82/// \endcode83///84/// \code{.operation}85/// FOR m := 0 TO dst.rows - 186/// temp1[(dst.colsb / 4 - 1) : 0] = 087/// FOR k := 0 TO src1.colsb / 4 - 188/// FOR n := 0 TO dst.colsb / 4 - 189/// temp1[n] +=90/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])91/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])92/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])93/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])94/// ENDFOR95/// ENDFOR96/// FOR n := 0 TO dst.colsb / 4 - 197/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])98/// ENDFOR99/// write_row_and_zero(dst, m, tmp, dst.colsb)100/// zero_upper_rows(dst, dst.rows)101/// zero_tileconfig_start()102/// \endcode103///104/// This intrinsic corresponds to the \c TDPBHF8PS instruction.105///106/// \param dst107/// The destination tile. Max size is 1024 Bytes.108/// \param src1109/// The 1st source tile. Max size is 1024 Bytes.110/// \param src2111/// The 2nd source tile. Max size is 1024 Bytes.112__DEFAULT_FN_ATTRS_FP8 static void113__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {114dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile,115src1.tile, src2.tile);116}117118static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8119_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,120_tile1024i dst, _tile1024i src1, _tile1024i src2) {121return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2);122}123124/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2125/// accumulating into a Single Precision (FP32) source/dest \a dst.126///127/// \headerfile <immintrin.h>128///129/// \code130/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)131/// \endcode132///133/// \code{.operation}134/// FOR m := 0 TO dst.rows - 1135/// temp1[(dst.colsb / 4 - 1) : 0] = 0136/// FOR k := 0 TO src1.colsb / 4 - 1137/// FOR n := 0 TO dst.colsb / 4 - 1138/// temp1[n] +=139/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])140/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])141/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])142/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])143/// ENDFOR144/// ENDFOR145/// FOR n := 0 TO dst.colsb / 4 - 1146/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])147/// ENDFOR148/// write_row_and_zero(dst, m, tmp, dst.colsb)149/// zero_upper_rows(dst, dst.rows)150/// zero_tileconfig_start()151/// \endcode152///153/// This intrinsic corresponds to the \c TDPHBF8PS instruction.154///155/// \param dst156/// The destination tile. Max size is 1024 Bytes.157/// \param src1158/// The 1st source tile. Max size is 1024 Bytes.159/// \param src2160/// The 2nd source tile. Max size is 1024 Bytes.161162__DEFAULT_FN_ATTRS_FP8 static void163__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {164dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,165src1.tile, src2.tile);166}167168static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8169_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k,170_tile1024i dst, _tile1024i src1, _tile1024i src2) {171return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2);172}173174/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2175/// accumulating into a Single Precision (FP32) source/dest \a dst.176///177/// \headerfile <immintrin.h>178///179/// \code180/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)181/// \endcode182///183/// \code{.operation}184/// FOR m := 0 TO dst.rows - 1185/// temp1[(dst.colsb / 4 - 1) : 0] = 0186/// FOR k := 0 TO src1.colsb / 4 - 1187/// FOR n := 0 TO dst.colsb / 4 - 1188/// temp1[n] +=189/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])190/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])191/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])192/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])193/// ENDFOR194/// ENDFOR195/// FOR n := 0 TO dst.colsb / 4 - 1196/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])197/// ENDFOR198/// write_row_and_zero(dst, m, tmp, dst.colsb)199/// zero_upper_rows(dst, dst.rows)200/// zero_tileconfig_start()201/// \endcode202///203/// This intrinsic corresponds to the \c TDPHF8PS instruction.204///205/// \param dst206/// The destination tile. Max size is 1024 Bytes.207/// \param src1208/// The 1st source tile. Max size is 1024 Bytes.209/// \param src2210/// The 2nd source tile. Max size is 1024 Bytes.211__DEFAULT_FN_ATTRS_FP8 static void212__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {213dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile,214src1.tile, src2.tile);215}216217#define _tile_dpbf8ps(dst, src1, src2) \218__builtin_ia32_tdpbf8ps((dst), (src1), (src2))219#define _tile_dpbhf8ps(dst, src1, src2) \220__builtin_ia32_tdpbhf8ps((dst), (src1), (src2))221#define _tile_dphbf8ps(dst, src1, src2) \222__builtin_ia32_tdphbf8ps((dst), (src1), (src2))223#define _tile_dphf8ps(dst, src1, src2) \224__builtin_ia32_tdphf8ps((dst), (src1), (src2))225226#undef __DEFAULT_FN_ATTRS_FP8227228#endif /* __x86_64__ */229#endif /* __AMXFP8INTRIN_H */230231232