Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Headers/amxavx512intrin.h
213766 views
1
/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
2
*
3
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
* See https://llvm.org/LICENSE.txt for license information.
5
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
*
7
*===------------------------------------------------------------------------===
8
*/
9
#ifndef __IMMINTRIN_H
10
#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
11
#endif // __IMMINTRIN_H
12
13
#ifndef __AMX_AVX512INTRIN_H
14
#define __AMX_AVX512INTRIN_H
15
#if defined(__x86_64__) && defined(__SSE2__)
16
17
#define __DEFAULT_FN_ATTRS_AVX512 \
18
__attribute__((__always_inline__, __nodebug__, \
19
__target__("amx-avx512,avx10.2-512")))
20
21
/// Moves a row from a tile register to a zmm destination register, converting
22
/// the int32 source elements to fp32. The row of the tile is selected by a
23
/// 32b GPR.
24
///
25
/// \headerfile <x86intrin.h>
26
///
27
/// \code
28
/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
29
/// \endcode
30
///
31
/// \code{.operation}
32
/// VL := 512
33
/// VL_bytes := VL >> 3
34
/// row_index := row & 0xffff
35
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
36
/// FOR i := 0 TO (VL_bytes / 4) - 1
37
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
38
/// dst.dword[i] := 0
39
/// ELSE
40
/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
41
/// FI
42
/// ENDFOR
43
/// dst[MAX_VL-1:VL] := 0
44
/// zero_tileconfig_start()
45
/// \endcode
46
///
47
/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
48
///
49
/// \param tsrc
50
/// The source tile. Max size is 1024 Bytes.
51
/// \param row
52
/// The row of the source tile
53
#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
54
55
/// Moves a row from a tile register to a zmm destination register, converting
56
/// the fp32 source elements to bf16. It places the resulting bf16 elements
57
/// in the high 16 bits within each dword. The row of the tile is selected
58
/// by a 32b GPR.
59
///
60
/// \headerfile <x86intrin.h>
61
///
62
/// \code
63
/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
64
/// \endcode
65
///
66
/// \code{.operation}
67
/// VL := 512
68
/// VL_bytes := VL >> 3
69
/// row_index := row & 0xffff
70
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
71
/// FOR i := 0 TO (VL_bytes / 4) - 1
72
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
73
/// dst.dword[i] := 0
74
/// ELSE
75
/// dst.word[2*i+0] := 0
76
/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
77
/// FI
78
/// ENDFOR
79
/// dst[MAX_VL-1:VL] := 0
80
/// zero_tileconfig_start()
81
/// \endcode
82
///
83
/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
84
///
85
/// \param tsrc
86
/// The source tile. Max size is 1024 Bytes.
87
/// \param row
88
/// The the row of the source tile.
89
#define _tile_cvtrowps2bf16h(tsrc, row) \
90
__builtin_ia32_tcvtrowps2bf16h(tsrc, row)
91
92
/// Moves a row from a tile register to a zmm destination register, converting
93
/// the fp32 source elements to bf16. It places the resulting bf16 elements
94
/// in the low 16 bits within each dword. The row of the tile is selected
95
/// by a 32b GPR.
96
///
97
/// \headerfile <x86intrin.h>
98
///
99
/// \code
100
/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
101
/// \endcode
102
///
103
/// \code{.operation}
104
/// VL := 512
105
/// VL_bytes := VL >> 3
106
/// row_index := row & 0xffff
107
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
108
/// FOR i := 0 TO (VL_bytes / 4) - 1
109
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
110
/// dst.dword[i] := 0
111
/// ELSE
112
/// dst.word[2*i+1] := 0
113
/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
114
/// FI
115
/// ENDFOR
116
/// dst[MAX_VL-1:VL] := 0
117
/// zero_tileconfig_start()
118
/// \endcode
119
///
120
/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
121
///
122
/// \param tsrc
123
/// The source tile. Max size is 1024 Bytes.
124
/// \param row
125
/// The the row of the source tile.
126
#define _tile_cvtrowps2bf16l(tsrc, row) \
127
__builtin_ia32_tcvtrowps2bf16l(tsrc, row)
128
129
/// Moves a row from a tile register to a zmm destination register, converting
130
/// the fp32 source elements to fp16. It places the resulting fp16 elements
131
/// in the high 16 bits within each dword. The row of the tile is selected
132
/// by a 32b GPR.
133
///
134
/// \headerfile <x86intrin.h>
135
///
136
/// \code
137
/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
138
/// \endcode
139
///
140
/// \code{.operation}
141
/// VL := 512
142
/// VL_bytes := VL >> 3
143
/// row_index := row & 0xffff
144
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
145
/// FOR i := 0 TO (VL_bytes / 4) - 1
146
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
147
/// dst.dword[i] := 0
148
/// ELSE
149
/// dst.word[2*i+0] := 0
150
/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
151
/// FI
152
/// ENDFOR
153
/// dst[MAX_VL-1:VL] := 0
154
/// zero_tileconfig_start()
155
/// \endcode
156
///
157
/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
158
///
159
/// \param tsrc
160
/// The source tile. Max size is 1024 Bytes.
161
/// \param row
162
/// The the row of the source tile.
163
#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
164
165
/// Moves a row from a tile register to a zmm destination register, converting
166
/// the fp32 source elements to fp16. It places the resulting fp16 elements
167
/// in the low 16 bits within each dword. The row of the tile is selected
168
/// by a 32b GPR.
169
///
170
/// \headerfile <x86intrin.h>
171
///
172
/// \code
173
/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
174
/// \endcode
175
///
176
/// \code{.operation}
177
/// VL := 512
178
/// VL_bytes := VL >> 3
179
/// row_index := row & 0xffff
180
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
181
/// FOR i := 0 TO (VL_bytes / 4) - 1
182
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
183
/// dst.dword[i] := 0
184
/// ELSE
185
/// dst.word[2*i+1] := 0
186
/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
187
/// FI
188
/// ENDFOR
189
/// dst[MAX_VL-1:VL] := 0
190
/// zero_tileconfig_start()
191
/// \endcode
192
///
193
/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
194
///
195
/// \param tsrc
196
/// The source tile. Max size is 1024 Bytes.
197
/// \param row
198
/// The the row of the source tile.
199
#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
200
201
/// Move one row of a tile data to a v16f32 data.
202
/// The row of the tile is selected by a 32b GPR.
203
///
204
/// \headerfile <immintrin.h>
205
///
206
/// \code
207
/// __m512 _tile_movrow(__tile a, unsigned b);
208
/// \endcode
209
///
210
/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
211
///
212
/// \param a
213
/// The 1st source tile. Max size is 1024 Bytes.
214
/// \param b
215
/// The 2nd source r32. Size is 4 Bytes.
216
/// \returns
217
/// The destination v16f32 data. Size is 64 Bytes.
218
///
219
/// \code{.operation}
220
/// VL := 512
221
/// VL_bytes := VL>>3
222
/// row_index := b&0xffff
223
/// row_chunk := ((b>>16)&0xffff) * VL_bytes
224
/// FOR i := 0 TO (VL_bytes-1)
225
/// IF (row_chunk + i >= a.colsb)
226
/// dst.byte[i] := 0
227
/// ELSE
228
/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
229
/// ENDFOR
230
/// \endcode
231
#define _tile_movrow(a, b) ((__m512i)__builtin_ia32_tilemovrow(a, b))
232
233
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
234
235
static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
236
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
237
return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
238
}
239
240
static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
241
_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
242
_tile1024i src, unsigned u) {
243
return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
244
}
245
246
static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
247
_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
248
_tile1024i src, unsigned u) {
249
return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
250
}
251
252
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
253
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
254
return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
255
}
256
257
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
258
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
259
return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
260
}
261
262
static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
263
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
264
return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
265
}
266
267
/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
268
/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
269
/// MXCSR.RC=RNE. Embedded rounding is not supported.
270
/// The row and chunk elements of tile is fetched from 32bit src1.
271
///
272
/// \headerfile <immintrin.h>
273
///
274
/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
275
///
276
/// \param src0
277
/// The 1st source tile. Max size is 1024 Bytes.
278
/// \param src1
279
/// The 2nd source r32. Size is 4 Bytes.
280
/// \returns
281
/// The destination v16f32 data. Size is 64 Bytes.
282
__DEFAULT_FN_ATTRS_AVX512
283
static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
284
return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
285
}
286
287
/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
288
/// elements to bf16 at high 16-bits of each dword.
289
/// The row and chunk elements of tile is fetched from 32bit src1.
290
///
291
/// \headerfile <immintrin.h>
292
///
293
/// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
294
///
295
/// \param src0
296
/// The 1st source tile. Max size is 1024 Bytes.
297
/// \param src1
298
/// The 2nd source r32. Size is 4 Bytes.
299
/// \returns
300
/// The destination v32bf16 data. Size is 64 Bytes.
301
__DEFAULT_FN_ATTRS_AVX512
302
static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
303
return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
304
}
305
306
/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
307
/// elements to bf16 at low 16-bits of each dword.
308
/// The row and chunk elements of tile is fetched from 32bit src1.
309
///
310
/// \headerfile <immintrin.h>
311
///
312
/// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
313
///
314
/// \param src0
315
/// The 1st source tile. Max size is 1024 Bytes.
316
/// \param src1
317
/// The 2nd source r32. Size is 4 Bytes.
318
/// \returns
319
/// The destination v32bf16 data. Size is 64 Bytes.
320
__DEFAULT_FN_ATTRS_AVX512
321
static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
322
return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
323
}
324
325
/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
326
/// elements to fp16 at high 16-bits of each dword.
327
/// The row and chunk elements of tile is fetched from 32bit src1.
328
///
329
/// \headerfile <immintrin.h>
330
///
331
/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
332
///
333
/// \param src0
334
/// The 1st source tile. Max size is 1024 Bytes.
335
/// \param src1
336
/// The 2nd source r32. Size is 4 Bytes.
337
/// \returns
338
/// The destination v32fp16 data. Size is 64 Bytes.
339
__DEFAULT_FN_ATTRS_AVX512
340
static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
341
return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
342
}
343
344
/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
345
/// elements to fp16 at low 16-bits of each dword.
346
/// The row and chunk elements of tile is fetched from 32bit src1.
347
///
348
/// \headerfile <immintrin.h>
349
///
350
/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
351
///
352
/// \param src0
353
/// The 1st source tile. Max size is 1024 Bytes.
354
/// \param src1
355
/// The 2nd source r32. Size is 4 Bytes.
356
/// \returns
357
/// The destination v32fp16 data. Size is 64 Bytes.
358
__DEFAULT_FN_ATTRS_AVX512
359
static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
360
return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
361
}
362
363
/// Move one row of a tile data to a v16f32 data.
364
/// The row of the tile is selected by a 32b GPR.
365
///
366
/// \headerfile <immintrin.h>
367
///
368
/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
369
///
370
/// \param src0
371
/// The 1st source tile. Max size is 1024 Bytes.
372
/// \param src1
373
/// The 2nd source r32. Size is 4 Bytes.
374
/// \returns
375
/// The destination v16i32 data. Size is 64 Bytes.
376
__DEFAULT_FN_ATTRS_AVX512
377
static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
378
return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
379
}
380
381
#endif // __x86_64__ && __SSE2__
382
#endif // __AMX_AVX512INTRIN_H
383
384