Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Headers/avx512bf16intrin.h
35233 views
1
/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
2
*
3
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
* See https://llvm.org/LICENSE.txt for license information.
5
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
*
7
*===-----------------------------------------------------------------------===
8
*/
9
#ifndef __IMMINTRIN_H
10
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11
#endif
12
13
#ifdef __SSE2__
14
15
#ifndef __AVX512BF16INTRIN_H
16
#define __AVX512BF16INTRIN_H
17
18
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
21
22
#define __DEFAULT_FN_ATTRS512 \
23
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24
__min_vector_width__(512)))
25
#define __DEFAULT_FN_ATTRS \
26
__attribute__((__always_inline__, __nodebug__, \
27
__target__("avx512bf16,no-evex512")))
28
29
/// Convert One BF16 Data to One Single Float Data.
30
///
31
/// \headerfile <x86intrin.h>
32
///
33
/// This intrinsic does not correspond to a specific instruction.
34
///
35
/// \param __A
36
/// A bfloat data.
37
/// \returns A float data whose sign field and exponent field keep unchanged,
38
/// and fraction field is extended to 23 bits.
39
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40
return __builtin_ia32_cvtsbf162ss_32(__A);
41
}
42
43
/// Convert Two Packed Single Data to One Packed BF16 Data.
44
///
45
/// \headerfile <x86intrin.h>
46
///
47
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48
///
49
/// \param __A
50
/// A 512-bit vector of [16 x float].
51
/// \param __B
52
/// A 512-bit vector of [16 x float].
53
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54
/// conversion of __B, and higher 256 bits come from conversion of __A.
55
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58
(__v16sf) __B);
59
}
60
61
/// Convert Two Packed Single Data to One Packed BF16 Data.
62
///
63
/// \headerfile <x86intrin.h>
64
///
65
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66
///
67
/// \param __A
68
/// A 512-bit vector of [16 x float].
69
/// \param __B
70
/// A 512-bit vector of [16 x float].
71
/// \param __W
72
/// A 512-bit vector of [32 x bfloat].
73
/// \param __U
74
/// A 32-bit mask value specifying what is chosen for each element.
75
/// A 1 means conversion of __A or __B. A 0 means element from __W.
76
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77
/// conversion of __B, and higher 256 bits come from conversion of __A.
78
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82
(__v32bf)__W);
83
}
84
85
/// Convert Two Packed Single Data to One Packed BF16 Data.
86
///
87
/// \headerfile <x86intrin.h>
88
///
89
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90
///
91
/// \param __A
92
/// A 512-bit vector of [16 x float].
93
/// \param __B
94
/// A 512-bit vector of [16 x float].
95
/// \param __U
96
/// A 32-bit mask value specifying what is chosen for each element.
97
/// A 1 means conversion of __A or __B. A 0 means element is zero.
98
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99
/// conversion of __B, and higher 256 bits come from conversion of __A.
100
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104
(__v32bf)_mm512_setzero_si512());
105
}
106
107
/// Convert Packed Single Data to Packed BF16 Data.
108
///
109
/// \headerfile <x86intrin.h>
110
///
111
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
112
///
113
/// \param __A
114
/// A 512-bit vector of [16 x float].
115
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117
_mm512_cvtneps_pbh(__m512 __A) {
118
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119
(__v16bf)_mm256_undefined_si256(),
120
(__mmask16)-1);
121
}
122
123
/// Convert Packed Single Data to Packed BF16 Data.
124
///
125
/// \headerfile <x86intrin.h>
126
///
127
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
128
///
129
/// \param __A
130
/// A 512-bit vector of [16 x float].
131
/// \param __W
132
/// A 256-bit vector of [16 x bfloat].
133
/// \param __U
134
/// A 16-bit mask value specifying what is chosen for each element.
135
/// A 1 means conversion of __A. A 0 means element from __W.
136
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140
(__v16bf)__W,
141
(__mmask16)__U);
142
}
143
144
/// Convert Packed Single Data to Packed BF16 Data.
145
///
146
/// \headerfile <x86intrin.h>
147
///
148
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
149
///
150
/// \param __A
151
/// A 512-bit vector of [16 x float].
152
/// \param __U
153
/// A 16-bit mask value specifying what is chosen for each element.
154
/// A 1 means conversion of __A. A 0 means element is zero.
155
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159
(__v16bf)_mm256_setzero_si256(),
160
(__mmask16)__U);
161
}
162
163
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
164
///
165
/// \headerfile <x86intrin.h>
166
///
167
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
168
///
169
/// \param __A
170
/// A 512-bit vector of [32 x bfloat].
171
/// \param __B
172
/// A 512-bit vector of [32 x bfloat].
173
/// \param __D
174
/// A 512-bit vector of [16 x float].
175
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
176
/// __A, __B and __D
177
static __inline__ __m512 __DEFAULT_FN_ATTRS512
178
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180
(__v32bf) __A,
181
(__v32bf) __B);
182
}
183
184
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
185
///
186
/// \headerfile <x86intrin.h>
187
///
188
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
189
///
190
/// \param __A
191
/// A 512-bit vector of [32 x bfloat].
192
/// \param __B
193
/// A 512-bit vector of [32 x bfloat].
194
/// \param __D
195
/// A 512-bit vector of [16 x float].
196
/// \param __U
197
/// A 16-bit mask value specifying what is chosen for each element.
198
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
200
/// __A, __B and __D
201
static __inline__ __m512 __DEFAULT_FN_ATTRS512
202
_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205
(__v16sf)__D);
206
}
207
208
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
209
///
210
/// \headerfile <x86intrin.h>
211
///
212
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
213
///
214
/// \param __A
215
/// A 512-bit vector of [32 x bfloat].
216
/// \param __B
217
/// A 512-bit vector of [32 x bfloat].
218
/// \param __D
219
/// A 512-bit vector of [16 x float].
220
/// \param __U
221
/// A 16-bit mask value specifying what is chosen for each element.
222
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
224
/// __A, __B and __D
225
static __inline__ __m512 __DEFAULT_FN_ATTRS512
226
_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229
(__v16sf)_mm512_setzero_si512());
230
}
231
232
/// Convert Packed BF16 Data to Packed float Data.
233
///
234
/// \headerfile <x86intrin.h>
235
///
236
/// \param __A
237
/// A 256-bit vector of [16 x bfloat].
238
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
239
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
242
}
243
244
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
245
///
246
/// \headerfile <x86intrin.h>
247
///
248
/// \param __U
249
/// A 16-bit mask. Elements are zeroed out when the corresponding mask
250
/// bit is not set.
251
/// \param __A
252
/// A 256-bit vector of [16 x bfloat].
253
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
254
static __inline__ __m512 __DEFAULT_FN_ATTRS512
255
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257
(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
258
}
259
260
/// Convert Packed BF16 Data to Packed float Data using merging mask.
261
///
262
/// \headerfile <x86intrin.h>
263
///
264
/// \param __S
265
/// A 512-bit vector of [16 x float]. Elements are copied from __S when
266
/// the corresponding mask bit is not set.
267
/// \param __U
268
/// A 16-bit mask.
269
/// \param __A
270
/// A 256-bit vector of [16 x bfloat].
271
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
272
static __inline__ __m512 __DEFAULT_FN_ATTRS512
273
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
274
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
275
(__m512i)__S, (__mmask16)__U,
276
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
277
}
278
279
#undef __DEFAULT_FN_ATTRS
280
#undef __DEFAULT_FN_ATTRS512
281
282
#endif
283
#endif
284
285