CoCalc -- avx512bf16intrin.h

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Headers/avx512bf16intrin.h
³⁵²³³ views
1
/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
#ifndef __IMMINTRIN_H
10
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11
#endif
12

13
#ifdef __SSE2__
14

15
#ifndef __AVX512BF16INTRIN_H
16
#define __AVX512BF16INTRIN_H
17

18
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
21

22
#define __DEFAULT_FN_ATTRS512 \
23
  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24
                 __min_vector_width__(512)))
25
#define __DEFAULT_FN_ATTRS                                                     \
26
  __attribute__((__always_inline__, __nodebug__,                               \
27
                 __target__("avx512bf16,no-evex512")))
28

29
/// Convert One BF16 Data to One Single Float Data.
30
///
31
/// \headerfile <x86intrin.h>
32
///
33
/// This intrinsic does not correspond to a specific instruction.
34
///
35
/// \param __A
36
///    A bfloat data.
37
/// \returns A float data whose sign field and exponent field keep unchanged,
38
///    and fraction field is extended to 23 bits.
39
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40
  return __builtin_ia32_cvtsbf162ss_32(__A);
41
}
42

43
/// Convert Two Packed Single Data to One Packed BF16 Data.
44
///
45
/// \headerfile <x86intrin.h>
46
///
47
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48
///
49
/// \param __A
50
///    A 512-bit vector of [16 x float].
51
/// \param __B
52
///    A 512-bit vector of [16 x float].
53
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54
///    conversion of __B, and higher 256 bits come from conversion of __A.
55
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57
  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58
                                                    (__v16sf) __B);
59
}
60

61
/// Convert Two Packed Single Data to One Packed BF16 Data.
62
///
63
/// \headerfile <x86intrin.h>
64
///
65
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66
///
67
/// \param __A
68
///    A 512-bit vector of [16 x float].
69
/// \param __B
70
///    A 512-bit vector of [16 x float].
71
/// \param __W
72
///    A 512-bit vector of [32 x bfloat].
73
/// \param __U
74
///    A 32-bit mask value specifying what is chosen for each element.
75
///    A 1 means conversion of __A or __B. A 0 means element from __W.
76
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77
///    conversion of __B, and higher 256 bits come from conversion of __A.
78
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80
  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81
                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82
                                        (__v32bf)__W);
83
}
84

85
/// Convert Two Packed Single Data to One Packed BF16 Data.
86
///
87
/// \headerfile <x86intrin.h>
88
///
89
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90
///
91
/// \param __A
92
///    A 512-bit vector of [16 x float].
93
/// \param __B
94
///    A 512-bit vector of [16 x float].
95
/// \param __U
96
///    A 32-bit mask value specifying what is chosen for each element.
97
///    A 1 means conversion of __A or __B. A 0 means element is zero.
98
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99
///    conversion of __B, and higher 256 bits come from conversion of __A.
100
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102
  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103
                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104
                                        (__v32bf)_mm512_setzero_si512());
105
}
106

107
/// Convert Packed Single Data to Packed BF16 Data.
108
///
109
/// \headerfile <x86intrin.h>
110
///
111
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
112
///
113
/// \param __A
114
///    A 512-bit vector of [16 x float].
115
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117
_mm512_cvtneps_pbh(__m512 __A) {
118
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119
                                              (__v16bf)_mm256_undefined_si256(),
120
                                              (__mmask16)-1);
121
}
122

123
/// Convert Packed Single Data to Packed BF16 Data.
124
///
125
/// \headerfile <x86intrin.h>
126
///
127
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
128
///
129
/// \param __A
130
///    A 512-bit vector of [16 x float].
131
/// \param __W
132
///    A 256-bit vector of [16 x bfloat].
133
/// \param __U
134
///    A 16-bit mask value specifying what is chosen for each element.
135
///    A 1 means conversion of __A. A 0 means element from __W.
136
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140
                                                        (__v16bf)__W,
141
                                                        (__mmask16)__U);
142
}
143

144
/// Convert Packed Single Data to Packed BF16 Data.
145
///
146
/// \headerfile <x86intrin.h>
147
///
148
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
149
///
150
/// \param __A
151
///    A 512-bit vector of [16 x float].
152
/// \param __U
153
///    A 16-bit mask value specifying what is chosen for each element.
154
///    A 1 means conversion of __A. A 0 means element is zero.
155
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159
                                                (__v16bf)_mm256_setzero_si256(),
160
                                                (__mmask16)__U);
161
}
162

163
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
164
///
165
/// \headerfile <x86intrin.h>
166
///
167
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
168
///
169
/// \param __A
170
///    A 512-bit vector of [32 x bfloat].
171
/// \param __B
172
///    A 512-bit vector of [32 x bfloat].
173
/// \param __D
174
///    A 512-bit vector of [16 x float].
175
/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
176
///  __A, __B and __D
177
static __inline__ __m512 __DEFAULT_FN_ATTRS512
178
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179
  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180
                                             (__v32bf) __A,
181
                                             (__v32bf) __B);
182
}
183

184
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
185
///
186
/// \headerfile <x86intrin.h>
187
///
188
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
189
///
190
/// \param __A
191
///    A 512-bit vector of [32 x bfloat].
192
/// \param __B
193
///    A 512-bit vector of [32 x bfloat].
194
/// \param __D
195
///    A 512-bit vector of [16 x float].
196
/// \param __U
197
///    A 16-bit mask value specifying what is chosen for each element.
198
///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199
/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
200
///  __A, __B and __D
201
static __inline__ __m512 __DEFAULT_FN_ATTRS512
202
_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203
  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204
                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205
                                       (__v16sf)__D);
206
}
207

208
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
209
///
210
/// \headerfile <x86intrin.h>
211
///
212
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
213
///
214
/// \param __A
215
///    A 512-bit vector of [32 x bfloat].
216
/// \param __B
217
///    A 512-bit vector of [32 x bfloat].
218
/// \param __D
219
///    A 512-bit vector of [16 x float].
220
/// \param __U
221
///    A 16-bit mask value specifying what is chosen for each element.
222
///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223
/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
224
///  __A, __B and __D
225
static __inline__ __m512 __DEFAULT_FN_ATTRS512
226
_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227
  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228
                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229
                                       (__v16sf)_mm512_setzero_si512());
230
}
231

232
/// Convert Packed BF16 Data to Packed float Data.
233
///
234
/// \headerfile <x86intrin.h>
235
///
236
/// \param __A
237
///    A 256-bit vector of [16 x bfloat].
238
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
239
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240
  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241
      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
242
}
243

244
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
245
///
246
/// \headerfile <x86intrin.h>
247
///
248
/// \param __U
249
///    A 16-bit mask. Elements are zeroed out when the corresponding mask
250
///    bit is not set.
251
/// \param __A
252
///    A 256-bit vector of [16 x bfloat].
253
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
254
static __inline__ __m512 __DEFAULT_FN_ATTRS512
255
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256
  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257
      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
258
}
259

260
/// Convert Packed BF16 Data to Packed float Data using merging mask.
261
///
262
/// \headerfile <x86intrin.h>
263
///
264
/// \param __S
265
///    A 512-bit vector of [16 x float]. Elements are copied from __S when
266
///     the corresponding mask bit is not set.
267
/// \param __U
268
///    A 16-bit mask.
269
/// \param __A
270
///    A 256-bit vector of [16 x bfloat].
271
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
272
static __inline__ __m512 __DEFAULT_FN_ATTRS512
273
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
274
  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
275
      (__m512i)__S, (__mmask16)__U,
276
      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
277
}
278

279
#undef __DEFAULT_FN_ATTRS
280
#undef __DEFAULT_FN_ATTRS512
281

282
#endif
283
#endif
284

285
Product

Resources

Company