CoCalc -- ghash

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/hash/ghash_pclmul.c
³⁹⁵⁰⁷ views
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#define BR_ENABLE_INTRINSICS   1
26
#include "inner.h"
27

28
/*
29
 * This is the GHASH implementation that leverages the pclmulqdq opcode
30
 * (from the AES-NI instructions).
31
 */
32

33
#if BR_AES_X86NI
34

35
/*
36
 * Test CPU support for PCLMULQDQ.
37
 */
38
static inline int
39
pclmul_supported(void)
40
{
41
	/*
42
	 * Bit mask for features in ECX:
43
	 *    1   PCLMULQDQ support
44
	 */
45
	return br_cpuid(0, 0, 0x00000002, 0);
46
}
47

48
/* see bearssl_hash.h */
49
br_ghash
50
br_ghash_pclmul_get(void)
51
{
52
	return pclmul_supported() ? &br_ghash_pclmul : 0;
53
}
54

55
BR_TARGETS_X86_UP
56

57
/*
58
 * GHASH is defined over elements of GF(2^128) with "full little-endian"
59
 * representation: leftmost byte is least significant, and, within each
60
 * byte, leftmost _bit_ is least significant. The natural ordering in
61
 * x86 is "mixed little-endian": bytes are ordered from least to most
62
 * significant, but bits within a byte are in most-to-least significant
63
 * order. Going to full little-endian representation would require
64
 * reversing bits within each byte, which is doable but expensive.
65
 *
66
 * Instead, we go to full big-endian representation, by swapping bytes
67
 * around, which is done with a single _mm_shuffle_epi8() opcode (it
68
 * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
69
 * can use a full big-endian representation because in a carryless
70
 * multiplication, we have a nice bit reversal property:
71
 *
72
 *    rev_128(x) * rev_128(y) = rev_255(x * y)
73
 *
74
 * So by using full big-endian, we still get the right result, except
75
 * that it is right-shifted by 1 bit. The left-shift is relatively
76
 * inexpensive, and it can be mutualised.
77
 *
78
 *
79
 * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
80
 * values with bit precision, we have to break down values into 64-bit
81
 * chunks. We number chunks from 0 to 3 in left to right order.
82
 */
83

84
/*
85
 * Byte-swap a complete 128-bit value. This normally uses
86
 * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
87
 * However, this crashes old Clang versions, so, for Clang before 3.8,
88
 * we use an alternate (and less efficient) version.
89
 */
90
#if BR_CLANG && !BR_CLANG_3_8
91
#define BYTESWAP_DECL
92
#define BYTESWAP_PREP   (void)0
93
#define BYTESWAP(x)   do { \
94
		__m128i byteswap1, byteswap2; \
95
		byteswap1 = (x); \
96
		byteswap2 = _mm_srli_epi16(byteswap1, 8); \
97
		byteswap1 = _mm_slli_epi16(byteswap1, 8); \
98
		byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
99
		byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100
		byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101
		(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
102
	} while (0)
103
#else
104
#define BYTESWAP_DECL   __m128i byteswap_index;
105
#define BYTESWAP_PREP   do { \
106
		byteswap_index = _mm_set_epi8( \
107
			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
108
	} while (0)
109
#define BYTESWAP(x)   do { \
110
		(x) = _mm_shuffle_epi8((x), byteswap_index); \
111
	} while (0)
112
#endif
113

114
/*
115
 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116
 * for that compiler, we use inline assembly. Inline assembly is
117
 * potentially a bit slower because the compiler does not understand
118
 * what the opcode does, and thus cannot optimize instruction
119
 * scheduling.
120
 *
121
 * We use a target of "sse2" only, so that Clang may still handle the
122
 * '__m128i' type and allocate SSE2 registers.
123
 */
124
#if BR_CLANG
125
BR_TARGET("sse2")
126
static inline __m128i
127
pclmulqdq00(__m128i x, __m128i y)
128
{
129
	__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
130
	return x;
131
}
132
BR_TARGET("sse2")
133
static inline __m128i
134
pclmulqdq11(__m128i x, __m128i y)
135
{
136
	__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
137
	return x;
138
}
139
#else
140
#define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
141
#define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
142
#endif
143

144
/*
145
 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146
 * halves of kw (into the right half of kx; left half is unspecified).
147
 */
148
#define BK(kw, kx)   do { \
149
		kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
150
	} while (0)
151

152
/*
153
 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154
 * the XOR of the two values (kx).
155
 */
156
#define PBK(k0, k1, kw, kx)   do { \
157
		kw = _mm_unpacklo_epi64(k1, k0); \
158
		kx = _mm_xor_si128(k0, k1); \
159
	} while (0)
160

161
/*
162
 * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
163
 */
164
#define SL_256(x0, x1, x2, x3)   do { \
165
		x0 = _mm_or_si128( \
166
			_mm_slli_epi64(x0, 1), \
167
			_mm_srli_epi64(x1, 63)); \
168
		x1 = _mm_or_si128( \
169
			_mm_slli_epi64(x1, 1), \
170
			_mm_srli_epi64(x2, 63)); \
171
		x2 = _mm_or_si128( \
172
			_mm_slli_epi64(x2, 1), \
173
			_mm_srli_epi64(x3, 63)); \
174
		x3 = _mm_slli_epi64(x3, 1); \
175
	} while (0)
176

177
/*
178
 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179
 * result is written in x0..x1.
180
 */
181
#define REDUCE_F128(x0, x1, x2, x3)   do { \
182
		x1 = _mm_xor_si128( \
183
			x1, \
184
			_mm_xor_si128( \
185
				_mm_xor_si128( \
186
					x3, \
187
					_mm_srli_epi64(x3, 1)), \
188
				_mm_xor_si128( \
189
					_mm_srli_epi64(x3, 2), \
190
					_mm_srli_epi64(x3, 7)))); \
191
		x2 = _mm_xor_si128( \
192
			_mm_xor_si128( \
193
				x2, \
194
				_mm_slli_epi64(x3, 63)), \
195
			_mm_xor_si128( \
196
				_mm_slli_epi64(x3, 62), \
197
				_mm_slli_epi64(x3, 57))); \
198
		x0 = _mm_xor_si128( \
199
			x0, \
200
			_mm_xor_si128( \
201
				_mm_xor_si128( \
202
					x2, \
203
					_mm_srli_epi64(x2, 1)), \
204
				_mm_xor_si128( \
205
					_mm_srli_epi64(x2, 2), \
206
					_mm_srli_epi64(x2, 7)))); \
207
		x1 = _mm_xor_si128( \
208
			_mm_xor_si128( \
209
				x1, \
210
				_mm_slli_epi64(x2, 63)), \
211
			_mm_xor_si128( \
212
				_mm_slli_epi64(x2, 62), \
213
				_mm_slli_epi64(x2, 57))); \
214
	} while (0)
215

216
/*
217
 * Square value kw into (dw,dx).
218
 */
219
#define SQUARE_F128(kw, dw, dx)   do { \
220
		__m128i z0, z1, z2, z3; \
221
		z1 = pclmulqdq11(kw, kw); \
222
		z3 = pclmulqdq00(kw, kw); \
223
		z0 = _mm_shuffle_epi32(z1, 0x0E); \
224
		z2 = _mm_shuffle_epi32(z3, 0x0E); \
225
		SL_256(z0, z1, z2, z3); \
226
		REDUCE_F128(z0, z1, z2, z3); \
227
		PBK(z0, z1, dw, dx); \
228
	} while (0)
229

230
/* see bearssl_hash.h */
231
BR_TARGET("ssse3,pclmul")
232
void
233
br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
234
{
235
	const unsigned char *buf1, *buf2;
236
	unsigned char tmp[64];
237
	size_t num4, num1;
238
	__m128i yw, h1w, h1x;
239
	BYTESWAP_DECL
240

241
	/*
242
	 * We split data into two chunks. First chunk starts at buf1
243
	 * and contains num4 blocks of 64-byte values. Second chunk
244
	 * starts at buf2 and contains num1 blocks of 16-byte values.
245
	 * We want the first chunk to be as large as possible.
246
	 */
247
	buf1 = data;
248
	num4 = len >> 6;
249
	len &= 63;
250
	buf2 = buf1 + (num4 << 6);
251
	num1 = (len + 15) >> 4;
252
	if ((len & 15) != 0) {
253
		memcpy(tmp, buf2, len);
254
		memset(tmp + len, 0, (num1 << 4) - len);
255
		buf2 = tmp;
256
	}
257

258
	/*
259
	 * Preparatory step for endian conversions.
260
	 */
261
	BYTESWAP_PREP;
262

263
	/*
264
	 * Load y and h.
265
	 */
266
	yw = _mm_loadu_si128(y);
267
	h1w = _mm_loadu_si128(h);
268
	BYTESWAP(yw);
269
	BYTESWAP(h1w);
270
	BK(h1w, h1x);
271

272
	if (num4 > 0) {
273
		__m128i h2w, h2x, h3w, h3x, h4w, h4x;
274
		__m128i t0, t1, t2, t3;
275

276
		/*
277
		 * Compute h2 = h^2.
278
		 */
279
		SQUARE_F128(h1w, h2w, h2x);
280

281
		/*
282
		 * Compute h3 = h^3 = h*(h^2).
283
		 */
284
		t1 = pclmulqdq11(h1w, h2w);
285
		t3 = pclmulqdq00(h1w, h2w);
286
		t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
287
			_mm_xor_si128(t1, t3));
288
		t0 = _mm_shuffle_epi32(t1, 0x0E);
289
		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
290
		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
291
		SL_256(t0, t1, t2, t3);
292
		REDUCE_F128(t0, t1, t2, t3);
293
		PBK(t0, t1, h3w, h3x);
294

295
		/*
296
		 * Compute h4 = h^4 = (h^2)^2.
297
		 */
298
		SQUARE_F128(h2w, h4w, h4x);
299

300
		while (num4 -- > 0) {
301
			__m128i aw0, aw1, aw2, aw3;
302
			__m128i ax0, ax1, ax2, ax3;
303

304
			aw0 = _mm_loadu_si128((void *)(buf1 +  0));
305
			aw1 = _mm_loadu_si128((void *)(buf1 + 16));
306
			aw2 = _mm_loadu_si128((void *)(buf1 + 32));
307
			aw3 = _mm_loadu_si128((void *)(buf1 + 48));
308
			BYTESWAP(aw0);
309
			BYTESWAP(aw1);
310
			BYTESWAP(aw2);
311
			BYTESWAP(aw3);
312
			buf1 += 64;
313

314
			aw0 = _mm_xor_si128(aw0, yw);
315
			BK(aw1, ax1);
316
			BK(aw2, ax2);
317
			BK(aw3, ax3);
318
			BK(aw0, ax0);
319

320
			t1 = _mm_xor_si128(
321
				_mm_xor_si128(
322
					pclmulqdq11(aw0, h4w),
323
					pclmulqdq11(aw1, h3w)),
324
				_mm_xor_si128(
325
					pclmulqdq11(aw2, h2w),
326
					pclmulqdq11(aw3, h1w)));
327
			t3 = _mm_xor_si128(
328
				_mm_xor_si128(
329
					pclmulqdq00(aw0, h4w),
330
					pclmulqdq00(aw1, h3w)),
331
				_mm_xor_si128(
332
					pclmulqdq00(aw2, h2w),
333
					pclmulqdq00(aw3, h1w)));
334
			t2 = _mm_xor_si128(
335
				_mm_xor_si128(
336
					pclmulqdq00(ax0, h4x),
337
					pclmulqdq00(ax1, h3x)),
338
				_mm_xor_si128(
339
					pclmulqdq00(ax2, h2x),
340
					pclmulqdq00(ax3, h1x)));
341
			t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
342
			t0 = _mm_shuffle_epi32(t1, 0x0E);
343
			t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
344
			t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
345
			SL_256(t0, t1, t2, t3);
346
			REDUCE_F128(t0, t1, t2, t3);
347
			yw = _mm_unpacklo_epi64(t1, t0);
348
		}
349
	}
350

351
	while (num1 -- > 0) {
352
		__m128i aw, ax;
353
		__m128i t0, t1, t2, t3;
354

355
		aw = _mm_loadu_si128((void *)buf2);
356
		BYTESWAP(aw);
357
		buf2 += 16;
358

359
		aw = _mm_xor_si128(aw, yw);
360
		BK(aw, ax);
361

362
		t1 = pclmulqdq11(aw, h1w);
363
		t3 = pclmulqdq00(aw, h1w);
364
		t2 = pclmulqdq00(ax, h1x);
365
		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
366
		t0 = _mm_shuffle_epi32(t1, 0x0E);
367
		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
368
		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
369
		SL_256(t0, t1, t2, t3);
370
		REDUCE_F128(t0, t1, t2, t3);
371
		yw = _mm_unpacklo_epi64(t1, t0);
372
	}
373

374
	BYTESWAP(yw);
375
	_mm_storeu_si128(y, yw);
376
}
377

378
BR_TARGETS_X86_DOWN
379

380
#else
381

382
/* see bearssl_hash.h */
383
br_ghash
384
br_ghash_pclmul_get(void)
385
{
386
	return 0;
387
}
388

389
#endif
390

391
Product

Resources

Company