CoCalc -- aesni

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/aesni/aesni_ghash.c
³⁹⁴⁸² views
1
/*-
2
 * Copyright (c) 2014 The FreeBSD Foundation
3
 * All rights reserved.
4
 *
5
 * This software was developed by John-Mark Gurney under
6
 * the sponsorship of the FreeBSD Foundation and
7
 * Rubicon Communications, LLC (Netgate).
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions
10
 * are met:
11
 * 1.  Redistributions of source code must retain the above copyright
12
 *     notice, this list of conditions and the following disclaimer.
13
 * 2.  Redistributions in binary form must reproduce the above copyright
14
 *     notice, this list of conditions and the following disclaimer in the
15
 *     documentation and/or other materials provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
 * SUCH DAMAGE.
28
 *
29
 *
30
 */
31

32
/*
33
 * Figure 5, 8 and 12 are copied from the Intel white paper:
34
 * Intel® Carry-Less Multiplication Instruction and its Usage for
35
 * Computing the GCM Mode
36
 *
37
 * and as such are:
38
 * Copyright © 2010 Intel Corporation.
39
 * All rights reserved.
40
 * 
41
 * Redistribution and use in source and binary forms, with or without
42
 * modification, are permitted provided that the following conditions
43
 * are met:
44
 *   * Redistributions of source code must retain the above copyright
45
 *     notice, this list of conditions and the following disclaimer.
46
 *   * Redistributions in binary form must reproduce the above copyright
47
 *     notice, this list of conditions and the following disclaimer in the
48
 *     documentation and/or other materials provided with the distribution.
49
 *   * Neither the name of Intel Corporation nor the
50
 *     names of its contributors may be used to endorse or promote products
51
 *     derived from this software without specific prior written permission.
52
 * 
53
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
54
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
55
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
56
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
57
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
60
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
61
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
62
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64
 */
65

66
#ifdef _KERNEL
67
#include <crypto/aesni/aesni.h>
68
#include <crypto/aesni/aesni_os.h>
69
#else
70
#include <stdint.h>
71
#endif
72

73
#include <wmmintrin.h>
74
#include <emmintrin.h>
75
#include <smmintrin.h>
76

77
static inline int
78
m128icmp(__m128i a, __m128i b)
79
{
80
	__m128i cmp;
81

82
	cmp = _mm_cmpeq_epi32(a, b);
83

84
	return _mm_movemask_epi8(cmp) == 0xffff;
85
}
86

87
#ifdef __i386__
88
static inline __m128i
89
_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
90
{  
91

92
	if (!ndx) {
93
		a = _mm_insert_epi32(a, b, 0);
94
		a = _mm_insert_epi32(a, b >> 32, 1);
95
	} else {
96
		a = _mm_insert_epi32(a, b, 2);
97
		a = _mm_insert_epi32(a, b >> 32, 3);
98
	}
99

100
	return a;
101
}
102
#endif
103

104
/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
105

106
/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
107
static void
108
gfmul(__m128i a, __m128i b, __m128i *res)
109
{
110
	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
111

112
	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
113
	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
114
	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
115
	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
116

117
	tmp4 = _mm_xor_si128(tmp4, tmp5);
118
	tmp5 = _mm_slli_si128(tmp4, 8);
119
	tmp4 = _mm_srli_si128(tmp4, 8);
120
	tmp3 = _mm_xor_si128(tmp3, tmp5);
121
	tmp6 = _mm_xor_si128(tmp6, tmp4);
122

123
	tmp7 = _mm_srli_epi32(tmp3, 31);
124
	tmp8 = _mm_srli_epi32(tmp6, 31);
125
	tmp3 = _mm_slli_epi32(tmp3, 1);
126
	tmp6 = _mm_slli_epi32(tmp6, 1);
127

128
	tmp9 = _mm_srli_si128(tmp7, 12);
129
	tmp8 = _mm_slli_si128(tmp8, 4);
130
	tmp7 = _mm_slli_si128(tmp7, 4);
131
	tmp3 = _mm_or_si128(tmp3, tmp7);
132
	tmp6 = _mm_or_si128(tmp6, tmp8);
133
	tmp6 = _mm_or_si128(tmp6, tmp9);
134

135
	tmp7 = _mm_slli_epi32(tmp3, 31);
136
	tmp8 = _mm_slli_epi32(tmp3, 30);
137
	tmp9 = _mm_slli_epi32(tmp3, 25);
138

139
	tmp7 = _mm_xor_si128(tmp7, tmp8);
140
	tmp7 = _mm_xor_si128(tmp7, tmp9);
141
	tmp8 = _mm_srli_si128(tmp7, 4);
142
	tmp7 = _mm_slli_si128(tmp7, 12);
143
	tmp3 = _mm_xor_si128(tmp3, tmp7);
144

145
	tmp2 = _mm_srli_epi32(tmp3, 1);
146
	tmp4 = _mm_srli_epi32(tmp3, 2);
147
	tmp5 = _mm_srli_epi32(tmp3, 7);
148
	tmp2 = _mm_xor_si128(tmp2, tmp4);
149
	tmp2 = _mm_xor_si128(tmp2, tmp5);
150
	tmp2 = _mm_xor_si128(tmp2, tmp8);
151
	tmp3 = _mm_xor_si128(tmp3, tmp2);
152
	tmp6 = _mm_xor_si128(tmp6, tmp3);
153

154
	*res = tmp6;
155
}
156

157
/*
158
 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
159
 * Method */
160
static void
161
reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
162
    __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
163
{
164
	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
165
	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
166
	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
167
	__m128i tmp0, tmp1, tmp2, tmp3;
168
	__m128i tmp4, tmp5, tmp6, tmp7;
169
	__m128i tmp8, tmp9;
170

171
	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
172
	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
173
	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
174
	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
175

176
	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
177
	lo = _mm_xor_si128(lo, H3_X3_lo);
178
	lo = _mm_xor_si128(lo, H4_X4_lo);
179

180
	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
181
	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
182
	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
183
	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
184

185
	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
186
	hi = _mm_xor_si128(hi, H3_X3_hi);
187
	hi = _mm_xor_si128(hi, H4_X4_hi);
188

189
	tmp0 = _mm_shuffle_epi32(H1, 78);
190
	tmp4 = _mm_shuffle_epi32(X1, 78);
191
	tmp0 = _mm_xor_si128(tmp0, H1);
192
	tmp4 = _mm_xor_si128(tmp4, X1);
193
	tmp1 = _mm_shuffle_epi32(H2, 78);
194
	tmp5 = _mm_shuffle_epi32(X2, 78);
195
	tmp1 = _mm_xor_si128(tmp1, H2);
196
	tmp5 = _mm_xor_si128(tmp5, X2);
197
	tmp2 = _mm_shuffle_epi32(H3, 78);
198
	tmp6 = _mm_shuffle_epi32(X3, 78);
199
	tmp2 = _mm_xor_si128(tmp2, H3);
200
	tmp6 = _mm_xor_si128(tmp6, X3);
201
	tmp3 = _mm_shuffle_epi32(H4, 78);
202
	tmp7 = _mm_shuffle_epi32(X4, 78);
203
	tmp3 = _mm_xor_si128(tmp3, H4);
204
	tmp7 = _mm_xor_si128(tmp7, X4);
205

206
	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
207
	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
208
	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
209
	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
210

211
	tmp0 = _mm_xor_si128(tmp0, lo);
212
	tmp0 = _mm_xor_si128(tmp0, hi);
213
	tmp0 = _mm_xor_si128(tmp1, tmp0);
214
	tmp0 = _mm_xor_si128(tmp2, tmp0);
215
	tmp0 = _mm_xor_si128(tmp3, tmp0);
216

217
	tmp4 = _mm_slli_si128(tmp0, 8);
218
	tmp0 = _mm_srli_si128(tmp0, 8);
219

220
	lo = _mm_xor_si128(tmp4, lo);
221
	hi = _mm_xor_si128(tmp0, hi);
222

223
	tmp3 = lo;
224
	tmp6 = hi;
225

226
	tmp7 = _mm_srli_epi32(tmp3, 31);
227
	tmp8 = _mm_srli_epi32(tmp6, 31);
228
	tmp3 = _mm_slli_epi32(tmp3, 1);
229
	tmp6 = _mm_slli_epi32(tmp6, 1);
230

231
	tmp9 = _mm_srli_si128(tmp7, 12);
232
	tmp8 = _mm_slli_si128(tmp8, 4);
233
	tmp7 = _mm_slli_si128(tmp7, 4);
234
	tmp3 = _mm_or_si128(tmp3, tmp7);
235
	tmp6 = _mm_or_si128(tmp6, tmp8);
236
	tmp6 = _mm_or_si128(tmp6, tmp9);
237

238
	tmp7 = _mm_slli_epi32(tmp3, 31);
239
	tmp8 = _mm_slli_epi32(tmp3, 30);
240
	tmp9 = _mm_slli_epi32(tmp3, 25);
241

242
	tmp7 = _mm_xor_si128(tmp7, tmp8);
243
	tmp7 = _mm_xor_si128(tmp7, tmp9);
244
	tmp8 = _mm_srli_si128(tmp7, 4);
245
	tmp7 = _mm_slli_si128(tmp7, 12);
246
	tmp3 = _mm_xor_si128(tmp3, tmp7);
247

248
	tmp2 = _mm_srli_epi32(tmp3, 1);
249
	tmp4 = _mm_srli_epi32(tmp3, 2);
250
	tmp5 = _mm_srli_epi32(tmp3, 7);
251
	tmp2 = _mm_xor_si128(tmp2, tmp4);
252
	tmp2 = _mm_xor_si128(tmp2, tmp5);
253
	tmp2 = _mm_xor_si128(tmp2, tmp8);
254
	tmp3 = _mm_xor_si128(tmp3, tmp2);
255
	tmp6 = _mm_xor_si128(tmp6, tmp3);
256

257
	*res = tmp6;
258
}
259

260
/*
261
 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
262
 * Every Four Blocks
263
 */
264
/*
265
 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
266
 * 2^32-256*8*16 bytes.
267
 */
268
void
269
AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
270
	const unsigned char *addt, const unsigned char *ivec,
271
	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
272
	const unsigned char *key, int nr)
273
{
274
	int i, j ,k;
275
	__m128i tmp1, tmp2, tmp3, tmp4;
276
	__m128i tmp5, tmp6, tmp7, tmp8;
277
	__m128i H, H2, H3, H4, Y, T;
278
	const __m128i *KEY = (const __m128i *)key;
279
	__m128i ctr1, ctr2, ctr3, ctr4;
280
	__m128i ctr5, ctr6, ctr7, ctr8;
281
	__m128i last_block = _mm_setzero_si128();
282
	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
283
	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
284
	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
285
	    7);
286
	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
287
	    15);
288
	__m128i X = _mm_setzero_si128();
289

290
	if (ibytes == 96/8) {
291
		Y = _mm_loadu_si128((const __m128i *)ivec);
292
		Y = _mm_insert_epi32(Y, 0x1000000, 3);
293
		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
294
		tmp1 = _mm_xor_si128(X, KEY[0]);
295
		tmp2 = _mm_xor_si128(Y, KEY[0]);
296
		for (j=1; j < nr-1; j+=2) {
297
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
298
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
299

300
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
301
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
302
		}
303
		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
304
		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
305

306
		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
307
		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
308

309
		H = _mm_shuffle_epi8(H, BSWAP_MASK);
310
	} else {
311
		tmp1 = _mm_xor_si128(X, KEY[0]);
312
		for (j=1; j <nr; j++)
313
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
314
		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
315

316
		H = _mm_shuffle_epi8(H, BSWAP_MASK);
317
		Y = _mm_setzero_si128();
318

319
		for (i=0; i < ibytes/16; i++) {
320
			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
321
			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
322
			Y = _mm_xor_si128(Y, tmp1);
323
			gfmul(Y, H, &Y);
324
		}
325
		if (ibytes%16) {
326
			for (j=0; j < ibytes%16; j++)
327
				((unsigned char*)&last_block)[j] = ivec[i*16+j];
328
			tmp1 = last_block;
329
			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
330
			Y = _mm_xor_si128(Y, tmp1);
331
			gfmul(Y, H, &Y);
332
		}
333
		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
334
		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
335

336
		Y = _mm_xor_si128(Y, tmp1);
337
		gfmul(Y, H, &Y);
338
		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
339
		tmp1 = _mm_xor_si128(Y, KEY[0]);
340
		for (j=1; j < nr; j++)
341
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
342
		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
343
	}
344

345
	gfmul(H,H,&H2);
346
	gfmul(H,H2,&H3);
347
	gfmul(H,H3,&H4);
348

349
	for (i=0; i<abytes/16/4; i++) {
350
		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
351
		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
352
		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
353
		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
354

355
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
356
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
357
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
358
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
359
		tmp1 = _mm_xor_si128(X, tmp1);
360

361
		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
362
	}
363
	for (i=i*4; i<abytes/16; i++) {
364
		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
365
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
366
		X = _mm_xor_si128(X,tmp1);
367
		gfmul(X, H, &X);
368
	}
369
	if (abytes%16) {
370
		last_block = _mm_setzero_si128();
371
		for (j=0; j<abytes%16; j++)
372
			((unsigned char*)&last_block)[j] = addt[i*16+j];
373
		tmp1 = last_block;
374
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
375
		X =_mm_xor_si128(X,tmp1);
376
		gfmul(X,H,&X);
377
	}
378

379
	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
380
	ctr1 = _mm_add_epi64(ctr1, ONE);
381
	ctr2 = _mm_add_epi64(ctr1, ONE);
382
	ctr3 = _mm_add_epi64(ctr2, ONE);
383
	ctr4 = _mm_add_epi64(ctr3, ONE);
384
	ctr5 = _mm_add_epi64(ctr4, ONE);
385
	ctr6 = _mm_add_epi64(ctr5, ONE);
386
	ctr7 = _mm_add_epi64(ctr6, ONE);
387
	ctr8 = _mm_add_epi64(ctr7, ONE);
388

389
	for (i=0; i<nbytes/16/8; i++) {
390
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
391
		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
392
		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
393
		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
394
		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
395
		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
396
		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
397
		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
398

399
		ctr1 = _mm_add_epi64(ctr1, EIGHT);
400
		ctr2 = _mm_add_epi64(ctr2, EIGHT);
401
		ctr3 = _mm_add_epi64(ctr3, EIGHT);
402
		ctr4 = _mm_add_epi64(ctr4, EIGHT);
403
		ctr5 = _mm_add_epi64(ctr5, EIGHT);
404
		ctr6 = _mm_add_epi64(ctr6, EIGHT);
405
		ctr7 = _mm_add_epi64(ctr7, EIGHT);
406
		ctr8 = _mm_add_epi64(ctr8, EIGHT);
407

408
		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
409
		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
410
		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
411
		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
412
		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
413
		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
414
		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
415
		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
416

417
		for (j=1; j<nr; j++) {
418
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
419
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
420
			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
421
			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
422
			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
423
			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
424
			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
425
			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
426
		}
427
		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
428
		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
429
		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
430
		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
431
		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
432
		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
433
		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
434
		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
435

436
		tmp1 = _mm_xor_si128(tmp1,
437
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
438
		tmp2 = _mm_xor_si128(tmp2,
439
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
440
		tmp3 = _mm_xor_si128(tmp3,
441
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
442
		tmp4 = _mm_xor_si128(tmp4,
443
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
444
		tmp5 = _mm_xor_si128(tmp5,
445
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
446
		tmp6 = _mm_xor_si128(tmp6,
447
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
448
		tmp7 = _mm_xor_si128(tmp7,
449
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
450
		tmp8 = _mm_xor_si128(tmp8,
451
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
452

453
		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
454
		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
455
		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
456
		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
457
		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
458
		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
459
		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
460
		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
461

462
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
463
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
464
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
465
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
466
		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
467
		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
468
		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
469
		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
470

471
		tmp1 = _mm_xor_si128(X, tmp1);
472

473
		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
474

475
		tmp5 = _mm_xor_si128(X, tmp5);
476
		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
477
	}
478
	for (k=i*8; k<nbytes/16; k++) {
479
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
480
		ctr1 = _mm_add_epi64(ctr1, ONE);
481
		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
482
		for (j=1; j<nr-1; j+=2) {
483
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
484
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
485
		}
486
		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
487
		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
488
		tmp1 = _mm_xor_si128(tmp1,
489
		    _mm_loadu_si128(&((const __m128i *)in)[k]));
490
		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
491
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
492
		X = _mm_xor_si128(X, tmp1);
493
		gfmul(X,H,&X);
494
	}
495
	//If remains one incomplete block
496
	if (nbytes%16) {
497
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
498
		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
499
		for (j=1; j<nr-1; j+=2) {
500
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
501
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
502
		}
503
		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
504
		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
505
		last_block = _mm_setzero_si128();
506
		memcpy(&last_block, &((const __m128i *)in)[k],
507
		    nbytes % 16);
508
		last_block = _mm_xor_si128(last_block, tmp1);
509
		for (j=0; j<nbytes%16; j++)
510
			out[k*16+j] = ((unsigned char*)&last_block)[j];
511
		for ((void)j; j<16; j++)
512
			((unsigned char*)&last_block)[j] = 0;
513
		tmp1 = last_block;
514
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
515
		X = _mm_xor_si128(X, tmp1);
516
		gfmul(X, H, &X);
517
	}
518
	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
519
	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
520

521
	X = _mm_xor_si128(X, tmp1);
522
	gfmul(X,H,&X);
523
	X = _mm_shuffle_epi8(X, BSWAP_MASK);
524
	T = _mm_xor_si128(X, T);
525
	_mm_storeu_si128((__m128i*)tag, T);
526
}
527

528
/* My modification of _encrypt to be _decrypt */
529
int
530
AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
531
	const unsigned char *addt, const unsigned char *ivec,
532
	const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
533
	const unsigned char *key, int nr)
534
{
535
	int i, j ,k;
536
	__m128i tmp1, tmp2, tmp3, tmp4;
537
	__m128i tmp5, tmp6, tmp7, tmp8;
538
	__m128i H, H2, H3, H4, Y, T;
539
	const __m128i *KEY = (const __m128i *)key;
540
	__m128i ctr1, ctr2, ctr3, ctr4;
541
	__m128i ctr5, ctr6, ctr7, ctr8;
542
	__m128i last_block = _mm_setzero_si128();
543
	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
544
	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
545
	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
546
	    7);
547
	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
548
	    15);
549
	__m128i X = _mm_setzero_si128();
550

551
	if (ibytes == 96/8) {
552
		Y = _mm_loadu_si128((const __m128i *)ivec);
553
		Y = _mm_insert_epi32(Y, 0x1000000, 3);
554
		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
555
		tmp1 = _mm_xor_si128(X, KEY[0]);
556
		tmp2 = _mm_xor_si128(Y, KEY[0]);
557
		for (j=1; j < nr-1; j+=2) {
558
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
559
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
560

561
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
562
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
563
		}
564
		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
565
		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
566

567
		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
568
		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
569

570
		H = _mm_shuffle_epi8(H, BSWAP_MASK);
571
	} else {
572
		tmp1 = _mm_xor_si128(X, KEY[0]);
573
		for (j=1; j <nr; j++)
574
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
575
		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
576

577
		H = _mm_shuffle_epi8(H, BSWAP_MASK);
578
		Y = _mm_setzero_si128();
579

580
		for (i=0; i < ibytes/16; i++) {
581
			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
582
			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
583
			Y = _mm_xor_si128(Y, tmp1);
584
			gfmul(Y, H, &Y);
585
		}
586
		if (ibytes%16) {
587
			for (j=0; j < ibytes%16; j++)
588
				((unsigned char*)&last_block)[j] = ivec[i*16+j];
589
			tmp1 = last_block;
590
			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
591
			Y = _mm_xor_si128(Y, tmp1);
592
			gfmul(Y, H, &Y);
593
		}
594
		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
595
		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
596

597
		Y = _mm_xor_si128(Y, tmp1);
598
		gfmul(Y, H, &Y);
599
		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
600
		tmp1 = _mm_xor_si128(Y, KEY[0]);
601
		for (j=1; j < nr; j++)
602
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
603
		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
604
	}
605

606
	gfmul(H,H,&H2);
607
	gfmul(H,H2,&H3);
608
	gfmul(H,H3,&H4);
609

610
	for (i=0; i<abytes/16/4; i++) {
611
		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
612
		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
613
		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
614
		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
615

616
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
617
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
618
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
619
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
620

621
		tmp1 = _mm_xor_si128(X, tmp1);
622

623
		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
624
	}
625
	for (i=i*4; i<abytes/16; i++) {
626
		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
627
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
628
		X = _mm_xor_si128(X,tmp1);
629
		gfmul(X, H, &X);
630
	}
631
	if (abytes%16) {
632
		last_block = _mm_setzero_si128();
633
		for (j=0; j<abytes%16; j++)
634
			((unsigned char*)&last_block)[j] = addt[i*16+j];
635
		tmp1 = last_block;
636
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
637
		X =_mm_xor_si128(X,tmp1);
638
		gfmul(X,H,&X);
639
	}
640

641
	/* This is where we validate the cipher text before decrypt */
642
	for (i = 0; i<nbytes/16/4; i++) {
643
		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
644
		tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
645
		tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
646
		tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
647

648
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
649
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
650
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
651
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
652

653
		tmp1 = _mm_xor_si128(X, tmp1);
654

655
		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
656
	}
657
	for (i = i*4; i<nbytes/16; i++) {
658
		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
659
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
660
		X = _mm_xor_si128(X, tmp1);
661
		gfmul(X,H,&X);
662
	}
663
	if (nbytes%16) {
664
		last_block = _mm_setzero_si128();
665
		for (j=0; j<nbytes%16; j++)
666
			((unsigned char*)&last_block)[j] = in[i*16+j];
667
		tmp1 = last_block;
668
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
669
		X = _mm_xor_si128(X, tmp1);
670
		gfmul(X, H, &X);
671
	}
672

673
	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
674
	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
675

676
	X = _mm_xor_si128(X, tmp1);
677
	gfmul(X,H,&X);
678
	X = _mm_shuffle_epi8(X, BSWAP_MASK);
679
	T = _mm_xor_si128(X, T);
680

681
	if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
682
		return 0; //in case the authentication failed
683

684
	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
685
	ctr1 = _mm_add_epi64(ctr1, ONE);
686
	ctr2 = _mm_add_epi64(ctr1, ONE);
687
	ctr3 = _mm_add_epi64(ctr2, ONE);
688
	ctr4 = _mm_add_epi64(ctr3, ONE);
689
	ctr5 = _mm_add_epi64(ctr4, ONE);
690
	ctr6 = _mm_add_epi64(ctr5, ONE);
691
	ctr7 = _mm_add_epi64(ctr6, ONE);
692
	ctr8 = _mm_add_epi64(ctr7, ONE);
693

694
	for (i=0; i<nbytes/16/8; i++) {
695
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
696
		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
697
		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
698
		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
699
		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
700
		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
701
		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
702
		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
703

704
		ctr1 = _mm_add_epi64(ctr1, EIGHT);
705
		ctr2 = _mm_add_epi64(ctr2, EIGHT);
706
		ctr3 = _mm_add_epi64(ctr3, EIGHT);
707
		ctr4 = _mm_add_epi64(ctr4, EIGHT);
708
		ctr5 = _mm_add_epi64(ctr5, EIGHT);
709
		ctr6 = _mm_add_epi64(ctr6, EIGHT);
710
		ctr7 = _mm_add_epi64(ctr7, EIGHT);
711
		ctr8 = _mm_add_epi64(ctr8, EIGHT);
712

713
		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
714
		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
715
		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
716
		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
717
		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
718
		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
719
		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
720
		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
721

722
		for (j=1; j<nr; j++) {
723
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
724
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
725
			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
726
			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
727
			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
728
			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
729
			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
730
			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
731
		}
732
		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
733
		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
734
		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
735
		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
736
		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
737
		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
738
		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
739
		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
740

741
		tmp1 = _mm_xor_si128(tmp1,
742
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
743
		tmp2 = _mm_xor_si128(tmp2,
744
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
745
		tmp3 = _mm_xor_si128(tmp3,
746
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
747
		tmp4 = _mm_xor_si128(tmp4,
748
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
749
		tmp5 = _mm_xor_si128(tmp5,
750
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
751
		tmp6 = _mm_xor_si128(tmp6,
752
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
753
		tmp7 = _mm_xor_si128(tmp7,
754
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
755
		tmp8 = _mm_xor_si128(tmp8,
756
		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
757

758
		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
759
		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
760
		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
761
		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
762
		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
763
		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
764
		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
765
		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
766

767
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
768
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
769
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
770
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
771
		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
772
		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
773
		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
774
		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
775
	}
776
	for (k=i*8; k<nbytes/16; k++) {
777
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
778
		ctr1 = _mm_add_epi64(ctr1, ONE);
779
		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
780
		for (j=1; j<nr-1; j+=2) {
781
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
782
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
783
		}
784
		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
785
		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
786
		tmp1 = _mm_xor_si128(tmp1,
787
		    _mm_loadu_si128(&((const __m128i *)in)[k]));
788
		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
789
	}
790
	//If remains one incomplete block
791
	if (nbytes%16) {
792
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
793
		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
794
		for (j=1; j<nr-1; j+=2) {
795
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
796
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
797
		}
798
		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
799
		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
800
		last_block = _mm_setzero_si128();
801
		memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
802
		tmp1 = _mm_xor_si128(tmp1, last_block);
803
		last_block = tmp1;
804
		for (j=0; j<nbytes%16; j++)
805
			out[k*16+j] = ((unsigned char*)&last_block)[j];
806
	}
807
	return 1; //when sucessfull returns 1
808
}
809

810
Product

Resources

Company