Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/aesni/aesni_ghash.c
39482 views
1
/*-
2
* Copyright (c) 2014 The FreeBSD Foundation
3
* All rights reserved.
4
*
5
* This software was developed by John-Mark Gurney under
6
* the sponsorship of the FreeBSD Foundation and
7
* Rubicon Communications, LLC (Netgate).
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*
29
*
30
*/
31
32
/*
33
* Figure 5, 8 and 12 are copied from the Intel white paper:
34
* Intel® Carry-Less Multiplication Instruction and its Usage for
35
* Computing the GCM Mode
36
*
37
* and as such are:
38
* Copyright © 2010 Intel Corporation.
39
* All rights reserved.
40
*
41
* Redistribution and use in source and binary forms, with or without
42
* modification, are permitted provided that the following conditions
43
* are met:
44
* * Redistributions of source code must retain the above copyright
45
* notice, this list of conditions and the following disclaimer.
46
* * Redistributions in binary form must reproduce the above copyright
47
* notice, this list of conditions and the following disclaimer in the
48
* documentation and/or other materials provided with the distribution.
49
* * Neither the name of Intel Corporation nor the
50
* names of its contributors may be used to endorse or promote products
51
* derived from this software without specific prior written permission.
52
*
53
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
54
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
55
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
56
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
57
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
60
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
61
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
62
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64
*/
65
66
#ifdef _KERNEL
67
#include <crypto/aesni/aesni.h>
68
#include <crypto/aesni/aesni_os.h>
69
#else
70
#include <stdint.h>
71
#endif
72
73
#include <wmmintrin.h>
74
#include <emmintrin.h>
75
#include <smmintrin.h>
76
77
static inline int
78
m128icmp(__m128i a, __m128i b)
79
{
80
__m128i cmp;
81
82
cmp = _mm_cmpeq_epi32(a, b);
83
84
return _mm_movemask_epi8(cmp) == 0xffff;
85
}
86
87
#ifdef __i386__
88
static inline __m128i
89
_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
90
{
91
92
if (!ndx) {
93
a = _mm_insert_epi32(a, b, 0);
94
a = _mm_insert_epi32(a, b >> 32, 1);
95
} else {
96
a = _mm_insert_epi32(a, b, 2);
97
a = _mm_insert_epi32(a, b >> 32, 3);
98
}
99
100
return a;
101
}
102
#endif
103
104
/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
105
106
/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
107
static void
108
gfmul(__m128i a, __m128i b, __m128i *res)
109
{
110
__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
111
112
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
113
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
114
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
115
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
116
117
tmp4 = _mm_xor_si128(tmp4, tmp5);
118
tmp5 = _mm_slli_si128(tmp4, 8);
119
tmp4 = _mm_srli_si128(tmp4, 8);
120
tmp3 = _mm_xor_si128(tmp3, tmp5);
121
tmp6 = _mm_xor_si128(tmp6, tmp4);
122
123
tmp7 = _mm_srli_epi32(tmp3, 31);
124
tmp8 = _mm_srli_epi32(tmp6, 31);
125
tmp3 = _mm_slli_epi32(tmp3, 1);
126
tmp6 = _mm_slli_epi32(tmp6, 1);
127
128
tmp9 = _mm_srli_si128(tmp7, 12);
129
tmp8 = _mm_slli_si128(tmp8, 4);
130
tmp7 = _mm_slli_si128(tmp7, 4);
131
tmp3 = _mm_or_si128(tmp3, tmp7);
132
tmp6 = _mm_or_si128(tmp6, tmp8);
133
tmp6 = _mm_or_si128(tmp6, tmp9);
134
135
tmp7 = _mm_slli_epi32(tmp3, 31);
136
tmp8 = _mm_slli_epi32(tmp3, 30);
137
tmp9 = _mm_slli_epi32(tmp3, 25);
138
139
tmp7 = _mm_xor_si128(tmp7, tmp8);
140
tmp7 = _mm_xor_si128(tmp7, tmp9);
141
tmp8 = _mm_srli_si128(tmp7, 4);
142
tmp7 = _mm_slli_si128(tmp7, 12);
143
tmp3 = _mm_xor_si128(tmp3, tmp7);
144
145
tmp2 = _mm_srli_epi32(tmp3, 1);
146
tmp4 = _mm_srli_epi32(tmp3, 2);
147
tmp5 = _mm_srli_epi32(tmp3, 7);
148
tmp2 = _mm_xor_si128(tmp2, tmp4);
149
tmp2 = _mm_xor_si128(tmp2, tmp5);
150
tmp2 = _mm_xor_si128(tmp2, tmp8);
151
tmp3 = _mm_xor_si128(tmp3, tmp2);
152
tmp6 = _mm_xor_si128(tmp6, tmp3);
153
154
*res = tmp6;
155
}
156
157
/*
158
* Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
159
* Method */
160
static void
161
reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
162
__m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
163
{
164
/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
165
__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
166
H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
167
__m128i tmp0, tmp1, tmp2, tmp3;
168
__m128i tmp4, tmp5, tmp6, tmp7;
169
__m128i tmp8, tmp9;
170
171
H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
172
H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
173
H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
174
H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
175
176
lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
177
lo = _mm_xor_si128(lo, H3_X3_lo);
178
lo = _mm_xor_si128(lo, H4_X4_lo);
179
180
H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
181
H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
182
H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
183
H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
184
185
hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
186
hi = _mm_xor_si128(hi, H3_X3_hi);
187
hi = _mm_xor_si128(hi, H4_X4_hi);
188
189
tmp0 = _mm_shuffle_epi32(H1, 78);
190
tmp4 = _mm_shuffle_epi32(X1, 78);
191
tmp0 = _mm_xor_si128(tmp0, H1);
192
tmp4 = _mm_xor_si128(tmp4, X1);
193
tmp1 = _mm_shuffle_epi32(H2, 78);
194
tmp5 = _mm_shuffle_epi32(X2, 78);
195
tmp1 = _mm_xor_si128(tmp1, H2);
196
tmp5 = _mm_xor_si128(tmp5, X2);
197
tmp2 = _mm_shuffle_epi32(H3, 78);
198
tmp6 = _mm_shuffle_epi32(X3, 78);
199
tmp2 = _mm_xor_si128(tmp2, H3);
200
tmp6 = _mm_xor_si128(tmp6, X3);
201
tmp3 = _mm_shuffle_epi32(H4, 78);
202
tmp7 = _mm_shuffle_epi32(X4, 78);
203
tmp3 = _mm_xor_si128(tmp3, H4);
204
tmp7 = _mm_xor_si128(tmp7, X4);
205
206
tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
207
tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
208
tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
209
tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
210
211
tmp0 = _mm_xor_si128(tmp0, lo);
212
tmp0 = _mm_xor_si128(tmp0, hi);
213
tmp0 = _mm_xor_si128(tmp1, tmp0);
214
tmp0 = _mm_xor_si128(tmp2, tmp0);
215
tmp0 = _mm_xor_si128(tmp3, tmp0);
216
217
tmp4 = _mm_slli_si128(tmp0, 8);
218
tmp0 = _mm_srli_si128(tmp0, 8);
219
220
lo = _mm_xor_si128(tmp4, lo);
221
hi = _mm_xor_si128(tmp0, hi);
222
223
tmp3 = lo;
224
tmp6 = hi;
225
226
tmp7 = _mm_srli_epi32(tmp3, 31);
227
tmp8 = _mm_srli_epi32(tmp6, 31);
228
tmp3 = _mm_slli_epi32(tmp3, 1);
229
tmp6 = _mm_slli_epi32(tmp6, 1);
230
231
tmp9 = _mm_srli_si128(tmp7, 12);
232
tmp8 = _mm_slli_si128(tmp8, 4);
233
tmp7 = _mm_slli_si128(tmp7, 4);
234
tmp3 = _mm_or_si128(tmp3, tmp7);
235
tmp6 = _mm_or_si128(tmp6, tmp8);
236
tmp6 = _mm_or_si128(tmp6, tmp9);
237
238
tmp7 = _mm_slli_epi32(tmp3, 31);
239
tmp8 = _mm_slli_epi32(tmp3, 30);
240
tmp9 = _mm_slli_epi32(tmp3, 25);
241
242
tmp7 = _mm_xor_si128(tmp7, tmp8);
243
tmp7 = _mm_xor_si128(tmp7, tmp9);
244
tmp8 = _mm_srli_si128(tmp7, 4);
245
tmp7 = _mm_slli_si128(tmp7, 12);
246
tmp3 = _mm_xor_si128(tmp3, tmp7);
247
248
tmp2 = _mm_srli_epi32(tmp3, 1);
249
tmp4 = _mm_srli_epi32(tmp3, 2);
250
tmp5 = _mm_srli_epi32(tmp3, 7);
251
tmp2 = _mm_xor_si128(tmp2, tmp4);
252
tmp2 = _mm_xor_si128(tmp2, tmp5);
253
tmp2 = _mm_xor_si128(tmp2, tmp8);
254
tmp3 = _mm_xor_si128(tmp3, tmp2);
255
tmp6 = _mm_xor_si128(tmp6, tmp3);
256
257
*res = tmp6;
258
}
259
260
/*
261
* Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
262
* Every Four Blocks
263
*/
264
/*
265
* per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
266
* 2^32-256*8*16 bytes.
267
*/
268
void
269
AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
270
const unsigned char *addt, const unsigned char *ivec,
271
unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
272
const unsigned char *key, int nr)
273
{
274
int i, j ,k;
275
__m128i tmp1, tmp2, tmp3, tmp4;
276
__m128i tmp5, tmp6, tmp7, tmp8;
277
__m128i H, H2, H3, H4, Y, T;
278
const __m128i *KEY = (const __m128i *)key;
279
__m128i ctr1, ctr2, ctr3, ctr4;
280
__m128i ctr5, ctr6, ctr7, ctr8;
281
__m128i last_block = _mm_setzero_si128();
282
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
283
__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
284
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
285
7);
286
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
287
15);
288
__m128i X = _mm_setzero_si128();
289
290
if (ibytes == 96/8) {
291
Y = _mm_loadu_si128((const __m128i *)ivec);
292
Y = _mm_insert_epi32(Y, 0x1000000, 3);
293
/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
294
tmp1 = _mm_xor_si128(X, KEY[0]);
295
tmp2 = _mm_xor_si128(Y, KEY[0]);
296
for (j=1; j < nr-1; j+=2) {
297
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
298
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
299
300
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
301
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
302
}
303
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
304
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
305
306
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
307
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
308
309
H = _mm_shuffle_epi8(H, BSWAP_MASK);
310
} else {
311
tmp1 = _mm_xor_si128(X, KEY[0]);
312
for (j=1; j <nr; j++)
313
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
314
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
315
316
H = _mm_shuffle_epi8(H, BSWAP_MASK);
317
Y = _mm_setzero_si128();
318
319
for (i=0; i < ibytes/16; i++) {
320
tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
321
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
322
Y = _mm_xor_si128(Y, tmp1);
323
gfmul(Y, H, &Y);
324
}
325
if (ibytes%16) {
326
for (j=0; j < ibytes%16; j++)
327
((unsigned char*)&last_block)[j] = ivec[i*16+j];
328
tmp1 = last_block;
329
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
330
Y = _mm_xor_si128(Y, tmp1);
331
gfmul(Y, H, &Y);
332
}
333
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
334
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
335
336
Y = _mm_xor_si128(Y, tmp1);
337
gfmul(Y, H, &Y);
338
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
339
tmp1 = _mm_xor_si128(Y, KEY[0]);
340
for (j=1; j < nr; j++)
341
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
342
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
343
}
344
345
gfmul(H,H,&H2);
346
gfmul(H,H2,&H3);
347
gfmul(H,H3,&H4);
348
349
for (i=0; i<abytes/16/4; i++) {
350
tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
351
tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
352
tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
353
tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
354
355
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
356
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
357
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
358
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
359
tmp1 = _mm_xor_si128(X, tmp1);
360
361
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
362
}
363
for (i=i*4; i<abytes/16; i++) {
364
tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
365
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
366
X = _mm_xor_si128(X,tmp1);
367
gfmul(X, H, &X);
368
}
369
if (abytes%16) {
370
last_block = _mm_setzero_si128();
371
for (j=0; j<abytes%16; j++)
372
((unsigned char*)&last_block)[j] = addt[i*16+j];
373
tmp1 = last_block;
374
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
375
X =_mm_xor_si128(X,tmp1);
376
gfmul(X,H,&X);
377
}
378
379
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
380
ctr1 = _mm_add_epi64(ctr1, ONE);
381
ctr2 = _mm_add_epi64(ctr1, ONE);
382
ctr3 = _mm_add_epi64(ctr2, ONE);
383
ctr4 = _mm_add_epi64(ctr3, ONE);
384
ctr5 = _mm_add_epi64(ctr4, ONE);
385
ctr6 = _mm_add_epi64(ctr5, ONE);
386
ctr7 = _mm_add_epi64(ctr6, ONE);
387
ctr8 = _mm_add_epi64(ctr7, ONE);
388
389
for (i=0; i<nbytes/16/8; i++) {
390
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
391
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
392
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
393
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
394
tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
395
tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
396
tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
397
tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
398
399
ctr1 = _mm_add_epi64(ctr1, EIGHT);
400
ctr2 = _mm_add_epi64(ctr2, EIGHT);
401
ctr3 = _mm_add_epi64(ctr3, EIGHT);
402
ctr4 = _mm_add_epi64(ctr4, EIGHT);
403
ctr5 = _mm_add_epi64(ctr5, EIGHT);
404
ctr6 = _mm_add_epi64(ctr6, EIGHT);
405
ctr7 = _mm_add_epi64(ctr7, EIGHT);
406
ctr8 = _mm_add_epi64(ctr8, EIGHT);
407
408
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
409
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
410
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
411
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
412
tmp5 =_mm_xor_si128(tmp5, KEY[0]);
413
tmp6 =_mm_xor_si128(tmp6, KEY[0]);
414
tmp7 =_mm_xor_si128(tmp7, KEY[0]);
415
tmp8 =_mm_xor_si128(tmp8, KEY[0]);
416
417
for (j=1; j<nr; j++) {
418
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
419
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
420
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
421
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
422
tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
423
tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
424
tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
425
tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
426
}
427
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
428
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
429
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
430
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
431
tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
432
tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
433
tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
434
tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
435
436
tmp1 = _mm_xor_si128(tmp1,
437
_mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
438
tmp2 = _mm_xor_si128(tmp2,
439
_mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
440
tmp3 = _mm_xor_si128(tmp3,
441
_mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
442
tmp4 = _mm_xor_si128(tmp4,
443
_mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
444
tmp5 = _mm_xor_si128(tmp5,
445
_mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
446
tmp6 = _mm_xor_si128(tmp6,
447
_mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
448
tmp7 = _mm_xor_si128(tmp7,
449
_mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
450
tmp8 = _mm_xor_si128(tmp8,
451
_mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
452
453
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
454
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
455
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
456
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
457
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
458
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
459
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
460
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
461
462
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
463
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
464
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
465
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
466
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
467
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
468
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
469
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
470
471
tmp1 = _mm_xor_si128(X, tmp1);
472
473
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
474
475
tmp5 = _mm_xor_si128(X, tmp5);
476
reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
477
}
478
for (k=i*8; k<nbytes/16; k++) {
479
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
480
ctr1 = _mm_add_epi64(ctr1, ONE);
481
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
482
for (j=1; j<nr-1; j+=2) {
483
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
484
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
485
}
486
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
487
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
488
tmp1 = _mm_xor_si128(tmp1,
489
_mm_loadu_si128(&((const __m128i *)in)[k]));
490
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
491
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
492
X = _mm_xor_si128(X, tmp1);
493
gfmul(X,H,&X);
494
}
495
//If remains one incomplete block
496
if (nbytes%16) {
497
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
498
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
499
for (j=1; j<nr-1; j+=2) {
500
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
501
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
502
}
503
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
504
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
505
last_block = _mm_setzero_si128();
506
memcpy(&last_block, &((const __m128i *)in)[k],
507
nbytes % 16);
508
last_block = _mm_xor_si128(last_block, tmp1);
509
for (j=0; j<nbytes%16; j++)
510
out[k*16+j] = ((unsigned char*)&last_block)[j];
511
for ((void)j; j<16; j++)
512
((unsigned char*)&last_block)[j] = 0;
513
tmp1 = last_block;
514
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
515
X = _mm_xor_si128(X, tmp1);
516
gfmul(X, H, &X);
517
}
518
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
519
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
520
521
X = _mm_xor_si128(X, tmp1);
522
gfmul(X,H,&X);
523
X = _mm_shuffle_epi8(X, BSWAP_MASK);
524
T = _mm_xor_si128(X, T);
525
_mm_storeu_si128((__m128i*)tag, T);
526
}
527
528
/* My modification of _encrypt to be _decrypt */
529
int
530
AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
531
const unsigned char *addt, const unsigned char *ivec,
532
const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
533
const unsigned char *key, int nr)
534
{
535
int i, j ,k;
536
__m128i tmp1, tmp2, tmp3, tmp4;
537
__m128i tmp5, tmp6, tmp7, tmp8;
538
__m128i H, H2, H3, H4, Y, T;
539
const __m128i *KEY = (const __m128i *)key;
540
__m128i ctr1, ctr2, ctr3, ctr4;
541
__m128i ctr5, ctr6, ctr7, ctr8;
542
__m128i last_block = _mm_setzero_si128();
543
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
544
__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
545
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
546
7);
547
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
548
15);
549
__m128i X = _mm_setzero_si128();
550
551
if (ibytes == 96/8) {
552
Y = _mm_loadu_si128((const __m128i *)ivec);
553
Y = _mm_insert_epi32(Y, 0x1000000, 3);
554
/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
555
tmp1 = _mm_xor_si128(X, KEY[0]);
556
tmp2 = _mm_xor_si128(Y, KEY[0]);
557
for (j=1; j < nr-1; j+=2) {
558
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
559
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
560
561
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
562
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
563
}
564
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
565
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
566
567
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
568
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
569
570
H = _mm_shuffle_epi8(H, BSWAP_MASK);
571
} else {
572
tmp1 = _mm_xor_si128(X, KEY[0]);
573
for (j=1; j <nr; j++)
574
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
575
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
576
577
H = _mm_shuffle_epi8(H, BSWAP_MASK);
578
Y = _mm_setzero_si128();
579
580
for (i=0; i < ibytes/16; i++) {
581
tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
582
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
583
Y = _mm_xor_si128(Y, tmp1);
584
gfmul(Y, H, &Y);
585
}
586
if (ibytes%16) {
587
for (j=0; j < ibytes%16; j++)
588
((unsigned char*)&last_block)[j] = ivec[i*16+j];
589
tmp1 = last_block;
590
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
591
Y = _mm_xor_si128(Y, tmp1);
592
gfmul(Y, H, &Y);
593
}
594
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
595
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
596
597
Y = _mm_xor_si128(Y, tmp1);
598
gfmul(Y, H, &Y);
599
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
600
tmp1 = _mm_xor_si128(Y, KEY[0]);
601
for (j=1; j < nr; j++)
602
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
603
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
604
}
605
606
gfmul(H,H,&H2);
607
gfmul(H,H2,&H3);
608
gfmul(H,H3,&H4);
609
610
for (i=0; i<abytes/16/4; i++) {
611
tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
612
tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
613
tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
614
tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
615
616
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
617
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
618
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
619
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
620
621
tmp1 = _mm_xor_si128(X, tmp1);
622
623
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
624
}
625
for (i=i*4; i<abytes/16; i++) {
626
tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
627
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
628
X = _mm_xor_si128(X,tmp1);
629
gfmul(X, H, &X);
630
}
631
if (abytes%16) {
632
last_block = _mm_setzero_si128();
633
for (j=0; j<abytes%16; j++)
634
((unsigned char*)&last_block)[j] = addt[i*16+j];
635
tmp1 = last_block;
636
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
637
X =_mm_xor_si128(X,tmp1);
638
gfmul(X,H,&X);
639
}
640
641
/* This is where we validate the cipher text before decrypt */
642
for (i = 0; i<nbytes/16/4; i++) {
643
tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
644
tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
645
tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
646
tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
647
648
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
649
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
650
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
651
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
652
653
tmp1 = _mm_xor_si128(X, tmp1);
654
655
reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
656
}
657
for (i = i*4; i<nbytes/16; i++) {
658
tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
659
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
660
X = _mm_xor_si128(X, tmp1);
661
gfmul(X,H,&X);
662
}
663
if (nbytes%16) {
664
last_block = _mm_setzero_si128();
665
for (j=0; j<nbytes%16; j++)
666
((unsigned char*)&last_block)[j] = in[i*16+j];
667
tmp1 = last_block;
668
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
669
X = _mm_xor_si128(X, tmp1);
670
gfmul(X, H, &X);
671
}
672
673
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
674
tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
675
676
X = _mm_xor_si128(X, tmp1);
677
gfmul(X,H,&X);
678
X = _mm_shuffle_epi8(X, BSWAP_MASK);
679
T = _mm_xor_si128(X, T);
680
681
if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
682
return 0; //in case the authentication failed
683
684
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
685
ctr1 = _mm_add_epi64(ctr1, ONE);
686
ctr2 = _mm_add_epi64(ctr1, ONE);
687
ctr3 = _mm_add_epi64(ctr2, ONE);
688
ctr4 = _mm_add_epi64(ctr3, ONE);
689
ctr5 = _mm_add_epi64(ctr4, ONE);
690
ctr6 = _mm_add_epi64(ctr5, ONE);
691
ctr7 = _mm_add_epi64(ctr6, ONE);
692
ctr8 = _mm_add_epi64(ctr7, ONE);
693
694
for (i=0; i<nbytes/16/8; i++) {
695
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
696
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
697
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
698
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
699
tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
700
tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
701
tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
702
tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
703
704
ctr1 = _mm_add_epi64(ctr1, EIGHT);
705
ctr2 = _mm_add_epi64(ctr2, EIGHT);
706
ctr3 = _mm_add_epi64(ctr3, EIGHT);
707
ctr4 = _mm_add_epi64(ctr4, EIGHT);
708
ctr5 = _mm_add_epi64(ctr5, EIGHT);
709
ctr6 = _mm_add_epi64(ctr6, EIGHT);
710
ctr7 = _mm_add_epi64(ctr7, EIGHT);
711
ctr8 = _mm_add_epi64(ctr8, EIGHT);
712
713
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
714
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
715
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
716
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
717
tmp5 =_mm_xor_si128(tmp5, KEY[0]);
718
tmp6 =_mm_xor_si128(tmp6, KEY[0]);
719
tmp7 =_mm_xor_si128(tmp7, KEY[0]);
720
tmp8 =_mm_xor_si128(tmp8, KEY[0]);
721
722
for (j=1; j<nr; j++) {
723
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
724
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
725
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
726
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
727
tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
728
tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
729
tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
730
tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
731
}
732
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
733
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
734
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
735
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
736
tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
737
tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
738
tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
739
tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
740
741
tmp1 = _mm_xor_si128(tmp1,
742
_mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
743
tmp2 = _mm_xor_si128(tmp2,
744
_mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
745
tmp3 = _mm_xor_si128(tmp3,
746
_mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
747
tmp4 = _mm_xor_si128(tmp4,
748
_mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
749
tmp5 = _mm_xor_si128(tmp5,
750
_mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
751
tmp6 = _mm_xor_si128(tmp6,
752
_mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
753
tmp7 = _mm_xor_si128(tmp7,
754
_mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
755
tmp8 = _mm_xor_si128(tmp8,
756
_mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
757
758
_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
759
_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
760
_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
761
_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
762
_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
763
_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
764
_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
765
_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
766
767
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
768
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
769
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
770
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
771
tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
772
tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
773
tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
774
tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
775
}
776
for (k=i*8; k<nbytes/16; k++) {
777
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
778
ctr1 = _mm_add_epi64(ctr1, ONE);
779
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
780
for (j=1; j<nr-1; j+=2) {
781
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
782
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
783
}
784
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
785
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
786
tmp1 = _mm_xor_si128(tmp1,
787
_mm_loadu_si128(&((const __m128i *)in)[k]));
788
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
789
}
790
//If remains one incomplete block
791
if (nbytes%16) {
792
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
793
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
794
for (j=1; j<nr-1; j+=2) {
795
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
796
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
797
}
798
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
799
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
800
last_block = _mm_setzero_si128();
801
memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
802
tmp1 = _mm_xor_si128(tmp1, last_block);
803
last_block = tmp1;
804
for (j=0; j<nbytes%16; j++)
805
out[k*16+j] = ((unsigned char*)&last_block)[j];
806
}
807
return 1; //when sucessfull returns 1
808
}
809
810