Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/symcipher/aes_x86ni_ctrcbc.c
39482 views
1
/*
2
* Copyright (c) 2017 Thomas Pornin <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining
5
* a copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sublicense, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be
13
* included in all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#define BR_ENABLE_INTRINSICS 1
26
#include "inner.h"
27
28
#if BR_AES_X86NI
29
30
/* see bearssl_block.h */
31
const br_block_ctrcbc_class *
32
br_aes_x86ni_ctrcbc_get_vtable(void)
33
{
34
return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
35
}
36
37
/* see bearssl_block.h */
38
void
39
br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
40
const void *key, size_t len)
41
{
42
ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
43
ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44
}
45
46
BR_TARGETS_X86_UP
47
48
/* see bearssl_block.h */
49
BR_TARGET("sse2,sse4.1,aes")
50
void
51
br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
52
void *ctr, void *data, size_t len)
53
{
54
unsigned char *buf;
55
unsigned num_rounds;
56
__m128i sk[15];
57
__m128i ivx0, ivx1, ivx2, ivx3;
58
__m128i erev, zero, one, four, notthree;
59
unsigned u;
60
61
buf = data;
62
num_rounds = ctx->num_rounds;
63
for (u = 0; u <= num_rounds; u ++) {
64
sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
65
}
66
67
/*
68
* Some SSE2 constants.
69
*/
70
erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71
8, 9, 10, 11, 12, 13, 14, 15);
72
zero = _mm_setzero_si128();
73
one = _mm_set_epi64x(0, 1);
74
four = _mm_set_epi64x(0, 4);
75
notthree = _mm_sub_epi64(zero, four);
76
77
/*
78
* Decode the counter in big-endian and pre-increment the other
79
* three counters.
80
*/
81
ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
82
ivx1 = _mm_add_epi64(ivx0, one);
83
ivx1 = _mm_sub_epi64(ivx1,
84
_mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
85
ivx2 = _mm_add_epi64(ivx1, one);
86
ivx2 = _mm_sub_epi64(ivx2,
87
_mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
88
ivx3 = _mm_add_epi64(ivx2, one);
89
ivx3 = _mm_sub_epi64(ivx3,
90
_mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
91
while (len > 0) {
92
__m128i x0, x1, x2, x3;
93
94
/*
95
* Load counter values; we need to byteswap them because
96
* the specification says that they use big-endian.
97
*/
98
x0 = _mm_shuffle_epi8(ivx0, erev);
99
x1 = _mm_shuffle_epi8(ivx1, erev);
100
x2 = _mm_shuffle_epi8(ivx2, erev);
101
x3 = _mm_shuffle_epi8(ivx3, erev);
102
103
x0 = _mm_xor_si128(x0, sk[0]);
104
x1 = _mm_xor_si128(x1, sk[0]);
105
x2 = _mm_xor_si128(x2, sk[0]);
106
x3 = _mm_xor_si128(x3, sk[0]);
107
x0 = _mm_aesenc_si128(x0, sk[1]);
108
x1 = _mm_aesenc_si128(x1, sk[1]);
109
x2 = _mm_aesenc_si128(x2, sk[1]);
110
x3 = _mm_aesenc_si128(x3, sk[1]);
111
x0 = _mm_aesenc_si128(x0, sk[2]);
112
x1 = _mm_aesenc_si128(x1, sk[2]);
113
x2 = _mm_aesenc_si128(x2, sk[2]);
114
x3 = _mm_aesenc_si128(x3, sk[2]);
115
x0 = _mm_aesenc_si128(x0, sk[3]);
116
x1 = _mm_aesenc_si128(x1, sk[3]);
117
x2 = _mm_aesenc_si128(x2, sk[3]);
118
x3 = _mm_aesenc_si128(x3, sk[3]);
119
x0 = _mm_aesenc_si128(x0, sk[4]);
120
x1 = _mm_aesenc_si128(x1, sk[4]);
121
x2 = _mm_aesenc_si128(x2, sk[4]);
122
x3 = _mm_aesenc_si128(x3, sk[4]);
123
x0 = _mm_aesenc_si128(x0, sk[5]);
124
x1 = _mm_aesenc_si128(x1, sk[5]);
125
x2 = _mm_aesenc_si128(x2, sk[5]);
126
x3 = _mm_aesenc_si128(x3, sk[5]);
127
x0 = _mm_aesenc_si128(x0, sk[6]);
128
x1 = _mm_aesenc_si128(x1, sk[6]);
129
x2 = _mm_aesenc_si128(x2, sk[6]);
130
x3 = _mm_aesenc_si128(x3, sk[6]);
131
x0 = _mm_aesenc_si128(x0, sk[7]);
132
x1 = _mm_aesenc_si128(x1, sk[7]);
133
x2 = _mm_aesenc_si128(x2, sk[7]);
134
x3 = _mm_aesenc_si128(x3, sk[7]);
135
x0 = _mm_aesenc_si128(x0, sk[8]);
136
x1 = _mm_aesenc_si128(x1, sk[8]);
137
x2 = _mm_aesenc_si128(x2, sk[8]);
138
x3 = _mm_aesenc_si128(x3, sk[8]);
139
x0 = _mm_aesenc_si128(x0, sk[9]);
140
x1 = _mm_aesenc_si128(x1, sk[9]);
141
x2 = _mm_aesenc_si128(x2, sk[9]);
142
x3 = _mm_aesenc_si128(x3, sk[9]);
143
if (num_rounds == 10) {
144
x0 = _mm_aesenclast_si128(x0, sk[10]);
145
x1 = _mm_aesenclast_si128(x1, sk[10]);
146
x2 = _mm_aesenclast_si128(x2, sk[10]);
147
x3 = _mm_aesenclast_si128(x3, sk[10]);
148
} else if (num_rounds == 12) {
149
x0 = _mm_aesenc_si128(x0, sk[10]);
150
x1 = _mm_aesenc_si128(x1, sk[10]);
151
x2 = _mm_aesenc_si128(x2, sk[10]);
152
x3 = _mm_aesenc_si128(x3, sk[10]);
153
x0 = _mm_aesenc_si128(x0, sk[11]);
154
x1 = _mm_aesenc_si128(x1, sk[11]);
155
x2 = _mm_aesenc_si128(x2, sk[11]);
156
x3 = _mm_aesenc_si128(x3, sk[11]);
157
x0 = _mm_aesenclast_si128(x0, sk[12]);
158
x1 = _mm_aesenclast_si128(x1, sk[12]);
159
x2 = _mm_aesenclast_si128(x2, sk[12]);
160
x3 = _mm_aesenclast_si128(x3, sk[12]);
161
} else {
162
x0 = _mm_aesenc_si128(x0, sk[10]);
163
x1 = _mm_aesenc_si128(x1, sk[10]);
164
x2 = _mm_aesenc_si128(x2, sk[10]);
165
x3 = _mm_aesenc_si128(x3, sk[10]);
166
x0 = _mm_aesenc_si128(x0, sk[11]);
167
x1 = _mm_aesenc_si128(x1, sk[11]);
168
x2 = _mm_aesenc_si128(x2, sk[11]);
169
x3 = _mm_aesenc_si128(x3, sk[11]);
170
x0 = _mm_aesenc_si128(x0, sk[12]);
171
x1 = _mm_aesenc_si128(x1, sk[12]);
172
x2 = _mm_aesenc_si128(x2, sk[12]);
173
x3 = _mm_aesenc_si128(x3, sk[12]);
174
x0 = _mm_aesenc_si128(x0, sk[13]);
175
x1 = _mm_aesenc_si128(x1, sk[13]);
176
x2 = _mm_aesenc_si128(x2, sk[13]);
177
x3 = _mm_aesenc_si128(x3, sk[13]);
178
x0 = _mm_aesenclast_si128(x0, sk[14]);
179
x1 = _mm_aesenclast_si128(x1, sk[14]);
180
x2 = _mm_aesenclast_si128(x2, sk[14]);
181
x3 = _mm_aesenclast_si128(x3, sk[14]);
182
}
183
if (len >= 64) {
184
x0 = _mm_xor_si128(x0,
185
_mm_loadu_si128((void *)(buf + 0)));
186
x1 = _mm_xor_si128(x1,
187
_mm_loadu_si128((void *)(buf + 16)));
188
x2 = _mm_xor_si128(x2,
189
_mm_loadu_si128((void *)(buf + 32)));
190
x3 = _mm_xor_si128(x3,
191
_mm_loadu_si128((void *)(buf + 48)));
192
_mm_storeu_si128((void *)(buf + 0), x0);
193
_mm_storeu_si128((void *)(buf + 16), x1);
194
_mm_storeu_si128((void *)(buf + 32), x2);
195
_mm_storeu_si128((void *)(buf + 48), x3);
196
buf += 64;
197
len -= 64;
198
} else {
199
unsigned char tmp[64];
200
201
_mm_storeu_si128((void *)(tmp + 0), x0);
202
_mm_storeu_si128((void *)(tmp + 16), x1);
203
_mm_storeu_si128((void *)(tmp + 32), x2);
204
_mm_storeu_si128((void *)(tmp + 48), x3);
205
for (u = 0; u < len; u ++) {
206
buf[u] ^= tmp[u];
207
}
208
switch (len) {
209
case 16:
210
ivx0 = ivx1;
211
break;
212
case 32:
213
ivx0 = ivx2;
214
break;
215
case 48:
216
ivx0 = ivx3;
217
break;
218
}
219
break;
220
}
221
222
/*
223
* Add 4 to each counter value. For carry propagation
224
* into the upper 64-bit words, we would need to compare
225
* the results with 4, but SSE2+ has only _signed_
226
* comparisons. Instead, we mask out the low two bits,
227
* and check whether the remaining bits are zero.
228
*/
229
ivx0 = _mm_add_epi64(ivx0, four);
230
ivx1 = _mm_add_epi64(ivx1, four);
231
ivx2 = _mm_add_epi64(ivx2, four);
232
ivx3 = _mm_add_epi64(ivx3, four);
233
ivx0 = _mm_sub_epi64(ivx0,
234
_mm_slli_si128(_mm_cmpeq_epi64(
235
_mm_and_si128(ivx0, notthree), zero), 8));
236
ivx1 = _mm_sub_epi64(ivx1,
237
_mm_slli_si128(_mm_cmpeq_epi64(
238
_mm_and_si128(ivx1, notthree), zero), 8));
239
ivx2 = _mm_sub_epi64(ivx2,
240
_mm_slli_si128(_mm_cmpeq_epi64(
241
_mm_and_si128(ivx2, notthree), zero), 8));
242
ivx3 = _mm_sub_epi64(ivx3,
243
_mm_slli_si128(_mm_cmpeq_epi64(
244
_mm_and_si128(ivx3, notthree), zero), 8));
245
}
246
247
/*
248
* Write back new counter value. The loop took care to put the
249
* right counter value in ivx0.
250
*/
251
_mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
252
}
253
254
/* see bearssl_block.h */
255
BR_TARGET("sse2,sse4.1,aes")
256
void
257
br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
258
void *cbcmac, const void *data, size_t len)
259
{
260
const unsigned char *buf;
261
unsigned num_rounds;
262
__m128i sk[15], ivx;
263
unsigned u;
264
265
buf = data;
266
ivx = _mm_loadu_si128(cbcmac);
267
num_rounds = ctx->num_rounds;
268
for (u = 0; u <= num_rounds; u ++) {
269
sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
270
}
271
while (len > 0) {
272
__m128i x;
273
274
x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
275
x = _mm_xor_si128(x, sk[0]);
276
x = _mm_aesenc_si128(x, sk[1]);
277
x = _mm_aesenc_si128(x, sk[2]);
278
x = _mm_aesenc_si128(x, sk[3]);
279
x = _mm_aesenc_si128(x, sk[4]);
280
x = _mm_aesenc_si128(x, sk[5]);
281
x = _mm_aesenc_si128(x, sk[6]);
282
x = _mm_aesenc_si128(x, sk[7]);
283
x = _mm_aesenc_si128(x, sk[8]);
284
x = _mm_aesenc_si128(x, sk[9]);
285
if (num_rounds == 10) {
286
x = _mm_aesenclast_si128(x, sk[10]);
287
} else if (num_rounds == 12) {
288
x = _mm_aesenc_si128(x, sk[10]);
289
x = _mm_aesenc_si128(x, sk[11]);
290
x = _mm_aesenclast_si128(x, sk[12]);
291
} else {
292
x = _mm_aesenc_si128(x, sk[10]);
293
x = _mm_aesenc_si128(x, sk[11]);
294
x = _mm_aesenc_si128(x, sk[12]);
295
x = _mm_aesenc_si128(x, sk[13]);
296
x = _mm_aesenclast_si128(x, sk[14]);
297
}
298
ivx = x;
299
buf += 16;
300
len -= 16;
301
}
302
_mm_storeu_si128(cbcmac, ivx);
303
}
304
305
/* see bearssl_block.h */
306
BR_TARGET("sse2,sse4.1,aes")
307
void
308
br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
309
void *ctr, void *cbcmac, void *data, size_t len)
310
{
311
unsigned char *buf;
312
unsigned num_rounds;
313
__m128i sk[15];
314
__m128i ivx, cmx;
315
__m128i erev, zero, one;
316
unsigned u;
317
int first_iter;
318
319
num_rounds = ctx->num_rounds;
320
for (u = 0; u <= num_rounds; u ++) {
321
sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
322
}
323
324
/*
325
* Some SSE2 constants.
326
*/
327
erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328
8, 9, 10, 11, 12, 13, 14, 15);
329
zero = _mm_setzero_si128();
330
one = _mm_set_epi64x(0, 1);
331
332
/*
333
* Decode the counter in big-endian.
334
*/
335
ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
336
cmx = _mm_loadu_si128(cbcmac);
337
338
buf = data;
339
first_iter = 1;
340
while (len > 0) {
341
__m128i dx, x0, x1;
342
343
/*
344
* Load initial values:
345
* dx encrypted block of data
346
* x0 counter (for CTR encryption)
347
* x1 input for CBC-MAC
348
*/
349
dx = _mm_loadu_si128((void *)buf);
350
x0 = _mm_shuffle_epi8(ivx, erev);
351
x1 = cmx;
352
353
x0 = _mm_xor_si128(x0, sk[0]);
354
x1 = _mm_xor_si128(x1, sk[0]);
355
x0 = _mm_aesenc_si128(x0, sk[1]);
356
x1 = _mm_aesenc_si128(x1, sk[1]);
357
x0 = _mm_aesenc_si128(x0, sk[2]);
358
x1 = _mm_aesenc_si128(x1, sk[2]);
359
x0 = _mm_aesenc_si128(x0, sk[3]);
360
x1 = _mm_aesenc_si128(x1, sk[3]);
361
x0 = _mm_aesenc_si128(x0, sk[4]);
362
x1 = _mm_aesenc_si128(x1, sk[4]);
363
x0 = _mm_aesenc_si128(x0, sk[5]);
364
x1 = _mm_aesenc_si128(x1, sk[5]);
365
x0 = _mm_aesenc_si128(x0, sk[6]);
366
x1 = _mm_aesenc_si128(x1, sk[6]);
367
x0 = _mm_aesenc_si128(x0, sk[7]);
368
x1 = _mm_aesenc_si128(x1, sk[7]);
369
x0 = _mm_aesenc_si128(x0, sk[8]);
370
x1 = _mm_aesenc_si128(x1, sk[8]);
371
x0 = _mm_aesenc_si128(x0, sk[9]);
372
x1 = _mm_aesenc_si128(x1, sk[9]);
373
if (num_rounds == 10) {
374
x0 = _mm_aesenclast_si128(x0, sk[10]);
375
x1 = _mm_aesenclast_si128(x1, sk[10]);
376
} else if (num_rounds == 12) {
377
x0 = _mm_aesenc_si128(x0, sk[10]);
378
x1 = _mm_aesenc_si128(x1, sk[10]);
379
x0 = _mm_aesenc_si128(x0, sk[11]);
380
x1 = _mm_aesenc_si128(x1, sk[11]);
381
x0 = _mm_aesenclast_si128(x0, sk[12]);
382
x1 = _mm_aesenclast_si128(x1, sk[12]);
383
} else {
384
x0 = _mm_aesenc_si128(x0, sk[10]);
385
x1 = _mm_aesenc_si128(x1, sk[10]);
386
x0 = _mm_aesenc_si128(x0, sk[11]);
387
x1 = _mm_aesenc_si128(x1, sk[11]);
388
x0 = _mm_aesenc_si128(x0, sk[12]);
389
x1 = _mm_aesenc_si128(x1, sk[12]);
390
x0 = _mm_aesenc_si128(x0, sk[13]);
391
x1 = _mm_aesenc_si128(x1, sk[13]);
392
x0 = _mm_aesenclast_si128(x0, sk[14]);
393
x1 = _mm_aesenclast_si128(x1, sk[14]);
394
}
395
396
x0 = _mm_xor_si128(x0, dx);
397
if (first_iter) {
398
cmx = _mm_xor_si128(cmx, x0);
399
first_iter = 0;
400
} else {
401
cmx = _mm_xor_si128(x1, x0);
402
}
403
_mm_storeu_si128((void *)buf, x0);
404
405
buf += 16;
406
len -= 16;
407
408
/*
409
* Increment the counter value.
410
*/
411
ivx = _mm_add_epi64(ivx, one);
412
ivx = _mm_sub_epi64(ivx,
413
_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
414
415
/*
416
* If this was the last iteration, then compute the
417
* extra block encryption to complete CBC-MAC.
418
*/
419
if (len == 0) {
420
cmx = _mm_xor_si128(cmx, sk[0]);
421
cmx = _mm_aesenc_si128(cmx, sk[1]);
422
cmx = _mm_aesenc_si128(cmx, sk[2]);
423
cmx = _mm_aesenc_si128(cmx, sk[3]);
424
cmx = _mm_aesenc_si128(cmx, sk[4]);
425
cmx = _mm_aesenc_si128(cmx, sk[5]);
426
cmx = _mm_aesenc_si128(cmx, sk[6]);
427
cmx = _mm_aesenc_si128(cmx, sk[7]);
428
cmx = _mm_aesenc_si128(cmx, sk[8]);
429
cmx = _mm_aesenc_si128(cmx, sk[9]);
430
if (num_rounds == 10) {
431
cmx = _mm_aesenclast_si128(cmx, sk[10]);
432
} else if (num_rounds == 12) {
433
cmx = _mm_aesenc_si128(cmx, sk[10]);
434
cmx = _mm_aesenc_si128(cmx, sk[11]);
435
cmx = _mm_aesenclast_si128(cmx, sk[12]);
436
} else {
437
cmx = _mm_aesenc_si128(cmx, sk[10]);
438
cmx = _mm_aesenc_si128(cmx, sk[11]);
439
cmx = _mm_aesenc_si128(cmx, sk[12]);
440
cmx = _mm_aesenc_si128(cmx, sk[13]);
441
cmx = _mm_aesenclast_si128(cmx, sk[14]);
442
}
443
break;
444
}
445
}
446
447
/*
448
* Write back new counter value and CBC-MAC value.
449
*/
450
_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
451
_mm_storeu_si128(cbcmac, cmx);
452
}
453
454
/* see bearssl_block.h */
455
BR_TARGET("sse2,sse4.1,aes")
456
void
457
br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
458
void *ctr, void *cbcmac, void *data, size_t len)
459
{
460
unsigned char *buf;
461
unsigned num_rounds;
462
__m128i sk[15];
463
__m128i ivx, cmx;
464
__m128i erev, zero, one;
465
unsigned u;
466
467
num_rounds = ctx->num_rounds;
468
for (u = 0; u <= num_rounds; u ++) {
469
sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
470
}
471
472
/*
473
* Some SSE2 constants.
474
*/
475
erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476
8, 9, 10, 11, 12, 13, 14, 15);
477
zero = _mm_setzero_si128();
478
one = _mm_set_epi64x(0, 1);
479
480
/*
481
* Decode the counter in big-endian.
482
*/
483
ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
484
cmx = _mm_loadu_si128(cbcmac);
485
486
buf = data;
487
while (len > 0) {
488
__m128i dx, x0, x1;
489
490
/*
491
* Load initial values:
492
* dx encrypted block of data
493
* x0 counter (for CTR encryption)
494
* x1 input for CBC-MAC
495
*/
496
dx = _mm_loadu_si128((void *)buf);
497
x0 = _mm_shuffle_epi8(ivx, erev);
498
x1 = _mm_xor_si128(cmx, dx);
499
500
x0 = _mm_xor_si128(x0, sk[0]);
501
x1 = _mm_xor_si128(x1, sk[0]);
502
x0 = _mm_aesenc_si128(x0, sk[1]);
503
x1 = _mm_aesenc_si128(x1, sk[1]);
504
x0 = _mm_aesenc_si128(x0, sk[2]);
505
x1 = _mm_aesenc_si128(x1, sk[2]);
506
x0 = _mm_aesenc_si128(x0, sk[3]);
507
x1 = _mm_aesenc_si128(x1, sk[3]);
508
x0 = _mm_aesenc_si128(x0, sk[4]);
509
x1 = _mm_aesenc_si128(x1, sk[4]);
510
x0 = _mm_aesenc_si128(x0, sk[5]);
511
x1 = _mm_aesenc_si128(x1, sk[5]);
512
x0 = _mm_aesenc_si128(x0, sk[6]);
513
x1 = _mm_aesenc_si128(x1, sk[6]);
514
x0 = _mm_aesenc_si128(x0, sk[7]);
515
x1 = _mm_aesenc_si128(x1, sk[7]);
516
x0 = _mm_aesenc_si128(x0, sk[8]);
517
x1 = _mm_aesenc_si128(x1, sk[8]);
518
x0 = _mm_aesenc_si128(x0, sk[9]);
519
x1 = _mm_aesenc_si128(x1, sk[9]);
520
if (num_rounds == 10) {
521
x0 = _mm_aesenclast_si128(x0, sk[10]);
522
x1 = _mm_aesenclast_si128(x1, sk[10]);
523
} else if (num_rounds == 12) {
524
x0 = _mm_aesenc_si128(x0, sk[10]);
525
x1 = _mm_aesenc_si128(x1, sk[10]);
526
x0 = _mm_aesenc_si128(x0, sk[11]);
527
x1 = _mm_aesenc_si128(x1, sk[11]);
528
x0 = _mm_aesenclast_si128(x0, sk[12]);
529
x1 = _mm_aesenclast_si128(x1, sk[12]);
530
} else {
531
x0 = _mm_aesenc_si128(x0, sk[10]);
532
x1 = _mm_aesenc_si128(x1, sk[10]);
533
x0 = _mm_aesenc_si128(x0, sk[11]);
534
x1 = _mm_aesenc_si128(x1, sk[11]);
535
x0 = _mm_aesenc_si128(x0, sk[12]);
536
x1 = _mm_aesenc_si128(x1, sk[12]);
537
x0 = _mm_aesenc_si128(x0, sk[13]);
538
x1 = _mm_aesenc_si128(x1, sk[13]);
539
x0 = _mm_aesenclast_si128(x0, sk[14]);
540
x1 = _mm_aesenclast_si128(x1, sk[14]);
541
}
542
x0 = _mm_xor_si128(x0, dx);
543
cmx = x1;
544
_mm_storeu_si128((void *)buf, x0);
545
546
buf += 16;
547
len -= 16;
548
549
/*
550
* Increment the counter value.
551
*/
552
ivx = _mm_add_epi64(ivx, one);
553
ivx = _mm_sub_epi64(ivx,
554
_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
555
}
556
557
/*
558
* Write back new counter value and CBC-MAC value.
559
*/
560
_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
561
_mm_storeu_si128(cbcmac, cmx);
562
}
563
564
BR_TARGETS_X86_DOWN
565
566
/* see bearssl_block.h */
567
const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
568
sizeof(br_aes_x86ni_ctrcbc_keys),
569
16,
570
4,
571
(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
572
&br_aes_x86ni_ctrcbc_init,
573
(void (*)(const br_block_ctrcbc_class *const *,
574
void *, void *, void *, size_t))
575
&br_aes_x86ni_ctrcbc_encrypt,
576
(void (*)(const br_block_ctrcbc_class *const *,
577
void *, void *, void *, size_t))
578
&br_aes_x86ni_ctrcbc_decrypt,
579
(void (*)(const br_block_ctrcbc_class *const *,
580
void *, void *, size_t))
581
&br_aes_x86ni_ctrcbc_ctr,
582
(void (*)(const br_block_ctrcbc_class *const *,
583
void *, const void *, size_t))
584
&br_aes_x86ni_ctrcbc_mac
585
};
586
587
#else
588
589
/* see bearssl_block.h */
590
const br_block_ctrcbc_class *
591
br_aes_x86ni_ctrcbc_get_vtable(void)
592
{
593
return NULL;
594
}
595
596
#endif
597
598