Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
48775 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24
*/
25
26
#include <sys/zfs_context.h>
27
#include <sys/cmn_err.h>
28
#include <modes/modes.h>
29
#include <sys/crypto/common.h>
30
#include <sys/crypto/icp.h>
31
#include <sys/crypto/impl.h>
32
#include <sys/byteorder.h>
33
#include <sys/simd.h>
34
#include <modes/gcm_impl.h>
35
#ifdef CAN_USE_GCM_ASM
36
#include <aes/aes_impl.h>
37
#endif
38
39
#define GHASH(c, d, t, o) \
40
xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
41
(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
42
(uint64_t *)(void *)(t));
43
44
/* Select GCM implementation */
45
#define IMPL_FASTEST (UINT32_MAX)
46
#define IMPL_CYCLE (UINT32_MAX-1)
47
#ifdef CAN_USE_GCM_ASM
48
#define IMPL_AVX (UINT32_MAX-2)
49
#if CAN_USE_GCM_ASM >= 2
50
#define IMPL_AVX2 (UINT32_MAX-3)
51
#endif
52
#endif
53
#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
54
static uint32_t icp_gcm_impl = IMPL_FASTEST;
55
static uint32_t user_sel_impl = IMPL_FASTEST;
56
57
#ifdef CAN_USE_GCM_ASM
58
/* Does the architecture we run on support the MOVBE instruction? */
59
boolean_t gcm_avx_can_use_movbe = B_FALSE;
60
/*
61
* Whether to use the optimized openssl gcm and ghash implementations.
62
*/
63
static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
64
#define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used)
65
66
extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
67
68
static inline boolean_t gcm_avx_will_work(void);
69
static inline boolean_t gcm_avx2_will_work(void);
70
static inline void gcm_use_impl(gcm_impl impl);
71
static inline gcm_impl gcm_toggle_impl(void);
72
73
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
74
crypto_data_t *, size_t);
75
76
static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
77
static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
78
static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
79
size_t, size_t);
80
#endif /* ifdef CAN_USE_GCM_ASM */
81
82
/*
83
* Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
84
* is done in another function.
85
*/
86
int
87
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
88
crypto_data_t *out, size_t block_size,
89
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
90
void (*copy_block)(uint8_t *, uint8_t *),
91
void (*xor_block)(uint8_t *, uint8_t *))
92
{
93
#ifdef CAN_USE_GCM_ASM
94
if (ctx->impl != GCM_IMPL_GENERIC)
95
return (gcm_mode_encrypt_contiguous_blocks_avx(
96
ctx, data, length, out, block_size));
97
#endif
98
99
const gcm_impl_ops_t *gops;
100
size_t remainder = length;
101
size_t need = 0;
102
uint8_t *datap = (uint8_t *)data;
103
uint8_t *blockp;
104
uint8_t *lastp;
105
void *iov_or_mp;
106
offset_t offset;
107
uint8_t *out_data_1;
108
uint8_t *out_data_2;
109
size_t out_data_1_len;
110
uint64_t counter;
111
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
112
113
if (length + ctx->gcm_remainder_len < block_size) {
114
/* accumulate bytes here and return */
115
memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
116
datap,
117
length);
118
ctx->gcm_remainder_len += length;
119
if (ctx->gcm_copy_to == NULL) {
120
ctx->gcm_copy_to = datap;
121
}
122
return (CRYPTO_SUCCESS);
123
}
124
125
crypto_init_ptrs(out, &iov_or_mp, &offset);
126
127
gops = gcm_impl_get_ops();
128
do {
129
/* Unprocessed data from last call. */
130
if (ctx->gcm_remainder_len > 0) {
131
need = block_size - ctx->gcm_remainder_len;
132
133
if (need > remainder)
134
return (CRYPTO_DATA_LEN_RANGE);
135
136
memcpy(&((uint8_t *)ctx->gcm_remainder)
137
[ctx->gcm_remainder_len], datap, need);
138
139
blockp = (uint8_t *)ctx->gcm_remainder;
140
} else {
141
blockp = datap;
142
}
143
144
/*
145
* Increment counter. Counter bits are confined
146
* to the bottom 32 bits of the counter block.
147
*/
148
counter = ntohll(ctx->gcm_cb[1] & counter_mask);
149
counter = htonll(counter + 1);
150
counter &= counter_mask;
151
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
152
153
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
154
(uint8_t *)ctx->gcm_tmp);
155
xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
156
157
lastp = (uint8_t *)ctx->gcm_tmp;
158
159
ctx->gcm_processed_data_len += block_size;
160
161
crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
162
&out_data_1_len, &out_data_2, block_size);
163
164
/* copy block to where it belongs */
165
if (out_data_1_len == block_size) {
166
copy_block(lastp, out_data_1);
167
} else {
168
memcpy(out_data_1, lastp, out_data_1_len);
169
if (out_data_2 != NULL) {
170
memcpy(out_data_2,
171
lastp + out_data_1_len,
172
block_size - out_data_1_len);
173
}
174
}
175
/* update offset */
176
out->cd_offset += block_size;
177
178
/* add ciphertext to the hash */
179
GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
180
181
/* Update pointer to next block of data to be processed. */
182
if (ctx->gcm_remainder_len != 0) {
183
datap += need;
184
ctx->gcm_remainder_len = 0;
185
} else {
186
datap += block_size;
187
}
188
189
remainder = (size_t)&data[length] - (size_t)datap;
190
191
/* Incomplete last block. */
192
if (remainder > 0 && remainder < block_size) {
193
memcpy(ctx->gcm_remainder, datap, remainder);
194
ctx->gcm_remainder_len = remainder;
195
ctx->gcm_copy_to = datap;
196
goto out;
197
}
198
ctx->gcm_copy_to = NULL;
199
200
} while (remainder > 0);
201
out:
202
return (CRYPTO_SUCCESS);
203
}
204
205
int
206
gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
207
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
208
void (*copy_block)(uint8_t *, uint8_t *),
209
void (*xor_block)(uint8_t *, uint8_t *))
210
{
211
(void) copy_block;
212
#ifdef CAN_USE_GCM_ASM
213
if (ctx->impl != GCM_IMPL_GENERIC)
214
return (gcm_encrypt_final_avx(ctx, out, block_size));
215
#endif
216
217
const gcm_impl_ops_t *gops;
218
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
219
uint8_t *ghash, *macp = NULL;
220
int i, rv;
221
222
if (out->cd_length <
223
(ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
224
return (CRYPTO_DATA_LEN_RANGE);
225
}
226
227
gops = gcm_impl_get_ops();
228
ghash = (uint8_t *)ctx->gcm_ghash;
229
230
if (ctx->gcm_remainder_len > 0) {
231
uint64_t counter;
232
uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
233
234
/*
235
* Here is where we deal with data that is not a
236
* multiple of the block size.
237
*/
238
239
/*
240
* Increment counter.
241
*/
242
counter = ntohll(ctx->gcm_cb[1] & counter_mask);
243
counter = htonll(counter + 1);
244
counter &= counter_mask;
245
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
246
247
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
248
(uint8_t *)ctx->gcm_tmp);
249
250
macp = (uint8_t *)ctx->gcm_remainder;
251
memset(macp + ctx->gcm_remainder_len, 0,
252
block_size - ctx->gcm_remainder_len);
253
254
/* XOR with counter block */
255
for (i = 0; i < ctx->gcm_remainder_len; i++) {
256
macp[i] ^= tmpp[i];
257
}
258
259
/* add ciphertext to the hash */
260
GHASH(ctx, macp, ghash, gops);
261
262
ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
263
}
264
265
ctx->gcm_len_a_len_c[1] =
266
htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
267
GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
268
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
269
(uint8_t *)ctx->gcm_J0);
270
xor_block((uint8_t *)ctx->gcm_J0, ghash);
271
272
if (ctx->gcm_remainder_len > 0) {
273
rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
274
if (rv != CRYPTO_SUCCESS)
275
return (rv);
276
}
277
out->cd_offset += ctx->gcm_remainder_len;
278
ctx->gcm_remainder_len = 0;
279
rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
280
if (rv != CRYPTO_SUCCESS)
281
return (rv);
282
out->cd_offset += ctx->gcm_tag_len;
283
284
return (CRYPTO_SUCCESS);
285
}
286
287
/*
288
* This will only deal with decrypting the last block of the input that
289
* might not be a multiple of block length.
290
*/
291
static void
292
gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
293
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
294
void (*xor_block)(uint8_t *, uint8_t *))
295
{
296
uint8_t *datap, *outp, *counterp;
297
uint64_t counter;
298
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
299
int i;
300
301
/*
302
* Increment counter.
303
* Counter bits are confined to the bottom 32 bits
304
*/
305
counter = ntohll(ctx->gcm_cb[1] & counter_mask);
306
counter = htonll(counter + 1);
307
counter &= counter_mask;
308
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
309
310
datap = (uint8_t *)ctx->gcm_remainder;
311
outp = &((ctx->gcm_pt_buf)[index]);
312
counterp = (uint8_t *)ctx->gcm_tmp;
313
314
/* authentication tag */
315
memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
316
memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
317
318
/* add ciphertext to the hash */
319
GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
320
321
/* decrypt remaining ciphertext */
322
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
323
324
/* XOR with counter block */
325
for (i = 0; i < ctx->gcm_remainder_len; i++) {
326
outp[i] = datap[i] ^ counterp[i];
327
}
328
}
329
330
int
331
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
332
crypto_data_t *out, size_t block_size,
333
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
334
void (*copy_block)(uint8_t *, uint8_t *),
335
void (*xor_block)(uint8_t *, uint8_t *))
336
{
337
(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
338
(void) xor_block;
339
size_t new_len;
340
uint8_t *new;
341
342
/*
343
* Copy contiguous ciphertext input blocks to plaintext buffer.
344
* Ciphertext will be decrypted in the final.
345
*/
346
if (length > 0) {
347
new_len = ctx->gcm_pt_buf_len + length;
348
new = vmem_alloc(new_len, KM_SLEEP);
349
if (new == NULL) {
350
vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
351
ctx->gcm_pt_buf = NULL;
352
return (CRYPTO_HOST_MEMORY);
353
}
354
355
if (ctx->gcm_pt_buf != NULL) {
356
memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
357
vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
358
} else {
359
ASSERT0(ctx->gcm_pt_buf_len);
360
}
361
362
ctx->gcm_pt_buf = new;
363
ctx->gcm_pt_buf_len = new_len;
364
memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
365
length);
366
ctx->gcm_processed_data_len += length;
367
}
368
369
ctx->gcm_remainder_len = 0;
370
return (CRYPTO_SUCCESS);
371
}
372
373
int
374
gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
375
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
376
void (*xor_block)(uint8_t *, uint8_t *))
377
{
378
#ifdef CAN_USE_GCM_ASM
379
if (ctx->impl != GCM_IMPL_GENERIC)
380
return (gcm_decrypt_final_avx(ctx, out, block_size));
381
#endif
382
383
const gcm_impl_ops_t *gops;
384
size_t pt_len;
385
size_t remainder;
386
uint8_t *ghash;
387
uint8_t *blockp;
388
uint8_t *cbp;
389
uint64_t counter;
390
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
391
int processed = 0, rv;
392
393
ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
394
395
gops = gcm_impl_get_ops();
396
pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
397
ghash = (uint8_t *)ctx->gcm_ghash;
398
blockp = ctx->gcm_pt_buf;
399
remainder = pt_len;
400
while (remainder > 0) {
401
/* Incomplete last block */
402
if (remainder < block_size) {
403
memcpy(ctx->gcm_remainder, blockp, remainder);
404
ctx->gcm_remainder_len = remainder;
405
/*
406
* not expecting anymore ciphertext, just
407
* compute plaintext for the remaining input
408
*/
409
gcm_decrypt_incomplete_block(ctx, block_size,
410
processed, encrypt_block, xor_block);
411
ctx->gcm_remainder_len = 0;
412
goto out;
413
}
414
/* add ciphertext to the hash */
415
GHASH(ctx, blockp, ghash, gops);
416
417
/*
418
* Increment counter.
419
* Counter bits are confined to the bottom 32 bits
420
*/
421
counter = ntohll(ctx->gcm_cb[1] & counter_mask);
422
counter = htonll(counter + 1);
423
counter &= counter_mask;
424
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
425
426
cbp = (uint8_t *)ctx->gcm_tmp;
427
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
428
429
/* XOR with ciphertext */
430
xor_block(cbp, blockp);
431
432
processed += block_size;
433
blockp += block_size;
434
remainder -= block_size;
435
}
436
out:
437
ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
438
GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
439
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
440
(uint8_t *)ctx->gcm_J0);
441
xor_block((uint8_t *)ctx->gcm_J0, ghash);
442
443
/* compare the input authentication tag with what we calculated */
444
if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
445
/* They don't match */
446
return (CRYPTO_INVALID_MAC);
447
} else {
448
rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
449
if (rv != CRYPTO_SUCCESS)
450
return (rv);
451
out->cd_offset += pt_len;
452
}
453
return (CRYPTO_SUCCESS);
454
}
455
456
static int
457
gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
458
{
459
size_t tag_len;
460
461
/*
462
* Check the length of the authentication tag (in bits).
463
*/
464
tag_len = gcm_param->ulTagBits;
465
switch (tag_len) {
466
case 32:
467
case 64:
468
case 96:
469
case 104:
470
case 112:
471
case 120:
472
case 128:
473
break;
474
default:
475
return (CRYPTO_MECHANISM_PARAM_INVALID);
476
}
477
478
if (gcm_param->ulIvLen == 0)
479
return (CRYPTO_MECHANISM_PARAM_INVALID);
480
481
return (CRYPTO_SUCCESS);
482
}
483
484
static void
485
gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
486
gcm_ctx_t *ctx, size_t block_size,
487
void (*copy_block)(uint8_t *, uint8_t *),
488
void (*xor_block)(uint8_t *, uint8_t *))
489
{
490
const gcm_impl_ops_t *gops;
491
uint8_t *cb;
492
ulong_t remainder = iv_len;
493
ulong_t processed = 0;
494
uint8_t *datap, *ghash;
495
uint64_t len_a_len_c[2];
496
497
gops = gcm_impl_get_ops();
498
ghash = (uint8_t *)ctx->gcm_ghash;
499
cb = (uint8_t *)ctx->gcm_cb;
500
if (iv_len == 12) {
501
memcpy(cb, iv, 12);
502
cb[12] = 0;
503
cb[13] = 0;
504
cb[14] = 0;
505
cb[15] = 1;
506
/* J0 will be used again in the final */
507
copy_block(cb, (uint8_t *)ctx->gcm_J0);
508
} else {
509
/* GHASH the IV */
510
do {
511
if (remainder < block_size) {
512
memset(cb, 0, block_size);
513
memcpy(cb, &(iv[processed]), remainder);
514
datap = (uint8_t *)cb;
515
remainder = 0;
516
} else {
517
datap = (uint8_t *)(&(iv[processed]));
518
processed += block_size;
519
remainder -= block_size;
520
}
521
GHASH(ctx, datap, ghash, gops);
522
} while (remainder > 0);
523
524
len_a_len_c[0] = 0;
525
len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
526
GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
527
528
/* J0 will be used again in the final */
529
copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
530
}
531
}
532
533
static int
534
gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
535
const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
536
int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
537
void (*copy_block)(uint8_t *, uint8_t *),
538
void (*xor_block)(uint8_t *, uint8_t *))
539
{
540
const gcm_impl_ops_t *gops;
541
uint8_t *ghash, *datap, *authp;
542
size_t remainder, processed;
543
544
/* encrypt zero block to get subkey H */
545
memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
546
encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
547
(uint8_t *)ctx->gcm_H);
548
549
gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
550
copy_block, xor_block);
551
552
gops = gcm_impl_get_ops();
553
authp = (uint8_t *)ctx->gcm_tmp;
554
ghash = (uint8_t *)ctx->gcm_ghash;
555
memset(authp, 0, block_size);
556
memset(ghash, 0, block_size);
557
558
processed = 0;
559
remainder = auth_data_len;
560
do {
561
if (remainder < block_size) {
562
/*
563
* There's not a block full of data, pad rest of
564
* buffer with zero
565
*/
566
567
if (auth_data != NULL) {
568
memset(authp, 0, block_size);
569
memcpy(authp, &(auth_data[processed]),
570
remainder);
571
} else {
572
ASSERT0(remainder);
573
}
574
575
datap = (uint8_t *)authp;
576
remainder = 0;
577
} else {
578
datap = (uint8_t *)(&(auth_data[processed]));
579
processed += block_size;
580
remainder -= block_size;
581
}
582
583
/* add auth data to the hash */
584
GHASH(ctx, datap, ghash, gops);
585
586
} while (remainder > 0);
587
588
return (CRYPTO_SUCCESS);
589
}
590
591
/*
592
* Init the GCM context struct. Handle the cycle and avx implementations here.
593
*/
594
int
595
gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
596
size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
597
uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
598
void (*xor_block)(uint8_t *, uint8_t *))
599
{
600
CK_AES_GCM_PARAMS *gcm_param;
601
int rv = CRYPTO_SUCCESS;
602
size_t tag_len, iv_len;
603
604
if (param != NULL) {
605
gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
606
607
/* GCM mode. */
608
if ((rv = gcm_validate_args(gcm_param)) != 0) {
609
return (rv);
610
}
611
gcm_ctx->gcm_flags |= GCM_MODE;
612
613
size_t tbits = gcm_param->ulTagBits;
614
tag_len = CRYPTO_BITS2BYTES(tbits);
615
iv_len = gcm_param->ulIvLen;
616
617
gcm_ctx->gcm_tag_len = tag_len;
618
gcm_ctx->gcm_processed_data_len = 0;
619
620
/* these values are in bits */
621
gcm_ctx->gcm_len_a_len_c[0]
622
= htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
623
} else {
624
return (CRYPTO_MECHANISM_PARAM_INVALID);
625
}
626
627
const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
628
const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
629
size_t aad_len = gcm_param->ulAADLen;
630
631
#ifdef CAN_USE_GCM_ASM
632
boolean_t needs_bswap =
633
((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
634
635
if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
636
gcm_ctx->impl = GCM_IMPL_USED;
637
} else {
638
/*
639
* Handle the "cycle" implementation by creating different
640
* contexts, one per implementation.
641
*/
642
gcm_ctx->impl = gcm_toggle_impl();
643
644
/* The AVX impl. doesn't handle byte swapped key schedules. */
645
if (needs_bswap == B_TRUE) {
646
gcm_ctx->impl = GCM_IMPL_GENERIC;
647
}
648
/*
649
* If this is an AVX context, use the MOVBE and the BSWAP
650
* variants alternately.
651
*/
652
if (gcm_ctx->impl == GCM_IMPL_AVX &&
653
zfs_movbe_available() == B_TRUE) {
654
(void) atomic_toggle_boolean_nv(
655
(volatile boolean_t *)&gcm_avx_can_use_movbe);
656
}
657
}
658
/*
659
* We don't handle byte swapped key schedules in the avx code path,
660
* still they could be created by the aes generic implementation.
661
* Make sure not to use them since we'll corrupt data if we do.
662
*/
663
if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
664
gcm_ctx->impl = GCM_IMPL_GENERIC;
665
666
cmn_err_once(CE_WARN,
667
"ICP: Can't use the aes generic or cycle implementations "
668
"in combination with the gcm avx or avx2-vaes "
669
"implementation!");
670
cmn_err_once(CE_WARN,
671
"ICP: Falling back to a compatible implementation, "
672
"aes-gcm performance will likely be degraded.");
673
cmn_err_once(CE_WARN,
674
"ICP: Choose at least the x86_64 aes implementation to "
675
"restore performance.");
676
}
677
678
/*
679
* AVX implementations use Htable with sizes depending on
680
* implementation.
681
*/
682
if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
683
rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
684
block_size);
685
}
686
else
687
#endif /* ifdef CAN_USE_GCM_ASM */
688
if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
689
encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
690
rv = CRYPTO_MECHANISM_PARAM_INVALID;
691
}
692
693
return (rv);
694
}
695
696
void *
697
gcm_alloc_ctx(int kmflag)
698
{
699
gcm_ctx_t *gcm_ctx;
700
701
if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
702
return (NULL);
703
704
gcm_ctx->gcm_flags = GCM_MODE;
705
return (gcm_ctx);
706
}
707
708
/* GCM implementation that contains the fastest methods */
709
static gcm_impl_ops_t gcm_fastest_impl = {
710
.name = "fastest"
711
};
712
713
/* All compiled in implementations */
714
static const gcm_impl_ops_t *gcm_all_impl[] = {
715
&gcm_generic_impl,
716
#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
717
&gcm_pclmulqdq_impl,
718
#endif
719
};
720
721
/* Indicate that benchmark has been completed */
722
static boolean_t gcm_impl_initialized = B_FALSE;
723
724
/* Hold all supported implementations */
725
static size_t gcm_supp_impl_cnt = 0;
726
static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
727
728
/*
729
* Returns the GCM operations for encrypt/decrypt/key setup. When a
730
* SIMD implementation is not allowed in the current context, then
731
* fallback to the fastest generic implementation.
732
*/
733
const gcm_impl_ops_t *
734
gcm_impl_get_ops(void)
735
{
736
if (!kfpu_allowed())
737
return (&gcm_generic_impl);
738
739
const gcm_impl_ops_t *ops = NULL;
740
const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
741
742
switch (impl) {
743
case IMPL_FASTEST:
744
ASSERT(gcm_impl_initialized);
745
ops = &gcm_fastest_impl;
746
break;
747
case IMPL_CYCLE:
748
/* Cycle through supported implementations */
749
ASSERT(gcm_impl_initialized);
750
ASSERT3U(gcm_supp_impl_cnt, >, 0);
751
static size_t cycle_impl_idx = 0;
752
size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
753
ops = gcm_supp_impl[idx];
754
break;
755
#ifdef CAN_USE_GCM_ASM
756
case IMPL_AVX:
757
#if CAN_USE_GCM_ASM >= 2
758
case IMPL_AVX2:
759
#endif
760
/*
761
* Make sure that we return a valid implementation while
762
* switching to the avx implementation since there still
763
* may be unfinished non-avx contexts around.
764
*/
765
ops = &gcm_generic_impl;
766
break;
767
#endif
768
default:
769
ASSERT3U(impl, <, gcm_supp_impl_cnt);
770
ASSERT3U(gcm_supp_impl_cnt, >, 0);
771
if (impl < ARRAY_SIZE(gcm_all_impl))
772
ops = gcm_supp_impl[impl];
773
break;
774
}
775
776
ASSERT3P(ops, !=, NULL);
777
778
return (ops);
779
}
780
781
/*
782
* Initialize all supported implementations.
783
*/
784
void
785
gcm_impl_init(void)
786
{
787
gcm_impl_ops_t *curr_impl;
788
int i, c;
789
790
/* Move supported implementations into gcm_supp_impls */
791
for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
792
curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
793
794
if (curr_impl->is_supported())
795
gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
796
}
797
gcm_supp_impl_cnt = c;
798
799
/*
800
* Set the fastest implementation given the assumption that the
801
* hardware accelerated version is the fastest.
802
*/
803
#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
804
if (gcm_pclmulqdq_impl.is_supported()) {
805
memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
806
sizeof (gcm_fastest_impl));
807
} else
808
#endif
809
{
810
memcpy(&gcm_fastest_impl, &gcm_generic_impl,
811
sizeof (gcm_fastest_impl));
812
}
813
814
strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
815
816
#ifdef CAN_USE_GCM_ASM
817
/*
818
* Use the avx implementation if it's available and the implementation
819
* hasn't changed from its default value of fastest on module load.
820
*/
821
#if CAN_USE_GCM_ASM >= 2
822
if (gcm_avx2_will_work()) {
823
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
824
gcm_use_impl(GCM_IMPL_AVX2);
825
}
826
} else
827
#endif
828
if (gcm_avx_will_work()) {
829
#ifdef HAVE_MOVBE
830
if (zfs_movbe_available() == B_TRUE) {
831
atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
832
}
833
#endif
834
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
835
gcm_use_impl(GCM_IMPL_AVX);
836
}
837
}
838
#endif
839
/* Finish initialization */
840
atomic_swap_32(&icp_gcm_impl, user_sel_impl);
841
gcm_impl_initialized = B_TRUE;
842
}
843
844
static const struct {
845
const char *name;
846
uint32_t sel;
847
} gcm_impl_opts[] = {
848
{ "cycle", IMPL_CYCLE },
849
{ "fastest", IMPL_FASTEST },
850
#ifdef CAN_USE_GCM_ASM
851
{ "avx", IMPL_AVX },
852
{ "avx2-vaes", IMPL_AVX2 },
853
#endif
854
};
855
856
/*
857
* Function sets desired gcm implementation.
858
*
859
* If we are called before init(), user preference will be saved in
860
* user_sel_impl, and applied in later init() call. This occurs when module
861
* parameter is specified on module load. Otherwise, directly update
862
* icp_gcm_impl.
863
*
864
* @val Name of gcm implementation to use
865
* @param Unused.
866
*/
867
int
868
gcm_impl_set(const char *val)
869
{
870
int err = -EINVAL;
871
char req_name[GCM_IMPL_NAME_MAX];
872
uint32_t impl = GCM_IMPL_READ(user_sel_impl);
873
size_t i;
874
875
/* sanitize input */
876
i = strnlen(val, GCM_IMPL_NAME_MAX);
877
if (i == 0 || i >= GCM_IMPL_NAME_MAX)
878
return (err);
879
880
strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
881
while (i > 0 && isspace(req_name[i-1]))
882
i--;
883
req_name[i] = '\0';
884
885
/* Check mandatory options */
886
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
887
#ifdef CAN_USE_GCM_ASM
888
#if CAN_USE_GCM_ASM >= 2
889
/* Ignore avx implementation if it won't work. */
890
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
891
!gcm_avx2_will_work()) {
892
continue;
893
}
894
#endif
895
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
896
continue;
897
}
898
#endif
899
if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
900
impl = gcm_impl_opts[i].sel;
901
err = 0;
902
break;
903
}
904
}
905
906
/* check all supported impl if init() was already called */
907
if (err != 0 && gcm_impl_initialized) {
908
/* check all supported implementations */
909
for (i = 0; i < gcm_supp_impl_cnt; i++) {
910
if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
911
impl = i;
912
err = 0;
913
break;
914
}
915
}
916
}
917
#ifdef CAN_USE_GCM_ASM
918
/*
919
* Use the avx implementation if available and the requested one is
920
* avx or fastest.
921
*/
922
#if CAN_USE_GCM_ASM >= 2
923
if (gcm_avx2_will_work() == B_TRUE &&
924
(impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
925
gcm_use_impl(GCM_IMPL_AVX2);
926
} else
927
#endif
928
if (gcm_avx_will_work() == B_TRUE &&
929
(impl == IMPL_AVX || impl == IMPL_FASTEST)) {
930
gcm_use_impl(GCM_IMPL_AVX);
931
} else {
932
gcm_use_impl(GCM_IMPL_GENERIC);
933
}
934
#endif
935
936
if (err == 0) {
937
if (gcm_impl_initialized)
938
atomic_swap_32(&icp_gcm_impl, impl);
939
else
940
atomic_swap_32(&user_sel_impl, impl);
941
}
942
943
return (err);
944
}
945
946
#if defined(_KERNEL) && defined(__linux__)
947
948
static int
949
icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
950
{
951
return (gcm_impl_set(val));
952
}
953
954
static int
955
icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
956
{
957
int i, cnt = 0;
958
char *fmt;
959
const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
960
961
/* list mandatory options */
962
for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
963
#ifdef CAN_USE_GCM_ASM
964
/* Ignore avx implementation if it won't work. */
965
#if CAN_USE_GCM_ASM >= 2
966
if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
967
!gcm_avx2_will_work()) {
968
continue;
969
}
970
#endif
971
if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
972
continue;
973
}
974
#endif
975
fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
976
cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
977
gcm_impl_opts[i].name);
978
}
979
980
/* list all supported implementations */
981
for (i = 0; i < gcm_supp_impl_cnt; i++) {
982
fmt = (i == impl) ? "[%s] " : "%s ";
983
cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
984
gcm_supp_impl[i]->name);
985
}
986
987
return (cnt);
988
}
989
990
module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
991
NULL, 0644);
992
MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
993
#endif /* defined(__KERNEL) */
994
995
#ifdef CAN_USE_GCM_ASM
996
#define GCM_BLOCK_LEN 16
997
/*
998
* The openssl asm routines are 6x aggregated and need that many bytes
999
* at minimum.
1000
*/
1001
#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1002
#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1003
/*
1004
* Ensure the chunk size is reasonable since we are allocating a
1005
* GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1006
*/
1007
#define GCM_AVX_MAX_CHUNK_SIZE \
1008
(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1009
1010
/* Clear the FPU registers since they hold sensitive internal state. */
1011
#define clear_fpu_regs() clear_fpu_regs_avx()
1012
1013
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1014
1015
/* Get the chunk size module parameter. */
1016
#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1017
1018
/*
1019
* Module parameter: number of bytes to process at once while owning the FPU.
1020
* Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1021
* ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1022
*/
1023
static uint32_t gcm_avx_chunk_size =
1024
((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1025
1026
/*
1027
* GCM definitions: uint128_t is copied from include/crypto/modes.h
1028
* Avoiding u128 because it is already defined in kernel sources.
1029
*/
1030
typedef struct {
1031
uint64_t hi, lo;
1032
} uint128_t;
1033
1034
extern void ASMABI clear_fpu_regs_avx(void);
1035
extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1036
extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1037
const uint32_t pt[4], uint32_t ct[4]);
1038
1039
extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1040
#if CAN_USE_GCM_ASM >= 2
1041
extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
1042
const uint64_t H[2]);
1043
#endif
1044
extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1045
const uint8_t *in, size_t len);
1046
#if CAN_USE_GCM_ASM >= 2
1047
extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
1048
const uint64_t *Htable, const uint8_t *in, size_t len);
1049
#endif
1050
static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
1051
{
1052
switch (ctx->impl) {
1053
#if CAN_USE_GCM_ASM >= 2
1054
case GCM_IMPL_AVX2:
1055
gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
1056
(const uint64_t *)ctx->gcm_Htable, in, len);
1057
break;
1058
#endif
1059
1060
case GCM_IMPL_AVX:
1061
gcm_ghash_avx(ctx->gcm_ghash,
1062
(const uint64_t *)ctx->gcm_Htable, in, len);
1063
break;
1064
1065
default:
1066
VERIFY(B_FALSE);
1067
}
1068
}
1069
1070
typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
1071
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
1072
extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1073
const void *, uint64_t *, uint64_t *);
1074
#if CAN_USE_GCM_ASM >= 2
1075
extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
1076
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
1077
const uint128_t Htable[16], uint8_t Xi[16]);
1078
#endif
1079
1080
typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
1081
size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
1082
extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1083
const void *, uint64_t *, uint64_t *);
1084
#if CAN_USE_GCM_ASM >= 2
1085
extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
1086
uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
1087
const uint128_t Htable[16], uint8_t Xi[16]);
1088
#endif
1089
1090
static inline boolean_t
1091
gcm_avx2_will_work(void)
1092
{
1093
return (kfpu_allowed() &&
1094
zfs_avx2_available() && zfs_vaes_available() &&
1095
zfs_vpclmulqdq_available());
1096
}
1097
1098
static inline boolean_t
1099
gcm_avx_will_work(void)
1100
{
1101
/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1102
return (kfpu_allowed() &&
1103
zfs_avx_available() && zfs_aes_available() &&
1104
zfs_pclmulqdq_available());
1105
}
1106
1107
static inline void
1108
gcm_use_impl(gcm_impl impl)
1109
{
1110
switch (impl) {
1111
#if CAN_USE_GCM_ASM >= 2
1112
case GCM_IMPL_AVX2:
1113
if (gcm_avx2_will_work() == B_TRUE) {
1114
atomic_swap_32(&gcm_impl_used, impl);
1115
return;
1116
}
1117
1118
zfs_fallthrough;
1119
#endif
1120
1121
case GCM_IMPL_AVX:
1122
if (gcm_avx_will_work() == B_TRUE) {
1123
atomic_swap_32(&gcm_impl_used, impl);
1124
return;
1125
}
1126
1127
zfs_fallthrough;
1128
1129
default:
1130
atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
1131
}
1132
}
1133
1134
static inline boolean_t
1135
gcm_impl_will_work(gcm_impl impl)
1136
{
1137
switch (impl) {
1138
#if CAN_USE_GCM_ASM >= 2
1139
case GCM_IMPL_AVX2:
1140
return (gcm_avx2_will_work());
1141
#endif
1142
1143
case GCM_IMPL_AVX:
1144
return (gcm_avx_will_work());
1145
1146
default:
1147
return (B_TRUE);
1148
}
1149
}
1150
1151
static inline gcm_impl
1152
gcm_toggle_impl(void)
1153
{
1154
gcm_impl current_impl, new_impl;
1155
do { /* handle races */
1156
current_impl = atomic_load_32(&gcm_impl_used);
1157
new_impl = current_impl;
1158
while (B_TRUE) { /* handle incompatble implementations */
1159
new_impl = (new_impl + 1) % GCM_IMPL_MAX;
1160
if (gcm_impl_will_work(new_impl)) {
1161
break;
1162
}
1163
}
1164
1165
} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
1166
current_impl);
1167
1168
return (new_impl);
1169
}
1170
1171
1172
/* Increment the GCM counter block by n. */
1173
static inline void
1174
gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1175
{
1176
uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1177
uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1178
1179
counter = htonll(counter + n);
1180
counter &= counter_mask;
1181
ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1182
}
1183
1184
static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
1185
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1186
uint64_t *Xip)
1187
{
1188
(void) Htable;
1189
return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
1190
}
1191
1192
#if CAN_USE_GCM_ASM >= 2
1193
// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
1194
// bits of a |size_t|.
1195
// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
1196
static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
1197
1198
/* The following CRYPTO methods are from boringssl/crypto/internal.h */
1199
static inline uint32_t CRYPTO_bswap4(uint32_t x) {
1200
return (__builtin_bswap32(x));
1201
}
1202
1203
static inline uint32_t CRYPTO_load_u32_be(const void *in) {
1204
uint32_t v;
1205
memcpy(&v, in, sizeof (v));
1206
return (CRYPTO_bswap4(v));
1207
}
1208
1209
static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
1210
v = CRYPTO_bswap4(v);
1211
memcpy(out, &v, sizeof (v));
1212
}
1213
1214
static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
1215
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1216
uint64_t *Xip)
1217
{
1218
uint8_t *ivec = (uint8_t *)iv;
1219
len &= kSizeTWithoutLower4Bits;
1220
aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
1221
(const uint128_t *)Htable, (uint8_t *)Xip);
1222
CRYPTO_store_u32_be(&ivec[12],
1223
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
1224
return (len);
1225
}
1226
#endif /* if CAN_USE_GCM_ASM >= 2 */
1227
1228
/*
1229
* Encrypt multiple blocks of data in GCM mode.
1230
* This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1231
* if possible. While processing a chunk the FPU is "locked".
1232
*/
1233
static int
1234
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1235
size_t length, crypto_data_t *out, size_t block_size)
1236
{
1237
size_t bleft = length;
1238
size_t need = 0;
1239
size_t done = 0;
1240
uint8_t *datap = (uint8_t *)data;
1241
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1242
aesni_gcm_encrypt_impl *encrypt_blocks =
1243
#if CAN_USE_GCM_ASM >= 2
1244
ctx->impl == GCM_IMPL_AVX2 ?
1245
aesni_gcm_encrypt_avx2 :
1246
#endif
1247
aesni_gcm_encrypt_avx;
1248
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1249
uint64_t *ghash = ctx->gcm_ghash;
1250
uint64_t *htable = ctx->gcm_Htable;
1251
uint64_t *cb = ctx->gcm_cb;
1252
uint8_t *ct_buf = NULL;
1253
uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1254
int rv = CRYPTO_SUCCESS;
1255
1256
ASSERT(block_size == GCM_BLOCK_LEN);
1257
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1258
B_FALSE);
1259
/*
1260
* If the last call left an incomplete block, try to fill
1261
* it first.
1262
*/
1263
if (ctx->gcm_remainder_len > 0) {
1264
need = block_size - ctx->gcm_remainder_len;
1265
if (length < need) {
1266
/* Accumulate bytes here and return. */
1267
memcpy((uint8_t *)ctx->gcm_remainder +
1268
ctx->gcm_remainder_len, datap, length);
1269
1270
ctx->gcm_remainder_len += length;
1271
if (ctx->gcm_copy_to == NULL) {
1272
ctx->gcm_copy_to = datap;
1273
}
1274
return (CRYPTO_SUCCESS);
1275
} else {
1276
/* Complete incomplete block. */
1277
memcpy((uint8_t *)ctx->gcm_remainder +
1278
ctx->gcm_remainder_len, datap, need);
1279
1280
ctx->gcm_copy_to = NULL;
1281
}
1282
}
1283
1284
/* Allocate a buffer to encrypt to if there is enough input. */
1285
if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1286
ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1287
if (ct_buf == NULL) {
1288
return (CRYPTO_HOST_MEMORY);
1289
}
1290
}
1291
1292
/* If we completed an incomplete block, encrypt and write it out. */
1293
if (ctx->gcm_remainder_len > 0) {
1294
kfpu_begin();
1295
aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1296
(const uint32_t *)cb, (uint32_t *)tmp);
1297
1298
gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1299
GHASH_AVX(ctx, tmp, block_size);
1300
clear_fpu_regs();
1301
kfpu_end();
1302
rv = crypto_put_output_data(tmp, out, block_size);
1303
out->cd_offset += block_size;
1304
gcm_incr_counter_block(ctx);
1305
ctx->gcm_processed_data_len += block_size;
1306
bleft -= need;
1307
datap += need;
1308
ctx->gcm_remainder_len = 0;
1309
}
1310
1311
/* Do the bulk encryption in chunk_size blocks. */
1312
for (; bleft >= chunk_size; bleft -= chunk_size) {
1313
kfpu_begin();
1314
done = encrypt_blocks(
1315
datap, ct_buf, chunk_size, key, cb, htable, ghash);
1316
1317
clear_fpu_regs();
1318
kfpu_end();
1319
if (done != chunk_size) {
1320
rv = CRYPTO_FAILED;
1321
goto out_nofpu;
1322
}
1323
rv = crypto_put_output_data(ct_buf, out, chunk_size);
1324
if (rv != CRYPTO_SUCCESS) {
1325
goto out_nofpu;
1326
}
1327
out->cd_offset += chunk_size;
1328
datap += chunk_size;
1329
ctx->gcm_processed_data_len += chunk_size;
1330
}
1331
/* Check if we are already done. */
1332
if (bleft == 0) {
1333
goto out_nofpu;
1334
}
1335
/* Bulk encrypt the remaining data. */
1336
kfpu_begin();
1337
if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1338
done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
1339
ghash);
1340
if (done == 0) {
1341
rv = CRYPTO_FAILED;
1342
goto out;
1343
}
1344
rv = crypto_put_output_data(ct_buf, out, done);
1345
if (rv != CRYPTO_SUCCESS) {
1346
goto out;
1347
}
1348
out->cd_offset += done;
1349
ctx->gcm_processed_data_len += done;
1350
datap += done;
1351
bleft -= done;
1352
1353
}
1354
/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1355
while (bleft > 0) {
1356
if (bleft < block_size) {
1357
memcpy(ctx->gcm_remainder, datap, bleft);
1358
ctx->gcm_remainder_len = bleft;
1359
ctx->gcm_copy_to = datap;
1360
goto out;
1361
}
1362
/* Encrypt, hash and write out. */
1363
aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1364
(const uint32_t *)cb, (uint32_t *)tmp);
1365
1366
gcm_xor_avx(datap, tmp);
1367
GHASH_AVX(ctx, tmp, block_size);
1368
rv = crypto_put_output_data(tmp, out, block_size);
1369
if (rv != CRYPTO_SUCCESS) {
1370
goto out;
1371
}
1372
out->cd_offset += block_size;
1373
gcm_incr_counter_block(ctx);
1374
ctx->gcm_processed_data_len += block_size;
1375
datap += block_size;
1376
bleft -= block_size;
1377
}
1378
out:
1379
clear_fpu_regs();
1380
kfpu_end();
1381
out_nofpu:
1382
if (ct_buf != NULL) {
1383
vmem_free(ct_buf, chunk_size);
1384
}
1385
return (rv);
1386
}
1387
1388
/*
1389
* Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1390
* incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1391
*/
1392
static int
1393
gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1394
{
1395
uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1396
uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1397
uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1398
size_t rem_len = ctx->gcm_remainder_len;
1399
const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1400
int aes_rounds = ((aes_key_t *)keysched)->nr;
1401
int rv;
1402
1403
ASSERT(block_size == GCM_BLOCK_LEN);
1404
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1405
B_FALSE);
1406
1407
if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1408
return (CRYPTO_DATA_LEN_RANGE);
1409
}
1410
1411
kfpu_begin();
1412
/* Pad last incomplete block with zeros, encrypt and hash. */
1413
if (rem_len > 0) {
1414
uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1415
const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1416
1417
aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1418
memset(remainder + rem_len, 0, block_size - rem_len);
1419
for (int i = 0; i < rem_len; i++) {
1420
remainder[i] ^= tmp[i];
1421
}
1422
GHASH_AVX(ctx, remainder, block_size);
1423
ctx->gcm_processed_data_len += rem_len;
1424
/* No need to increment counter_block, it's the last block. */
1425
}
1426
/* Finish tag. */
1427
ctx->gcm_len_a_len_c[1] =
1428
htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1429
GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1430
aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1431
1432
gcm_xor_avx((uint8_t *)J0, ghash);
1433
clear_fpu_regs();
1434
kfpu_end();
1435
1436
/* Output remainder. */
1437
if (rem_len > 0) {
1438
rv = crypto_put_output_data(remainder, out, rem_len);
1439
if (rv != CRYPTO_SUCCESS)
1440
return (rv);
1441
}
1442
out->cd_offset += rem_len;
1443
ctx->gcm_remainder_len = 0;
1444
rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1445
if (rv != CRYPTO_SUCCESS)
1446
return (rv);
1447
1448
out->cd_offset += ctx->gcm_tag_len;
1449
return (CRYPTO_SUCCESS);
1450
}
1451
1452
static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
1453
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1454
uint64_t *Xip)
1455
{
1456
(void) Htable;
1457
return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
1458
}
1459
1460
#if CAN_USE_GCM_ASM >= 2
1461
static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
1462
size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1463
uint64_t *Xip)
1464
{
1465
uint8_t *ivec = (uint8_t *)iv;
1466
len &= kSizeTWithoutLower4Bits;
1467
aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
1468
(const uint128_t *)Htable, (uint8_t *)Xip);
1469
CRYPTO_store_u32_be(&ivec[12],
1470
CRYPTO_load_u32_be(&ivec[12]) + len / 16);
1471
return (len);
1472
}
1473
#endif /* if CAN_USE_GCM_ASM >= 2 */
1474
1475
/*
1476
* Finalize decryption: We just have accumulated crypto text, so now we
1477
* decrypt it here inplace.
1478
*/
1479
static int
1480
gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1481
{
1482
ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1483
ASSERT3U(block_size, ==, 16);
1484
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1485
B_FALSE);
1486
1487
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1488
aesni_gcm_decrypt_impl *decrypt_blocks =
1489
#if CAN_USE_GCM_ASM >= 2
1490
ctx->impl == GCM_IMPL_AVX2 ?
1491
aesni_gcm_decrypt_avx2 :
1492
#endif
1493
aesni_gcm_decrypt_avx;
1494
size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1495
uint8_t *datap = ctx->gcm_pt_buf;
1496
const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1497
uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1498
uint64_t *htable = ctx->gcm_Htable;
1499
uint64_t *ghash = ctx->gcm_ghash;
1500
uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1501
int rv = CRYPTO_SUCCESS;
1502
size_t bleft, done;
1503
1504
/*
1505
* Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1506
* greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1507
* GCM_AVX_MIN_DECRYPT_BYTES.
1508
*/
1509
for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1510
kfpu_begin();
1511
done = decrypt_blocks(datap, datap, chunk_size,
1512
(const void *)key, ctx->gcm_cb, htable, ghash);
1513
clear_fpu_regs();
1514
kfpu_end();
1515
if (done != chunk_size) {
1516
return (CRYPTO_FAILED);
1517
}
1518
datap += done;
1519
}
1520
/* Decrypt remainder, which is less than chunk size, in one go. */
1521
kfpu_begin();
1522
if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1523
done = decrypt_blocks(datap, datap, bleft,
1524
(const void *)key, ctx->gcm_cb, htable, ghash);
1525
if (done == 0) {
1526
clear_fpu_regs();
1527
kfpu_end();
1528
return (CRYPTO_FAILED);
1529
}
1530
datap += done;
1531
bleft -= done;
1532
}
1533
ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1534
1535
/*
1536
* Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1537
* decrypt them block by block.
1538
*/
1539
while (bleft > 0) {
1540
/* Incomplete last block. */
1541
if (bleft < block_size) {
1542
uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1543
1544
memset(lastb, 0, block_size);
1545
memcpy(lastb, datap, bleft);
1546
/* The GCM processing. */
1547
GHASH_AVX(ctx, lastb, block_size);
1548
aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1549
for (size_t i = 0; i < bleft; i++) {
1550
datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1551
}
1552
break;
1553
}
1554
/* The GCM processing. */
1555
GHASH_AVX(ctx, datap, block_size);
1556
aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1557
gcm_xor_avx((uint8_t *)tmp, datap);
1558
gcm_incr_counter_block(ctx);
1559
1560
datap += block_size;
1561
bleft -= block_size;
1562
}
1563
if (rv != CRYPTO_SUCCESS) {
1564
clear_fpu_regs();
1565
kfpu_end();
1566
return (rv);
1567
}
1568
/* Decryption done, finish the tag. */
1569
ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1570
GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1571
aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1572
(uint32_t *)ctx->gcm_J0);
1573
1574
gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1575
1576
/* We are done with the FPU, restore its state. */
1577
clear_fpu_regs();
1578
kfpu_end();
1579
1580
/* Compare the input authentication tag with what we calculated. */
1581
if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1582
/* They don't match. */
1583
return (CRYPTO_INVALID_MAC);
1584
}
1585
rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1586
if (rv != CRYPTO_SUCCESS) {
1587
return (rv);
1588
}
1589
out->cd_offset += pt_len;
1590
return (CRYPTO_SUCCESS);
1591
}
1592
1593
/*
1594
* Initialize the GCM params H, Htabtle and the counter block. Save the
1595
* initial counter block.
1596
*/
1597
static int
1598
gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1599
const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1600
{
1601
uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1602
uint64_t *H = ctx->gcm_H;
1603
const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1604
int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1605
const uint8_t *datap = auth_data;
1606
size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1607
size_t bleft;
1608
1609
ASSERT(block_size == GCM_BLOCK_LEN);
1610
ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1611
B_FALSE);
1612
1613
size_t htab_len = 0;
1614
#if CAN_USE_GCM_ASM >= 2
1615
if (ctx->impl == GCM_IMPL_AVX2) {
1616
/*
1617
* BoringSSL's API specifies uint128_t[16] for htab; but only
1618
* uint128_t[12] are used.
1619
* See https://github.com/google/boringssl/blob/
1620
* 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
1621
* modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
1622
*/
1623
htab_len = (2 * 8 * sizeof (uint128_t));
1624
} else
1625
#endif /* CAN_USE_GCM_ASM >= 2 */
1626
{
1627
htab_len = (2 * 6 * sizeof (uint128_t));
1628
}
1629
1630
ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
1631
if (ctx->gcm_Htable == NULL) {
1632
return (CRYPTO_HOST_MEMORY);
1633
}
1634
1635
/* Init H (encrypt zero block) and create the initial counter block. */
1636
memset(H, 0, sizeof (ctx->gcm_H));
1637
kfpu_begin();
1638
aes_encrypt_intel(keysched, aes_rounds,
1639
(const uint32_t *)H, (uint32_t *)H);
1640
1641
#if CAN_USE_GCM_ASM >= 2
1642
if (ctx->impl == GCM_IMPL_AVX2) {
1643
gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
1644
} else
1645
#endif /* if CAN_USE_GCM_ASM >= 2 */
1646
{
1647
gcm_init_htab_avx(ctx->gcm_Htable, H);
1648
}
1649
1650
if (iv_len == 12) {
1651
memcpy(cb, iv, 12);
1652
cb[12] = 0;
1653
cb[13] = 0;
1654
cb[14] = 0;
1655
cb[15] = 1;
1656
/* We need the ICB later. */
1657
memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1658
} else {
1659
/*
1660
* Most consumers use 12 byte IVs, so it's OK to use the
1661
* original routines for other IV sizes, just avoid nesting
1662
* kfpu_begin calls.
1663
*/
1664
clear_fpu_regs();
1665
kfpu_end();
1666
gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1667
aes_copy_block, aes_xor_block);
1668
kfpu_begin();
1669
}
1670
1671
memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1672
1673
/* Openssl post increments the counter, adjust for that. */
1674
gcm_incr_counter_block(ctx);
1675
1676
/* Ghash AAD in chunk_size blocks. */
1677
for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1678
GHASH_AVX(ctx, datap, chunk_size);
1679
datap += chunk_size;
1680
clear_fpu_regs();
1681
kfpu_end();
1682
kfpu_begin();
1683
}
1684
/* Ghash the remainder and handle possible incomplete GCM block. */
1685
if (bleft > 0) {
1686
size_t incomp = bleft % block_size;
1687
1688
bleft -= incomp;
1689
if (bleft > 0) {
1690
GHASH_AVX(ctx, datap, bleft);
1691
datap += bleft;
1692
}
1693
if (incomp > 0) {
1694
/* Zero pad and hash incomplete last block. */
1695
uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1696
1697
memset(authp, 0, block_size);
1698
memcpy(authp, datap, incomp);
1699
GHASH_AVX(ctx, authp, block_size);
1700
}
1701
}
1702
clear_fpu_regs();
1703
kfpu_end();
1704
return (CRYPTO_SUCCESS);
1705
}
1706
1707
#if defined(_KERNEL)
1708
static int
1709
icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1710
{
1711
unsigned long val;
1712
char val_rounded[16];
1713
int error = 0;
1714
1715
error = kstrtoul(buf, 0, &val);
1716
if (error)
1717
return (error);
1718
1719
val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1720
1721
if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1722
return (-EINVAL);
1723
1724
snprintf(val_rounded, 16, "%u", (uint32_t)val);
1725
error = param_set_uint(val_rounded, kp);
1726
return (error);
1727
}
1728
1729
module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1730
param_get_uint, &gcm_avx_chunk_size, 0644);
1731
1732
MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1733
"How many bytes to process while owning the FPU");
1734
1735
#endif /* defined(__KERNEL) */
1736
#endif /* ifdef CAN_USE_GCM_ASM */
1737
1738