Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/gf128hash.c
170838 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* GF(2^128) polynomial hashing: GHASH and POLYVAL
4
*
5
* Copyright 2025 Google LLC
6
*/
7
8
#include <crypto/gf128hash.h>
9
#include <linux/export.h>
10
#include <linux/module.h>
11
#include <linux/string.h>
12
#include <linux/unaligned.h>
13
14
/*
15
* GHASH and POLYVAL are almost-XOR-universal hash functions. They interpret
16
* the message as the coefficients of a polynomial in the finite field GF(2^128)
17
* and evaluate that polynomial at a secret point.
18
*
19
* Neither GHASH nor POLYVAL is a cryptographic hash function. They should be
20
* used only by algorithms that are specifically designed to use them.
21
*
22
* GHASH is the older variant, defined as part of GCM in NIST SP 800-38D
23
* (https://nvlpubs.nist.gov/nistpubs/legacy/sp/nistspecialpublication800-38d.pdf).
24
* GHASH is hard to implement directly, due to its backwards mapping between
25
* bits and polynomial coefficients. GHASH implementations typically pre and
26
* post-process the inputs and outputs (mainly by byte-swapping) to convert the
27
* GHASH computation into an equivalent computation over a different,
28
* easier-to-use representation of GF(2^128).
29
*
30
* POLYVAL is a newer GF(2^128) polynomial hash, originally defined as part of
31
* AES-GCM-SIV (https://datatracker.ietf.org/doc/html/rfc8452) and also used by
32
* HCTR2 (https://eprint.iacr.org/2021/1441.pdf). It uses that easier-to-use
33
* field representation directly, eliminating the data conversion steps.
34
*
35
* This file provides library APIs for GHASH and POLYVAL. These APIs can
36
* delegate to either a generic implementation or an architecture-optimized
37
* implementation. Due to the mathematical relationship between GHASH and
38
* POLYVAL, in some cases code for one is reused with the other.
39
*
40
* For the generic implementation, we don't use the traditional table approach
41
* to GF(2^128) multiplication. That approach is not constant-time and requires
42
* a lot of memory. Instead, we use a different approach which emulates
43
* carryless multiplication using standard multiplications by spreading the data
44
* bits apart using "holes". This allows the carries to spill harmlessly. This
45
* approach is borrowed from BoringSSL, which in turn credits BearSSL's
46
* documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
47
* "holes" trick and a presentation by Shay Gueron
48
* (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
49
* 256-bit => 128-bit reduction algorithm.
50
*/
51
52
#ifdef CONFIG_ARCH_SUPPORTS_INT128
53
54
/* Do a 64 x 64 => 128 bit carryless multiplication. */
55
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
56
{
57
/*
58
* With 64-bit multiplicands and one term every 4 bits, there would be
59
* up to 64 / 4 = 16 one bits per column when each multiplication is
60
* written out as a series of additions in the schoolbook manner.
61
* Unfortunately, that doesn't work since the value 16 is 1 too large to
62
* fit in 4 bits. Carries would sometimes overflow into the next term.
63
*
64
* Using one term every 5 bits would work. However, that would cost
65
* 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
66
*
67
* Instead, mask off 4 bits from one multiplicand, giving a max of 15
68
* one bits per column. Then handle those 4 bits separately.
69
*/
70
u64 a0 = a & 0x1111111111111110;
71
u64 a1 = a & 0x2222222222222220;
72
u64 a2 = a & 0x4444444444444440;
73
u64 a3 = a & 0x8888888888888880;
74
75
u64 b0 = b & 0x1111111111111111;
76
u64 b1 = b & 0x2222222222222222;
77
u64 b2 = b & 0x4444444444444444;
78
u64 b3 = b & 0x8888888888888888;
79
80
/* Multiply the high 60 bits of @a by @b. */
81
u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
82
(a2 * (u128)b2) ^ (a3 * (u128)b1);
83
u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
84
(a2 * (u128)b3) ^ (a3 * (u128)b2);
85
u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
86
(a2 * (u128)b0) ^ (a3 * (u128)b3);
87
u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
88
(a2 * (u128)b1) ^ (a3 * (u128)b0);
89
90
/* Multiply the low 4 bits of @a by @b. */
91
u64 e0 = -(a & 1) & b;
92
u64 e1 = -((a >> 1) & 1) & b;
93
u64 e2 = -((a >> 2) & 1) & b;
94
u64 e3 = -((a >> 3) & 1) & b;
95
u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
96
u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
97
98
/* Add all the intermediate products together. */
99
*out_lo = (((u64)c0) & 0x1111111111111111) ^
100
(((u64)c1) & 0x2222222222222222) ^
101
(((u64)c2) & 0x4444444444444444) ^
102
(((u64)c3) & 0x8888888888888888) ^ extra_lo;
103
*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
104
(((u64)(c1 >> 64)) & 0x2222222222222222) ^
105
(((u64)(c2 >> 64)) & 0x4444444444444444) ^
106
(((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
107
}
108
109
#else /* CONFIG_ARCH_SUPPORTS_INT128 */
110
111
/* Do a 32 x 32 => 64 bit carryless multiplication. */
112
static u64 clmul32(u32 a, u32 b)
113
{
114
/*
115
* With 32-bit multiplicands and one term every 4 bits, there are up to
116
* 32 / 4 = 8 one bits per column when each multiplication is written
117
* out as a series of additions in the schoolbook manner. The value 8
118
* fits in 4 bits, so the carries don't overflow into the next term.
119
*/
120
u32 a0 = a & 0x11111111;
121
u32 a1 = a & 0x22222222;
122
u32 a2 = a & 0x44444444;
123
u32 a3 = a & 0x88888888;
124
125
u32 b0 = b & 0x11111111;
126
u32 b1 = b & 0x22222222;
127
u32 b2 = b & 0x44444444;
128
u32 b3 = b & 0x88888888;
129
130
u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
131
(a2 * (u64)b2) ^ (a3 * (u64)b1);
132
u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
133
(a2 * (u64)b3) ^ (a3 * (u64)b2);
134
u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
135
(a2 * (u64)b0) ^ (a3 * (u64)b3);
136
u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
137
(a2 * (u64)b1) ^ (a3 * (u64)b0);
138
139
/* Add all the intermediate products together. */
140
return (c0 & 0x1111111111111111) ^
141
(c1 & 0x2222222222222222) ^
142
(c2 & 0x4444444444444444) ^
143
(c3 & 0x8888888888888888);
144
}
145
146
/* Do a 64 x 64 => 128 bit carryless multiplication. */
147
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
148
{
149
u32 a_lo = (u32)a;
150
u32 a_hi = a >> 32;
151
u32 b_lo = (u32)b;
152
u32 b_hi = b >> 32;
153
154
/* Karatsuba multiplication */
155
u64 lo = clmul32(a_lo, b_lo);
156
u64 hi = clmul32(a_hi, b_hi);
157
u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
158
159
*out_lo = lo ^ (mi << 32);
160
*out_hi = hi ^ (mi >> 32);
161
}
162
#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
163
164
/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
165
static void __maybe_unused
166
polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
167
{
168
u64 c0, c1, c2, c3, mi0, mi1;
169
170
/*
171
* Carryless-multiply @a by @b using Karatsuba multiplication. Store
172
* the 256-bit product in @c0 (low) through @c3 (high).
173
*/
174
clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
175
clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
176
clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
177
&mi0, &mi1);
178
mi0 ^= c0 ^ c2;
179
mi1 ^= c1 ^ c3;
180
c1 ^= mi0;
181
c2 ^= mi1;
182
183
/*
184
* Cancel out the low 128 bits of the product by adding multiples of
185
* G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
186
* of which cancels out 64 bits. Note that we break G(x) into three
187
* parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
188
*/
189
190
/*
191
* First, add G(x) times c0 as follows:
192
*
193
* (c0, c1, c2) = (0,
194
* c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
195
* c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
196
*/
197
c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
198
c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
199
200
/*
201
* Second, add G(x) times the new c1:
202
*
203
* (c1, c2, c3) = (0,
204
* c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
205
* c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
206
*/
207
c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
208
c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
209
210
/* Return (c2, c3). This implicitly multiplies by x^-128. */
211
a->lo = cpu_to_le64(c2);
212
a->hi = cpu_to_le64(c3);
213
}
214
215
static void __maybe_unused ghash_blocks_generic(struct polyval_elem *acc,
216
const struct polyval_elem *key,
217
const u8 *data, size_t nblocks)
218
{
219
do {
220
acc->lo ^=
221
cpu_to_le64(get_unaligned_be64((__be64 *)(data + 8)));
222
acc->hi ^= cpu_to_le64(get_unaligned_be64((__be64 *)data));
223
polyval_mul_generic(acc, key);
224
data += GHASH_BLOCK_SIZE;
225
} while (--nblocks);
226
}
227
228
static void __maybe_unused
229
polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
230
const u8 *data, size_t nblocks)
231
{
232
do {
233
acc->lo ^= get_unaligned((__le64 *)data);
234
acc->hi ^= get_unaligned((__le64 *)(data + 8));
235
polyval_mul_generic(acc, key);
236
data += POLYVAL_BLOCK_SIZE;
237
} while (--nblocks);
238
}
239
240
/* Convert the key from GHASH format to POLYVAL format. */
241
static void __maybe_unused ghash_key_to_polyval(const u8 in[GHASH_BLOCK_SIZE],
242
struct polyval_elem *out)
243
{
244
u64 hi = get_unaligned_be64(&in[0]);
245
u64 lo = get_unaligned_be64(&in[8]);
246
u64 mask = (s64)hi >> 63;
247
248
hi = (hi << 1) ^ (lo >> 63) ^ (mask & ((u64)0xc2 << 56));
249
lo = (lo << 1) ^ (mask & 1);
250
out->lo = cpu_to_le64(lo);
251
out->hi = cpu_to_le64(hi);
252
}
253
254
/* Convert the accumulator from POLYVAL format to GHASH format. */
255
static void polyval_acc_to_ghash(const struct polyval_elem *in,
256
u8 out[GHASH_BLOCK_SIZE])
257
{
258
put_unaligned_be64(le64_to_cpu(in->hi), &out[0]);
259
put_unaligned_be64(le64_to_cpu(in->lo), &out[8]);
260
}
261
262
/* Convert the accumulator from GHASH format to POLYVAL format. */
263
static void __maybe_unused ghash_acc_to_polyval(const u8 in[GHASH_BLOCK_SIZE],
264
struct polyval_elem *out)
265
{
266
out->lo = cpu_to_le64(get_unaligned_be64(&in[8]));
267
out->hi = cpu_to_le64(get_unaligned_be64(&in[0]));
268
}
269
270
#ifdef CONFIG_CRYPTO_LIB_GF128HASH_ARCH
271
#include "gf128hash.h" /* $(SRCARCH)/gf128hash.h */
272
#endif
273
274
void ghash_preparekey(struct ghash_key *key, const u8 raw_key[GHASH_BLOCK_SIZE])
275
{
276
#ifdef ghash_preparekey_arch
277
ghash_preparekey_arch(key, raw_key);
278
#else
279
ghash_key_to_polyval(raw_key, &key->h);
280
#endif
281
}
282
EXPORT_SYMBOL_GPL(ghash_preparekey);
283
284
static void ghash_mul(struct ghash_ctx *ctx)
285
{
286
#ifdef ghash_mul_arch
287
ghash_mul_arch(&ctx->acc, ctx->key);
288
#elif defined(ghash_blocks_arch)
289
static const u8 zeroes[GHASH_BLOCK_SIZE];
290
291
ghash_blocks_arch(&ctx->acc, ctx->key, zeroes, 1);
292
#else
293
polyval_mul_generic(&ctx->acc, &ctx->key->h);
294
#endif
295
}
296
297
/* nblocks is always >= 1. */
298
static void ghash_blocks(struct ghash_ctx *ctx, const u8 *data, size_t nblocks)
299
{
300
#ifdef ghash_blocks_arch
301
ghash_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
302
#else
303
ghash_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
304
#endif
305
}
306
307
void ghash_update(struct ghash_ctx *ctx, const u8 *data, size_t len)
308
{
309
if (unlikely(ctx->partial)) {
310
size_t n = min(len, GHASH_BLOCK_SIZE - ctx->partial);
311
312
len -= n;
313
while (n--)
314
ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - ctx->partial++] ^=
315
*data++;
316
if (ctx->partial < GHASH_BLOCK_SIZE)
317
return;
318
ghash_mul(ctx);
319
}
320
if (len >= GHASH_BLOCK_SIZE) {
321
size_t nblocks = len / GHASH_BLOCK_SIZE;
322
323
ghash_blocks(ctx, data, nblocks);
324
data += len & ~(GHASH_BLOCK_SIZE - 1);
325
len &= GHASH_BLOCK_SIZE - 1;
326
}
327
for (size_t i = 0; i < len; i++)
328
ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - i] ^= data[i];
329
ctx->partial = len;
330
}
331
EXPORT_SYMBOL_GPL(ghash_update);
332
333
void ghash_final(struct ghash_ctx *ctx, u8 out[GHASH_BLOCK_SIZE])
334
{
335
if (unlikely(ctx->partial))
336
ghash_mul(ctx);
337
polyval_acc_to_ghash(&ctx->acc, out);
338
memzero_explicit(ctx, sizeof(*ctx));
339
}
340
EXPORT_SYMBOL_GPL(ghash_final);
341
342
void polyval_preparekey(struct polyval_key *key,
343
const u8 raw_key[POLYVAL_BLOCK_SIZE])
344
{
345
#ifdef polyval_preparekey_arch
346
polyval_preparekey_arch(key, raw_key);
347
#else
348
memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
349
#endif
350
}
351
EXPORT_SYMBOL_GPL(polyval_preparekey);
352
353
/*
354
* polyval_mul_generic() and polyval_blocks_generic() take the key as a
355
* polyval_elem rather than a polyval_key, so that arch-optimized
356
* implementations with a different key format can use it as a fallback (if they
357
* have H^1 stored somewhere in their struct). Thus, the following dispatch
358
* code is needed to pass the appropriate key argument.
359
*/
360
361
static void polyval_mul(struct polyval_ctx *ctx)
362
{
363
#ifdef polyval_mul_arch
364
polyval_mul_arch(&ctx->acc, ctx->key);
365
#elif defined(polyval_blocks_arch)
366
static const u8 zeroes[POLYVAL_BLOCK_SIZE];
367
368
polyval_blocks_arch(&ctx->acc, ctx->key, zeroes, 1);
369
#else
370
polyval_mul_generic(&ctx->acc, &ctx->key->h);
371
#endif
372
}
373
374
/* nblocks is always >= 1. */
375
static void polyval_blocks(struct polyval_ctx *ctx,
376
const u8 *data, size_t nblocks)
377
{
378
#ifdef polyval_blocks_arch
379
polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
380
#else
381
polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
382
#endif
383
}
384
385
void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
386
{
387
if (unlikely(ctx->partial)) {
388
size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
389
390
len -= n;
391
while (n--)
392
ctx->acc.bytes[ctx->partial++] ^= *data++;
393
if (ctx->partial < POLYVAL_BLOCK_SIZE)
394
return;
395
polyval_mul(ctx);
396
}
397
if (len >= POLYVAL_BLOCK_SIZE) {
398
size_t nblocks = len / POLYVAL_BLOCK_SIZE;
399
400
polyval_blocks(ctx, data, nblocks);
401
data += len & ~(POLYVAL_BLOCK_SIZE - 1);
402
len &= POLYVAL_BLOCK_SIZE - 1;
403
}
404
for (size_t i = 0; i < len; i++)
405
ctx->acc.bytes[i] ^= data[i];
406
ctx->partial = len;
407
}
408
EXPORT_SYMBOL_GPL(polyval_update);
409
410
void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
411
{
412
if (unlikely(ctx->partial))
413
polyval_mul(ctx);
414
memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
415
memzero_explicit(ctx, sizeof(*ctx));
416
}
417
EXPORT_SYMBOL_GPL(polyval_final);
418
419
#ifdef gf128hash_mod_init_arch
420
static int __init gf128hash_mod_init(void)
421
{
422
gf128hash_mod_init_arch();
423
return 0;
424
}
425
subsys_initcall(gf128hash_mod_init);
426
427
static void __exit gf128hash_mod_exit(void)
428
{
429
}
430
module_exit(gf128hash_mod_exit);
431
#endif
432
433
MODULE_DESCRIPTION("GF(2^128) polynomial hashing: GHASH and POLYVAL");
434
MODULE_LICENSE("GPL");
435
436