CoCalc -- polyval.c

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/polyval.c
⁴⁹⁰⁴¹ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
 * POLYVAL library functions
4
 *
5
 * Copyright 2025 Google LLC
6
 */
7

8
#include <crypto/polyval.h>
9
#include <linux/export.h>
10
#include <linux/module.h>
11
#include <linux/string.h>
12
#include <linux/unaligned.h>
13

14
/*
15
 * POLYVAL is an almost-XOR-universal hash function.  Similar to GHASH, POLYVAL
16
 * interprets the message as the coefficients of a polynomial in GF(2^128) and
17
 * evaluates that polynomial at a secret point.  POLYVAL has a simple
18
 * mathematical relationship with GHASH, but it uses a better field convention
19
 * which makes it easier and faster to implement.
20
 *
21
 * POLYVAL is not a cryptographic hash function, and it should be used only by
22
 * algorithms that are specifically designed to use it.
23
 *
24
 * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
25
 * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
26
 *
27
 * POLYVAL is also used by HCTR2.  See "Length-preserving encryption with HCTR2"
28
 * (https://eprint.iacr.org/2021/1441.pdf).
29
 *
30
 * This file provides a library API for POLYVAL.  This API can delegate to
31
 * either a generic implementation or an architecture-optimized implementation.
32
 *
33
 * For the generic implementation, we don't use the traditional table approach
34
 * to GF(2^128) multiplication.  That approach is not constant-time and requires
35
 * a lot of memory.  Instead, we use a different approach which emulates
36
 * carryless multiplication using standard multiplications by spreading the data
37
 * bits apart using "holes".  This allows the carries to spill harmlessly.  This
38
 * approach is borrowed from BoringSSL, which in turn credits BearSSL's
39
 * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
40
 * "holes" trick and a presentation by Shay Gueron
41
 * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
42
 * 256-bit => 128-bit reduction algorithm.
43
 */
44

45
#ifdef CONFIG_ARCH_SUPPORTS_INT128
46

47
/* Do a 64 x 64 => 128 bit carryless multiplication. */
48
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
49
{
50
	/*
51
	 * With 64-bit multiplicands and one term every 4 bits, there would be
52
	 * up to 64 / 4 = 16 one bits per column when each multiplication is
53
	 * written out as a series of additions in the schoolbook manner.
54
	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
55
	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
56
	 *
57
	 * Using one term every 5 bits would work.  However, that would cost
58
	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
59
	 *
60
	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
61
	 * one bits per column.  Then handle those 4 bits separately.
62
	 */
63
	u64 a0 = a & 0x1111111111111110;
64
	u64 a1 = a & 0x2222222222222220;
65
	u64 a2 = a & 0x4444444444444440;
66
	u64 a3 = a & 0x8888888888888880;
67

68
	u64 b0 = b & 0x1111111111111111;
69
	u64 b1 = b & 0x2222222222222222;
70
	u64 b2 = b & 0x4444444444444444;
71
	u64 b3 = b & 0x8888888888888888;
72

73
	/* Multiply the high 60 bits of @a by @b. */
74
	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
75
		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
76
	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
77
		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
78
	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
79
		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
80
	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
81
		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
82

83
	/* Multiply the low 4 bits of @a by @b. */
84
	u64 e0 = -(a & 1) & b;
85
	u64 e1 = -((a >> 1) & 1) & b;
86
	u64 e2 = -((a >> 2) & 1) & b;
87
	u64 e3 = -((a >> 3) & 1) & b;
88
	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
89
	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
90

91
	/* Add all the intermediate products together. */
92
	*out_lo = (((u64)c0) & 0x1111111111111111) ^
93
		  (((u64)c1) & 0x2222222222222222) ^
94
		  (((u64)c2) & 0x4444444444444444) ^
95
		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
96
	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
97
		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
98
		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
99
		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
100
}
101

102
#else /* CONFIG_ARCH_SUPPORTS_INT128 */
103

104
/* Do a 32 x 32 => 64 bit carryless multiplication. */
105
static u64 clmul32(u32 a, u32 b)
106
{
107
	/*
108
	 * With 32-bit multiplicands and one term every 4 bits, there are up to
109
	 * 32 / 4 = 8 one bits per column when each multiplication is written
110
	 * out as a series of additions in the schoolbook manner.  The value 8
111
	 * fits in 4 bits, so the carries don't overflow into the next term.
112
	 */
113
	u32 a0 = a & 0x11111111;
114
	u32 a1 = a & 0x22222222;
115
	u32 a2 = a & 0x44444444;
116
	u32 a3 = a & 0x88888888;
117

118
	u32 b0 = b & 0x11111111;
119
	u32 b1 = b & 0x22222222;
120
	u32 b2 = b & 0x44444444;
121
	u32 b3 = b & 0x88888888;
122

123
	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
124
		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
125
	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
126
		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
127
	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
128
		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
129
	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
130
		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
131

132
	/* Add all the intermediate products together. */
133
	return (c0 & 0x1111111111111111) ^
134
	       (c1 & 0x2222222222222222) ^
135
	       (c2 & 0x4444444444444444) ^
136
	       (c3 & 0x8888888888888888);
137
}
138

139
/* Do a 64 x 64 => 128 bit carryless multiplication. */
140
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
141
{
142
	u32 a_lo = (u32)a;
143
	u32 a_hi = a >> 32;
144
	u32 b_lo = (u32)b;
145
	u32 b_hi = b >> 32;
146

147
	/* Karatsuba multiplication */
148
	u64 lo = clmul32(a_lo, b_lo);
149
	u64 hi = clmul32(a_hi, b_hi);
150
	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
151

152
	*out_lo = lo ^ (mi << 32);
153
	*out_hi = hi ^ (mi >> 32);
154
}
155
#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
156

157
/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
158
static void __maybe_unused
159
polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
160
{
161
	u64 c0, c1, c2, c3, mi0, mi1;
162

163
	/*
164
	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
165
	 * the 256-bit product in @c0 (low) through @c3 (high).
166
	 */
167
	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
168
	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
169
	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
170
		&mi0, &mi1);
171
	mi0 ^= c0 ^ c2;
172
	mi1 ^= c1 ^ c3;
173
	c1 ^= mi0;
174
	c2 ^= mi1;
175

176
	/*
177
	 * Cancel out the low 128 bits of the product by adding multiples of
178
	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
179
	 * of which cancels out 64 bits.  Note that we break G(x) into three
180
	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
181
	 */
182

183
	/*
184
	 * First, add G(x) times c0 as follows:
185
	 *
186
	 * (c0, c1, c2) = (0,
187
	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
188
	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
189
	 */
190
	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
191
	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
192

193
	/*
194
	 * Second, add G(x) times the new c1:
195
	 *
196
	 * (c1, c2, c3) = (0,
197
	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
198
	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
199
	 */
200
	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
201
	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
202

203
	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
204
	a->lo = cpu_to_le64(c2);
205
	a->hi = cpu_to_le64(c3);
206
}
207

208
static void __maybe_unused
209
polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
210
		       const u8 *data, size_t nblocks)
211
{
212
	do {
213
		acc->lo ^= get_unaligned((__le64 *)data);
214
		acc->hi ^= get_unaligned((__le64 *)(data + 8));
215
		polyval_mul_generic(acc, key);
216
		data += POLYVAL_BLOCK_SIZE;
217
	} while (--nblocks);
218
}
219

220
/* Include the arch-optimized implementation of POLYVAL, if one is available. */
221
#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
222
#include "polyval.h" /* $(SRCARCH)/polyval.h */
223
void polyval_preparekey(struct polyval_key *key,
224
			const u8 raw_key[POLYVAL_BLOCK_SIZE])
225
{
226
	polyval_preparekey_arch(key, raw_key);
227
}
228
EXPORT_SYMBOL_GPL(polyval_preparekey);
229
#endif /* Else, polyval_preparekey() is an inline function. */
230

231
/*
232
 * polyval_mul_generic() and polyval_blocks_generic() take the key as a
233
 * polyval_elem rather than a polyval_key, so that arch-optimized
234
 * implementations with a different key format can use it as a fallback (if they
235
 * have H^1 stored somewhere in their struct).  Thus, the following dispatch
236
 * code is needed to pass the appropriate key argument.
237
 */
238

239
static void polyval_mul(struct polyval_ctx *ctx)
240
{
241
#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
242
	polyval_mul_arch(&ctx->acc, ctx->key);
243
#else
244
	polyval_mul_generic(&ctx->acc, &ctx->key->h);
245
#endif
246
}
247

248
static void polyval_blocks(struct polyval_ctx *ctx,
249
			   const u8 *data, size_t nblocks)
250
{
251
#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
252
	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
253
#else
254
	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
255
#endif
256
}
257

258
void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
259
{
260
	if (unlikely(ctx->partial)) {
261
		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
262

263
		len -= n;
264
		while (n--)
265
			ctx->acc.bytes[ctx->partial++] ^= *data++;
266
		if (ctx->partial < POLYVAL_BLOCK_SIZE)
267
			return;
268
		polyval_mul(ctx);
269
	}
270
	if (len >= POLYVAL_BLOCK_SIZE) {
271
		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
272

273
		polyval_blocks(ctx, data, nblocks);
274
		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
275
		len &= POLYVAL_BLOCK_SIZE - 1;
276
	}
277
	for (size_t i = 0; i < len; i++)
278
		ctx->acc.bytes[i] ^= data[i];
279
	ctx->partial = len;
280
}
281
EXPORT_SYMBOL_GPL(polyval_update);
282

283
void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
284
{
285
	if (unlikely(ctx->partial))
286
		polyval_mul(ctx);
287
	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
288
	memzero_explicit(ctx, sizeof(*ctx));
289
}
290
EXPORT_SYMBOL_GPL(polyval_final);
291

292
#ifdef polyval_mod_init_arch
293
static int __init polyval_mod_init(void)
294
{
295
	polyval_mod_init_arch();
296
	return 0;
297
}
298
subsys_initcall(polyval_mod_init);
299

300
static void __exit polyval_mod_exit(void)
301
{
302
}
303
module_exit(polyval_mod_exit);
304
#endif
305

306
MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
307
MODULE_LICENSE("GPL");
308

309
Product

Resources

Company