CoCalc -- gf128mul.c

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/gf128mul.c
⁵⁰³⁴⁶ views
1
/* gf128mul.c - GF(2^128) multiplication functions
2
 *
3
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
4
 * Copyright (c) 2006, Rik Snel <[email protected]>
5
 *
6
 * Based on Dr Brian Gladman's (GPL'd) work published at
7
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
8
 * See the original copyright notice below.
9
 *
10
 * This program is free software; you can redistribute it and/or modify it
11
 * under the terms of the GNU General Public License as published by the Free
12
 * Software Foundation; either version 2 of the License, or (at your option)
13
 * any later version.
14
 */
15

16
/*
17
 ---------------------------------------------------------------------------
18
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
19

20
 LICENSE TERMS
21

22
 The free distribution and use of this software in both source and binary
23
 form is allowed (with or without changes) provided that:
24

25
   1. distributions of this source code include the above copyright
26
      notice, this list of conditions and the following disclaimer;
27

28
   2. distributions in binary form include the above copyright
29
      notice, this list of conditions and the following disclaimer
30
      in the documentation and/or other associated materials;
31

32
   3. the copyright holder's name is not used to endorse products
33
      built using this software without specific written permission.
34

35
 ALTERNATIVELY, provided that this notice is retained in full, this product
36
 may be distributed under the terms of the GNU General Public License (GPL),
37
 in which case the provisions of the GPL apply INSTEAD OF those given above.
38

39
 DISCLAIMER
40

41
 This software is provided 'as is' with no explicit or implied warranties
42
 in respect of its properties, including, but not limited to, correctness
43
 and/or fitness for purpose.
44
 ---------------------------------------------------------------------------
45
 Issue 31/01/2006
46

47
 This file provides fast multiplication in GF(2^128) as required by several
48
 cryptographic authentication modes
49
*/
50

51
#include <crypto/gf128mul.h>
52
#include <linux/export.h>
53
#include <linux/kernel.h>
54
#include <linux/module.h>
55
#include <linux/slab.h>
56

57
#define gf128mul_dat(q) { \
58
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
59
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
60
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
61
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
62
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
63
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
64
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
65
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
66
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
67
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
68
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
69
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
70
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
71
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
72
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
73
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
74
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
75
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
76
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
77
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
78
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
79
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
80
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
81
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
82
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
83
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
84
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
85
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
86
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
87
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
88
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
89
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
90
}
91

92
/*
93
 * Given a value i in 0..255 as the byte overflow when a field element
94
 * in GF(2^128) is multiplied by x^8, the following macro returns the
95
 * 16-bit value that must be XOR-ed into the low-degree end of the
96
 * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
97
 *
98
 * There are two versions of the macro, and hence two tables: one for
99
 * the "be" convention where the highest-order bit is the coefficient of
100
 * the highest-degree polynomial term, and one for the "le" convention
101
 * where the highest-order bit is the coefficient of the lowest-degree
102
 * polynomial term.  In both cases the values are stored in CPU byte
103
 * endianness such that the coefficients are ordered consistently across
104
 * bytes, i.e. in the "be" table bits 15..0 of the stored value
105
 * correspond to the coefficients of x^15..x^0, and in the "le" table
106
 * bits 15..0 correspond to the coefficients of x^0..x^15.
107
 *
108
 * Therefore, provided that the appropriate byte endianness conversions
109
 * are done by the multiplication functions (and these must be in place
110
 * anyway to support both little endian and big endian CPUs), the "be"
111
 * table can be used for multiplications of both "bbe" and "ble"
112
 * elements, and the "le" table can be used for multiplications of both
113
 * "lle" and "lbe" elements.
114
 */
115

116
#define xda_be(i) ( \
117
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
118
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
119
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
120
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
121
)
122

123
#define xda_le(i) ( \
124
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
125
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
126
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
127
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
128
)
129

130
static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
131
static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
132

133
/*
134
 * The following functions multiply a field element by x^8 in
135
 * the polynomial field representation.  They use 64-bit word operations
136
 * to gain speed but compensate for machine endianness and hence work
137
 * correctly on both styles of machine.
138
 */
139

140
static void gf128mul_x8_lle(be128 *x)
141
{
142
	u64 a = be64_to_cpu(x->a);
143
	u64 b = be64_to_cpu(x->b);
144
	u64 _tt = gf128mul_table_le[b & 0xff];
145

146
	x->b = cpu_to_be64((b >> 8) | (a << 56));
147
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
148
}
149

150
/* time invariant version of gf128mul_x8_lle */
151
static void gf128mul_x8_lle_ti(be128 *x)
152
{
153
	u64 a = be64_to_cpu(x->a);
154
	u64 b = be64_to_cpu(x->b);
155
	u64 _tt = xda_le(b & 0xff); /* avoid table lookup */
156

157
	x->b = cpu_to_be64((b >> 8) | (a << 56));
158
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
159
}
160

161
static void gf128mul_x8_bbe(be128 *x)
162
{
163
	u64 a = be64_to_cpu(x->a);
164
	u64 b = be64_to_cpu(x->b);
165
	u64 _tt = gf128mul_table_be[a >> 56];
166

167
	x->a = cpu_to_be64((a << 8) | (b >> 56));
168
	x->b = cpu_to_be64((b << 8) ^ _tt);
169
}
170

171
void gf128mul_x8_ble(le128 *r, const le128 *x)
172
{
173
	u64 a = le64_to_cpu(x->a);
174
	u64 b = le64_to_cpu(x->b);
175
	u64 _tt = gf128mul_table_be[a >> 56];
176

177
	r->a = cpu_to_le64((a << 8) | (b >> 56));
178
	r->b = cpu_to_le64((b << 8) ^ _tt);
179
}
180
EXPORT_SYMBOL(gf128mul_x8_ble);
181

182
void gf128mul_lle(be128 *r, const be128 *b)
183
{
184
	/*
185
	 * The p array should be aligned to twice the size of its element type,
186
	 * so that every even/odd pair is guaranteed to share a cacheline
187
	 * (assuming a cacheline size of 32 bytes or more, which is by far the
188
	 * most common). This ensures that each be128_xor() call in the loop
189
	 * takes the same amount of time regardless of the value of 'ch', which
190
	 * is derived from function parameter 'b', which is commonly used as a
191
	 * key, e.g., for GHASH. The odd array elements are all set to zero,
192
	 * making each be128_xor() a NOP if its associated bit in 'ch' is not
193
	 * set, and this is equivalent to calling be128_xor() conditionally.
194
	 * This approach aims to avoid leaking information about such keys
195
	 * through execution time variances.
196
	 *
197
	 * Unfortunately, __aligned(16) or higher does not work on x86 for
198
	 * variables on the stack so we need to perform the alignment by hand.
199
	 */
200
	be128 array[16 + 3] = {};
201
	be128 *p = PTR_ALIGN(&array[0], 2 * sizeof(be128));
202
	int i;
203

204
	p[0] = *r;
205
	for (i = 0; i < 7; ++i)
206
		gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]);
207

208
	memset(r, 0, sizeof(*r));
209
	for (i = 0;;) {
210
		u8 ch = ((u8 *)b)[15 - i];
211

212
		be128_xor(r, r, &p[ 0 + !(ch & 0x80)]);
213
		be128_xor(r, r, &p[ 2 + !(ch & 0x40)]);
214
		be128_xor(r, r, &p[ 4 + !(ch & 0x20)]);
215
		be128_xor(r, r, &p[ 6 + !(ch & 0x10)]);
216
		be128_xor(r, r, &p[ 8 + !(ch & 0x08)]);
217
		be128_xor(r, r, &p[10 + !(ch & 0x04)]);
218
		be128_xor(r, r, &p[12 + !(ch & 0x02)]);
219
		be128_xor(r, r, &p[14 + !(ch & 0x01)]);
220

221
		if (++i >= 16)
222
			break;
223

224
		gf128mul_x8_lle_ti(r); /* use the time invariant version */
225
	}
226
}
227
EXPORT_SYMBOL(gf128mul_lle);
228

229
/*      This version uses 64k bytes of table space.
230
    A 16 byte buffer has to be multiplied by a 16 byte key
231
    value in GF(2^128).  If we consider a GF(2^128) value in
232
    the buffer's lowest byte, we can construct a table of
233
    the 256 16 byte values that result from the 256 values
234
    of this byte.  This requires 4096 bytes. But we also
235
    need tables for each of the 16 higher bytes in the
236
    buffer as well, which makes 64 kbytes in total.
237
*/
238
/* additional explanation
239
 * t[0][BYTE] contains g*BYTE
240
 * t[1][BYTE] contains g*x^8*BYTE
241
 *  ..
242
 * t[15][BYTE] contains g*x^120*BYTE */
243
struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
244
{
245
	struct gf128mul_64k *t;
246
	int i, j, k;
247

248
	t = kzalloc(sizeof(*t), GFP_KERNEL);
249
	if (!t)
250
		goto out;
251

252
	for (i = 0; i < 16; i++) {
253
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
254
		if (!t->t[i]) {
255
			gf128mul_free_64k(t);
256
			t = NULL;
257
			goto out;
258
		}
259
	}
260

261
	t->t[0]->t[1] = *g;
262
	for (j = 1; j <= 64; j <<= 1)
263
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
264

265
	for (i = 0;;) {
266
		for (j = 2; j < 256; j += j)
267
			for (k = 1; k < j; ++k)
268
				be128_xor(&t->t[i]->t[j + k],
269
					  &t->t[i]->t[j], &t->t[i]->t[k]);
270

271
		if (++i >= 16)
272
			break;
273

274
		for (j = 128; j > 0; j >>= 1) {
275
			t->t[i]->t[j] = t->t[i - 1]->t[j];
276
			gf128mul_x8_bbe(&t->t[i]->t[j]);
277
		}
278
	}
279

280
out:
281
	return t;
282
}
283
EXPORT_SYMBOL(gf128mul_init_64k_bbe);
284

285
void gf128mul_free_64k(struct gf128mul_64k *t)
286
{
287
	int i;
288

289
	for (i = 0; i < 16; i++)
290
		kfree_sensitive(t->t[i]);
291
	kfree_sensitive(t);
292
}
293
EXPORT_SYMBOL(gf128mul_free_64k);
294

295
void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
296
{
297
	u8 *ap = (u8 *)a;
298
	be128 r[1];
299
	int i;
300

301
	*r = t->t[0]->t[ap[15]];
302
	for (i = 1; i < 16; ++i)
303
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
304
	*a = *r;
305
}
306
EXPORT_SYMBOL(gf128mul_64k_bbe);
307

308
/*      This version uses 4k bytes of table space.
309
    A 16 byte buffer has to be multiplied by a 16 byte key
310
    value in GF(2^128).  If we consider a GF(2^128) value in a
311
    single byte, we can construct a table of the 256 16 byte
312
    values that result from the 256 values of this byte.
313
    This requires 4096 bytes. If we take the highest byte in
314
    the buffer and use this table to get the result, we then
315
    have to multiply by x^120 to get the final value. For the
316
    next highest byte the result has to be multiplied by x^112
317
    and so on. But we can do this by accumulating the result
318
    in an accumulator starting with the result for the top
319
    byte.  We repeatedly multiply the accumulator value by
320
    x^8 and then add in (i.e. xor) the 16 bytes of the next
321
    lower byte in the buffer, stopping when we reach the
322
    lowest byte. This requires a 4096 byte table.
323
*/
324
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
325
{
326
	struct gf128mul_4k *t;
327
	int j, k;
328

329
	t = kzalloc(sizeof(*t), GFP_KERNEL);
330
	if (!t)
331
		goto out;
332

333
	t->t[128] = *g;
334
	for (j = 64; j > 0; j >>= 1)
335
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);
336

337
	for (j = 2; j < 256; j += j)
338
		for (k = 1; k < j; ++k)
339
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
340

341
out:
342
	return t;
343
}
344
EXPORT_SYMBOL(gf128mul_init_4k_lle);
345

346
void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
347
{
348
	u8 *ap = (u8 *)a;
349
	be128 r[1];
350
	int i = 15;
351

352
	*r = t->t[ap[15]];
353
	while (i--) {
354
		gf128mul_x8_lle(r);
355
		be128_xor(r, r, &t->t[ap[i]]);
356
	}
357
	*a = *r;
358
}
359
EXPORT_SYMBOL(gf128mul_4k_lle);
360

361
MODULE_LICENSE("GPL");
362
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
363

364
Product

Resources

Company