CoCalc -- ghash

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/hash/ghash_pwr8.c
³⁹⁵³⁶ views
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#define BR_POWER_ASM_MACROS   1
26
#include "inner.h"
27

28
/*
29
 * This is the GHASH implementation that leverages the POWER8 opcodes.
30
 */
31

32
#if BR_POWER8
33

34
/*
35
 * Some symbolic names for registers.
36
 *   HB0 = 16 bytes of value 0
37
 *   HB1 = 16 bytes of value 1
38
 *   HB2 = 16 bytes of value 2
39
 *   HB6 = 16 bytes of value 6
40
 *   HB7 = 16 bytes of value 7
41
 *   TT0, TT1 and TT2 are temporaries
42
 *
43
 * BSW holds the pattern for byteswapping 32-bit words; this is set only
44
 * on little-endian systems. XBSW is the same register with the +32 offset
45
 * for access with the VSX opcodes.
46
 */
47
#define HB0     0
48
#define HB1     1
49
#define HB2     2
50
#define HB6     3
51
#define HB7     4
52
#define TT0     5
53
#define TT1     6
54
#define TT2     7
55

56
#define BSW     8
57
#define XBSW   40
58

59
/*
60
 * Macro to initialise the constants.
61
 */
62
#define INIT \
63
		vxor(HB0, HB0, HB0) \
64
		vspltisb(HB1, 1) \
65
		vspltisb(HB2, 2) \
66
		vspltisb(HB6, 6) \
67
		vspltisb(HB7, 7) \
68
		INIT_BSW
69

70
/*
71
 * Fix endianness of a value after reading it or before writing it, if
72
 * necessary.
73
 */
74
#if BR_POWER8_LE
75
#define INIT_BSW         lxvw4x(XBSW, 0, %[idx2be])
76
#define FIX_ENDIAN(xx)   vperm(xx, xx, xx, BSW)
77
#else
78
#define INIT_BSW
79
#define FIX_ENDIAN(xx)
80
#endif
81

82
/*
83
 * Left-shift x0:x1 by one bit to the left. This is a corrective action
84
 * needed because GHASH is defined in full little-endian specification,
85
 * while the opcodes use full big-endian convention, so the 255-bit product
86
 * ends up one bit to the right.
87
 */
88
#define SL_256(x0, x1) \
89
		vsldoi(TT0, HB0, x1, 1) \
90
		vsl(x0, x0, HB1) \
91
		vsr(TT0, TT0, HB7) \
92
		vsl(x1, x1, HB1) \
93
		vxor(x0, x0, TT0)
94

95
/*
96
 * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
97
 * x0 or x1, or a different register). x0 and x1 are modified.
98
 */
99
#define REDUCE_F128(xd, x0, x1) \
100
		vxor(x0, x0, x1) \
101
		vsr(TT0, x1, HB1) \
102
		vsr(TT1, x1, HB2) \
103
		vsr(TT2, x1, HB7) \
104
		vxor(x0, x0, TT0) \
105
		vxor(TT1, TT1, TT2) \
106
		vxor(x0, x0, TT1) \
107
		vsldoi(x1, x1, HB0, 15) \
108
		vsl(TT1, x1, HB6) \
109
		vsl(TT2, x1, HB1) \
110
		vxor(x1, TT1, TT2) \
111
		vsr(TT0, x1, HB1) \
112
		vsr(TT1, x1, HB2) \
113
		vsr(TT2, x1, HB7) \
114
		vxor(x0, x0, x1) \
115
		vxor(x0, x0, TT0) \
116
		vxor(TT1, TT1, TT2) \
117
		vxor(xd, x0, TT1)
118

119
/* see bearssl_hash.h */
120
void
121
br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
122
{
123
	const unsigned char *buf1, *buf2;
124
	size_t num4, num1;
125
	unsigned char tmp[64];
126
	long cc0, cc1, cc2, cc3;
127

128
#if BR_POWER8_LE
129
	static const uint32_t idx2be[] = {
130
		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
131
	};
132
#endif
133

134
	buf1 = data;
135

136
	/*
137
	 * Assembly code requires data into two chunks; first chunk
138
	 * must contain a number of blocks which is a multiple of 4.
139
	 * Since the processing for the first chunk is faster, we want
140
	 * to make it as big as possible.
141
	 *
142
	 * For the remainder, there are two possibilities:
143
	 *  -- if the remainder size is a multiple of 16, then use it
144
	 *     in place;
145
	 *  -- otherwise, copy it to the tmp[] array and pad it with
146
	 *     zeros.
147
	 */
148
	num4 = len >> 6;
149
	buf2 = buf1 + (num4 << 6);
150
	len &= 63;
151
	num1 = (len + 15) >> 4;
152
	if ((len & 15) != 0) {
153
		memcpy(tmp, buf2, len);
154
		memset(tmp + len, 0, (num1 << 4) - len);
155
		buf2 = tmp;
156
	}
157

158
	cc0 =  0;
159
	cc1 = 16;
160
	cc2 = 32;
161
	cc3 = 48;
162
	asm volatile (
163
		INIT
164

165
		/*
166
		 * Load current h (denoted hereafter h1) in v9.
167
		 */
168
		lxvw4x(41, 0, %[h])
169
		FIX_ENDIAN(9)
170

171
		/*
172
		 * Load current y into v28.
173
		 */
174
		lxvw4x(60, 0, %[y])
175
		FIX_ENDIAN(28)
176

177
		/*
178
		 * Split h1 into three registers:
179
		 *   v17 = h1_1:h1_0
180
		 *   v18 =    0:h1_0
181
		 *   v19 = h1_1:0
182
		 */
183
		xxpermdi(49, 41, 41, 2)
184
		vsldoi(18, HB0, 9, 8)
185
		vsldoi(19, 9, HB0, 8)
186

187
		/*
188
		 * If num4 is 0, skip directly to the second chunk.
189
		 */
190
		cmpldi(%[num4], 0)
191
		beq(chunk1)
192

193
		/*
194
		 * Compute h2 = h*h in v10.
195
		 */
196
		vpmsumd(10, 18, 18)
197
		vpmsumd(11, 19, 19)
198
		SL_256(10, 11)
199
		REDUCE_F128(10, 10, 11)
200

201
		/*
202
		 * Compute h3 = h*h*h in v11.
203
		 * We first split h2 into:
204
		 *   v10 = h2_0:h2_1
205
		 *   v11 =    0:h2_0
206
		 *   v12 = h2_1:0
207
		 * Then we do the product with h1, and reduce into v11.
208
		 */
209
		vsldoi(11, HB0, 10, 8)
210
		vsldoi(12, 10, HB0, 8)
211
		vpmsumd(13, 10, 17)
212
		vpmsumd(11, 11, 18)
213
		vpmsumd(12, 12, 19)
214
		vsldoi(14, HB0, 13, 8)
215
		vsldoi(15, 13, HB0, 8)
216
		vxor(11, 11, 14)
217
		vxor(12, 12, 15)
218
		SL_256(11, 12)
219
		REDUCE_F128(11, 11, 12)
220

221
		/*
222
		 * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
223
		 */
224
		vsldoi(12, HB0, 10, 8)
225
		vsldoi(13, 10, HB0, 8)
226
		vpmsumd(12, 12, 12)
227
		vpmsumd(13, 13, 13)
228
		SL_256(12, 13)
229
		REDUCE_F128(12, 12, 13)
230

231
		/*
232
		 * Repack h1, h2, h3 and h4:
233
		 *   v13 = h4_0:h3_0
234
		 *   v14 = h4_1:h3_1
235
		 *   v15 = h2_0:h1_0
236
		 *   v16 = h2_1:h1_1
237
		 */
238
		xxpermdi(45, 44, 43, 0)
239
		xxpermdi(46, 44, 43, 3)
240
		xxpermdi(47, 42, 41, 0)
241
		xxpermdi(48, 42, 41, 3)
242

243
		/*
244
		 * Loop for each group of four blocks.
245
		 */
246
		mtctr(%[num4])
247
	label(loop4)
248
		/*
249
		 * Read the four next blocks.
250
		 *   v20 = y + a0 = b0
251
		 *   v21 = a1     = b1
252
		 *   v22 = a2     = b2
253
		 *   v23 = a3     = b3
254
		 */
255
		lxvw4x(52, %[cc0], %[buf1])
256
		lxvw4x(53, %[cc1], %[buf1])
257
		lxvw4x(54, %[cc2], %[buf1])
258
		lxvw4x(55, %[cc3], %[buf1])
259
		FIX_ENDIAN(20)
260
		FIX_ENDIAN(21)
261
		FIX_ENDIAN(22)
262
		FIX_ENDIAN(23)
263
		addi(%[buf1], %[buf1], 64)
264
		vxor(20, 20, 28)
265

266
		/*
267
		 * Repack the blocks into v9, v10, v11 and v12.
268
		 *   v9  = b0_0:b1_0
269
		 *   v10 = b0_1:b1_1
270
		 *   v11 = b2_0:b3_0
271
		 *   v12 = b2_1:b3_1
272
		 */
273
		xxpermdi(41, 52, 53, 0)
274
		xxpermdi(42, 52, 53, 3)
275
		xxpermdi(43, 54, 55, 0)
276
		xxpermdi(44, 54, 55, 3)
277

278
		/*
279
		 * Compute the products.
280
		 *   v20 = b0_0*h4_0 + b1_0*h3_0
281
		 *   v21 = b0_1*h4_0 + b1_1*h3_0
282
		 *   v22 = b0_0*h4_1 + b1_0*h3_1
283
		 *   v23 = b0_1*h4_1 + b1_1*h3_1
284
		 *   v24 = b2_0*h2_0 + b3_0*h1_0
285
		 *   v25 = b2_1*h2_0 + b3_1*h1_0
286
		 *   v26 = b2_0*h2_1 + b3_0*h1_1
287
		 *   v27 = b2_1*h2_1 + b3_1*h1_1
288
		 */
289
		vpmsumd(20, 13,  9)
290
		vpmsumd(21, 13, 10)
291
		vpmsumd(22, 14,  9)
292
		vpmsumd(23, 14, 10)
293
		vpmsumd(24, 15, 11)
294
		vpmsumd(25, 15, 12)
295
		vpmsumd(26, 16, 11)
296
		vpmsumd(27, 16, 12)
297

298
		/*
299
		 * Sum products into a single 256-bit result in v11:v12.
300
		 */
301
		vxor(11, 20, 24)
302
		vxor(12, 23, 27)
303
		vxor( 9, 21, 22)
304
		vxor(10, 25, 26)
305
		vxor(20,  9, 10)
306
		vsldoi( 9, HB0, 20, 8)
307
		vsldoi(10, 20, HB0, 8)
308
		vxor(11, 11, 9)
309
		vxor(12, 12, 10)
310

311
		/*
312
		 * Fix and reduce in GF(2^128); this is the new y (in v28).
313
		 */
314
		SL_256(11, 12)
315
		REDUCE_F128(28, 11, 12)
316

317
		/*
318
		 * Loop for next group of four blocks.
319
		 */
320
		bdnz(loop4)
321

322
		/*
323
		 * Process second chunk, one block at a time.
324
		 */
325
	label(chunk1)
326
		cmpldi(%[num1], 0)
327
		beq(done)
328

329
		mtctr(%[num1])
330
	label(loop1)
331
		/*
332
		 * Load next data block and XOR it into y.
333
		 */
334
		lxvw4x(41, 0, %[buf2])
335
#if BR_POWER8_LE
336
		FIX_ENDIAN(9)
337
#endif
338
		addi(%[buf2], %[buf2], 16)
339
		vxor(9, 28, 9)
340

341
		/*
342
		 * Split y into doublewords:
343
		 *   v9  = y_0:y_1
344
		 *   v10 =   0:y_0
345
		 *   v11 = y_1:0
346
		 */
347
		vsldoi(10, HB0, 9, 8)
348
		vsldoi(11, 9, HB0, 8)
349

350
		/*
351
		 * Compute products with h:
352
		 *   v12 = y_0 * h_0
353
		 *   v13 = y_1 * h_1
354
		 *   v14 = y_1 * h_0 + y_0 * h_1
355
		 */
356
		vpmsumd(14,  9, 17)
357
		vpmsumd(12, 10, 18)
358
		vpmsumd(13, 11, 19)
359

360
		/*
361
		 * Propagate v14 into v12:v13 to finalise product.
362
		 */
363
		vsldoi(10, HB0, 14, 8)
364
		vsldoi(11, 14, HB0, 8)
365
		vxor(12, 12, 10)
366
		vxor(13, 13, 11)
367

368
		/*
369
		 * Fix result and reduce into v28 (next value for y).
370
		 */
371
		SL_256(12, 13)
372
		REDUCE_F128(28, 12, 13)
373
		bdnz(loop1)
374

375
	label(done)
376
		/*
377
		 * Write back the new y.
378
		 */
379
		FIX_ENDIAN(28)
380
		stxvw4x(60, 0, %[y])
381

382
: [buf1] "+b" (buf1), [buf2] "+b" (buf2)
383
: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
384
  [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
385
#if BR_POWER8_LE
386
	, [idx2be] "b" (idx2be)
387
#endif
388
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
389
  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
390
  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
391
  "ctr", "memory"
392
	);
393
}
394

395
/* see bearssl_hash.h */
396
br_ghash
397
br_ghash_pwr8_get(void)
398
{
399
	return &br_ghash_pwr8;
400
}
401

402
#else
403

404
/* see bearssl_hash.h */
405
br_ghash
406
br_ghash_pwr8_get(void)
407
{
408
	return 0;
409
}
410

411
#endif
412

413
Product

Resources

Company