CoCalc -- checksum

GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/lib/checksum_64.S
²⁶⁴²⁴ views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
 * This file contains assembly-language implementations
4
 * of IP-style 1's complement checksum routines.
5
 *	
6
 *    Copyright (C) 1995-1996 Gary Thomas ([email protected])
7
 *
8
 * Severely hacked about by Paul Mackerras ([email protected]).
9
 */
10

11
#include <linux/export.h>
12
#include <linux/sys.h>
13
#include <asm/processor.h>
14
#include <asm/errno.h>
15
#include <asm/ppc_asm.h>
16

17
/*
18
 * Computes the checksum of a memory block at buff, length len,
19
 * and adds in "sum" (32-bit).
20
 *
21
 * __csum_partial(r3=buff, r4=len, r5=sum)
22
 */
23
_GLOBAL(__csum_partial)
24
	addic	r0,r5,0			/* clear carry */
25

26
	srdi.	r6,r4,3			/* less than 8 bytes? */
27
	beq	.Lcsum_tail_word
28

29
	/*
30
	 * If only halfword aligned, align to a double word. Since odd
31
	 * aligned addresses should be rare and they would require more
32
	 * work to calculate the correct checksum, we ignore that case
33
	 * and take the potential slowdown of unaligned loads.
34
	 */
35
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
36
	beq	.Lcsum_aligned
37

38
	li	r7,4
39
	sub	r6,r7,r6
40
	mtctr	r6
41

42
1:
43
	lhz	r6,0(r3)		/* align to doubleword */
44
	subi	r4,r4,2
45
	addi	r3,r3,2
46
	adde	r0,r0,r6
47
	bdnz	1b
48

49
.Lcsum_aligned:
50
	/*
51
	 * We unroll the loop such that each iteration is 64 bytes with an
52
	 * entry and exit limb of 64 bytes, meaning a minimum size of
53
	 * 128 bytes.
54
	 */
55
	srdi.	r6,r4,7
56
	beq	.Lcsum_tail_doublewords		/* len < 128 */
57

58
	srdi	r6,r4,6
59
	subi	r6,r6,1
60
	mtctr	r6
61

62
	stdu	r1,-STACKFRAMESIZE(r1)
63
	std	r14,STK_REG(R14)(r1)
64
	std	r15,STK_REG(R15)(r1)
65
	std	r16,STK_REG(R16)(r1)
66

67
	ld	r6,0(r3)
68
	ld	r9,8(r3)
69

70
	ld	r10,16(r3)
71
	ld	r11,24(r3)
72

73
	/*
74
	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
75
	 * because of the XER dependency. This means the fastest this loop can
76
	 * go is 16 cycles per iteration. The scheduling of the loop below has
77
	 * been shown to hit this on both POWER6 and POWER7.
78
	 */
79
	.align 5
80
2:
81
	adde	r0,r0,r6
82
	ld	r12,32(r3)
83
	ld	r14,40(r3)
84

85
	adde	r0,r0,r9
86
	ld	r15,48(r3)
87
	ld	r16,56(r3)
88
	addi	r3,r3,64
89

90
	adde	r0,r0,r10
91

92
	adde	r0,r0,r11
93

94
	adde	r0,r0,r12
95

96
	adde	r0,r0,r14
97

98
	adde	r0,r0,r15
99
	ld	r6,0(r3)
100
	ld	r9,8(r3)
101

102
	adde	r0,r0,r16
103
	ld	r10,16(r3)
104
	ld	r11,24(r3)
105
	bdnz	2b
106

107

108
	adde	r0,r0,r6
109
	ld	r12,32(r3)
110
	ld	r14,40(r3)
111

112
	adde	r0,r0,r9
113
	ld	r15,48(r3)
114
	ld	r16,56(r3)
115
	addi	r3,r3,64
116

117
	adde	r0,r0,r10
118
	adde	r0,r0,r11
119
	adde	r0,r0,r12
120
	adde	r0,r0,r14
121
	adde	r0,r0,r15
122
	adde	r0,r0,r16
123

124
	ld	r14,STK_REG(R14)(r1)
125
	ld	r15,STK_REG(R15)(r1)
126
	ld	r16,STK_REG(R16)(r1)
127
	addi	r1,r1,STACKFRAMESIZE
128

129
	andi.	r4,r4,63
130

131
.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
132
	srdi.	r6,r4,3
133
	beq	.Lcsum_tail_word
134

135
	mtctr	r6
136
3:
137
	ld	r6,0(r3)
138
	addi	r3,r3,8
139
	adde	r0,r0,r6
140
	bdnz	3b
141

142
	andi.	r4,r4,7
143

144
.Lcsum_tail_word:			/* Up to 7 bytes to go */
145
	srdi.	r6,r4,2
146
	beq	.Lcsum_tail_halfword
147

148
	lwz	r6,0(r3)
149
	addi	r3,r3,4
150
	adde	r0,r0,r6
151
	subi	r4,r4,4
152

153
.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
154
	srdi.	r6,r4,1
155
	beq	.Lcsum_tail_byte
156

157
	lhz	r6,0(r3)
158
	addi	r3,r3,2
159
	adde	r0,r0,r6
160
	subi	r4,r4,2
161

162
.Lcsum_tail_byte:			/* Up to 1 byte to go */
163
	andi.	r6,r4,1
164
	beq	.Lcsum_finish
165

166
	lbz	r6,0(r3)
167
#ifdef __BIG_ENDIAN__
168
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
169
	adde	r0,r0,r9
170
#else
171
	adde	r0,r0,r6
172
#endif
173

174
.Lcsum_finish:
175
	addze	r0,r0			/* add in final carry */
176
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177
	add	r3,r4,r0
178
	srdi	r3,r3,32
179
	blr
180
EXPORT_SYMBOL(__csum_partial)
181

182

183
	.macro srcnr
184
100:
185
	EX_TABLE(100b,.Lerror_nr)
186
	.endm
187

188
	.macro source
189
150:
190
	EX_TABLE(150b,.Lerror)
191
	.endm
192

193
	.macro dstnr
194
200:
195
	EX_TABLE(200b,.Lerror_nr)
196
	.endm
197

198
	.macro dest
199
250:
200
	EX_TABLE(250b,.Lerror)
201
	.endm
202

203
/*
204
 * Computes the checksum of a memory block at src, length len,
205
 * and adds in 0xffffffff (32-bit), while copying the block to dst.
206
 * If an access exception occurs, it returns 0.
207
 *
208
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209
 */
210
_GLOBAL(csum_partial_copy_generic)
211
	li	r6,-1
212
	addic	r0,r6,0			/* clear carry */
213

214
	srdi.	r6,r5,3			/* less than 8 bytes? */
215
	beq	.Lcopy_tail_word
216

217
	/*
218
	 * If only halfword aligned, align to a double word. Since odd
219
	 * aligned addresses should be rare and they would require more
220
	 * work to calculate the correct checksum, we ignore that case
221
	 * and take the potential slowdown of unaligned loads.
222
	 *
223
	 * If the source and destination are relatively unaligned we only
224
	 * align the source. This keeps things simple.
225
	 */
226
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
227
	beq	.Lcopy_aligned
228

229
	li	r9,4
230
	sub	r6,r9,r6
231
	mtctr	r6
232

233
1:
234
srcnr;	lhz	r6,0(r3)		/* align to doubleword */
235
	subi	r5,r5,2
236
	addi	r3,r3,2
237
	adde	r0,r0,r6
238
dstnr;	sth	r6,0(r4)
239
	addi	r4,r4,2
240
	bdnz	1b
241

242
.Lcopy_aligned:
243
	/*
244
	 * We unroll the loop such that each iteration is 64 bytes with an
245
	 * entry and exit limb of 64 bytes, meaning a minimum size of
246
	 * 128 bytes.
247
	 */
248
	srdi.	r6,r5,7
249
	beq	.Lcopy_tail_doublewords		/* len < 128 */
250

251
	srdi	r6,r5,6
252
	subi	r6,r6,1
253
	mtctr	r6
254

255
	stdu	r1,-STACKFRAMESIZE(r1)
256
	std	r14,STK_REG(R14)(r1)
257
	std	r15,STK_REG(R15)(r1)
258
	std	r16,STK_REG(R16)(r1)
259

260
source;	ld	r6,0(r3)
261
source;	ld	r9,8(r3)
262

263
source;	ld	r10,16(r3)
264
source;	ld	r11,24(r3)
265

266
	/*
267
	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
268
	 * because of the XER dependency. This means the fastest this loop can
269
	 * go is 16 cycles per iteration. The scheduling of the loop below has
270
	 * been shown to hit this on both POWER6 and POWER7.
271
	 */
272
	.align 5
273
2:
274
	adde	r0,r0,r6
275
source;	ld	r12,32(r3)
276
source;	ld	r14,40(r3)
277

278
	adde	r0,r0,r9
279
source;	ld	r15,48(r3)
280
source;	ld	r16,56(r3)
281
	addi	r3,r3,64
282

283
	adde	r0,r0,r10
284
dest;	std	r6,0(r4)
285
dest;	std	r9,8(r4)
286

287
	adde	r0,r0,r11
288
dest;	std	r10,16(r4)
289
dest;	std	r11,24(r4)
290

291
	adde	r0,r0,r12
292
dest;	std	r12,32(r4)
293
dest;	std	r14,40(r4)
294

295
	adde	r0,r0,r14
296
dest;	std	r15,48(r4)
297
dest;	std	r16,56(r4)
298
	addi	r4,r4,64
299

300
	adde	r0,r0,r15
301
source;	ld	r6,0(r3)
302
source;	ld	r9,8(r3)
303

304
	adde	r0,r0,r16
305
source;	ld	r10,16(r3)
306
source;	ld	r11,24(r3)
307
	bdnz	2b
308

309

310
	adde	r0,r0,r6
311
source;	ld	r12,32(r3)
312
source;	ld	r14,40(r3)
313

314
	adde	r0,r0,r9
315
source;	ld	r15,48(r3)
316
source;	ld	r16,56(r3)
317
	addi	r3,r3,64
318

319
	adde	r0,r0,r10
320
dest;	std	r6,0(r4)
321
dest;	std	r9,8(r4)
322

323
	adde	r0,r0,r11
324
dest;	std	r10,16(r4)
325
dest;	std	r11,24(r4)
326

327
	adde	r0,r0,r12
328
dest;	std	r12,32(r4)
329
dest;	std	r14,40(r4)
330

331
	adde	r0,r0,r14
332
dest;	std	r15,48(r4)
333
dest;	std	r16,56(r4)
334
	addi	r4,r4,64
335

336
	adde	r0,r0,r15
337
	adde	r0,r0,r16
338

339
	ld	r14,STK_REG(R14)(r1)
340
	ld	r15,STK_REG(R15)(r1)
341
	ld	r16,STK_REG(R16)(r1)
342
	addi	r1,r1,STACKFRAMESIZE
343

344
	andi.	r5,r5,63
345

346
.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
347
	srdi.	r6,r5,3
348
	beq	.Lcopy_tail_word
349

350
	mtctr	r6
351
3:
352
srcnr;	ld	r6,0(r3)
353
	addi	r3,r3,8
354
	adde	r0,r0,r6
355
dstnr;	std	r6,0(r4)
356
	addi	r4,r4,8
357
	bdnz	3b
358

359
	andi.	r5,r5,7
360

361
.Lcopy_tail_word:			/* Up to 7 bytes to go */
362
	srdi.	r6,r5,2
363
	beq	.Lcopy_tail_halfword
364

365
srcnr;	lwz	r6,0(r3)
366
	addi	r3,r3,4
367
	adde	r0,r0,r6
368
dstnr;	stw	r6,0(r4)
369
	addi	r4,r4,4
370
	subi	r5,r5,4
371

372
.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
373
	srdi.	r6,r5,1
374
	beq	.Lcopy_tail_byte
375

376
srcnr;	lhz	r6,0(r3)
377
	addi	r3,r3,2
378
	adde	r0,r0,r6
379
dstnr;	sth	r6,0(r4)
380
	addi	r4,r4,2
381
	subi	r5,r5,2
382

383
.Lcopy_tail_byte:			/* Up to 1 byte to go */
384
	andi.	r6,r5,1
385
	beq	.Lcopy_finish
386

387
srcnr;	lbz	r6,0(r3)
388
#ifdef __BIG_ENDIAN__
389
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
390
	adde	r0,r0,r9
391
#else
392
	adde	r0,r0,r6
393
#endif
394
dstnr;	stb	r6,0(r4)
395

396
.Lcopy_finish:
397
	addze	r0,r0			/* add in final carry */
398
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
399
	add	r3,r4,r0
400
	srdi	r3,r3,32
401
	blr
402

403
.Lerror:
404
	ld	r14,STK_REG(R14)(r1)
405
	ld	r15,STK_REG(R15)(r1)
406
	ld	r16,STK_REG(R16)(r1)
407
	addi	r1,r1,STACKFRAMESIZE
408
.Lerror_nr:
409
	li	r3,0
410
	blr
411

412
EXPORT_SYMBOL(csum_partial_copy_generic)
413

414
/*
415
 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416
 *			   const struct in6_addr *daddr,
417
 *			   __u32 len, __u8 proto, __wsum sum)
418
 */
419

420
_GLOBAL(csum_ipv6_magic)
421
	ld	r8, 0(r3)
422
	ld	r9, 8(r3)
423
	add	r5, r5, r6
424
	addc	r0, r8, r9
425
	ld	r10, 0(r4)
426
	ld	r11, 8(r4)
427
#ifdef CONFIG_CPU_LITTLE_ENDIAN
428
	rotldi	r5, r5, 8
429
#endif
430
	adde	r0, r0, r10
431
	add	r5, r5, r7
432
	adde	r0, r0, r11
433
	adde	r0, r0, r5
434
	addze	r0, r0
435
	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
436
	add	r3, r0, r3
437
	srdi	r0, r3, 32
438
	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
439
	add	r3, r0, r3
440
	not	r3, r3
441
	rlwinm	r3, r3, 16, 16, 31
442
	blr
443
EXPORT_SYMBOL(csum_ipv6_magic)
444

445
Product

Resources

Company