CoCalc -- checksum

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/lib/checksum_64.S
¹⁰⁸¹⁷ views
1
/*
2
 * This file contains assembly-language implementations
3
 * of IP-style 1's complement checksum routines.
4
 *	
5
 *    Copyright (C) 1995-1996 Gary Thomas ([email protected])
6
 *
7
 *  This program is free software; you can redistribute it and/or
8
 *  modify it under the terms of the GNU General Public License
9
 *  as published by the Free Software Foundation; either version
10
 *  2 of the License, or (at your option) any later version.
11
 *
12
 * Severely hacked about by Paul Mackerras ([email protected]).
13
 */
14

15
#include <linux/sys.h>
16
#include <asm/processor.h>
17
#include <asm/errno.h>
18
#include <asm/ppc_asm.h>
19

20
/*
21
 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22
 * len is in words and is always >= 5.
23
 *
24
 * In practice len == 5, but this is not guaranteed.  So this code does not
25
 * attempt to use doubleword instructions.
26
 */
27
_GLOBAL(ip_fast_csum)
28
	lwz	r0,0(r3)
29
	lwzu	r5,4(r3)
30
	addic.	r4,r4,-2
31
	addc	r0,r0,r5
32
	mtctr	r4
33
	blelr-
34
1:	lwzu	r4,4(r3)
35
	adde	r0,r0,r4
36
	bdnz	1b
37
	addze	r0,r0		/* add in final carry */
38
        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
39
        add     r0,r0,r4
40
        srdi    r0,r0,32
41
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
42
	add	r3,r0,r3
43
	not	r3,r3
44
	srwi	r3,r3,16
45
	blr
46

47
/*
48
 * Compute checksum of TCP or UDP pseudo-header:
49
 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50
 * No real gain trying to do this specially for 64 bit, but
51
 * the 32 bit addition may spill into the upper bits of
52
 * the doubleword so we still must fold it down from 64.
53
 */	
54
_GLOBAL(csum_tcpudp_magic)
55
	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
56
	addc	r0,r3,r4	/* add 4 32-bit words together */
57
	adde	r0,r0,r5
58
	adde	r0,r0,r7
59
        rldicl  r4,r0,32,0      /* fold 64 bit value */
60
        add     r0,r4,r0
61
        srdi    r0,r0,32
62
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
63
	add	r3,r0,r3
64
	not	r3,r3
65
	srwi	r3,r3,16
66
	blr
67

68
#define STACKFRAMESIZE 256
69
#define STK_REG(i)	(112 + ((i)-14)*8)
70

71
/*
72
 * Computes the checksum of a memory block at buff, length len,
73
 * and adds in "sum" (32-bit).
74
 *
75
 * csum_partial(r3=buff, r4=len, r5=sum)
76
 */
77
_GLOBAL(csum_partial)
78
	addic	r0,r5,0			/* clear carry */
79

80
	srdi.	r6,r4,3			/* less than 8 bytes? */
81
	beq	.Lcsum_tail_word
82

83
	/*
84
	 * If only halfword aligned, align to a double word. Since odd
85
	 * aligned addresses should be rare and they would require more
86
	 * work to calculate the correct checksum, we ignore that case
87
	 * and take the potential slowdown of unaligned loads.
88
	 */
89
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
90
	beq	.Lcsum_aligned
91

92
	li	r7,4
93
	sub	r6,r7,r6
94
	mtctr	r6
95

96
1:
97
	lhz	r6,0(r3)		/* align to doubleword */
98
	subi	r4,r4,2
99
	addi	r3,r3,2
100
	adde	r0,r0,r6
101
	bdnz	1b
102

103
.Lcsum_aligned:
104
	/*
105
	 * We unroll the loop such that each iteration is 64 bytes with an
106
	 * entry and exit limb of 64 bytes, meaning a minimum size of
107
	 * 128 bytes.
108
	 */
109
	srdi.	r6,r4,7
110
	beq	.Lcsum_tail_doublewords		/* len < 128 */
111

112
	srdi	r6,r4,6
113
	subi	r6,r6,1
114
	mtctr	r6
115

116
	stdu	r1,-STACKFRAMESIZE(r1)
117
	std	r14,STK_REG(r14)(r1)
118
	std	r15,STK_REG(r15)(r1)
119
	std	r16,STK_REG(r16)(r1)
120

121
	ld	r6,0(r3)
122
	ld	r9,8(r3)
123

124
	ld	r10,16(r3)
125
	ld	r11,24(r3)
126

127
	/*
128
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129
	 * the XER dependency. This means the fastest this loop can go is
130
	 * 16 cycles per iteration. The scheduling of the loop below has
131
	 * been shown to hit this on both POWER6 and POWER7.
132
	 */
133
	.align 5
134
2:
135
	adde	r0,r0,r6
136
	ld	r12,32(r3)
137
	ld	r14,40(r3)
138

139
	adde	r0,r0,r9
140
	ld	r15,48(r3)
141
	ld	r16,56(r3)
142
	addi	r3,r3,64
143

144
	adde	r0,r0,r10
145

146
	adde	r0,r0,r11
147

148
	adde	r0,r0,r12
149

150
	adde	r0,r0,r14
151

152
	adde	r0,r0,r15
153
	ld	r6,0(r3)
154
	ld	r9,8(r3)
155

156
	adde	r0,r0,r16
157
	ld	r10,16(r3)
158
	ld	r11,24(r3)
159
	bdnz	2b
160

161

162
	adde	r0,r0,r6
163
	ld	r12,32(r3)
164
	ld	r14,40(r3)
165

166
	adde	r0,r0,r9
167
	ld	r15,48(r3)
168
	ld	r16,56(r3)
169
	addi	r3,r3,64
170

171
	adde	r0,r0,r10
172
	adde	r0,r0,r11
173
	adde	r0,r0,r12
174
	adde	r0,r0,r14
175
	adde	r0,r0,r15
176
	adde	r0,r0,r16
177

178
	ld	r14,STK_REG(r14)(r1)
179
	ld	r15,STK_REG(r15)(r1)
180
	ld	r16,STK_REG(r16)(r1)
181
	addi	r1,r1,STACKFRAMESIZE
182

183
	andi.	r4,r4,63
184

185
.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
186
	srdi.	r6,r4,3
187
	beq	.Lcsum_tail_word
188

189
	mtctr	r6
190
3:
191
	ld	r6,0(r3)
192
	addi	r3,r3,8
193
	adde	r0,r0,r6
194
	bdnz	3b
195

196
	andi.	r4,r4,7
197

198
.Lcsum_tail_word:			/* Up to 7 bytes to go */
199
	srdi.	r6,r4,2
200
	beq	.Lcsum_tail_halfword
201

202
	lwz	r6,0(r3)
203
	addi	r3,r3,4
204
	adde	r0,r0,r6
205
	subi	r4,r4,4
206

207
.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
208
	srdi.	r6,r4,1
209
	beq	.Lcsum_tail_byte
210

211
	lhz	r6,0(r3)
212
	addi	r3,r3,2
213
	adde	r0,r0,r6
214
	subi	r4,r4,2
215

216
.Lcsum_tail_byte:			/* Up to 1 byte to go */
217
	andi.	r6,r4,1
218
	beq	.Lcsum_finish
219

220
	lbz	r6,0(r3)
221
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
222
	adde	r0,r0,r9
223

224
.Lcsum_finish:
225
	addze	r0,r0			/* add in final carry */
226
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
227
	add	r3,r4,r0
228
	srdi	r3,r3,32
229
	blr
230

231

232
	.macro source
233
100:
234
	.section __ex_table,"a"
235
	.align 3
236
	.llong 100b,.Lsrc_error
237
	.previous
238
	.endm
239

240
	.macro dest
241
200:
242
	.section __ex_table,"a"
243
	.align 3
244
	.llong 200b,.Ldest_error
245
	.previous
246
	.endm
247

248
/*
249
 * Computes the checksum of a memory block at src, length len,
250
 * and adds in "sum" (32-bit), while copying the block to dst.
251
 * If an access exception occurs on src or dst, it stores -EFAULT
252
 * to *src_err or *dst_err respectively. The caller must take any action
253
 * required in this case (zeroing memory, recalculating partial checksum etc).
254
 *
255
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
256
 */
257
_GLOBAL(csum_partial_copy_generic)
258
	addic	r0,r6,0			/* clear carry */
259

260
	srdi.	r6,r5,3			/* less than 8 bytes? */
261
	beq	.Lcopy_tail_word
262

263
	/*
264
	 * If only halfword aligned, align to a double word. Since odd
265
	 * aligned addresses should be rare and they would require more
266
	 * work to calculate the correct checksum, we ignore that case
267
	 * and take the potential slowdown of unaligned loads.
268
	 *
269
	 * If the source and destination are relatively unaligned we only
270
	 * align the source. This keeps things simple.
271
	 */
272
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
273
	beq	.Lcopy_aligned
274

275
	li	r7,4
276
	sub	r6,r7,r6
277
	mtctr	r6
278

279
1:
280
source;	lhz	r6,0(r3)		/* align to doubleword */
281
	subi	r5,r5,2
282
	addi	r3,r3,2
283
	adde	r0,r0,r6
284
dest;	sth	r6,0(r4)
285
	addi	r4,r4,2
286
	bdnz	1b
287

288
.Lcopy_aligned:
289
	/*
290
	 * We unroll the loop such that each iteration is 64 bytes with an
291
	 * entry and exit limb of 64 bytes, meaning a minimum size of
292
	 * 128 bytes.
293
	 */
294
	srdi.	r6,r5,7
295
	beq	.Lcopy_tail_doublewords		/* len < 128 */
296

297
	srdi	r6,r5,6
298
	subi	r6,r6,1
299
	mtctr	r6
300

301
	stdu	r1,-STACKFRAMESIZE(r1)
302
	std	r14,STK_REG(r14)(r1)
303
	std	r15,STK_REG(r15)(r1)
304
	std	r16,STK_REG(r16)(r1)
305

306
source;	ld	r6,0(r3)
307
source;	ld	r9,8(r3)
308

309
source;	ld	r10,16(r3)
310
source;	ld	r11,24(r3)
311

312
	/*
313
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
314
	 * the XER dependency. This means the fastest this loop can go is
315
	 * 16 cycles per iteration. The scheduling of the loop below has
316
	 * been shown to hit this on both POWER6 and POWER7.
317
	 */
318
	.align 5
319
2:
320
	adde	r0,r0,r6
321
source;	ld	r12,32(r3)
322
source;	ld	r14,40(r3)
323

324
	adde	r0,r0,r9
325
source;	ld	r15,48(r3)
326
source;	ld	r16,56(r3)
327
	addi	r3,r3,64
328

329
	adde	r0,r0,r10
330
dest;	std	r6,0(r4)
331
dest;	std	r9,8(r4)
332

333
	adde	r0,r0,r11
334
dest;	std	r10,16(r4)
335
dest;	std	r11,24(r4)
336

337
	adde	r0,r0,r12
338
dest;	std	r12,32(r4)
339
dest;	std	r14,40(r4)
340

341
	adde	r0,r0,r14
342
dest;	std	r15,48(r4)
343
dest;	std	r16,56(r4)
344
	addi	r4,r4,64
345

346
	adde	r0,r0,r15
347
source;	ld	r6,0(r3)
348
source;	ld	r9,8(r3)
349

350
	adde	r0,r0,r16
351
source;	ld	r10,16(r3)
352
source;	ld	r11,24(r3)
353
	bdnz	2b
354

355

356
	adde	r0,r0,r6
357
source;	ld	r12,32(r3)
358
source;	ld	r14,40(r3)
359

360
	adde	r0,r0,r9
361
source;	ld	r15,48(r3)
362
source;	ld	r16,56(r3)
363
	addi	r3,r3,64
364

365
	adde	r0,r0,r10
366
dest;	std	r6,0(r4)
367
dest;	std	r9,8(r4)
368

369
	adde	r0,r0,r11
370
dest;	std	r10,16(r4)
371
dest;	std	r11,24(r4)
372

373
	adde	r0,r0,r12
374
dest;	std	r12,32(r4)
375
dest;	std	r14,40(r4)
376

377
	adde	r0,r0,r14
378
dest;	std	r15,48(r4)
379
dest;	std	r16,56(r4)
380
	addi	r4,r4,64
381

382
	adde	r0,r0,r15
383
	adde	r0,r0,r16
384

385
	ld	r14,STK_REG(r14)(r1)
386
	ld	r15,STK_REG(r15)(r1)
387
	ld	r16,STK_REG(r16)(r1)
388
	addi	r1,r1,STACKFRAMESIZE
389

390
	andi.	r5,r5,63
391

392
.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
393
	srdi.	r6,r5,3
394
	beq	.Lcopy_tail_word
395

396
	mtctr	r6
397
3:
398
source;	ld	r6,0(r3)
399
	addi	r3,r3,8
400
	adde	r0,r0,r6
401
dest;	std	r6,0(r4)
402
	addi	r4,r4,8
403
	bdnz	3b
404

405
	andi.	r5,r5,7
406

407
.Lcopy_tail_word:			/* Up to 7 bytes to go */
408
	srdi.	r6,r5,2
409
	beq	.Lcopy_tail_halfword
410

411
source;	lwz	r6,0(r3)
412
	addi	r3,r3,4
413
	adde	r0,r0,r6
414
dest;	stw	r6,0(r4)
415
	addi	r4,r4,4
416
	subi	r5,r5,4
417

418
.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
419
	srdi.	r6,r5,1
420
	beq	.Lcopy_tail_byte
421

422
source;	lhz	r6,0(r3)
423
	addi	r3,r3,2
424
	adde	r0,r0,r6
425
dest;	sth	r6,0(r4)
426
	addi	r4,r4,2
427
	subi	r5,r5,2
428

429
.Lcopy_tail_byte:			/* Up to 1 byte to go */
430
	andi.	r6,r5,1
431
	beq	.Lcopy_finish
432

433
source;	lbz	r6,0(r3)
434
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
435
	adde	r0,r0,r9
436
dest;	stb	r6,0(r4)
437

438
.Lcopy_finish:
439
	addze	r0,r0			/* add in final carry */
440
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
441
	add	r3,r4,r0
442
	srdi	r3,r3,32
443
	blr
444

445
.Lsrc_error:
446
	cmpdi	0,r7,0
447
	beqlr
448
	li	r6,-EFAULT
449
	stw	r6,0(r7)
450
	blr
451

452
.Ldest_error:
453
	cmpdi	0,r8,0
454
	beqlr
455
	li	r6,-EFAULT
456
	stw	r6,0(r8)
457
	blr
458

459
Product

Resources

Company