Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/lib/checksum_64.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* This file contains assembly-language implementations
4
* of IP-style 1's complement checksum routines.
5
*
6
* Copyright (C) 1995-1996 Gary Thomas ([email protected])
7
*
8
* Severely hacked about by Paul Mackerras ([email protected]).
9
*/
10
11
#include <linux/export.h>
12
#include <linux/sys.h>
13
#include <asm/processor.h>
14
#include <asm/errno.h>
15
#include <asm/ppc_asm.h>
16
17
/*
18
* Computes the checksum of a memory block at buff, length len,
19
* and adds in "sum" (32-bit).
20
*
21
* __csum_partial(r3=buff, r4=len, r5=sum)
22
*/
23
_GLOBAL(__csum_partial)
24
addic r0,r5,0 /* clear carry */
25
26
srdi. r6,r4,3 /* less than 8 bytes? */
27
beq .Lcsum_tail_word
28
29
/*
30
* If only halfword aligned, align to a double word. Since odd
31
* aligned addresses should be rare and they would require more
32
* work to calculate the correct checksum, we ignore that case
33
* and take the potential slowdown of unaligned loads.
34
*/
35
rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
36
beq .Lcsum_aligned
37
38
li r7,4
39
sub r6,r7,r6
40
mtctr r6
41
42
1:
43
lhz r6,0(r3) /* align to doubleword */
44
subi r4,r4,2
45
addi r3,r3,2
46
adde r0,r0,r6
47
bdnz 1b
48
49
.Lcsum_aligned:
50
/*
51
* We unroll the loop such that each iteration is 64 bytes with an
52
* entry and exit limb of 64 bytes, meaning a minimum size of
53
* 128 bytes.
54
*/
55
srdi. r6,r4,7
56
beq .Lcsum_tail_doublewords /* len < 128 */
57
58
srdi r6,r4,6
59
subi r6,r6,1
60
mtctr r6
61
62
stdu r1,-STACKFRAMESIZE(r1)
63
std r14,STK_REG(R14)(r1)
64
std r15,STK_REG(R15)(r1)
65
std r16,STK_REG(R16)(r1)
66
67
ld r6,0(r3)
68
ld r9,8(r3)
69
70
ld r10,16(r3)
71
ld r11,24(r3)
72
73
/*
74
* On POWER6 and POWER7 back to back adde instructions take 2 cycles
75
* because of the XER dependency. This means the fastest this loop can
76
* go is 16 cycles per iteration. The scheduling of the loop below has
77
* been shown to hit this on both POWER6 and POWER7.
78
*/
79
.align 5
80
2:
81
adde r0,r0,r6
82
ld r12,32(r3)
83
ld r14,40(r3)
84
85
adde r0,r0,r9
86
ld r15,48(r3)
87
ld r16,56(r3)
88
addi r3,r3,64
89
90
adde r0,r0,r10
91
92
adde r0,r0,r11
93
94
adde r0,r0,r12
95
96
adde r0,r0,r14
97
98
adde r0,r0,r15
99
ld r6,0(r3)
100
ld r9,8(r3)
101
102
adde r0,r0,r16
103
ld r10,16(r3)
104
ld r11,24(r3)
105
bdnz 2b
106
107
108
adde r0,r0,r6
109
ld r12,32(r3)
110
ld r14,40(r3)
111
112
adde r0,r0,r9
113
ld r15,48(r3)
114
ld r16,56(r3)
115
addi r3,r3,64
116
117
adde r0,r0,r10
118
adde r0,r0,r11
119
adde r0,r0,r12
120
adde r0,r0,r14
121
adde r0,r0,r15
122
adde r0,r0,r16
123
124
ld r14,STK_REG(R14)(r1)
125
ld r15,STK_REG(R15)(r1)
126
ld r16,STK_REG(R16)(r1)
127
addi r1,r1,STACKFRAMESIZE
128
129
andi. r4,r4,63
130
131
.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
132
srdi. r6,r4,3
133
beq .Lcsum_tail_word
134
135
mtctr r6
136
3:
137
ld r6,0(r3)
138
addi r3,r3,8
139
adde r0,r0,r6
140
bdnz 3b
141
142
andi. r4,r4,7
143
144
.Lcsum_tail_word: /* Up to 7 bytes to go */
145
srdi. r6,r4,2
146
beq .Lcsum_tail_halfword
147
148
lwz r6,0(r3)
149
addi r3,r3,4
150
adde r0,r0,r6
151
subi r4,r4,4
152
153
.Lcsum_tail_halfword: /* Up to 3 bytes to go */
154
srdi. r6,r4,1
155
beq .Lcsum_tail_byte
156
157
lhz r6,0(r3)
158
addi r3,r3,2
159
adde r0,r0,r6
160
subi r4,r4,2
161
162
.Lcsum_tail_byte: /* Up to 1 byte to go */
163
andi. r6,r4,1
164
beq .Lcsum_finish
165
166
lbz r6,0(r3)
167
#ifdef __BIG_ENDIAN__
168
sldi r9,r6,8 /* Pad the byte out to 16 bits */
169
adde r0,r0,r9
170
#else
171
adde r0,r0,r6
172
#endif
173
174
.Lcsum_finish:
175
addze r0,r0 /* add in final carry */
176
rldicl r4,r0,32,0 /* fold two 32 bit halves together */
177
add r3,r4,r0
178
srdi r3,r3,32
179
blr
180
EXPORT_SYMBOL(__csum_partial)
181
182
183
.macro srcnr
184
100:
185
EX_TABLE(100b,.Lerror_nr)
186
.endm
187
188
.macro source
189
150:
190
EX_TABLE(150b,.Lerror)
191
.endm
192
193
.macro dstnr
194
200:
195
EX_TABLE(200b,.Lerror_nr)
196
.endm
197
198
.macro dest
199
250:
200
EX_TABLE(250b,.Lerror)
201
.endm
202
203
/*
204
* Computes the checksum of a memory block at src, length len,
205
* and adds in 0xffffffff (32-bit), while copying the block to dst.
206
* If an access exception occurs, it returns 0.
207
*
208
* csum_partial_copy_generic(r3=src, r4=dst, r5=len)
209
*/
210
_GLOBAL(csum_partial_copy_generic)
211
li r6,-1
212
addic r0,r6,0 /* clear carry */
213
214
srdi. r6,r5,3 /* less than 8 bytes? */
215
beq .Lcopy_tail_word
216
217
/*
218
* If only halfword aligned, align to a double word. Since odd
219
* aligned addresses should be rare and they would require more
220
* work to calculate the correct checksum, we ignore that case
221
* and take the potential slowdown of unaligned loads.
222
*
223
* If the source and destination are relatively unaligned we only
224
* align the source. This keeps things simple.
225
*/
226
rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
227
beq .Lcopy_aligned
228
229
li r9,4
230
sub r6,r9,r6
231
mtctr r6
232
233
1:
234
srcnr; lhz r6,0(r3) /* align to doubleword */
235
subi r5,r5,2
236
addi r3,r3,2
237
adde r0,r0,r6
238
dstnr; sth r6,0(r4)
239
addi r4,r4,2
240
bdnz 1b
241
242
.Lcopy_aligned:
243
/*
244
* We unroll the loop such that each iteration is 64 bytes with an
245
* entry and exit limb of 64 bytes, meaning a minimum size of
246
* 128 bytes.
247
*/
248
srdi. r6,r5,7
249
beq .Lcopy_tail_doublewords /* len < 128 */
250
251
srdi r6,r5,6
252
subi r6,r6,1
253
mtctr r6
254
255
stdu r1,-STACKFRAMESIZE(r1)
256
std r14,STK_REG(R14)(r1)
257
std r15,STK_REG(R15)(r1)
258
std r16,STK_REG(R16)(r1)
259
260
source; ld r6,0(r3)
261
source; ld r9,8(r3)
262
263
source; ld r10,16(r3)
264
source; ld r11,24(r3)
265
266
/*
267
* On POWER6 and POWER7 back to back adde instructions take 2 cycles
268
* because of the XER dependency. This means the fastest this loop can
269
* go is 16 cycles per iteration. The scheduling of the loop below has
270
* been shown to hit this on both POWER6 and POWER7.
271
*/
272
.align 5
273
2:
274
adde r0,r0,r6
275
source; ld r12,32(r3)
276
source; ld r14,40(r3)
277
278
adde r0,r0,r9
279
source; ld r15,48(r3)
280
source; ld r16,56(r3)
281
addi r3,r3,64
282
283
adde r0,r0,r10
284
dest; std r6,0(r4)
285
dest; std r9,8(r4)
286
287
adde r0,r0,r11
288
dest; std r10,16(r4)
289
dest; std r11,24(r4)
290
291
adde r0,r0,r12
292
dest; std r12,32(r4)
293
dest; std r14,40(r4)
294
295
adde r0,r0,r14
296
dest; std r15,48(r4)
297
dest; std r16,56(r4)
298
addi r4,r4,64
299
300
adde r0,r0,r15
301
source; ld r6,0(r3)
302
source; ld r9,8(r3)
303
304
adde r0,r0,r16
305
source; ld r10,16(r3)
306
source; ld r11,24(r3)
307
bdnz 2b
308
309
310
adde r0,r0,r6
311
source; ld r12,32(r3)
312
source; ld r14,40(r3)
313
314
adde r0,r0,r9
315
source; ld r15,48(r3)
316
source; ld r16,56(r3)
317
addi r3,r3,64
318
319
adde r0,r0,r10
320
dest; std r6,0(r4)
321
dest; std r9,8(r4)
322
323
adde r0,r0,r11
324
dest; std r10,16(r4)
325
dest; std r11,24(r4)
326
327
adde r0,r0,r12
328
dest; std r12,32(r4)
329
dest; std r14,40(r4)
330
331
adde r0,r0,r14
332
dest; std r15,48(r4)
333
dest; std r16,56(r4)
334
addi r4,r4,64
335
336
adde r0,r0,r15
337
adde r0,r0,r16
338
339
ld r14,STK_REG(R14)(r1)
340
ld r15,STK_REG(R15)(r1)
341
ld r16,STK_REG(R16)(r1)
342
addi r1,r1,STACKFRAMESIZE
343
344
andi. r5,r5,63
345
346
.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
347
srdi. r6,r5,3
348
beq .Lcopy_tail_word
349
350
mtctr r6
351
3:
352
srcnr; ld r6,0(r3)
353
addi r3,r3,8
354
adde r0,r0,r6
355
dstnr; std r6,0(r4)
356
addi r4,r4,8
357
bdnz 3b
358
359
andi. r5,r5,7
360
361
.Lcopy_tail_word: /* Up to 7 bytes to go */
362
srdi. r6,r5,2
363
beq .Lcopy_tail_halfword
364
365
srcnr; lwz r6,0(r3)
366
addi r3,r3,4
367
adde r0,r0,r6
368
dstnr; stw r6,0(r4)
369
addi r4,r4,4
370
subi r5,r5,4
371
372
.Lcopy_tail_halfword: /* Up to 3 bytes to go */
373
srdi. r6,r5,1
374
beq .Lcopy_tail_byte
375
376
srcnr; lhz r6,0(r3)
377
addi r3,r3,2
378
adde r0,r0,r6
379
dstnr; sth r6,0(r4)
380
addi r4,r4,2
381
subi r5,r5,2
382
383
.Lcopy_tail_byte: /* Up to 1 byte to go */
384
andi. r6,r5,1
385
beq .Lcopy_finish
386
387
srcnr; lbz r6,0(r3)
388
#ifdef __BIG_ENDIAN__
389
sldi r9,r6,8 /* Pad the byte out to 16 bits */
390
adde r0,r0,r9
391
#else
392
adde r0,r0,r6
393
#endif
394
dstnr; stb r6,0(r4)
395
396
.Lcopy_finish:
397
addze r0,r0 /* add in final carry */
398
rldicl r4,r0,32,0 /* fold two 32 bit halves together */
399
add r3,r4,r0
400
srdi r3,r3,32
401
blr
402
403
.Lerror:
404
ld r14,STK_REG(R14)(r1)
405
ld r15,STK_REG(R15)(r1)
406
ld r16,STK_REG(R16)(r1)
407
addi r1,r1,STACKFRAMESIZE
408
.Lerror_nr:
409
li r3,0
410
blr
411
412
EXPORT_SYMBOL(csum_partial_copy_generic)
413
414
/*
415
* __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
416
* const struct in6_addr *daddr,
417
* __u32 len, __u8 proto, __wsum sum)
418
*/
419
420
_GLOBAL(csum_ipv6_magic)
421
ld r8, 0(r3)
422
ld r9, 8(r3)
423
add r5, r5, r6
424
addc r0, r8, r9
425
ld r10, 0(r4)
426
ld r11, 8(r4)
427
#ifdef CONFIG_CPU_LITTLE_ENDIAN
428
rotldi r5, r5, 8
429
#endif
430
adde r0, r0, r10
431
add r5, r5, r7
432
adde r0, r0, r11
433
adde r0, r0, r5
434
addze r0, r0
435
rotldi r3, r0, 32 /* fold two 32 bit halves together */
436
add r3, r0, r3
437
srdi r0, r3, 32
438
rotlwi r3, r0, 16 /* fold two 16 bit halves together */
439
add r3, r0, r3
440
not r3, r3
441
rlwinm r3, r3, 16, 16, 31
442
blr
443
EXPORT_SYMBOL(csum_ipv6_magic)
444
445