Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/lib/checksum_64.S
10817 views
1
/*
2
* This file contains assembly-language implementations
3
* of IP-style 1's complement checksum routines.
4
*
5
* Copyright (C) 1995-1996 Gary Thomas ([email protected])
6
*
7
* This program is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU General Public License
9
* as published by the Free Software Foundation; either version
10
* 2 of the License, or (at your option) any later version.
11
*
12
* Severely hacked about by Paul Mackerras ([email protected]).
13
*/
14
15
#include <linux/sys.h>
16
#include <asm/processor.h>
17
#include <asm/errno.h>
18
#include <asm/ppc_asm.h>
19
20
/*
21
* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22
* len is in words and is always >= 5.
23
*
24
* In practice len == 5, but this is not guaranteed. So this code does not
25
* attempt to use doubleword instructions.
26
*/
27
_GLOBAL(ip_fast_csum)
28
lwz r0,0(r3)
29
lwzu r5,4(r3)
30
addic. r4,r4,-2
31
addc r0,r0,r5
32
mtctr r4
33
blelr-
34
1: lwzu r4,4(r3)
35
adde r0,r0,r4
36
bdnz 1b
37
addze r0,r0 /* add in final carry */
38
rldicl r4,r0,32,0 /* fold two 32-bit halves together */
39
add r0,r0,r4
40
srdi r0,r0,32
41
rlwinm r3,r0,16,0,31 /* fold two halves together */
42
add r3,r0,r3
43
not r3,r3
44
srwi r3,r3,16
45
blr
46
47
/*
48
* Compute checksum of TCP or UDP pseudo-header:
49
* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50
* No real gain trying to do this specially for 64 bit, but
51
* the 32 bit addition may spill into the upper bits of
52
* the doubleword so we still must fold it down from 64.
53
*/
54
_GLOBAL(csum_tcpudp_magic)
55
rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
56
addc r0,r3,r4 /* add 4 32-bit words together */
57
adde r0,r0,r5
58
adde r0,r0,r7
59
rldicl r4,r0,32,0 /* fold 64 bit value */
60
add r0,r4,r0
61
srdi r0,r0,32
62
rlwinm r3,r0,16,0,31 /* fold two halves together */
63
add r3,r0,r3
64
not r3,r3
65
srwi r3,r3,16
66
blr
67
68
#define STACKFRAMESIZE 256
69
#define STK_REG(i) (112 + ((i)-14)*8)
70
71
/*
72
* Computes the checksum of a memory block at buff, length len,
73
* and adds in "sum" (32-bit).
74
*
75
* csum_partial(r3=buff, r4=len, r5=sum)
76
*/
77
_GLOBAL(csum_partial)
78
addic r0,r5,0 /* clear carry */
79
80
srdi. r6,r4,3 /* less than 8 bytes? */
81
beq .Lcsum_tail_word
82
83
/*
84
* If only halfword aligned, align to a double word. Since odd
85
* aligned addresses should be rare and they would require more
86
* work to calculate the correct checksum, we ignore that case
87
* and take the potential slowdown of unaligned loads.
88
*/
89
rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
90
beq .Lcsum_aligned
91
92
li r7,4
93
sub r6,r7,r6
94
mtctr r6
95
96
1:
97
lhz r6,0(r3) /* align to doubleword */
98
subi r4,r4,2
99
addi r3,r3,2
100
adde r0,r0,r6
101
bdnz 1b
102
103
.Lcsum_aligned:
104
/*
105
* We unroll the loop such that each iteration is 64 bytes with an
106
* entry and exit limb of 64 bytes, meaning a minimum size of
107
* 128 bytes.
108
*/
109
srdi. r6,r4,7
110
beq .Lcsum_tail_doublewords /* len < 128 */
111
112
srdi r6,r4,6
113
subi r6,r6,1
114
mtctr r6
115
116
stdu r1,-STACKFRAMESIZE(r1)
117
std r14,STK_REG(r14)(r1)
118
std r15,STK_REG(r15)(r1)
119
std r16,STK_REG(r16)(r1)
120
121
ld r6,0(r3)
122
ld r9,8(r3)
123
124
ld r10,16(r3)
125
ld r11,24(r3)
126
127
/*
128
* On POWER6 and POWER7 back to back addes take 2 cycles because of
129
* the XER dependency. This means the fastest this loop can go is
130
* 16 cycles per iteration. The scheduling of the loop below has
131
* been shown to hit this on both POWER6 and POWER7.
132
*/
133
.align 5
134
2:
135
adde r0,r0,r6
136
ld r12,32(r3)
137
ld r14,40(r3)
138
139
adde r0,r0,r9
140
ld r15,48(r3)
141
ld r16,56(r3)
142
addi r3,r3,64
143
144
adde r0,r0,r10
145
146
adde r0,r0,r11
147
148
adde r0,r0,r12
149
150
adde r0,r0,r14
151
152
adde r0,r0,r15
153
ld r6,0(r3)
154
ld r9,8(r3)
155
156
adde r0,r0,r16
157
ld r10,16(r3)
158
ld r11,24(r3)
159
bdnz 2b
160
161
162
adde r0,r0,r6
163
ld r12,32(r3)
164
ld r14,40(r3)
165
166
adde r0,r0,r9
167
ld r15,48(r3)
168
ld r16,56(r3)
169
addi r3,r3,64
170
171
adde r0,r0,r10
172
adde r0,r0,r11
173
adde r0,r0,r12
174
adde r0,r0,r14
175
adde r0,r0,r15
176
adde r0,r0,r16
177
178
ld r14,STK_REG(r14)(r1)
179
ld r15,STK_REG(r15)(r1)
180
ld r16,STK_REG(r16)(r1)
181
addi r1,r1,STACKFRAMESIZE
182
183
andi. r4,r4,63
184
185
.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
186
srdi. r6,r4,3
187
beq .Lcsum_tail_word
188
189
mtctr r6
190
3:
191
ld r6,0(r3)
192
addi r3,r3,8
193
adde r0,r0,r6
194
bdnz 3b
195
196
andi. r4,r4,7
197
198
.Lcsum_tail_word: /* Up to 7 bytes to go */
199
srdi. r6,r4,2
200
beq .Lcsum_tail_halfword
201
202
lwz r6,0(r3)
203
addi r3,r3,4
204
adde r0,r0,r6
205
subi r4,r4,4
206
207
.Lcsum_tail_halfword: /* Up to 3 bytes to go */
208
srdi. r6,r4,1
209
beq .Lcsum_tail_byte
210
211
lhz r6,0(r3)
212
addi r3,r3,2
213
adde r0,r0,r6
214
subi r4,r4,2
215
216
.Lcsum_tail_byte: /* Up to 1 byte to go */
217
andi. r6,r4,1
218
beq .Lcsum_finish
219
220
lbz r6,0(r3)
221
sldi r9,r6,8 /* Pad the byte out to 16 bits */
222
adde r0,r0,r9
223
224
.Lcsum_finish:
225
addze r0,r0 /* add in final carry */
226
rldicl r4,r0,32,0 /* fold two 32 bit halves together */
227
add r3,r4,r0
228
srdi r3,r3,32
229
blr
230
231
232
.macro source
233
100:
234
.section __ex_table,"a"
235
.align 3
236
.llong 100b,.Lsrc_error
237
.previous
238
.endm
239
240
.macro dest
241
200:
242
.section __ex_table,"a"
243
.align 3
244
.llong 200b,.Ldest_error
245
.previous
246
.endm
247
248
/*
249
* Computes the checksum of a memory block at src, length len,
250
* and adds in "sum" (32-bit), while copying the block to dst.
251
* If an access exception occurs on src or dst, it stores -EFAULT
252
* to *src_err or *dst_err respectively. The caller must take any action
253
* required in this case (zeroing memory, recalculating partial checksum etc).
254
*
255
* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
256
*/
257
_GLOBAL(csum_partial_copy_generic)
258
addic r0,r6,0 /* clear carry */
259
260
srdi. r6,r5,3 /* less than 8 bytes? */
261
beq .Lcopy_tail_word
262
263
/*
264
* If only halfword aligned, align to a double word. Since odd
265
* aligned addresses should be rare and they would require more
266
* work to calculate the correct checksum, we ignore that case
267
* and take the potential slowdown of unaligned loads.
268
*
269
* If the source and destination are relatively unaligned we only
270
* align the source. This keeps things simple.
271
*/
272
rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
273
beq .Lcopy_aligned
274
275
li r7,4
276
sub r6,r7,r6
277
mtctr r6
278
279
1:
280
source; lhz r6,0(r3) /* align to doubleword */
281
subi r5,r5,2
282
addi r3,r3,2
283
adde r0,r0,r6
284
dest; sth r6,0(r4)
285
addi r4,r4,2
286
bdnz 1b
287
288
.Lcopy_aligned:
289
/*
290
* We unroll the loop such that each iteration is 64 bytes with an
291
* entry and exit limb of 64 bytes, meaning a minimum size of
292
* 128 bytes.
293
*/
294
srdi. r6,r5,7
295
beq .Lcopy_tail_doublewords /* len < 128 */
296
297
srdi r6,r5,6
298
subi r6,r6,1
299
mtctr r6
300
301
stdu r1,-STACKFRAMESIZE(r1)
302
std r14,STK_REG(r14)(r1)
303
std r15,STK_REG(r15)(r1)
304
std r16,STK_REG(r16)(r1)
305
306
source; ld r6,0(r3)
307
source; ld r9,8(r3)
308
309
source; ld r10,16(r3)
310
source; ld r11,24(r3)
311
312
/*
313
* On POWER6 and POWER7 back to back addes take 2 cycles because of
314
* the XER dependency. This means the fastest this loop can go is
315
* 16 cycles per iteration. The scheduling of the loop below has
316
* been shown to hit this on both POWER6 and POWER7.
317
*/
318
.align 5
319
2:
320
adde r0,r0,r6
321
source; ld r12,32(r3)
322
source; ld r14,40(r3)
323
324
adde r0,r0,r9
325
source; ld r15,48(r3)
326
source; ld r16,56(r3)
327
addi r3,r3,64
328
329
adde r0,r0,r10
330
dest; std r6,0(r4)
331
dest; std r9,8(r4)
332
333
adde r0,r0,r11
334
dest; std r10,16(r4)
335
dest; std r11,24(r4)
336
337
adde r0,r0,r12
338
dest; std r12,32(r4)
339
dest; std r14,40(r4)
340
341
adde r0,r0,r14
342
dest; std r15,48(r4)
343
dest; std r16,56(r4)
344
addi r4,r4,64
345
346
adde r0,r0,r15
347
source; ld r6,0(r3)
348
source; ld r9,8(r3)
349
350
adde r0,r0,r16
351
source; ld r10,16(r3)
352
source; ld r11,24(r3)
353
bdnz 2b
354
355
356
adde r0,r0,r6
357
source; ld r12,32(r3)
358
source; ld r14,40(r3)
359
360
adde r0,r0,r9
361
source; ld r15,48(r3)
362
source; ld r16,56(r3)
363
addi r3,r3,64
364
365
adde r0,r0,r10
366
dest; std r6,0(r4)
367
dest; std r9,8(r4)
368
369
adde r0,r0,r11
370
dest; std r10,16(r4)
371
dest; std r11,24(r4)
372
373
adde r0,r0,r12
374
dest; std r12,32(r4)
375
dest; std r14,40(r4)
376
377
adde r0,r0,r14
378
dest; std r15,48(r4)
379
dest; std r16,56(r4)
380
addi r4,r4,64
381
382
adde r0,r0,r15
383
adde r0,r0,r16
384
385
ld r14,STK_REG(r14)(r1)
386
ld r15,STK_REG(r15)(r1)
387
ld r16,STK_REG(r16)(r1)
388
addi r1,r1,STACKFRAMESIZE
389
390
andi. r5,r5,63
391
392
.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
393
srdi. r6,r5,3
394
beq .Lcopy_tail_word
395
396
mtctr r6
397
3:
398
source; ld r6,0(r3)
399
addi r3,r3,8
400
adde r0,r0,r6
401
dest; std r6,0(r4)
402
addi r4,r4,8
403
bdnz 3b
404
405
andi. r5,r5,7
406
407
.Lcopy_tail_word: /* Up to 7 bytes to go */
408
srdi. r6,r5,2
409
beq .Lcopy_tail_halfword
410
411
source; lwz r6,0(r3)
412
addi r3,r3,4
413
adde r0,r0,r6
414
dest; stw r6,0(r4)
415
addi r4,r4,4
416
subi r5,r5,4
417
418
.Lcopy_tail_halfword: /* Up to 3 bytes to go */
419
srdi. r6,r5,1
420
beq .Lcopy_tail_byte
421
422
source; lhz r6,0(r3)
423
addi r3,r3,2
424
adde r0,r0,r6
425
dest; sth r6,0(r4)
426
addi r4,r4,2
427
subi r5,r5,2
428
429
.Lcopy_tail_byte: /* Up to 1 byte to go */
430
andi. r6,r5,1
431
beq .Lcopy_finish
432
433
source; lbz r6,0(r3)
434
sldi r9,r6,8 /* Pad the byte out to 16 bits */
435
adde r0,r0,r9
436
dest; stb r6,0(r4)
437
438
.Lcopy_finish:
439
addze r0,r0 /* add in final carry */
440
rldicl r4,r0,32,0 /* fold two 32 bit halves together */
441
add r3,r4,r0
442
srdi r3,r3,32
443
blr
444
445
.Lsrc_error:
446
cmpdi 0,r7,0
447
beqlr
448
li r6,-EFAULT
449
stw r6,0(r7)
450
blr
451
452
.Ldest_error:
453
cmpdi 0,r8,0
454
beqlr
455
li r6,-EFAULT
456
stw r6,0(r8)
457
blr
458
459