Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/xtensa/lib/checksum.S
26425 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* INET An implementation of the TCP/IP protocol suite for the LINUX
4
* operating system. INET is implemented using the BSD Socket
5
* interface as the means of communication with the user level.
6
*
7
* IP/TCP/UDP checksumming routines
8
*
9
* Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
10
* Optimized by Joe Taylor
11
*/
12
13
#include <linux/errno.h>
14
#include <linux/linkage.h>
15
#include <asm/asmmacro.h>
16
#include <asm/core.h>
17
18
/*
19
* computes a partial checksum, e.g. for TCP/UDP fragments
20
*/
21
22
/*
23
* unsigned int csum_partial(const unsigned char *buf, int len,
24
* unsigned int sum);
25
* a2 = buf
26
* a3 = len
27
* a4 = sum
28
*
29
* This function assumes 2- or 4-byte alignment. Other alignments will fail!
30
*/
31
32
/* ONES_ADD converts twos-complement math to ones-complement. */
33
#define ONES_ADD(sum, val) \
34
add sum, sum, val ; \
35
bgeu sum, val, 99f ; \
36
addi sum, sum, 1 ; \
37
99: ;
38
39
.text
40
ENTRY(csum_partial)
41
42
/*
43
* Experiments with Ethernet and SLIP connections show that buf
44
* is aligned on either a 2-byte or 4-byte boundary.
45
*/
46
abi_entry_default
47
extui a5, a2, 0, 2
48
bnez a5, 8f /* branch if 2-byte aligned */
49
/* Fall-through on common case, 4-byte alignment */
50
1:
51
srli a5, a3, 5 /* 32-byte chunks */
52
#if XCHAL_HAVE_LOOPS
53
loopgtz a5, 2f
54
#else
55
beqz a5, 2f
56
slli a5, a5, 5
57
add a5, a5, a2 /* a5 = end of last 32-byte chunk */
58
.Loop1:
59
#endif
60
l32i a6, a2, 0
61
l32i a7, a2, 4
62
ONES_ADD(a4, a6)
63
ONES_ADD(a4, a7)
64
l32i a6, a2, 8
65
l32i a7, a2, 12
66
ONES_ADD(a4, a6)
67
ONES_ADD(a4, a7)
68
l32i a6, a2, 16
69
l32i a7, a2, 20
70
ONES_ADD(a4, a6)
71
ONES_ADD(a4, a7)
72
l32i a6, a2, 24
73
l32i a7, a2, 28
74
ONES_ADD(a4, a6)
75
ONES_ADD(a4, a7)
76
addi a2, a2, 4*8
77
#if !XCHAL_HAVE_LOOPS
78
blt a2, a5, .Loop1
79
#endif
80
2:
81
extui a5, a3, 2, 3 /* remaining 4-byte chunks */
82
#if XCHAL_HAVE_LOOPS
83
loopgtz a5, 3f
84
#else
85
beqz a5, 3f
86
slli a5, a5, 2
87
add a5, a5, a2 /* a5 = end of last 4-byte chunk */
88
.Loop2:
89
#endif
90
l32i a6, a2, 0
91
ONES_ADD(a4, a6)
92
addi a2, a2, 4
93
#if !XCHAL_HAVE_LOOPS
94
blt a2, a5, .Loop2
95
#endif
96
3:
97
_bbci.l a3, 1, 5f /* remaining 2-byte chunk */
98
l16ui a6, a2, 0
99
ONES_ADD(a4, a6)
100
addi a2, a2, 2
101
5:
102
_bbci.l a3, 0, 7f /* remaining 1-byte chunk */
103
6: l8ui a6, a2, 0
104
#ifdef __XTENSA_EB__
105
slli a6, a6, 8 /* load byte into bits 8..15 */
106
#endif
107
ONES_ADD(a4, a6)
108
7:
109
mov a2, a4
110
abi_ret_default
111
112
/* uncommon case, buf is 2-byte aligned */
113
8:
114
beqz a3, 7b /* branch if len == 0 */
115
beqi a3, 1, 6b /* branch if len == 1 */
116
117
extui a5, a2, 0, 1
118
bnez a5, 8f /* branch if 1-byte aligned */
119
120
l16ui a6, a2, 0 /* common case, len >= 2 */
121
ONES_ADD(a4, a6)
122
addi a2, a2, 2 /* adjust buf */
123
addi a3, a3, -2 /* adjust len */
124
j 1b /* now buf is 4-byte aligned */
125
126
/* case: odd-byte aligned, len > 1
127
* This case is dog slow, so don't give us an odd address.
128
* (I don't think this ever happens, but just in case.)
129
*/
130
8:
131
srli a5, a3, 2 /* 4-byte chunks */
132
#if XCHAL_HAVE_LOOPS
133
loopgtz a5, 2f
134
#else
135
beqz a5, 2f
136
slli a5, a5, 2
137
add a5, a5, a2 /* a5 = end of last 4-byte chunk */
138
.Loop3:
139
#endif
140
l8ui a6, a2, 0 /* bits 24..31 */
141
l16ui a7, a2, 1 /* bits 8..23 */
142
l8ui a8, a2, 3 /* bits 0.. 8 */
143
#ifdef __XTENSA_EB__
144
slli a6, a6, 24
145
#else
146
slli a8, a8, 24
147
#endif
148
slli a7, a7, 8
149
or a7, a7, a6
150
or a7, a7, a8
151
ONES_ADD(a4, a7)
152
addi a2, a2, 4
153
#if !XCHAL_HAVE_LOOPS
154
blt a2, a5, .Loop3
155
#endif
156
2:
157
_bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
158
l8ui a6, a2, 0
159
l8ui a7, a2, 1
160
#ifdef __XTENSA_EB__
161
slli a6, a6, 8
162
#else
163
slli a7, a7, 8
164
#endif
165
or a7, a7, a6
166
ONES_ADD(a4, a7)
167
addi a2, a2, 2
168
3:
169
j 5b /* branch to handle the remaining byte */
170
171
ENDPROC(csum_partial)
172
EXPORT_SYMBOL(csum_partial)
173
174
/*
175
* Copy from ds while checksumming, otherwise like csum_partial
176
*/
177
178
/*
179
unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)
180
a2 = src
181
a3 = dst
182
a4 = len
183
a5 = sum
184
a8 = temp
185
a9 = temp
186
a10 = temp
187
188
This function is optimized for 4-byte aligned addresses. Other
189
alignments work, but not nearly as efficiently.
190
*/
191
192
ENTRY(csum_partial_copy_generic)
193
194
abi_entry_default
195
movi a5, -1
196
or a10, a2, a3
197
198
/* We optimize the following alignment tests for the 4-byte
199
aligned case. Two bbsi.l instructions might seem more optimal
200
(commented out below). However, both labels 5: and 3: are out
201
of the imm8 range, so the assembler relaxes them into
202
equivalent bbci.l, j combinations, which is actually
203
slower. */
204
205
extui a9, a10, 0, 2
206
beqz a9, 1f /* branch if both are 4-byte aligned */
207
bbsi.l a10, 0, 5f /* branch if one address is odd */
208
j 3f /* one address is 2-byte aligned */
209
210
/* _bbsi.l a10, 0, 5f */ /* branch if odd address */
211
/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
212
213
1:
214
/* src and dst are both 4-byte aligned */
215
srli a10, a4, 5 /* 32-byte chunks */
216
#if XCHAL_HAVE_LOOPS
217
loopgtz a10, 2f
218
#else
219
beqz a10, 2f
220
slli a10, a10, 5
221
add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
222
.Loop5:
223
#endif
224
EX(10f) l32i a9, a2, 0
225
EX(10f) l32i a8, a2, 4
226
EX(10f) s32i a9, a3, 0
227
EX(10f) s32i a8, a3, 4
228
ONES_ADD(a5, a9)
229
ONES_ADD(a5, a8)
230
EX(10f) l32i a9, a2, 8
231
EX(10f) l32i a8, a2, 12
232
EX(10f) s32i a9, a3, 8
233
EX(10f) s32i a8, a3, 12
234
ONES_ADD(a5, a9)
235
ONES_ADD(a5, a8)
236
EX(10f) l32i a9, a2, 16
237
EX(10f) l32i a8, a2, 20
238
EX(10f) s32i a9, a3, 16
239
EX(10f) s32i a8, a3, 20
240
ONES_ADD(a5, a9)
241
ONES_ADD(a5, a8)
242
EX(10f) l32i a9, a2, 24
243
EX(10f) l32i a8, a2, 28
244
EX(10f) s32i a9, a3, 24
245
EX(10f) s32i a8, a3, 28
246
ONES_ADD(a5, a9)
247
ONES_ADD(a5, a8)
248
addi a2, a2, 32
249
addi a3, a3, 32
250
#if !XCHAL_HAVE_LOOPS
251
blt a2, a10, .Loop5
252
#endif
253
2:
254
extui a10, a4, 2, 3 /* remaining 4-byte chunks */
255
extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
256
#if XCHAL_HAVE_LOOPS
257
loopgtz a10, 3f
258
#else
259
beqz a10, 3f
260
slli a10, a10, 2
261
add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
262
.Loop6:
263
#endif
264
EX(10f) l32i a9, a2, 0
265
EX(10f) s32i a9, a3, 0
266
ONES_ADD(a5, a9)
267
addi a2, a2, 4
268
addi a3, a3, 4
269
#if !XCHAL_HAVE_LOOPS
270
blt a2, a10, .Loop6
271
#endif
272
3:
273
/*
274
Control comes to here in two cases: (1) It may fall through
275
to here from the 4-byte alignment case to process, at most,
276
one 2-byte chunk. (2) It branches to here from above if
277
either src or dst is 2-byte aligned, and we process all bytes
278
here, except for perhaps a trailing odd byte. It's
279
inefficient, so align your addresses to 4-byte boundaries.
280
281
a2 = src
282
a3 = dst
283
a4 = len
284
a5 = sum
285
*/
286
srli a10, a4, 1 /* 2-byte chunks */
287
#if XCHAL_HAVE_LOOPS
288
loopgtz a10, 4f
289
#else
290
beqz a10, 4f
291
slli a10, a10, 1
292
add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
293
.Loop7:
294
#endif
295
EX(10f) l16ui a9, a2, 0
296
EX(10f) s16i a9, a3, 0
297
ONES_ADD(a5, a9)
298
addi a2, a2, 2
299
addi a3, a3, 2
300
#if !XCHAL_HAVE_LOOPS
301
blt a2, a10, .Loop7
302
#endif
303
4:
304
/* This section processes a possible trailing odd byte. */
305
_bbci.l a4, 0, 8f /* 1-byte chunk */
306
EX(10f) l8ui a9, a2, 0
307
EX(10f) s8i a9, a3, 0
308
#ifdef __XTENSA_EB__
309
slli a9, a9, 8 /* shift byte to bits 8..15 */
310
#endif
311
ONES_ADD(a5, a9)
312
8:
313
mov a2, a5
314
abi_ret_default
315
316
5:
317
/* Control branch to here when either src or dst is odd. We
318
process all bytes using 8-bit accesses. Grossly inefficient,
319
so don't feed us an odd address. */
320
321
srli a10, a4, 1 /* handle in pairs for 16-bit csum */
322
#if XCHAL_HAVE_LOOPS
323
loopgtz a10, 6f
324
#else
325
beqz a10, 6f
326
slli a10, a10, 1
327
add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
328
.Loop8:
329
#endif
330
EX(10f) l8ui a9, a2, 0
331
EX(10f) l8ui a8, a2, 1
332
EX(10f) s8i a9, a3, 0
333
EX(10f) s8i a8, a3, 1
334
#ifdef __XTENSA_EB__
335
slli a9, a9, 8 /* combine into a single 16-bit value */
336
#else /* for checksum computation */
337
slli a8, a8, 8
338
#endif
339
or a9, a9, a8
340
ONES_ADD(a5, a9)
341
addi a2, a2, 2
342
addi a3, a3, 2
343
#if !XCHAL_HAVE_LOOPS
344
blt a2, a10, .Loop8
345
#endif
346
6:
347
j 4b /* process the possible trailing odd byte */
348
349
ENDPROC(csum_partial_copy_generic)
350
EXPORT_SYMBOL(csum_partial_copy_generic)
351
352
353
# Exception handler:
354
.section .fixup, "ax"
355
10:
356
movi a2, 0
357
abi_ret_default
358
359
.previous
360
361