CoCalc -- checksum.S

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/xtensa/lib/checksum.S
¹⁰⁸¹⁷ views
1
/*
2
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3
 *		operating system.  INET is implemented using the  BSD Socket
4
 *		interface as the means of communication with the user level.
5
 *
6
 *		IP/TCP/UDP checksumming routines
7
 *
8
 * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
9
 *                  Optimized by Joe Taylor
10
 *
11
 *		This program is free software; you can redistribute it and/or
12
 *		modify it under the terms of the GNU General Public License
13
 *		as published by the Free Software Foundation; either version
14
 *		2 of the License, or (at your option) any later version.
15
 */
16

17
#include <asm/errno.h>
18
#include <linux/linkage.h>
19
#include <variant/core.h>
20

21
/*
22
 * computes a partial checksum, e.g. for TCP/UDP fragments
23
 */
24

25
/*
26
 * unsigned int csum_partial(const unsigned char *buf, int len,
27
 *                           unsigned int sum);
28
 *    a2 = buf
29
 *    a3 = len
30
 *    a4 = sum
31
 *
32
 * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
33
 */
34

35
/* ONES_ADD converts twos-complement math to ones-complement. */
36
#define ONES_ADD(sum, val)	  \
37
	add	sum, sum, val	; \
38
	bgeu	sum, val, 99f	; \
39
	addi	sum, sum, 1	; \
40
99:				;
41

42
.text
43
ENTRY(csum_partial)
44
	  /*
45
	   * Experiments with Ethernet and SLIP connections show that buf
46
	   * is aligned on either a 2-byte or 4-byte boundary.
47
	   */
48
	entry	sp, 32
49
	extui	a5, a2, 0, 2
50
	bnez	a5, 8f		/* branch if 2-byte aligned */
51
	/* Fall-through on common case, 4-byte alignment */
52
1:
53
	srli	a5, a3, 5	/* 32-byte chunks */
54
#if XCHAL_HAVE_LOOPS
55
	loopgtz	a5, 2f
56
#else
57
	beqz	a5, 2f
58
	slli	a5, a5, 5
59
	add	a5, a5, a2	/* a5 = end of last 32-byte chunk */
60
.Loop1:
61
#endif
62
	l32i	a6, a2, 0
63
	l32i	a7, a2, 4
64
	ONES_ADD(a4, a6)
65
	ONES_ADD(a4, a7)
66
	l32i	a6, a2, 8
67
	l32i	a7, a2, 12
68
	ONES_ADD(a4, a6)
69
	ONES_ADD(a4, a7)
70
	l32i	a6, a2, 16
71
	l32i	a7, a2, 20
72
	ONES_ADD(a4, a6)
73
	ONES_ADD(a4, a7)
74
	l32i	a6, a2, 24
75
	l32i	a7, a2, 28
76
	ONES_ADD(a4, a6)
77
	ONES_ADD(a4, a7)
78
	addi	a2, a2, 4*8
79
#if !XCHAL_HAVE_LOOPS
80
	blt	a2, a5, .Loop1
81
#endif
82
2:
83
	extui	a5, a3, 2, 3	/* remaining 4-byte chunks */
84
#if XCHAL_HAVE_LOOPS
85
	loopgtz	a5, 3f
86
#else
87
	beqz	a5, 3f
88
	slli	a5, a5, 2
89
	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
90
.Loop2:
91
#endif
92
	l32i	a6, a2, 0
93
	ONES_ADD(a4, a6)
94
	addi	a2, a2, 4
95
#if !XCHAL_HAVE_LOOPS
96
	blt	a2, a5, .Loop2
97
#endif
98
3:
99
	_bbci.l	a3, 1, 5f	/* remaining 2-byte chunk */
100
	l16ui	a6, a2, 0
101
	ONES_ADD(a4, a6)
102
	addi	a2, a2, 2
103
5:
104
	_bbci.l	a3, 0, 7f	/* remaining 1-byte chunk */
105
6:	l8ui	a6, a2, 0
106
#ifdef __XTENSA_EB__
107
	slli	a6, a6, 8	/* load byte into bits 8..15 */
108
#endif
109
	ONES_ADD(a4, a6)
110
7:
111
	mov	a2, a4
112
	retw
113

114
	/* uncommon case, buf is 2-byte aligned */
115
8:
116
	beqz	a3, 7b		/* branch if len == 0 */
117
	beqi	a3, 1, 6b	/* branch if len == 1 */
118

119
	extui	a5, a2, 0, 1
120
	bnez	a5, 8f		/* branch if 1-byte aligned */
121

122
	l16ui	a6, a2, 0	/* common case, len >= 2 */
123
	ONES_ADD(a4, a6)
124
	addi	a2, a2, 2	/* adjust buf */
125
	addi	a3, a3, -2	/* adjust len */
126
	j	1b		/* now buf is 4-byte aligned */
127

128
	/* case: odd-byte aligned, len > 1
129
	 * This case is dog slow, so don't give us an odd address.
130
	 * (I don't think this ever happens, but just in case.)
131
	 */
132
8:
133
	srli	a5, a3, 2	/* 4-byte chunks */
134
#if XCHAL_HAVE_LOOPS
135
	loopgtz	a5, 2f
136
#else
137
	beqz	a5, 2f
138
	slli	a5, a5, 2
139
	add	a5, a5, a2	/* a5 = end of last 4-byte chunk */
140
.Loop3:
141
#endif
142
	l8ui	a6, a2, 0	/* bits 24..31 */
143
	l16ui	a7, a2, 1	/* bits  8..23 */
144
	l8ui	a8, a2, 3	/* bits  0.. 8 */
145
#ifdef	__XTENSA_EB__
146
	slli	a6, a6, 24
147
#else
148
	slli	a8, a8, 24
149
#endif
150
	slli	a7, a7, 8
151
	or	a7, a7, a6
152
	or	a7, a7, a8
153
	ONES_ADD(a4, a7)
154
	addi	a2, a2, 4
155
#if !XCHAL_HAVE_LOOPS
156
	blt	a2, a5, .Loop3
157
#endif
158
2:
159
	_bbci.l	a3, 1, 3f	/* remaining 2-byte chunk, still odd addr */
160
	l8ui	a6, a2, 0
161
	l8ui	a7, a2, 1
162
#ifdef	__XTENSA_EB__
163
	slli	a6, a6, 8
164
#else
165
	slli	a7, a7, 8
166
#endif
167
	or	a7, a7, a6
168
	ONES_ADD(a4, a7)
169
	addi	a2, a2, 2
170
3:
171
	j	5b		/* branch to handle the remaining byte */
172

173

174

175
/*
176
 * Copy from ds while checksumming, otherwise like csum_partial
177
 *
178
 * The macros SRC and DST specify the type of access for the instruction.
179
 * thus we can call a custom exception handler for each access type.
180
 */
181

182
#define SRC(y...)			\
183
	9999: y;			\
184
	.section __ex_table, "a";	\
185
	.long 9999b, 6001f	;	\
186
	.previous
187

188
#define DST(y...)			\
189
	9999: y;			\
190
	.section __ex_table, "a";	\
191
	.long 9999b, 6002f	;	\
192
	.previous
193

194
/*
195
unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
196
					int sum, int *src_err_ptr, int *dst_err_ptr)
197
	a2  = src
198
	a3  = dst
199
	a4  = len
200
	a5  = sum
201
	a6  = src_err_ptr
202
	a7  = dst_err_ptr
203
	a8  = temp
204
	a9  = temp
205
	a10 = temp
206
	a11 = original len for exception handling
207
	a12 = original dst for exception handling
208

209
    This function is optimized for 4-byte aligned addresses.  Other
210
    alignments work, but not nearly as efficiently.
211
 */
212

213
ENTRY(csum_partial_copy_generic)
214
	entry	sp, 32
215
	mov	a12, a3
216
	mov	a11, a4
217
	or	a10, a2, a3
218

219
	/* We optimize the following alignment tests for the 4-byte
220
	aligned case.  Two bbsi.l instructions might seem more optimal
221
	(commented out below).  However, both labels 5: and 3: are out
222
	of the imm8 range, so the assembler relaxes them into
223
	equivalent bbci.l, j combinations, which is actually
224
	slower. */
225

226
	extui	a9, a10, 0, 2
227
	beqz	a9, 1f		/* branch if both are 4-byte aligned */
228
	bbsi.l	a10, 0, 5f	/* branch if one address is odd */
229
	j	3f		/* one address is 2-byte aligned */
230

231
/*	_bbsi.l	a10, 0, 5f */	/* branch if odd address */
232
/*	_bbsi.l	a10, 1, 3f */	/* branch if 2-byte-aligned address */
233

234
1:
235
	/* src and dst are both 4-byte aligned */
236
	srli	a10, a4, 5	/* 32-byte chunks */
237
#if XCHAL_HAVE_LOOPS
238
	loopgtz	a10, 2f
239
#else
240
	beqz	a10, 2f
241
	slli	a10, a10, 5
242
	add	a10, a10, a2	/* a10 = end of last 32-byte src chunk */
243
.Loop5:
244
#endif
245
SRC(	l32i	a9, a2, 0	)
246
SRC(	l32i	a8, a2, 4	)
247
DST(	s32i	a9, a3, 0	)
248
DST(	s32i	a8, a3, 4	)
249
	ONES_ADD(a5, a9)
250
	ONES_ADD(a5, a8)
251
SRC(	l32i	a9, a2, 8	)
252
SRC(	l32i	a8, a2, 12	)
253
DST(	s32i	a9, a3, 8	)
254
DST(	s32i	a8, a3, 12	)
255
	ONES_ADD(a5, a9)
256
	ONES_ADD(a5, a8)
257
SRC(	l32i	a9, a2, 16	)
258
SRC(	l32i	a8, a2, 20	)
259
DST(	s32i	a9, a3, 16	)
260
DST(	s32i	a8, a3, 20	)
261
	ONES_ADD(a5, a9)
262
	ONES_ADD(a5, a8)
263
SRC(	l32i	a9, a2, 24	)
264
SRC(	l32i	a8, a2, 28	)
265
DST(	s32i	a9, a3, 24	)
266
DST(	s32i	a8, a3, 28	)
267
	ONES_ADD(a5, a9)
268
	ONES_ADD(a5, a8)
269
	addi	a2, a2, 32
270
	addi	a3, a3, 32
271
#if !XCHAL_HAVE_LOOPS
272
	blt	a2, a10, .Loop5
273
#endif
274
2:
275
	extui	a10, a4, 2, 3	/* remaining 4-byte chunks */
276
	extui	a4, a4, 0, 2	/* reset len for general-case, 2-byte chunks */
277
#if XCHAL_HAVE_LOOPS
278
	loopgtz	a10, 3f
279
#else
280
	beqz	a10, 3f
281
	slli	a10, a10, 2
282
	add	a10, a10, a2	/* a10 = end of last 4-byte src chunk */
283
.Loop6:
284
#endif
285
SRC(	l32i	a9, a2, 0	)
286
DST(	s32i	a9, a3, 0	)
287
	ONES_ADD(a5, a9)
288
	addi	a2, a2, 4
289
	addi	a3, a3, 4
290
#if !XCHAL_HAVE_LOOPS
291
	blt	a2, a10, .Loop6
292
#endif
293
3:
294
	/*
295
	Control comes to here in two cases: (1) It may fall through
296
	to here from the 4-byte alignment case to process, at most,
297
	one 2-byte chunk.  (2) It branches to here from above if
298
	either src or dst is 2-byte aligned, and we process all bytes
299
	here, except for perhaps a trailing odd byte.  It's
300
	inefficient, so align your addresses to 4-byte boundaries.
301

302
	a2 = src
303
	a3 = dst
304
	a4 = len
305
	a5 = sum
306
	*/
307
	srli	a10, a4, 1	/* 2-byte chunks */
308
#if XCHAL_HAVE_LOOPS
309
	loopgtz	a10, 4f
310
#else
311
	beqz	a10, 4f
312
	slli	a10, a10, 1
313
	add	a10, a10, a2	/* a10 = end of last 2-byte src chunk */
314
.Loop7:
315
#endif
316
SRC(	l16ui	a9, a2, 0	)
317
DST(	s16i	a9, a3, 0	)
318
	ONES_ADD(a5, a9)
319
	addi	a2, a2, 2
320
	addi	a3, a3, 2
321
#if !XCHAL_HAVE_LOOPS
322
	blt	a2, a10, .Loop7
323
#endif
324
4:
325
	/* This section processes a possible trailing odd byte. */
326
	_bbci.l	a4, 0, 8f	/* 1-byte chunk */
327
SRC(	l8ui	a9, a2, 0	)
328
DST(	s8i	a9, a3, 0	)
329
#ifdef __XTENSA_EB__
330
	slli	a9, a9, 8	/* shift byte to bits 8..15 */
331
#endif
332
	ONES_ADD(a5, a9)
333
8:
334
	mov	a2, a5
335
	retw
336

337
5:
338
	/* Control branch to here when either src or dst is odd.  We
339
	process all bytes using 8-bit accesses.  Grossly inefficient,
340
	so don't feed us an odd address. */
341

342
	srli	a10, a4, 1	/* handle in pairs for 16-bit csum */
343
#if XCHAL_HAVE_LOOPS
344
	loopgtz	a10, 6f
345
#else
346
	beqz	a10, 6f
347
	slli	a10, a10, 1
348
	add	a10, a10, a2	/* a10 = end of last odd-aligned, 2-byte src chunk */
349
.Loop8:
350
#endif
351
SRC(	l8ui	a9, a2, 0	)
352
SRC(	l8ui	a8, a2, 1	)
353
DST(	s8i	a9, a3, 0	)
354
DST(	s8i	a8, a3, 1	)
355
#ifdef __XTENSA_EB__
356
	slli	a9, a9, 8	/* combine into a single 16-bit value */
357
#else				/* for checksum computation */
358
	slli	a8, a8, 8
359
#endif
360
	or	a9, a9, a8
361
	ONES_ADD(a5, a9)
362
	addi	a2, a2, 2
363
	addi	a3, a3, 2
364
#if !XCHAL_HAVE_LOOPS
365
	blt	a2, a10, .Loop8
366
#endif
367
6:
368
	j	4b		/* process the possible trailing odd byte */
369

370

371
# Exception handler:
372
.section .fixup, "ax"
373
/*
374
	a6  = src_err_ptr
375
	a7  = dst_err_ptr
376
	a11 = original len for exception handling
377
	a12 = original dst for exception handling
378
*/
379

380
6001:
381
	_movi	a2, -EFAULT
382
	s32i	a2, a6, 0	/* src_err_ptr */
383

384
	# clear the complete destination - computing the rest
385
	# is too much work
386
	movi	a2, 0
387
#if XCHAL_HAVE_LOOPS
388
	loopgtz	a11, 2f
389
#else
390
	beqz	a11, 2f
391
	add	a11, a11, a12	/* a11 = ending address */
392
.Leloop:
393
#endif
394
	s8i	a2, a12, 0
395
	addi	a12, a12, 1
396
#if !XCHAL_HAVE_LOOPS
397
	blt	a12, a11, .Leloop
398
#endif
399
2:
400
	retw
401

402
6002:
403
	movi	a2, -EFAULT
404
	s32i	a2, a7, 0	/* dst_err_ptr */
405
	movi	a2, 0
406
	retw
407

408
.previous
409

410

411
Product

Resources

Company