/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* INET An implementation of the TCP/IP protocol suite for the LINUX3* operating system. INET is implemented using the BSD Socket4* interface as the means of communication with the user level.5*6* IP/TCP/UDP checksumming routines7*8* Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea9* Optimized by Joe Taylor10*/1112#include <linux/errno.h>13#include <linux/linkage.h>14#include <asm/asmmacro.h>15#include <asm/core.h>1617/*18* computes a partial checksum, e.g. for TCP/UDP fragments19*/2021/*22* unsigned int csum_partial(const unsigned char *buf, int len,23* unsigned int sum);24* a2 = buf25* a3 = len26* a4 = sum27*28* This function assumes 2- or 4-byte alignment. Other alignments will fail!29*/3031/* ONES_ADD converts twos-complement math to ones-complement. */32#define ONES_ADD(sum, val) \33add sum, sum, val ; \34bgeu sum, val, 99f ; \35addi sum, sum, 1 ; \3699: ;3738.text39ENTRY(csum_partial)4041/*42* Experiments with Ethernet and SLIP connections show that buf43* is aligned on either a 2-byte or 4-byte boundary.44*/45abi_entry_default46extui a5, a2, 0, 247bnez a5, 8f /* branch if 2-byte aligned */48/* Fall-through on common case, 4-byte alignment */491:50srli a5, a3, 5 /* 32-byte chunks */51#if XCHAL_HAVE_LOOPS52loopgtz a5, 2f53#else54beqz a5, 2f55slli a5, a5, 556add a5, a5, a2 /* a5 = end of last 32-byte chunk */57.Loop1:58#endif59l32i a6, a2, 060l32i a7, a2, 461ONES_ADD(a4, a6)62ONES_ADD(a4, a7)63l32i a6, a2, 864l32i a7, a2, 1265ONES_ADD(a4, a6)66ONES_ADD(a4, a7)67l32i a6, a2, 1668l32i a7, a2, 2069ONES_ADD(a4, a6)70ONES_ADD(a4, a7)71l32i a6, a2, 2472l32i a7, a2, 2873ONES_ADD(a4, a6)74ONES_ADD(a4, a7)75addi a2, a2, 4*876#if !XCHAL_HAVE_LOOPS77blt a2, a5, .Loop178#endif792:80extui a5, a3, 2, 3 /* remaining 4-byte chunks */81#if XCHAL_HAVE_LOOPS82loopgtz a5, 3f83#else84beqz a5, 3f85slli a5, a5, 286add a5, a5, a2 /* a5 = end of last 4-byte chunk */87.Loop2:88#endif89l32i a6, a2, 090ONES_ADD(a4, a6)91addi a2, a2, 492#if !XCHAL_HAVE_LOOPS93blt a2, a5, .Loop294#endif953:96_bbci.l a3, 1, 5f /* remaining 2-byte chunk */97l16ui a6, a2, 098ONES_ADD(a4, a6)99addi a2, a2, 21005:101_bbci.l a3, 0, 7f /* remaining 1-byte chunk */1026: l8ui a6, a2, 0103#ifdef __XTENSA_EB__104slli a6, a6, 8 /* load byte into bits 8..15 */105#endif106ONES_ADD(a4, a6)1077:108mov a2, a4109abi_ret_default110111/* uncommon case, buf is 2-byte aligned */1128:113beqz a3, 7b /* branch if len == 0 */114beqi a3, 1, 6b /* branch if len == 1 */115116extui a5, a2, 0, 1117bnez a5, 8f /* branch if 1-byte aligned */118119l16ui a6, a2, 0 /* common case, len >= 2 */120ONES_ADD(a4, a6)121addi a2, a2, 2 /* adjust buf */122addi a3, a3, -2 /* adjust len */123j 1b /* now buf is 4-byte aligned */124125/* case: odd-byte aligned, len > 1126* This case is dog slow, so don't give us an odd address.127* (I don't think this ever happens, but just in case.)128*/1298:130srli a5, a3, 2 /* 4-byte chunks */131#if XCHAL_HAVE_LOOPS132loopgtz a5, 2f133#else134beqz a5, 2f135slli a5, a5, 2136add a5, a5, a2 /* a5 = end of last 4-byte chunk */137.Loop3:138#endif139l8ui a6, a2, 0 /* bits 24..31 */140l16ui a7, a2, 1 /* bits 8..23 */141l8ui a8, a2, 3 /* bits 0.. 8 */142#ifdef __XTENSA_EB__143slli a6, a6, 24144#else145slli a8, a8, 24146#endif147slli a7, a7, 8148or a7, a7, a6149or a7, a7, a8150ONES_ADD(a4, a7)151addi a2, a2, 4152#if !XCHAL_HAVE_LOOPS153blt a2, a5, .Loop3154#endif1552:156_bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */157l8ui a6, a2, 0158l8ui a7, a2, 1159#ifdef __XTENSA_EB__160slli a6, a6, 8161#else162slli a7, a7, 8163#endif164or a7, a7, a6165ONES_ADD(a4, a7)166addi a2, a2, 21673:168j 5b /* branch to handle the remaining byte */169170ENDPROC(csum_partial)171EXPORT_SYMBOL(csum_partial)172173/*174* Copy from ds while checksumming, otherwise like csum_partial175*/176177/*178unsigned int csum_partial_copy_generic (const char *src, char *dst, int len)179a2 = src180a3 = dst181a4 = len182a5 = sum183a8 = temp184a9 = temp185a10 = temp186187This function is optimized for 4-byte aligned addresses. Other188alignments work, but not nearly as efficiently.189*/190191ENTRY(csum_partial_copy_generic)192193abi_entry_default194movi a5, -1195or a10, a2, a3196197/* We optimize the following alignment tests for the 4-byte198aligned case. Two bbsi.l instructions might seem more optimal199(commented out below). However, both labels 5: and 3: are out200of the imm8 range, so the assembler relaxes them into201equivalent bbci.l, j combinations, which is actually202slower. */203204extui a9, a10, 0, 2205beqz a9, 1f /* branch if both are 4-byte aligned */206bbsi.l a10, 0, 5f /* branch if one address is odd */207j 3f /* one address is 2-byte aligned */208209/* _bbsi.l a10, 0, 5f */ /* branch if odd address */210/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */2112121:213/* src and dst are both 4-byte aligned */214srli a10, a4, 5 /* 32-byte chunks */215#if XCHAL_HAVE_LOOPS216loopgtz a10, 2f217#else218beqz a10, 2f219slli a10, a10, 5220add a10, a10, a2 /* a10 = end of last 32-byte src chunk */221.Loop5:222#endif223EX(10f) l32i a9, a2, 0224EX(10f) l32i a8, a2, 4225EX(10f) s32i a9, a3, 0226EX(10f) s32i a8, a3, 4227ONES_ADD(a5, a9)228ONES_ADD(a5, a8)229EX(10f) l32i a9, a2, 8230EX(10f) l32i a8, a2, 12231EX(10f) s32i a9, a3, 8232EX(10f) s32i a8, a3, 12233ONES_ADD(a5, a9)234ONES_ADD(a5, a8)235EX(10f) l32i a9, a2, 16236EX(10f) l32i a8, a2, 20237EX(10f) s32i a9, a3, 16238EX(10f) s32i a8, a3, 20239ONES_ADD(a5, a9)240ONES_ADD(a5, a8)241EX(10f) l32i a9, a2, 24242EX(10f) l32i a8, a2, 28243EX(10f) s32i a9, a3, 24244EX(10f) s32i a8, a3, 28245ONES_ADD(a5, a9)246ONES_ADD(a5, a8)247addi a2, a2, 32248addi a3, a3, 32249#if !XCHAL_HAVE_LOOPS250blt a2, a10, .Loop5251#endif2522:253extui a10, a4, 2, 3 /* remaining 4-byte chunks */254extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */255#if XCHAL_HAVE_LOOPS256loopgtz a10, 3f257#else258beqz a10, 3f259slli a10, a10, 2260add a10, a10, a2 /* a10 = end of last 4-byte src chunk */261.Loop6:262#endif263EX(10f) l32i a9, a2, 0264EX(10f) s32i a9, a3, 0265ONES_ADD(a5, a9)266addi a2, a2, 4267addi a3, a3, 4268#if !XCHAL_HAVE_LOOPS269blt a2, a10, .Loop6270#endif2713:272/*273Control comes to here in two cases: (1) It may fall through274to here from the 4-byte alignment case to process, at most,275one 2-byte chunk. (2) It branches to here from above if276either src or dst is 2-byte aligned, and we process all bytes277here, except for perhaps a trailing odd byte. It's278inefficient, so align your addresses to 4-byte boundaries.279280a2 = src281a3 = dst282a4 = len283a5 = sum284*/285srli a10, a4, 1 /* 2-byte chunks */286#if XCHAL_HAVE_LOOPS287loopgtz a10, 4f288#else289beqz a10, 4f290slli a10, a10, 1291add a10, a10, a2 /* a10 = end of last 2-byte src chunk */292.Loop7:293#endif294EX(10f) l16ui a9, a2, 0295EX(10f) s16i a9, a3, 0296ONES_ADD(a5, a9)297addi a2, a2, 2298addi a3, a3, 2299#if !XCHAL_HAVE_LOOPS300blt a2, a10, .Loop7301#endif3024:303/* This section processes a possible trailing odd byte. */304_bbci.l a4, 0, 8f /* 1-byte chunk */305EX(10f) l8ui a9, a2, 0306EX(10f) s8i a9, a3, 0307#ifdef __XTENSA_EB__308slli a9, a9, 8 /* shift byte to bits 8..15 */309#endif310ONES_ADD(a5, a9)3118:312mov a2, a5313abi_ret_default3143155:316/* Control branch to here when either src or dst is odd. We317process all bytes using 8-bit accesses. Grossly inefficient,318so don't feed us an odd address. */319320srli a10, a4, 1 /* handle in pairs for 16-bit csum */321#if XCHAL_HAVE_LOOPS322loopgtz a10, 6f323#else324beqz a10, 6f325slli a10, a10, 1326add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */327.Loop8:328#endif329EX(10f) l8ui a9, a2, 0330EX(10f) l8ui a8, a2, 1331EX(10f) s8i a9, a3, 0332EX(10f) s8i a8, a3, 1333#ifdef __XTENSA_EB__334slli a9, a9, 8 /* combine into a single 16-bit value */335#else /* for checksum computation */336slli a8, a8, 8337#endif338or a9, a9, a8339ONES_ADD(a5, a9)340addi a2, a2, 2341addi a3, a3, 2342#if !XCHAL_HAVE_LOOPS343blt a2, a10, .Loop8344#endif3456:346j 4b /* process the possible trailing odd byte */347348ENDPROC(csum_partial_copy_generic)349EXPORT_SYMBOL(csum_partial_copy_generic)350351352# Exception handler:353.section .fixup, "ax"35410:355movi a2, 0356abi_ret_default357358.previous359360361