/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* This file contains assembly-language implementations3* of IP-style 1's complement checksum routines.4*5* Copyright (C) 1995-1996 Gary Thomas ([email protected])6*7* Severely hacked about by Paul Mackerras ([email protected]).8*/910#include <linux/export.h>11#include <linux/sys.h>12#include <asm/processor.h>13#include <asm/errno.h>14#include <asm/ppc_asm.h>1516/*17* Computes the checksum of a memory block at buff, length len,18* and adds in "sum" (32-bit).19*20* __csum_partial(r3=buff, r4=len, r5=sum)21*/22_GLOBAL(__csum_partial)23addic r0,r5,0 /* clear carry */2425srdi. r6,r4,3 /* less than 8 bytes? */26beq .Lcsum_tail_word2728/*29* If only halfword aligned, align to a double word. Since odd30* aligned addresses should be rare and they would require more31* work to calculate the correct checksum, we ignore that case32* and take the potential slowdown of unaligned loads.33*/34rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */35beq .Lcsum_aligned3637li r7,438sub r6,r7,r639mtctr r640411:42lhz r6,0(r3) /* align to doubleword */43subi r4,r4,244addi r3,r3,245adde r0,r0,r646bdnz 1b4748.Lcsum_aligned:49/*50* We unroll the loop such that each iteration is 64 bytes with an51* entry and exit limb of 64 bytes, meaning a minimum size of52* 128 bytes.53*/54srdi. r6,r4,755beq .Lcsum_tail_doublewords /* len < 128 */5657srdi r6,r4,658subi r6,r6,159mtctr r66061stdu r1,-STACKFRAMESIZE(r1)62std r14,STK_REG(R14)(r1)63std r15,STK_REG(R15)(r1)64std r16,STK_REG(R16)(r1)6566ld r6,0(r3)67ld r9,8(r3)6869ld r10,16(r3)70ld r11,24(r3)7172/*73* On POWER6 and POWER7 back to back adde instructions take 2 cycles74* because of the XER dependency. This means the fastest this loop can75* go is 16 cycles per iteration. The scheduling of the loop below has76* been shown to hit this on both POWER6 and POWER7.77*/78.align 5792:80adde r0,r0,r681ld r12,32(r3)82ld r14,40(r3)8384adde r0,r0,r985ld r15,48(r3)86ld r16,56(r3)87addi r3,r3,648889adde r0,r0,r109091adde r0,r0,r119293adde r0,r0,r129495adde r0,r0,r149697adde r0,r0,r1598ld r6,0(r3)99ld r9,8(r3)100101adde r0,r0,r16102ld r10,16(r3)103ld r11,24(r3)104bdnz 2b105106107adde r0,r0,r6108ld r12,32(r3)109ld r14,40(r3)110111adde r0,r0,r9112ld r15,48(r3)113ld r16,56(r3)114addi r3,r3,64115116adde r0,r0,r10117adde r0,r0,r11118adde r0,r0,r12119adde r0,r0,r14120adde r0,r0,r15121adde r0,r0,r16122123ld r14,STK_REG(R14)(r1)124ld r15,STK_REG(R15)(r1)125ld r16,STK_REG(R16)(r1)126addi r1,r1,STACKFRAMESIZE127128andi. r4,r4,63129130.Lcsum_tail_doublewords: /* Up to 127 bytes to go */131srdi. r6,r4,3132beq .Lcsum_tail_word133134mtctr r61353:136ld r6,0(r3)137addi r3,r3,8138adde r0,r0,r6139bdnz 3b140141andi. r4,r4,7142143.Lcsum_tail_word: /* Up to 7 bytes to go */144srdi. r6,r4,2145beq .Lcsum_tail_halfword146147lwz r6,0(r3)148addi r3,r3,4149adde r0,r0,r6150subi r4,r4,4151152.Lcsum_tail_halfword: /* Up to 3 bytes to go */153srdi. r6,r4,1154beq .Lcsum_tail_byte155156lhz r6,0(r3)157addi r3,r3,2158adde r0,r0,r6159subi r4,r4,2160161.Lcsum_tail_byte: /* Up to 1 byte to go */162andi. r6,r4,1163beq .Lcsum_finish164165lbz r6,0(r3)166#ifdef __BIG_ENDIAN__167sldi r9,r6,8 /* Pad the byte out to 16 bits */168adde r0,r0,r9169#else170adde r0,r0,r6171#endif172173.Lcsum_finish:174addze r0,r0 /* add in final carry */175rldicl r4,r0,32,0 /* fold two 32 bit halves together */176add r3,r4,r0177srdi r3,r3,32178blr179EXPORT_SYMBOL(__csum_partial)180181182.macro srcnr183100:184EX_TABLE(100b,.Lerror_nr)185.endm186187.macro source188150:189EX_TABLE(150b,.Lerror)190.endm191192.macro dstnr193200:194EX_TABLE(200b,.Lerror_nr)195.endm196197.macro dest198250:199EX_TABLE(250b,.Lerror)200.endm201202/*203* Computes the checksum of a memory block at src, length len,204* and adds in 0xffffffff (32-bit), while copying the block to dst.205* If an access exception occurs, it returns 0.206*207* csum_partial_copy_generic(r3=src, r4=dst, r5=len)208*/209_GLOBAL(csum_partial_copy_generic)210li r6,-1211addic r0,r6,0 /* clear carry */212213srdi. r6,r5,3 /* less than 8 bytes? */214beq .Lcopy_tail_word215216/*217* If only halfword aligned, align to a double word. Since odd218* aligned addresses should be rare and they would require more219* work to calculate the correct checksum, we ignore that case220* and take the potential slowdown of unaligned loads.221*222* If the source and destination are relatively unaligned we only223* align the source. This keeps things simple.224*/225rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */226beq .Lcopy_aligned227228li r9,4229sub r6,r9,r6230mtctr r62312321:233srcnr; lhz r6,0(r3) /* align to doubleword */234subi r5,r5,2235addi r3,r3,2236adde r0,r0,r6237dstnr; sth r6,0(r4)238addi r4,r4,2239bdnz 1b240241.Lcopy_aligned:242/*243* We unroll the loop such that each iteration is 64 bytes with an244* entry and exit limb of 64 bytes, meaning a minimum size of245* 128 bytes.246*/247srdi. r6,r5,7248beq .Lcopy_tail_doublewords /* len < 128 */249250srdi r6,r5,6251subi r6,r6,1252mtctr r6253254stdu r1,-STACKFRAMESIZE(r1)255std r14,STK_REG(R14)(r1)256std r15,STK_REG(R15)(r1)257std r16,STK_REG(R16)(r1)258259source; ld r6,0(r3)260source; ld r9,8(r3)261262source; ld r10,16(r3)263source; ld r11,24(r3)264265/*266* On POWER6 and POWER7 back to back adde instructions take 2 cycles267* because of the XER dependency. This means the fastest this loop can268* go is 16 cycles per iteration. The scheduling of the loop below has269* been shown to hit this on both POWER6 and POWER7.270*/271.align 52722:273adde r0,r0,r6274source; ld r12,32(r3)275source; ld r14,40(r3)276277adde r0,r0,r9278source; ld r15,48(r3)279source; ld r16,56(r3)280addi r3,r3,64281282adde r0,r0,r10283dest; std r6,0(r4)284dest; std r9,8(r4)285286adde r0,r0,r11287dest; std r10,16(r4)288dest; std r11,24(r4)289290adde r0,r0,r12291dest; std r12,32(r4)292dest; std r14,40(r4)293294adde r0,r0,r14295dest; std r15,48(r4)296dest; std r16,56(r4)297addi r4,r4,64298299adde r0,r0,r15300source; ld r6,0(r3)301source; ld r9,8(r3)302303adde r0,r0,r16304source; ld r10,16(r3)305source; ld r11,24(r3)306bdnz 2b307308309adde r0,r0,r6310source; ld r12,32(r3)311source; ld r14,40(r3)312313adde r0,r0,r9314source; ld r15,48(r3)315source; ld r16,56(r3)316addi r3,r3,64317318adde r0,r0,r10319dest; std r6,0(r4)320dest; std r9,8(r4)321322adde r0,r0,r11323dest; std r10,16(r4)324dest; std r11,24(r4)325326adde r0,r0,r12327dest; std r12,32(r4)328dest; std r14,40(r4)329330adde r0,r0,r14331dest; std r15,48(r4)332dest; std r16,56(r4)333addi r4,r4,64334335adde r0,r0,r15336adde r0,r0,r16337338ld r14,STK_REG(R14)(r1)339ld r15,STK_REG(R15)(r1)340ld r16,STK_REG(R16)(r1)341addi r1,r1,STACKFRAMESIZE342343andi. r5,r5,63344345.Lcopy_tail_doublewords: /* Up to 127 bytes to go */346srdi. r6,r5,3347beq .Lcopy_tail_word348349mtctr r63503:351srcnr; ld r6,0(r3)352addi r3,r3,8353adde r0,r0,r6354dstnr; std r6,0(r4)355addi r4,r4,8356bdnz 3b357358andi. r5,r5,7359360.Lcopy_tail_word: /* Up to 7 bytes to go */361srdi. r6,r5,2362beq .Lcopy_tail_halfword363364srcnr; lwz r6,0(r3)365addi r3,r3,4366adde r0,r0,r6367dstnr; stw r6,0(r4)368addi r4,r4,4369subi r5,r5,4370371.Lcopy_tail_halfword: /* Up to 3 bytes to go */372srdi. r6,r5,1373beq .Lcopy_tail_byte374375srcnr; lhz r6,0(r3)376addi r3,r3,2377adde r0,r0,r6378dstnr; sth r6,0(r4)379addi r4,r4,2380subi r5,r5,2381382.Lcopy_tail_byte: /* Up to 1 byte to go */383andi. r6,r5,1384beq .Lcopy_finish385386srcnr; lbz r6,0(r3)387#ifdef __BIG_ENDIAN__388sldi r9,r6,8 /* Pad the byte out to 16 bits */389adde r0,r0,r9390#else391adde r0,r0,r6392#endif393dstnr; stb r6,0(r4)394395.Lcopy_finish:396addze r0,r0 /* add in final carry */397rldicl r4,r0,32,0 /* fold two 32 bit halves together */398add r3,r4,r0399srdi r3,r3,32400blr401402.Lerror:403ld r14,STK_REG(R14)(r1)404ld r15,STK_REG(R15)(r1)405ld r16,STK_REG(R16)(r1)406addi r1,r1,STACKFRAMESIZE407.Lerror_nr:408li r3,0409blr410411EXPORT_SYMBOL(csum_partial_copy_generic)412413/*414* __sum16 csum_ipv6_magic(const struct in6_addr *saddr,415* const struct in6_addr *daddr,416* __u32 len, __u8 proto, __wsum sum)417*/418419_GLOBAL(csum_ipv6_magic)420ld r8, 0(r3)421ld r9, 8(r3)422add r5, r5, r6423addc r0, r8, r9424ld r10, 0(r4)425ld r11, 8(r4)426#ifdef CONFIG_CPU_LITTLE_ENDIAN427rotldi r5, r5, 8428#endif429adde r0, r0, r10430add r5, r5, r7431adde r0, r0, r11432adde r0, r0, r5433addze r0, r0434rotldi r3, r0, 32 /* fold two 32 bit halves together */435add r3, r0, r3436srdi r0, r3, 32437rotlwi r3, r0, 16 /* fold two 16 bit halves together */438add r3, r0, r3439not r3, r3440rlwinm r3, r3, 16, 16, 31441blr442EXPORT_SYMBOL(csum_ipv6_magic)443444445