/*1* This file contains assembly-language implementations2* of IP-style 1's complement checksum routines.3*4* Copyright (C) 1995-1996 Gary Thomas ([email protected])5*6* This program is free software; you can redistribute it and/or7* modify it under the terms of the GNU General Public License8* as published by the Free Software Foundation; either version9* 2 of the License, or (at your option) any later version.10*11* Severely hacked about by Paul Mackerras ([email protected]).12*/1314#include <linux/sys.h>15#include <asm/processor.h>16#include <asm/errno.h>17#include <asm/ppc_asm.h>1819/*20* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header21* len is in words and is always >= 5.22*23* In practice len == 5, but this is not guaranteed. So this code does not24* attempt to use doubleword instructions.25*/26_GLOBAL(ip_fast_csum)27lwz r0,0(r3)28lwzu r5,4(r3)29addic. r4,r4,-230addc r0,r0,r531mtctr r432blelr-331: lwzu r4,4(r3)34adde r0,r0,r435bdnz 1b36addze r0,r0 /* add in final carry */37rldicl r4,r0,32,0 /* fold two 32-bit halves together */38add r0,r0,r439srdi r0,r0,3240rlwinm r3,r0,16,0,31 /* fold two halves together */41add r3,r0,r342not r3,r343srwi r3,r3,1644blr4546/*47* Compute checksum of TCP or UDP pseudo-header:48* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)49* No real gain trying to do this specially for 64 bit, but50* the 32 bit addition may spill into the upper bits of51* the doubleword so we still must fold it down from 64.52*/53_GLOBAL(csum_tcpudp_magic)54rlwimi r5,r6,16,0,15 /* put proto in upper half of len */55addc r0,r3,r4 /* add 4 32-bit words together */56adde r0,r0,r557adde r0,r0,r758rldicl r4,r0,32,0 /* fold 64 bit value */59add r0,r4,r060srdi r0,r0,3261rlwinm r3,r0,16,0,31 /* fold two halves together */62add r3,r0,r363not r3,r364srwi r3,r3,1665blr6667#define STACKFRAMESIZE 25668#define STK_REG(i) (112 + ((i)-14)*8)6970/*71* Computes the checksum of a memory block at buff, length len,72* and adds in "sum" (32-bit).73*74* csum_partial(r3=buff, r4=len, r5=sum)75*/76_GLOBAL(csum_partial)77addic r0,r5,0 /* clear carry */7879srdi. r6,r4,3 /* less than 8 bytes? */80beq .Lcsum_tail_word8182/*83* If only halfword aligned, align to a double word. Since odd84* aligned addresses should be rare and they would require more85* work to calculate the correct checksum, we ignore that case86* and take the potential slowdown of unaligned loads.87*/88rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */89beq .Lcsum_aligned9091li r7,492sub r6,r7,r693mtctr r694951:96lhz r6,0(r3) /* align to doubleword */97subi r4,r4,298addi r3,r3,299adde r0,r0,r6100bdnz 1b101102.Lcsum_aligned:103/*104* We unroll the loop such that each iteration is 64 bytes with an105* entry and exit limb of 64 bytes, meaning a minimum size of106* 128 bytes.107*/108srdi. r6,r4,7109beq .Lcsum_tail_doublewords /* len < 128 */110111srdi r6,r4,6112subi r6,r6,1113mtctr r6114115stdu r1,-STACKFRAMESIZE(r1)116std r14,STK_REG(r14)(r1)117std r15,STK_REG(r15)(r1)118std r16,STK_REG(r16)(r1)119120ld r6,0(r3)121ld r9,8(r3)122123ld r10,16(r3)124ld r11,24(r3)125126/*127* On POWER6 and POWER7 back to back addes take 2 cycles because of128* the XER dependency. This means the fastest this loop can go is129* 16 cycles per iteration. The scheduling of the loop below has130* been shown to hit this on both POWER6 and POWER7.131*/132.align 51332:134adde r0,r0,r6135ld r12,32(r3)136ld r14,40(r3)137138adde r0,r0,r9139ld r15,48(r3)140ld r16,56(r3)141addi r3,r3,64142143adde r0,r0,r10144145adde r0,r0,r11146147adde r0,r0,r12148149adde r0,r0,r14150151adde r0,r0,r15152ld r6,0(r3)153ld r9,8(r3)154155adde r0,r0,r16156ld r10,16(r3)157ld r11,24(r3)158bdnz 2b159160161adde r0,r0,r6162ld r12,32(r3)163ld r14,40(r3)164165adde r0,r0,r9166ld r15,48(r3)167ld r16,56(r3)168addi r3,r3,64169170adde r0,r0,r10171adde r0,r0,r11172adde r0,r0,r12173adde r0,r0,r14174adde r0,r0,r15175adde r0,r0,r16176177ld r14,STK_REG(r14)(r1)178ld r15,STK_REG(r15)(r1)179ld r16,STK_REG(r16)(r1)180addi r1,r1,STACKFRAMESIZE181182andi. r4,r4,63183184.Lcsum_tail_doublewords: /* Up to 127 bytes to go */185srdi. r6,r4,3186beq .Lcsum_tail_word187188mtctr r61893:190ld r6,0(r3)191addi r3,r3,8192adde r0,r0,r6193bdnz 3b194195andi. r4,r4,7196197.Lcsum_tail_word: /* Up to 7 bytes to go */198srdi. r6,r4,2199beq .Lcsum_tail_halfword200201lwz r6,0(r3)202addi r3,r3,4203adde r0,r0,r6204subi r4,r4,4205206.Lcsum_tail_halfword: /* Up to 3 bytes to go */207srdi. r6,r4,1208beq .Lcsum_tail_byte209210lhz r6,0(r3)211addi r3,r3,2212adde r0,r0,r6213subi r4,r4,2214215.Lcsum_tail_byte: /* Up to 1 byte to go */216andi. r6,r4,1217beq .Lcsum_finish218219lbz r6,0(r3)220sldi r9,r6,8 /* Pad the byte out to 16 bits */221adde r0,r0,r9222223.Lcsum_finish:224addze r0,r0 /* add in final carry */225rldicl r4,r0,32,0 /* fold two 32 bit halves together */226add r3,r4,r0227srdi r3,r3,32228blr229230231.macro source232100:233.section __ex_table,"a"234.align 3235.llong 100b,.Lsrc_error236.previous237.endm238239.macro dest240200:241.section __ex_table,"a"242.align 3243.llong 200b,.Ldest_error244.previous245.endm246247/*248* Computes the checksum of a memory block at src, length len,249* and adds in "sum" (32-bit), while copying the block to dst.250* If an access exception occurs on src or dst, it stores -EFAULT251* to *src_err or *dst_err respectively. The caller must take any action252* required in this case (zeroing memory, recalculating partial checksum etc).253*254* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)255*/256_GLOBAL(csum_partial_copy_generic)257addic r0,r6,0 /* clear carry */258259srdi. r6,r5,3 /* less than 8 bytes? */260beq .Lcopy_tail_word261262/*263* If only halfword aligned, align to a double word. Since odd264* aligned addresses should be rare and they would require more265* work to calculate the correct checksum, we ignore that case266* and take the potential slowdown of unaligned loads.267*268* If the source and destination are relatively unaligned we only269* align the source. This keeps things simple.270*/271rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */272beq .Lcopy_aligned273274li r7,4275sub r6,r7,r6276mtctr r62772781:279source; lhz r6,0(r3) /* align to doubleword */280subi r5,r5,2281addi r3,r3,2282adde r0,r0,r6283dest; sth r6,0(r4)284addi r4,r4,2285bdnz 1b286287.Lcopy_aligned:288/*289* We unroll the loop such that each iteration is 64 bytes with an290* entry and exit limb of 64 bytes, meaning a minimum size of291* 128 bytes.292*/293srdi. r6,r5,7294beq .Lcopy_tail_doublewords /* len < 128 */295296srdi r6,r5,6297subi r6,r6,1298mtctr r6299300stdu r1,-STACKFRAMESIZE(r1)301std r14,STK_REG(r14)(r1)302std r15,STK_REG(r15)(r1)303std r16,STK_REG(r16)(r1)304305source; ld r6,0(r3)306source; ld r9,8(r3)307308source; ld r10,16(r3)309source; ld r11,24(r3)310311/*312* On POWER6 and POWER7 back to back addes take 2 cycles because of313* the XER dependency. This means the fastest this loop can go is314* 16 cycles per iteration. The scheduling of the loop below has315* been shown to hit this on both POWER6 and POWER7.316*/317.align 53182:319adde r0,r0,r6320source; ld r12,32(r3)321source; ld r14,40(r3)322323adde r0,r0,r9324source; ld r15,48(r3)325source; ld r16,56(r3)326addi r3,r3,64327328adde r0,r0,r10329dest; std r6,0(r4)330dest; std r9,8(r4)331332adde r0,r0,r11333dest; std r10,16(r4)334dest; std r11,24(r4)335336adde r0,r0,r12337dest; std r12,32(r4)338dest; std r14,40(r4)339340adde r0,r0,r14341dest; std r15,48(r4)342dest; std r16,56(r4)343addi r4,r4,64344345adde r0,r0,r15346source; ld r6,0(r3)347source; ld r9,8(r3)348349adde r0,r0,r16350source; ld r10,16(r3)351source; ld r11,24(r3)352bdnz 2b353354355adde r0,r0,r6356source; ld r12,32(r3)357source; ld r14,40(r3)358359adde r0,r0,r9360source; ld r15,48(r3)361source; ld r16,56(r3)362addi r3,r3,64363364adde r0,r0,r10365dest; std r6,0(r4)366dest; std r9,8(r4)367368adde r0,r0,r11369dest; std r10,16(r4)370dest; std r11,24(r4)371372adde r0,r0,r12373dest; std r12,32(r4)374dest; std r14,40(r4)375376adde r0,r0,r14377dest; std r15,48(r4)378dest; std r16,56(r4)379addi r4,r4,64380381adde r0,r0,r15382adde r0,r0,r16383384ld r14,STK_REG(r14)(r1)385ld r15,STK_REG(r15)(r1)386ld r16,STK_REG(r16)(r1)387addi r1,r1,STACKFRAMESIZE388389andi. r5,r5,63390391.Lcopy_tail_doublewords: /* Up to 127 bytes to go */392srdi. r6,r5,3393beq .Lcopy_tail_word394395mtctr r63963:397source; ld r6,0(r3)398addi r3,r3,8399adde r0,r0,r6400dest; std r6,0(r4)401addi r4,r4,8402bdnz 3b403404andi. r5,r5,7405406.Lcopy_tail_word: /* Up to 7 bytes to go */407srdi. r6,r5,2408beq .Lcopy_tail_halfword409410source; lwz r6,0(r3)411addi r3,r3,4412adde r0,r0,r6413dest; stw r6,0(r4)414addi r4,r4,4415subi r5,r5,4416417.Lcopy_tail_halfword: /* Up to 3 bytes to go */418srdi. r6,r5,1419beq .Lcopy_tail_byte420421source; lhz r6,0(r3)422addi r3,r3,2423adde r0,r0,r6424dest; sth r6,0(r4)425addi r4,r4,2426subi r5,r5,2427428.Lcopy_tail_byte: /* Up to 1 byte to go */429andi. r6,r5,1430beq .Lcopy_finish431432source; lbz r6,0(r3)433sldi r9,r6,8 /* Pad the byte out to 16 bits */434adde r0,r0,r9435dest; stb r6,0(r4)436437.Lcopy_finish:438addze r0,r0 /* add in final carry */439rldicl r4,r0,32,0 /* fold two 32 bit halves together */440add r3,r4,r0441srdi r3,r3,32442blr443444.Lsrc_error:445cmpdi 0,r7,0446beqlr447li r6,-EFAULT448stw r6,0(r7)449blr450451.Ldest_error:452cmpdi 0,r8,0453beqlr454li r6,-EFAULT455stw r6,0(r8)456blr457458459