Path: blob/master/lib/crc/powerpc/crc-vpmsum-template.S
26289 views
/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Core of the accelerated CRC algorithm.3* In your file, define the constants and CRC_FUNCTION_NAME4* Then include this file.5*6* Calculate the checksum of data that is 16 byte aligned and a multiple of7* 16 bytes.8*9* The first step is to reduce it to 1024 bits. We do this in 8 parallel10* chunks in order to mask the latency of the vpmsum instructions. If we11* have more than 32 kB of data to checksum we repeat this step multiple12* times, passing in the previous 1024 bits.13*14* The next step is to reduce the 1024 bits to 64 bits. This step adds15* 32 bits of 0s to the end - this matches what a CRC does. We just16* calculate constants that land the data in this 32 bits.17*18* We then use fixed point Barrett reduction to compute a mod n over GF(2)19* for n = CRC using POWER8 instructions. We use x = 32.20*21* https://en.wikipedia.org/wiki/Barrett_reduction22*23* Copyright (C) 2015 Anton Blanchard <[email protected]>, IBM24*/2526#include <asm/ppc_asm.h>27#include <asm/ppc-opcode.h>2829#define MAX_SIZE 327683031.text3233#if defined(__BIG_ENDIAN__) && defined(REFLECT)34#define BYTESWAP_DATA35#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)36#define BYTESWAP_DATA37#else38#undef BYTESWAP_DATA39#endif4041#define off16 r2542#define off32 r2643#define off48 r2744#define off64 r2845#define off80 r2946#define off96 r3047#define off112 r314849#define const1 v2450#define const2 v255152#define byteswap v2653#define mask_32bit v2754#define mask_64bit v2855#define zeroes v295657#ifdef BYTESWAP_DATA58#define VPERM(A, B, C, D) vperm A, B, C, D59#else60#define VPERM(A, B, C, D)61#endif6263/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */64FUNC_START(CRC_FUNCTION_NAME)65std r31,-8(r1)66std r30,-16(r1)67std r29,-24(r1)68std r28,-32(r1)69std r27,-40(r1)70std r26,-48(r1)71std r25,-56(r1)7273li off16,1674li off32,3275li off48,4876li off64,6477li off80,8078li off96,9679li off112,11280li r0,08182/* Enough room for saving 10 non volatile VMX registers */83subi r6,r1,56+10*1684subi r7,r1,56+2*168586stvx v20,0,r687stvx v21,off16,r688stvx v22,off32,r689stvx v23,off48,r690stvx v24,off64,r691stvx v25,off80,r692stvx v26,off96,r693stvx v27,off112,r694stvx v28,0,r795stvx v29,off16,r79697mr r10,r39899vxor zeroes,zeroes,zeroes100vspltisw v0,-1101102vsldoi mask_32bit,zeroes,v0,4103vsldoi mask_64bit,zeroes,v0,8104105/* Get the initial value into v8 */106vxor v8,v8,v8107MTVRD(v8, R3)108#ifdef REFLECT109vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */110#else111vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */112#endif113114#ifdef BYTESWAP_DATA115LOAD_REG_ADDR(r3, .byteswap_constant)116lvx byteswap,0,r3117addi r3,r3,16118#endif119120cmpdi r5,256121blt .Lshort122123rldicr r6,r5,0,56124125/* Checksum in blocks of MAX_SIZE */1261: lis r7,MAX_SIZE@h127ori r7,r7,MAX_SIZE@l128mr r9,r7129cmpd r6,r7130bgt 2f131mr r7,r61322: subf r6,r7,r6133134/* our main loop does 128 bytes at a time */135srdi r7,r7,7136137/*138* Work out the offset into the constants table to start at. Each139* constant is 16 bytes, and it is used against 128 bytes of input140* data - 128 / 16 = 8141*/142sldi r8,r7,4143srdi r9,r9,3144subf r8,r8,r9145146/* We reduce our final 128 bytes in a separate step */147addi r7,r7,-1148mtctr r7149150LOAD_REG_ADDR(r3, .constants)151152/* Find the start of our constants */153add r3,r3,r8154155/* zero v0-v7 which will contain our checksums */156vxor v0,v0,v0157vxor v1,v1,v1158vxor v2,v2,v2159vxor v3,v3,v3160vxor v4,v4,v4161vxor v5,v5,v5162vxor v6,v6,v6163vxor v7,v7,v7164165lvx const1,0,r3166167/*168* If we are looping back to consume more data we use the values169* already in v16-v23.170*/171cmpdi r0,1172beq 2f173174/* First warm up pass */175lvx v16,0,r4176lvx v17,off16,r4177VPERM(v16,v16,v16,byteswap)178VPERM(v17,v17,v17,byteswap)179lvx v18,off32,r4180lvx v19,off48,r4181VPERM(v18,v18,v18,byteswap)182VPERM(v19,v19,v19,byteswap)183lvx v20,off64,r4184lvx v21,off80,r4185VPERM(v20,v20,v20,byteswap)186VPERM(v21,v21,v21,byteswap)187lvx v22,off96,r4188lvx v23,off112,r4189VPERM(v22,v22,v22,byteswap)190VPERM(v23,v23,v23,byteswap)191addi r4,r4,8*16192193/* xor in initial value */194vxor v16,v16,v81951962: bdz .Lfirst_warm_up_done197198addi r3,r3,16199lvx const2,0,r3200201/* Second warm up pass */202VPMSUMD(v8,v16,const1)203lvx v16,0,r4204VPERM(v16,v16,v16,byteswap)205ori r2,r2,0206207VPMSUMD(v9,v17,const1)208lvx v17,off16,r4209VPERM(v17,v17,v17,byteswap)210ori r2,r2,0211212VPMSUMD(v10,v18,const1)213lvx v18,off32,r4214VPERM(v18,v18,v18,byteswap)215ori r2,r2,0216217VPMSUMD(v11,v19,const1)218lvx v19,off48,r4219VPERM(v19,v19,v19,byteswap)220ori r2,r2,0221222VPMSUMD(v12,v20,const1)223lvx v20,off64,r4224VPERM(v20,v20,v20,byteswap)225ori r2,r2,0226227VPMSUMD(v13,v21,const1)228lvx v21,off80,r4229VPERM(v21,v21,v21,byteswap)230ori r2,r2,0231232VPMSUMD(v14,v22,const1)233lvx v22,off96,r4234VPERM(v22,v22,v22,byteswap)235ori r2,r2,0236237VPMSUMD(v15,v23,const1)238lvx v23,off112,r4239VPERM(v23,v23,v23,byteswap)240241addi r4,r4,8*16242243bdz .Lfirst_cool_down244245/*246* main loop. We modulo schedule it such that it takes three iterations247* to complete - first iteration load, second iteration vpmsum, third248* iteration xor.249*/250.balign 162514: lvx const1,0,r3252addi r3,r3,16253ori r2,r2,0254255vxor v0,v0,v8256VPMSUMD(v8,v16,const2)257lvx v16,0,r4258VPERM(v16,v16,v16,byteswap)259ori r2,r2,0260261vxor v1,v1,v9262VPMSUMD(v9,v17,const2)263lvx v17,off16,r4264VPERM(v17,v17,v17,byteswap)265ori r2,r2,0266267vxor v2,v2,v10268VPMSUMD(v10,v18,const2)269lvx v18,off32,r4270VPERM(v18,v18,v18,byteswap)271ori r2,r2,0272273vxor v3,v3,v11274VPMSUMD(v11,v19,const2)275lvx v19,off48,r4276VPERM(v19,v19,v19,byteswap)277lvx const2,0,r3278ori r2,r2,0279280vxor v4,v4,v12281VPMSUMD(v12,v20,const1)282lvx v20,off64,r4283VPERM(v20,v20,v20,byteswap)284ori r2,r2,0285286vxor v5,v5,v13287VPMSUMD(v13,v21,const1)288lvx v21,off80,r4289VPERM(v21,v21,v21,byteswap)290ori r2,r2,0291292vxor v6,v6,v14293VPMSUMD(v14,v22,const1)294lvx v22,off96,r4295VPERM(v22,v22,v22,byteswap)296ori r2,r2,0297298vxor v7,v7,v15299VPMSUMD(v15,v23,const1)300lvx v23,off112,r4301VPERM(v23,v23,v23,byteswap)302303addi r4,r4,8*16304305bdnz 4b306307.Lfirst_cool_down:308/* First cool down pass */309lvx const1,0,r3310addi r3,r3,16311312vxor v0,v0,v8313VPMSUMD(v8,v16,const1)314ori r2,r2,0315316vxor v1,v1,v9317VPMSUMD(v9,v17,const1)318ori r2,r2,0319320vxor v2,v2,v10321VPMSUMD(v10,v18,const1)322ori r2,r2,0323324vxor v3,v3,v11325VPMSUMD(v11,v19,const1)326ori r2,r2,0327328vxor v4,v4,v12329VPMSUMD(v12,v20,const1)330ori r2,r2,0331332vxor v5,v5,v13333VPMSUMD(v13,v21,const1)334ori r2,r2,0335336vxor v6,v6,v14337VPMSUMD(v14,v22,const1)338ori r2,r2,0339340vxor v7,v7,v15341VPMSUMD(v15,v23,const1)342ori r2,r2,0343344.Lsecond_cool_down:345/* Second cool down pass */346vxor v0,v0,v8347vxor v1,v1,v9348vxor v2,v2,v10349vxor v3,v3,v11350vxor v4,v4,v12351vxor v5,v5,v13352vxor v6,v6,v14353vxor v7,v7,v15354355#ifdef REFLECT356/*357* vpmsumd produces a 96 bit result in the least significant bits358* of the register. Since we are bit reflected we have to shift it359* left 32 bits so it occupies the least significant bits in the360* bit reflected domain.361*/362vsldoi v0,v0,zeroes,4363vsldoi v1,v1,zeroes,4364vsldoi v2,v2,zeroes,4365vsldoi v3,v3,zeroes,4366vsldoi v4,v4,zeroes,4367vsldoi v5,v5,zeroes,4368vsldoi v6,v6,zeroes,4369vsldoi v7,v7,zeroes,4370#endif371372/* xor with last 1024 bits */373lvx v8,0,r4374lvx v9,off16,r4375VPERM(v8,v8,v8,byteswap)376VPERM(v9,v9,v9,byteswap)377lvx v10,off32,r4378lvx v11,off48,r4379VPERM(v10,v10,v10,byteswap)380VPERM(v11,v11,v11,byteswap)381lvx v12,off64,r4382lvx v13,off80,r4383VPERM(v12,v12,v12,byteswap)384VPERM(v13,v13,v13,byteswap)385lvx v14,off96,r4386lvx v15,off112,r4387VPERM(v14,v14,v14,byteswap)388VPERM(v15,v15,v15,byteswap)389390addi r4,r4,8*16391392vxor v16,v0,v8393vxor v17,v1,v9394vxor v18,v2,v10395vxor v19,v3,v11396vxor v20,v4,v12397vxor v21,v5,v13398vxor v22,v6,v14399vxor v23,v7,v15400401li r0,1402cmpdi r6,0403addi r6,r6,128404bne 1b405406/* Work out how many bytes we have left */407andi. r5,r5,127408409/* Calculate where in the constant table we need to start */410subfic r6,r5,128411add r3,r3,r6412413/* How many 16 byte chunks are in the tail */414srdi r7,r5,4415mtctr r7416417/*418* Reduce the previously calculated 1024 bits to 64 bits, shifting419* 32 bits to include the trailing 32 bits of zeros420*/421lvx v0,0,r3422lvx v1,off16,r3423lvx v2,off32,r3424lvx v3,off48,r3425lvx v4,off64,r3426lvx v5,off80,r3427lvx v6,off96,r3428lvx v7,off112,r3429addi r3,r3,8*16430431VPMSUMW(v0,v16,v0)432VPMSUMW(v1,v17,v1)433VPMSUMW(v2,v18,v2)434VPMSUMW(v3,v19,v3)435VPMSUMW(v4,v20,v4)436VPMSUMW(v5,v21,v5)437VPMSUMW(v6,v22,v6)438VPMSUMW(v7,v23,v7)439440/* Now reduce the tail (0 - 112 bytes) */441cmpdi r7,0442beq 1f443444lvx v16,0,r4445lvx v17,0,r3446VPERM(v16,v16,v16,byteswap)447VPMSUMW(v16,v16,v17)448vxor v0,v0,v16449bdz 1f450451lvx v16,off16,r4452lvx v17,off16,r3453VPERM(v16,v16,v16,byteswap)454VPMSUMW(v16,v16,v17)455vxor v0,v0,v16456bdz 1f457458lvx v16,off32,r4459lvx v17,off32,r3460VPERM(v16,v16,v16,byteswap)461VPMSUMW(v16,v16,v17)462vxor v0,v0,v16463bdz 1f464465lvx v16,off48,r4466lvx v17,off48,r3467VPERM(v16,v16,v16,byteswap)468VPMSUMW(v16,v16,v17)469vxor v0,v0,v16470bdz 1f471472lvx v16,off64,r4473lvx v17,off64,r3474VPERM(v16,v16,v16,byteswap)475VPMSUMW(v16,v16,v17)476vxor v0,v0,v16477bdz 1f478479lvx v16,off80,r4480lvx v17,off80,r3481VPERM(v16,v16,v16,byteswap)482VPMSUMW(v16,v16,v17)483vxor v0,v0,v16484bdz 1f485486lvx v16,off96,r4487lvx v17,off96,r3488VPERM(v16,v16,v16,byteswap)489VPMSUMW(v16,v16,v17)490vxor v0,v0,v16491492/* Now xor all the parallel chunks together */4931: vxor v0,v0,v1494vxor v2,v2,v3495vxor v4,v4,v5496vxor v6,v6,v7497498vxor v0,v0,v2499vxor v4,v4,v6500501vxor v0,v0,v4502503.Lbarrett_reduction:504/* Barrett constants */505LOAD_REG_ADDR(r3, .barrett_constants)506507lvx const1,0,r3508lvx const2,off16,r3509510vsldoi v1,v0,v0,8511vxor v0,v0,v1 /* xor two 64 bit results together */512513#ifdef REFLECT514/* shift left one bit */515vspltisb v1,1516vsl v0,v0,v1517#endif518519vand v0,v0,mask_64bit520#ifndef REFLECT521/*522* Now for the Barrett reduction algorithm. The idea is to calculate q,523* the multiple of our polynomial that we need to subtract. By524* doing the computation 2x bits higher (ie 64 bits) and shifting the525* result back down 2x bits, we round down to the nearest multiple.526*/527VPMSUMD(v1,v0,const1) /* ma */528vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */529VPMSUMD(v1,v1,const2) /* qn */530vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */531532/*533* Get the result into r3. We need to shift it left 8 bytes:534* V0 [ 0 1 2 X ]535* V0 [ 0 X 2 3 ]536*/537vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */538#else539/*540* The reflected version of Barrett reduction. Instead of bit541* reflecting our data (which is expensive to do), we bit reflect our542* constants and our algorithm, which means the intermediate data in543* our vector registers goes from 0-63 instead of 63-0. We can reflect544* the algorithm because we don't carry in mod 2 arithmetic.545*/546vand v1,v0,mask_32bit /* bottom 32 bits of a */547VPMSUMD(v1,v1,const1) /* ma */548vand v1,v1,mask_32bit /* bottom 32bits of ma */549VPMSUMD(v1,v1,const2) /* qn */550vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */551552/*553* Since we are bit reflected, the result (ie the low 32 bits) is in554* the high 32 bits. We just need to shift it left 4 bytes555* V0 [ 0 1 X 3 ]556* V0 [ 0 X 2 3 ]557*/558vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */559#endif560561/* Get it into r3 */562MFVRD(R3, v0)563564.Lout:565subi r6,r1,56+10*16566subi r7,r1,56+2*16567568lvx v20,0,r6569lvx v21,off16,r6570lvx v22,off32,r6571lvx v23,off48,r6572lvx v24,off64,r6573lvx v25,off80,r6574lvx v26,off96,r6575lvx v27,off112,r6576lvx v28,0,r7577lvx v29,off16,r7578579ld r31,-8(r1)580ld r30,-16(r1)581ld r29,-24(r1)582ld r28,-32(r1)583ld r27,-40(r1)584ld r26,-48(r1)585ld r25,-56(r1)586587blr588589.Lfirst_warm_up_done:590lvx const1,0,r3591addi r3,r3,16592593VPMSUMD(v8,v16,const1)594VPMSUMD(v9,v17,const1)595VPMSUMD(v10,v18,const1)596VPMSUMD(v11,v19,const1)597VPMSUMD(v12,v20,const1)598VPMSUMD(v13,v21,const1)599VPMSUMD(v14,v22,const1)600VPMSUMD(v15,v23,const1)601602b .Lsecond_cool_down603604.Lshort:605cmpdi r5,0606beq .Lzero607608LOAD_REG_ADDR(r3, .short_constants)609610/* Calculate where in the constant table we need to start */611subfic r6,r5,256612add r3,r3,r6613614/* How many 16 byte chunks? */615srdi r7,r5,4616mtctr r7617618vxor v19,v19,v19619vxor v20,v20,v20620621lvx v0,0,r4622lvx v16,0,r3623VPERM(v0,v0,v16,byteswap)624vxor v0,v0,v8 /* xor in initial value */625VPMSUMW(v0,v0,v16)626bdz .Lv0627628lvx v1,off16,r4629lvx v17,off16,r3630VPERM(v1,v1,v17,byteswap)631VPMSUMW(v1,v1,v17)632bdz .Lv1633634lvx v2,off32,r4635lvx v16,off32,r3636VPERM(v2,v2,v16,byteswap)637VPMSUMW(v2,v2,v16)638bdz .Lv2639640lvx v3,off48,r4641lvx v17,off48,r3642VPERM(v3,v3,v17,byteswap)643VPMSUMW(v3,v3,v17)644bdz .Lv3645646lvx v4,off64,r4647lvx v16,off64,r3648VPERM(v4,v4,v16,byteswap)649VPMSUMW(v4,v4,v16)650bdz .Lv4651652lvx v5,off80,r4653lvx v17,off80,r3654VPERM(v5,v5,v17,byteswap)655VPMSUMW(v5,v5,v17)656bdz .Lv5657658lvx v6,off96,r4659lvx v16,off96,r3660VPERM(v6,v6,v16,byteswap)661VPMSUMW(v6,v6,v16)662bdz .Lv6663664lvx v7,off112,r4665lvx v17,off112,r3666VPERM(v7,v7,v17,byteswap)667VPMSUMW(v7,v7,v17)668bdz .Lv7669670addi r3,r3,128671addi r4,r4,128672673lvx v8,0,r4674lvx v16,0,r3675VPERM(v8,v8,v16,byteswap)676VPMSUMW(v8,v8,v16)677bdz .Lv8678679lvx v9,off16,r4680lvx v17,off16,r3681VPERM(v9,v9,v17,byteswap)682VPMSUMW(v9,v9,v17)683bdz .Lv9684685lvx v10,off32,r4686lvx v16,off32,r3687VPERM(v10,v10,v16,byteswap)688VPMSUMW(v10,v10,v16)689bdz .Lv10690691lvx v11,off48,r4692lvx v17,off48,r3693VPERM(v11,v11,v17,byteswap)694VPMSUMW(v11,v11,v17)695bdz .Lv11696697lvx v12,off64,r4698lvx v16,off64,r3699VPERM(v12,v12,v16,byteswap)700VPMSUMW(v12,v12,v16)701bdz .Lv12702703lvx v13,off80,r4704lvx v17,off80,r3705VPERM(v13,v13,v17,byteswap)706VPMSUMW(v13,v13,v17)707bdz .Lv13708709lvx v14,off96,r4710lvx v16,off96,r3711VPERM(v14,v14,v16,byteswap)712VPMSUMW(v14,v14,v16)713bdz .Lv14714715lvx v15,off112,r4716lvx v17,off112,r3717VPERM(v15,v15,v17,byteswap)718VPMSUMW(v15,v15,v17)719720.Lv15: vxor v19,v19,v15721.Lv14: vxor v20,v20,v14722.Lv13: vxor v19,v19,v13723.Lv12: vxor v20,v20,v12724.Lv11: vxor v19,v19,v11725.Lv10: vxor v20,v20,v10726.Lv9: vxor v19,v19,v9727.Lv8: vxor v20,v20,v8728.Lv7: vxor v19,v19,v7729.Lv6: vxor v20,v20,v6730.Lv5: vxor v19,v19,v5731.Lv4: vxor v20,v20,v4732.Lv3: vxor v19,v19,v3733.Lv2: vxor v20,v20,v2734.Lv1: vxor v19,v19,v1735.Lv0: vxor v20,v20,v0736737vxor v0,v19,v20738739b .Lbarrett_reduction740741.Lzero:742mr r3,r10743b .Lout744745FUNC_END(CRC_FUNCTION_NAME)746747748