/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Author: Anton Blanchard <[email protected]>3* Copyright 2015 IBM Corporation.4*/5#include <linux/export.h>6#include <asm/ppc_asm.h>7#include <asm/ppc-opcode.h>89#define off8 r610#define off16 r711#define off24 r81213#define rA r914#define rB r1015#define rC r1116#define rD r2717#define rE r2818#define rF r2919#define rG r3020#define rH r312122#ifdef __LITTLE_ENDIAN__23#define LH lhbrx24#define LW lwbrx25#define LD ldbrx26#define LVS lvsr27#define VPERM(_VRT,_VRA,_VRB,_VRC) \28vperm _VRT,_VRB,_VRA,_VRC29#else30#define LH lhzx31#define LW lwzx32#define LD ldx33#define LVS lvsl34#define VPERM(_VRT,_VRA,_VRB,_VRC) \35vperm _VRT,_VRA,_VRB,_VRC36#endif3738#define VMX_THRESH 409639#define ENTER_VMX_OPS \40mflr r0; \41std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \42std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \43std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \44std r0,16(r1); \45stdu r1,-STACKFRAMESIZE(r1); \46bl CFUNC(enter_vmx_ops); \47cmpwi cr1,r3,0; \48ld r0,STACKFRAMESIZE+16(r1); \49ld r3,STK_REG(R31)(r1); \50ld r4,STK_REG(R30)(r1); \51ld r5,STK_REG(R29)(r1); \52addi r1,r1,STACKFRAMESIZE; \53mtlr r05455#define EXIT_VMX_OPS \56mflr r0; \57std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \58std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \59std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \60std r0,16(r1); \61stdu r1,-STACKFRAMESIZE(r1); \62bl CFUNC(exit_vmx_ops); \63ld r0,STACKFRAMESIZE+16(r1); \64ld r3,STK_REG(R31)(r1); \65ld r4,STK_REG(R30)(r1); \66ld r5,STK_REG(R29)(r1); \67addi r1,r1,STACKFRAMESIZE; \68mtlr r06970/*71* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with72* 16 bytes boundary and permute the result with the 1st 16 bytes.7374* | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |75* ^ ^ ^76* 0xbbbb10 0xbbbb20 0xbbb3077* ^78* _vaddr79*80*81* _vmask is the mask generated by LVS82* _v1st_qw is the 1st aligned QW of current addr which is already loaded.83* for example: 0xyyyyyyyyyyyyy012 for big endian84* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.85* for example: 0x3456789abcdefzzz for big endian86* The permute result is saved in _v_res.87* for example: 0x0123456789abcdef for big endian.88*/89#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \90lvx _v2nd_qw,_vaddr,off16; \91VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)9293/*94* There are 2 categories for memcmp:95* 1) src/dst has the same offset to the 8 bytes boundary. The handlers96* are named like .Lsameoffset_xxxx97* 2) src/dst has different offset to the 8 bytes boundary. The handlers98* are named like .Ldiffoffset_xxxx99*/100_GLOBAL_TOC(memcmp)101cmpdi cr1,r5,0102103/* Use the short loop if the src/dst addresses are not104* with the same offset of 8 bytes align boundary.105*/106xor r6,r3,r4107andi. r6,r6,7108109/* Fall back to short loop if compare at aligned addrs110* with less than 8 bytes.111*/112cmpdi cr6,r5,7113114beq cr1,.Lzero115bgt cr6,.Lno_short116117.Lshort:118mtctr r51191: lbz rA,0(r3)120lbz rB,0(r4)121subf. rC,rB,rA122bne .Lnon_zero123bdz .Lzero124125lbz rA,1(r3)126lbz rB,1(r4)127subf. rC,rB,rA128bne .Lnon_zero129bdz .Lzero130131lbz rA,2(r3)132lbz rB,2(r4)133subf. rC,rB,rA134bne .Lnon_zero135bdz .Lzero136137lbz rA,3(r3)138lbz rB,3(r4)139subf. rC,rB,rA140bne .Lnon_zero141142addi r3,r3,4143addi r4,r4,4144145bdnz 1b146147.Lzero:148li r3,0149blr150151.Lno_short:152dcbt 0,r3153dcbt 0,r4154bne .Ldiffoffset_8bytes_make_align_start155156157.Lsameoffset_8bytes_make_align_start:158/* attempt to compare bytes not aligned with 8 bytes so that159* rest comparison can run based on 8 bytes alignment.160*/161andi. r6,r3,7162163/* Try to compare the first double word which is not 8 bytes aligned:164* load the first double word at (src & ~7UL) and shift left appropriate165* bits before comparision.166*/167rlwinm r6,r3,3,26,28168beq .Lsameoffset_8bytes_aligned169clrrdi r3,r3,3170clrrdi r4,r4,3171LD rA,0,r3172LD rB,0,r4173sld rA,rA,r6174sld rB,rB,r6175cmpld cr0,rA,rB176srwi r6,r6,3177bne cr0,.LcmpAB_lightweight178subfic r6,r6,8179subf. r5,r6,r5180addi r3,r3,8181addi r4,r4,8182beq .Lzero183184.Lsameoffset_8bytes_aligned:185/* now we are aligned with 8 bytes.186* Use .Llong loop if left cmp bytes are equal or greater than 32B.187*/188cmpdi cr6,r5,31189bgt cr6,.Llong190191.Lcmp_lt32bytes:192/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */193cmpdi cr5,r5,7194srdi r0,r5,3195ble cr5,.Lcmp_rest_lt8bytes196197/* handle 8 ~ 31 bytes */198clrldi r5,r5,61199mtctr r02002:201LD rA,0,r3202LD rB,0,r4203cmpld cr0,rA,rB204addi r3,r3,8205addi r4,r4,8206bne cr0,.LcmpAB_lightweight207bdnz 2b208209cmpwi r5,0210beq .Lzero211212.Lcmp_rest_lt8bytes:213/*214* Here we have less than 8 bytes to compare. At least s1 is aligned to215* 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a216* page boundary, otherwise we might read past the end of the buffer and217* trigger a page fault. We use 4K as the conservative minimum page218* size. If we detect that case we go to the byte-by-byte loop.219*220* Otherwise the next double word is loaded from s1 and s2, and shifted221* right to compare the appropriate bits.222*/223clrldi r6,r4,(64-12) // r6 = r4 & 0xfff224cmpdi r6,0xff8225bgt .Lshort226227subfic r6,r5,8228slwi r6,r6,3229LD rA,0,r3230LD rB,0,r4231srd rA,rA,r6232srd rB,rB,r6233cmpld cr0,rA,rB234bne cr0,.LcmpAB_lightweight235b .Lzero236237.Lnon_zero:238mr r3,rC239blr240241.Llong:242#ifdef CONFIG_ALTIVEC243BEGIN_FTR_SECTION244/* Try to use vmx loop if length is equal or greater than 4K */245cmpldi cr6,r5,VMX_THRESH246bge cr6,.Lsameoffset_vmx_cmp247END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)248249.Llong_novmx_cmp:250#endif251/* At least s1 addr is aligned with 8 bytes */252li off8,8253li off16,16254li off24,24255256std r31,-8(r1)257std r30,-16(r1)258std r29,-24(r1)259std r28,-32(r1)260std r27,-40(r1)261262srdi r0,r5,5263mtctr r0264andi. r5,r5,31265266LD rA,0,r3267LD rB,0,r4268269LD rC,off8,r3270LD rD,off8,r4271272LD rE,off16,r3273LD rF,off16,r4274275LD rG,off24,r3276LD rH,off24,r4277cmpld cr0,rA,rB278279addi r3,r3,32280addi r4,r4,32281282bdz .Lfirst32283284LD rA,0,r3285LD rB,0,r4286cmpld cr1,rC,rD287288LD rC,off8,r3289LD rD,off8,r4290cmpld cr6,rE,rF291292LD rE,off16,r3293LD rF,off16,r4294cmpld cr7,rG,rH295bne cr0,.LcmpAB296297LD rG,off24,r3298LD rH,off24,r4299cmpld cr0,rA,rB300bne cr1,.LcmpCD301302addi r3,r3,32303addi r4,r4,32304305bdz .Lsecond32306307.balign 163083091: LD rA,0,r3310LD rB,0,r4311cmpld cr1,rC,rD312bne cr6,.LcmpEF313314LD rC,off8,r3315LD rD,off8,r4316cmpld cr6,rE,rF317bne cr7,.LcmpGH318319LD rE,off16,r3320LD rF,off16,r4321cmpld cr7,rG,rH322bne cr0,.LcmpAB323324LD rG,off24,r3325LD rH,off24,r4326cmpld cr0,rA,rB327bne cr1,.LcmpCD328329addi r3,r3,32330addi r4,r4,32331332bdnz 1b333334.Lsecond32:335cmpld cr1,rC,rD336bne cr6,.LcmpEF337338cmpld cr6,rE,rF339bne cr7,.LcmpGH340341cmpld cr7,rG,rH342bne cr0,.LcmpAB343344bne cr1,.LcmpCD345bne cr6,.LcmpEF346bne cr7,.LcmpGH347348.Ltail:349ld r31,-8(r1)350ld r30,-16(r1)351ld r29,-24(r1)352ld r28,-32(r1)353ld r27,-40(r1)354355cmpdi r5,0356beq .Lzero357b .Lshort358359.Lfirst32:360cmpld cr1,rC,rD361cmpld cr6,rE,rF362cmpld cr7,rG,rH363364bne cr0,.LcmpAB365bne cr1,.LcmpCD366bne cr6,.LcmpEF367bne cr7,.LcmpGH368369b .Ltail370371.LcmpAB:372li r3,1373bgt cr0,.Lout374li r3,-1375b .Lout376377.LcmpCD:378li r3,1379bgt cr1,.Lout380li r3,-1381b .Lout382383.LcmpEF:384li r3,1385bgt cr6,.Lout386li r3,-1387b .Lout388389.LcmpGH:390li r3,1391bgt cr7,.Lout392li r3,-1393394.Lout:395ld r31,-8(r1)396ld r30,-16(r1)397ld r29,-24(r1)398ld r28,-32(r1)399ld r27,-40(r1)400blr401402.LcmpAB_lightweight: /* skip NV GPRS restore */403li r3,1404bgtlr405li r3,-1406blr407408#ifdef CONFIG_ALTIVEC409.Lsameoffset_vmx_cmp:410/* Enter with src/dst addrs has the same offset with 8 bytes411* align boundary.412*413* There is an optimization based on following fact: memcmp()414* prones to fail early at the first 32 bytes.415* Before applying VMX instructions which will lead to 32x128bits416* VMX regs load/restore penalty, we compare the first 32 bytes417* so that we can catch the ~80% fail cases.418*/419420li r0,4421mtctr r0422.Lsameoffset_prechk_32B_loop:423LD rA,0,r3424LD rB,0,r4425cmpld cr0,rA,rB426addi r3,r3,8427addi r4,r4,8428bne cr0,.LcmpAB_lightweight429addi r5,r5,-8430bdnz .Lsameoffset_prechk_32B_loop431432ENTER_VMX_OPS433beq cr1,.Llong_novmx_cmp4344353:436/* need to check whether r4 has the same offset with r3437* for 16 bytes boundary.438*/439xor r0,r3,r4440andi. r0,r0,0xf441bne .Ldiffoffset_vmx_cmp_start442443/* len is no less than 4KB. Need to align with 16 bytes further.444*/445andi. rA,r3,8446LD rA,0,r3447beq 4f448LD rB,0,r4449cmpld cr0,rA,rB450addi r3,r3,8451addi r4,r4,8452addi r5,r5,-8453454beq cr0,4f455/* save and restore cr0 */456mfocrf r5,128457EXIT_VMX_OPS458mtocrf 128,r5459b .LcmpAB_lightweight4604614:462/* compare 32 bytes for each loop */463srdi r0,r5,5464mtctr r0465clrldi r5,r5,59466li off16,16467468.balign 164695:470lvx v0,0,r3471lvx v1,0,r4472VCMPEQUD_RC(v0,v0,v1)473bnl cr6,7f474lvx v0,off16,r3475lvx v1,off16,r4476VCMPEQUD_RC(v0,v0,v1)477bnl cr6,6f478addi r3,r3,32479addi r4,r4,32480bdnz 5b481482EXIT_VMX_OPS483cmpdi r5,0484beq .Lzero485b .Lcmp_lt32bytes4864876:488addi r3,r3,16489addi r4,r4,164904917:492/* diff the last 16 bytes */493EXIT_VMX_OPS494LD rA,0,r3495LD rB,0,r4496cmpld cr0,rA,rB497li off8,8498bne cr0,.LcmpAB_lightweight499500LD rA,off8,r3501LD rB,off8,r4502cmpld cr0,rA,rB503bne cr0,.LcmpAB_lightweight504b .Lzero505#endif506507.Ldiffoffset_8bytes_make_align_start:508/* now try to align s1 with 8 bytes */509rlwinm r6,r3,3,26,28510beq .Ldiffoffset_align_s1_8bytes511512clrrdi r3,r3,3513LD rA,0,r3514LD rB,0,r4 /* unaligned load */515sld rA,rA,r6516srd rA,rA,r6517srd rB,rB,r6518cmpld cr0,rA,rB519srwi r6,r6,3520bne cr0,.LcmpAB_lightweight521522subfic r6,r6,8523subf. r5,r6,r5524addi r3,r3,8525add r4,r4,r6526527beq .Lzero528529.Ldiffoffset_align_s1_8bytes:530/* now s1 is aligned with 8 bytes. */531#ifdef CONFIG_ALTIVEC532BEGIN_FTR_SECTION533/* only do vmx ops when the size equal or greater than 4K bytes */534cmpdi cr5,r5,VMX_THRESH535bge cr5,.Ldiffoffset_vmx_cmp536END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)537538.Ldiffoffset_novmx_cmp:539#endif540541542cmpdi cr5,r5,31543ble cr5,.Lcmp_lt32bytes544545#ifdef CONFIG_ALTIVEC546b .Llong_novmx_cmp547#else548b .Llong549#endif550551#ifdef CONFIG_ALTIVEC552.Ldiffoffset_vmx_cmp:553/* perform a 32 bytes pre-checking before554* enable VMX operations.555*/556li r0,4557mtctr r0558.Ldiffoffset_prechk_32B_loop:559LD rA,0,r3560LD rB,0,r4561cmpld cr0,rA,rB562addi r3,r3,8563addi r4,r4,8564bne cr0,.LcmpAB_lightweight565addi r5,r5,-8566bdnz .Ldiffoffset_prechk_32B_loop567568ENTER_VMX_OPS569beq cr1,.Ldiffoffset_novmx_cmp570571.Ldiffoffset_vmx_cmp_start:572/* Firstly try to align r3 with 16 bytes */573andi. r6,r3,0xf574li off16,16575beq .Ldiffoffset_vmx_s1_16bytes_align576577LVS v3,0,r3578LVS v4,0,r4579580lvx v5,0,r3581lvx v6,0,r4582LD_VSR_CROSS16B(r3,v3,v5,v7,v9)583LD_VSR_CROSS16B(r4,v4,v6,v8,v10)584585VCMPEQUB_RC(v7,v9,v10)586bnl cr6,.Ldiffoffset_vmx_diff_found587588subfic r6,r6,16589subf r5,r6,r5590add r3,r3,r6591add r4,r4,r6592593.Ldiffoffset_vmx_s1_16bytes_align:594/* now s1 is aligned with 16 bytes */595lvx v6,0,r4596LVS v4,0,r4597srdi r6,r5,5 /* loop for 32 bytes each */598clrldi r5,r5,59599mtctr r6600601.balign 16602.Ldiffoffset_vmx_32bytesloop:603/* the first qw of r4 was saved in v6 */604lvx v9,0,r3605LD_VSR_CROSS16B(r4,v4,v6,v8,v10)606VCMPEQUB_RC(v7,v9,v10)607vor v6,v8,v8608bnl cr6,.Ldiffoffset_vmx_diff_found609610addi r3,r3,16611addi r4,r4,16612613lvx v9,0,r3614LD_VSR_CROSS16B(r4,v4,v6,v8,v10)615VCMPEQUB_RC(v7,v9,v10)616vor v6,v8,v8617bnl cr6,.Ldiffoffset_vmx_diff_found618619addi r3,r3,16620addi r4,r4,16621622bdnz .Ldiffoffset_vmx_32bytesloop623624EXIT_VMX_OPS625626cmpdi r5,0627beq .Lzero628b .Lcmp_lt32bytes629630.Ldiffoffset_vmx_diff_found:631EXIT_VMX_OPS632/* anyway, the diff will appear in next 16 bytes */633li r5,16634b .Lcmp_lt32bytes635636#endif637EXPORT_SYMBOL(memcmp)638639640