vperm _VRT,_VRB,_VRA,_VRC
vperm _VRT,_VRA,_VRB,_VRC
mflr r0; \
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
std r0,16(r1); \
stdu r1,-STACKFRAMESIZE(r1); \
bl CFUNC(enter_vmx_ops); \
cmpwi cr1,r3,0; \
ld r0,STACKFRAMESIZE+16(r1); \
ld r3,STK_REG(R31)(r1); \
ld r4,STK_REG(R30)(r1); \
ld r5,STK_REG(R29)(r1); \
addi r1,r1,STACKFRAMESIZE; \
mtlr r0
mflr r0; \
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
std r0,16(r1); \
stdu r1,-STACKFRAMESIZE(r1); \
bl CFUNC(exit_vmx_ops); \
ld r0,STACKFRAMESIZE+16(r1); \
ld r3,STK_REG(R31)(r1); \
ld r4,STK_REG(R30)(r1); \
ld r5,STK_REG(R29)(r1); \
addi r1,r1,STACKFRAMESIZE; \
mtlr r0
lvx _v2nd_qw,_vaddr,off16; \
VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
_GLOBAL_TOC(memcmp)
cmpdi cr1,r5,0
xor r6,r3,r4
andi. r6,r6,7
cmpdi cr6,r5,7
beq cr1,.Lzero
bgt cr6,.Lno_short
.Lshort:
mtctr r5
1: lbz rA,0(r3)
lbz rB,0(r4)
subf. rC,rB,rA
bne .Lnon_zero
bdz .Lzero
lbz rA,1(r3)
lbz rB,1(r4)
subf. rC,rB,rA
bne .Lnon_zero
bdz .Lzero
lbz rA,2(r3)
lbz rB,2(r4)
subf. rC,rB,rA
bne .Lnon_zero
bdz .Lzero
lbz rA,3(r3)
lbz rB,3(r4)
subf. rC,rB,rA
bne .Lnon_zero
addi r3,r3,4
addi r4,r4,4
bdnz 1b
.Lzero:
li r3,0
blr
.Lno_short:
dcbt 0,r3
dcbt 0,r4
bne .Ldiffoffset_8bytes_make_align_start
.Lsameoffset_8bytes_make_align_start:
andi. r6,r3,7
rlwinm r6,r3,3,26,28
beq .Lsameoffset_8bytes_aligned
clrrdi r3,r3,3
clrrdi r4,r4,3
LD rA,0,r3
LD rB,0,r4
sld rA,rA,r6
sld rB,rB,r6
cmpld cr0,rA,rB
srwi r6,r6,3
bne cr0,.LcmpAB_lightweight
subfic r6,r6,8
subf. r5,r6,r5
addi r3,r3,8
addi r4,r4,8
beq .Lzero
.Lsameoffset_8bytes_aligned:
cmpdi cr6,r5,31
bgt cr6,.Llong
.Lcmp_lt32bytes:
cmpdi cr5,r5,7
srdi r0,r5,3
ble cr5,.Lcmp_rest_lt8bytes
clrldi r5,r5,61
mtctr r0
2:
LD rA,0,r3
LD rB,0,r4
cmpld cr0,rA,rB
addi r3,r3,8
addi r4,r4,8
bne cr0,.LcmpAB_lightweight
bdnz 2b
cmpwi r5,0
beq .Lzero
.Lcmp_rest_lt8bytes:
clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
cmpdi r6,0xff8
bgt .Lshort
subfic r6,r5,8
slwi r6,r6,3
LD rA,0,r3
LD rB,0,r4
srd rA,rA,r6
srd rB,rB,r6
cmpld cr0,rA,rB
bne cr0,.LcmpAB_lightweight
b .Lzero
.Lnon_zero:
mr r3,rC
blr
.Llong:
BEGIN_FTR_SECTION
cmpldi cr6,r5,VMX_THRESH
bge cr6,.Lsameoffset_vmx_cmp
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
.Llong_novmx_cmp:
li off8,8
li off16,16
li off24,24
std r31,-8(r1)
std r30,-16(r1)
std r29,-24(r1)
std r28,-32(r1)
std r27,-40(r1)
srdi r0,r5,5
mtctr r0
andi. r5,r5,31
LD rA,0,r3
LD rB,0,r4
LD rC,off8,r3
LD rD,off8,r4
LD rE,off16,r3
LD rF,off16,r4
LD rG,off24,r3
LD rH,off24,r4
cmpld cr0,rA,rB
addi r3,r3,32
addi r4,r4,32
bdz .Lfirst32
LD rA,0,r3
LD rB,0,r4
cmpld cr1,rC,rD
LD rC,off8,r3
LD rD,off8,r4
cmpld cr6,rE,rF
LD rE,off16,r3
LD rF,off16,r4
cmpld cr7,rG,rH
bne cr0,.LcmpAB
LD rG,off24,r3
LD rH,off24,r4
cmpld cr0,rA,rB
bne cr1,.LcmpCD
addi r3,r3,32
addi r4,r4,32
bdz .Lsecond32
.balign 16
1: LD rA,0,r3
LD rB,0,r4
cmpld cr1,rC,rD
bne cr6,.LcmpEF
LD rC,off8,r3
LD rD,off8,r4
cmpld cr6,rE,rF
bne cr7,.LcmpGH
LD rE,off16,r3
LD rF,off16,r4
cmpld cr7,rG,rH
bne cr0,.LcmpAB
LD rG,off24,r3
LD rH,off24,r4
cmpld cr0,rA,rB
bne cr1,.LcmpCD
addi r3,r3,32
addi r4,r4,32
bdnz 1b
.Lsecond32:
cmpld cr1,rC,rD
bne cr6,.LcmpEF
cmpld cr6,rE,rF
bne cr7,.LcmpGH
cmpld cr7,rG,rH
bne cr0,.LcmpAB
bne cr1,.LcmpCD
bne cr6,.LcmpEF
bne cr7,.LcmpGH
.Ltail:
ld r31,-8(r1)
ld r30,-16(r1)
ld r29,-24(r1)
ld r28,-32(r1)
ld r27,-40(r1)
cmpdi r5,0
beq .Lzero
b .Lshort
.Lfirst32:
cmpld cr1,rC,rD
cmpld cr6,rE,rF
cmpld cr7,rG,rH
bne cr0,.LcmpAB
bne cr1,.LcmpCD
bne cr6,.LcmpEF
bne cr7,.LcmpGH
b .Ltail
.LcmpAB:
li r3,1
bgt cr0,.Lout
li r3,-1
b .Lout
.LcmpCD:
li r3,1
bgt cr1,.Lout
li r3,-1
b .Lout
.LcmpEF:
li r3,1
bgt cr6,.Lout
li r3,-1
b .Lout
.LcmpGH:
li r3,1
bgt cr7,.Lout
li r3,-1
.Lout:
ld r31,-8(r1)
ld r30,-16(r1)
ld r29,-24(r1)
ld r28,-32(r1)
ld r27,-40(r1)
blr
.LcmpAB_lightweight:
li r3,1
bgtlr
li r3,-1
blr
.Lsameoffset_vmx_cmp:
li r0,4
mtctr r0
.Lsameoffset_prechk_32B_loop:
LD rA,0,r3
LD rB,0,r4
cmpld cr0,rA,rB
addi r3,r3,8
addi r4,r4,8
bne cr0,.LcmpAB_lightweight
addi r5,r5,-8
bdnz .Lsameoffset_prechk_32B_loop
ENTER_VMX_OPS
beq cr1,.Llong_novmx_cmp
3:
xor r0,r3,r4
andi. r0,r0,0xf
bne .Ldiffoffset_vmx_cmp_start
andi. rA,r3,8
LD rA,0,r3
beq 4f
LD rB,0,r4
cmpld cr0,rA,rB
addi r3,r3,8
addi r4,r4,8
addi r5,r5,-8
beq cr0,4f
mfocrf r5,128
EXIT_VMX_OPS
mtocrf 128,r5
b .LcmpAB_lightweight
4:
srdi r0,r5,5
mtctr r0
clrldi r5,r5,59
li off16,16
.balign 16
5:
lvx v0,0,r3
lvx v1,0,r4
VCMPEQUD_RC(v0,v0,v1)
bnl cr6,7f
lvx v0,off16,r3
lvx v1,off16,r4
VCMPEQUD_RC(v0,v0,v1)
bnl cr6,6f
addi r3,r3,32
addi r4,r4,32
bdnz 5b
EXIT_VMX_OPS
cmpdi r5,0
beq .Lzero
b .Lcmp_lt32bytes
6:
addi r3,r3,16
addi r4,r4,16
7:
EXIT_VMX_OPS
LD rA,0,r3
LD rB,0,r4
cmpld cr0,rA,rB
li off8,8
bne cr0,.LcmpAB_lightweight
LD rA,off8,r3
LD rB,off8,r4
cmpld cr0,rA,rB
bne cr0,.LcmpAB_lightweight
b .Lzero
.Ldiffoffset_8bytes_make_align_start:
rlwinm r6,r3,3,26,28
beq .Ldiffoffset_align_s1_8bytes
clrrdi r3,r3,3
LD rA,0,r3
LD rB,0,r4
sld rA,rA,r6
srd rA,rA,r6
srd rB,rB,r6
cmpld cr0,rA,rB
srwi r6,r6,3
bne cr0,.LcmpAB_lightweight
subfic r6,r6,8
subf. r5,r6,r5
addi r3,r3,8
add r4,r4,r6
beq .Lzero
.Ldiffoffset_align_s1_8bytes:
BEGIN_FTR_SECTION
cmpdi cr5,r5,VMX_THRESH
bge cr5,.Ldiffoffset_vmx_cmp
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
.Ldiffoffset_novmx_cmp:
cmpdi cr5,r5,31
ble cr5,.Lcmp_lt32bytes
b .Llong_novmx_cmp
b .Llong
.Ldiffoffset_vmx_cmp:
li r0,4
mtctr r0
.Ldiffoffset_prechk_32B_loop:
LD rA,0,r3
LD rB,0,r4
cmpld cr0,rA,rB
addi r3,r3,8
addi r4,r4,8
bne cr0,.LcmpAB_lightweight
addi r5,r5,-8
bdnz .Ldiffoffset_prechk_32B_loop
ENTER_VMX_OPS
beq cr1,.Ldiffoffset_novmx_cmp
.Ldiffoffset_vmx_cmp_start:
andi. r6,r3,0xf
li off16,16
beq .Ldiffoffset_vmx_s1_16bytes_align
LVS v3,0,r3
LVS v4,0,r4
lvx v5,0,r3
lvx v6,0,r4
LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
VCMPEQUB_RC(v7,v9,v10)
bnl cr6,.Ldiffoffset_vmx_diff_found
subfic r6,r6,16
subf r5,r6,r5
add r3,r3,r6
add r4,r4,r6
.Ldiffoffset_vmx_s1_16bytes_align:
lvx v6,0,r4
LVS v4,0,r4
srdi r6,r5,5
clrldi r5,r5,59
mtctr r6
.balign 16
.Ldiffoffset_vmx_32bytesloop:
lvx v9,0,r3
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
VCMPEQUB_RC(v7,v9,v10)
vor v6,v8,v8
bnl cr6,.Ldiffoffset_vmx_diff_found
addi r3,r3,16
addi r4,r4,16
lvx v9,0,r3
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
VCMPEQUB_RC(v7,v9,v10)
vor v6,v8,v8
bnl cr6,.Ldiffoffset_vmx_diff_found
addi r3,r3,16
addi r4,r4,16
bdnz .Ldiffoffset_vmx_32bytesloop
EXIT_VMX_OPS
cmpdi r5,0
beq .Lzero
b .Lcmp_lt32bytes
.Ldiffoffset_vmx_diff_found:
EXIT_VMX_OPS
li r5,16
b .Lcmp_lt32bytes
EXPORT_SYMBOL(memcmp)