.weak strlcpy
.set strlcpy, __strlcpy
.text
ENTRY(__strlcpy)
subs x2, x2,
b.lo .L0
mov x9, x0 // stash copy of dst pointer
bic x10, x1,
and x11, x1,
ldr q1, [x10]
cmeq v1.16b, v1.16b,
mov x8,
lsl x12, x11,
lsl x8, x8, x12 // mask of bytes in the string
shrn v1.8b, v1.8h,
fmov x5, d1
ands x5, x5, x8
b.ne .Lhead_nul
ldr q3, [x10,
ldr q2, [x1] // load true head
mov x8,
sub x8, x8, x11
cmeq v1.16b, v3.16b,
subs x2, x2, x8
b.ls .Lhead_buf_end
shrn v1.8b, v1.8h,
fmov x5, d1
cbnz x5, .Lsecond_nul
ldr q1, [x10,
str q2, [x0] // deposit head into buffer
sub x0, x0, x11 // adjust x0
str q3, [x0,
add x10, x10,
add x0, x0,
subs x2, x2,
b.ls 1f
.p2align 4
0:
cmeq v2.16b, v1.16b,
shrn v2.8b, v2.8h,
fmov x5, d2
cbnz x5, 3f
str q1, [x0]
ldr q1, [x10,
cmp x2,
b.ls 2f
add x10, x10,
add x0, x0,
cmeq v2.16b, v1.16b,
shrn v2.8b, v2.8h,
fmov x5, d2
cbnz x5, 4f // process chunk if match
str q1, [x0,
ldr q1, [x10] // load next chunk
subs x2, x2,
b.hi 0b
1:
sub x10, x10,
add x2, x2,
sub x0, x0,
2:
cmeq v2.16b, v1.16b,
shrn v2.8b, v2.8h,
fmov x4, d2
mov x6,
mov x7, x4
lsl x5, x2,
lsl x5, x6, x5
cmp x2,
csel x5, x5, xzr, lo
orr x8, x4, x5 // treat limit as if terminator present
rbit x8, x8 // simulate x86 tzcnt
clz x8, x8 // index of mismatch
lsr x8, x8,
add x0, x0, x8
ldr q1, [x10, x8] // load tail
str q1, [x0] // store tail
strb wzr, [x0,
cbnz x7, 1f
.p2align 4
0:
ldr q1, [x10,
cmeq v1.16b, v1.16b,
shrn v1.8b, v1.8h,
fmov x7, d1
cbnz x7, 2f
ldr q1, [x10,
cmeq v1.16b, v1.16b,
shrn v1.8b, v1.8h,
fmov x7, d1
add x10, x10,
cbz x7, 0b
1: sub x10, x10,
2: rbit x8, x7
clz x8, x8 // index of mismatch
lsr x8, x8,
sub x10, x10, x1
add x0, x10,
add x0, x0, x8
ret
4:
sub x10, x10,
sub x0, x0,
3:
rbit x8, x5
clz x8, x8 // index of mismatch
lsr x8, x8,
add x0, x0, x8 // restore dst pointer
add x10, x10, x8
ldr q1, [x10,
str q1, [x0,
add x0, x0,
sub x0, x10, x1
ret
.Lhead_buf_end:
shrn v1.8b, v1.8h,
fmov x8, d1
add x2, x2,
mov x7, x8
mov x6,
cmp x2,
b.lo 0f
rbit x8, x8
clz x8, x8 // index of mismatch
lsr x8, x8,
add x8, x8,
cmp x8, x2
csel x8, x8, x2, lo // copy min(buflen, srclen) bytes
b 1f
0:
rbit x8, x8
clz x8, x8 // index of mismatch
lsr x8, x8,
mov x8, x2
1:
sub x8, x8, x11
strb wzr, [x9, x8]
cbnz x7, 1f
.p2align 4
0:
ldr q1, [x10,
cmeq v1.16b, v1.16b,
shrn v1.8b, v1.8h,
fmov x7, d1
cbnz x7, 2f
ldr q1, [x10,
cmeq v1.16b, v1.16b,
shrn v1.8b, v1.8h,
fmov x7, d1
add x10, x10,
cbz x7, 0b
1: sub x10, x10,
2: rbit x6, x7
clz x6, x6 // index of mismatch
lsr x6, x6,
sub x10, x10, x1
add x0, x10,
add x0, x0, x6
add x4, x9, x8 // dst + cnt
add x5, x1, x8 // src + cnt
b .L1732
.Lsecond_nul:
add x2, x2, x8
rbit x8, x5
clz x8, x8 // index of mismatch
lsr x5, x8,
sub x8, x11,
sub x0, x5, x8 // string length
cmp x0, x2 // did we match or hit limit first?
csel x8, x2, x0, hi
add x4, x9, x8 // dst + cnt
add x5, x1, x8 // src + cnt
strb wzr, [x4]
.L1732:
cmp x8,
b.lo .L0816
ldp x16, x17, [x1]
ldp x12, x1, [x5,
stp x16, x17, [x9]
stp x12, x1, [x4,
ret
.Lhead_nul:
rbit x8, x5
clz x8, x8 // index of mismatch
lsr x8, x8,
sub x0, x8, x11
cmp x0, x2
csel x8, x2, x0, hi
add x4, x9, x8 // dst + cnt
add x5, x1, x8 // src + cnt
strb wzr, [x4]
.L0816:
tbz x8,
ldr x16, [x1]
ldr x17, [x5,
str x16, [x9]
str x17, [x4,
ret
.p2align 4
.L0407:
cmp x8,
b.ls .L0203
ldr w16, [x1]
ldr w18, [x5,
str w16, [x9]
str w18, [x4,
ret
.L0203:
tbz x8, 1, .L0001
ldrh w16, [x1]
ldrh w17, [x5,
strh w16, [x9]
strh w17, [x4,
ret
.L0001:
ldrb w16, [x1]
strb w16, [x9]
strb wzr, [x4]
ret
.L0:
mov x0, x1
b strlen
ret
END(__strlcpy)