.weak memccpy
.set memccpy, __memccpy
.text
ENTRY(__memccpy)
subs x3, x3,
b.lo .L0
dup v0.16b, w2
mov x9, x0 // stash copy of src pointer
bic x10, x1,
and x11, x1,
ldr q1, [x10]
cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char
mov x8,
mov x6,
lsl x12, x11,
lsl x8, x8, x12 // mask of bytes in the string
shrn v1.8b, v1.8h,
fmov x5, d1
sub x12, x11,
adds x12, x12, x3 // distance from alignment boundary - 32
b.cc .Lrunt // branch if buffer length is 32 or less
ands x8, x8, x5
b.eq 0f
rbit x8, x8
clz x8, x8 // index of mismatch
lsr x8, x8,
sub x8, x8, x11 // ... from beginning of the string
add x0, x0, x8
add x4, x9, x8 // dst + cnt
add x5, x1, x8 // src + cnt
add x0, x0,
b .L0816
0:
ldr q3, [x10,
ldr q2, [x1] // load true head
cmeq v1.16b, v3.16b, v0.16b // char found in second chunk?
shrn v1.8b, v1.8h,
fmov x5, d1
cbz x5, 0f
rbit x8, x5
clz x8, x8 // index of mismatch
lsr x8, x8,
sub x11, x11,
sub x8, x8, x11 // adjust for alignment offset
add x0, x0, x8 // return value
add x0, x0,
add x4, x9, x8
add x5, x1, x8
b .L1732
0:
ldr q1, [x10,
str q2, [x0] // deposit head into buffer
sub x0, x0, x11 // adjust x0
mov x3, x12
str q3, [x0,
add x10, x10,
add x0, x0,
subs x3, x3,
b.lo 1f
.p2align 4
0:
cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?
shrn v2.8b, v2.8h,
fmov x5, d2
cbnz x5, 3f
str q1, [x0]
ldr q1, [x10,
cmp x3,
b.lo 2f
add x10, x10,
add x0, x0,
cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?
shrn v2.8b, v2.8h,
fmov x5, d2
cbnz x5, 4f // process chunk if match
str q1, [x0,
ldr q1, [x10] // load next chunk
subs x3, x3,
b.hs 0b
1:
sub x10, x10,
add x3, x3,
sub x0, x0,
2:
cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?
shrn v2.8b, v2.8h,
fmov x4, d2
lsl x5, x3,
lsl x5, x6, x5
orr x8, x4, x5 // insert match in mask at limit
rbit x8, x8 // simulate x86 tzcnt
clz x7, x8 // index of mismatch
lsr x8, x7,
lsl x5, x6, x7 // simulate x86 bt with shifted 0xf
add x8, x8,
add x0, x0, x8
ldr q1, [x10, x8] // load tail
str q1, [x0] // store tail
add x0, x0,
tst x4, x5 // terminator encountered inside buffer?
csel x0, x0, xzr, ne // if yes, return pointer, else NUL
ret
4:
sub x10, x10,
sub x0, x0,
3:
rbit x8, x5
clz x8, x8 // index of mismatch
lsr x3, x8,
add x0, x0, x3 // restore dst pointer
add x10, x10, x3
ldr q1, [x10,
str q1, [x0,
add x0, x0,
ret
.Lrunt:
add x13, x11, x3
mov x7, x5 // keep a copy of original match mask
lsl x4, x12,
lsl x4, x6, x4
cmp x13,
csel x4, x4, xzr, lo
orr x5, x5, x4 // insert match in mask at limit
ands x8, x8, x5 // if match always fall through
b.ne 0f
ldr q4, [x10,
cmeq v1.16b, v4.16b, v0.16b // char found in second chunk?
shrn v1.8b, v1.8h,
fmov x8, d1
mov x7, x8
lsl x4, x12,
lsl x4, x6, x4
orr x8, x8, x4 // induce match in upper bytes of mask
rbit x8, x8
clz x4, x8 // index of mismatch
lsr x8, x4,
add x8, x8,
b 1f
0:
rbit x8, x8
clz x4, x8 // index of mismatch
lsr x8, x4,
1:
add x0, x0, x8 // return value if terminator not found
sub x0, x0, x11
add x0, x0,
lsl x5, x6, x4
ands x7, x7, x5 // was the terminator present?
csel x0, xzr, x0, eq // return value based on what we matched
sub x8, x8, x11
add x4, x9, x8 // dst + cnt
add x5, x1, x8 // src + cnt
.L1732:
cmp x8,
b.lo .L0816
add x5, x5,
add x4, x4,
ldp x16, x17, [x1]
ldp x12, x13, [x5,
stp x16, x17, [x9]
stp x12, x13, [x4,
ret
.L0816:
tbz x8,
ldr x16, [x1]
ldr x17, [x5,
str x16, [x9]
str x17, [x4,
ret
.p2align 4
.L0407:
cmp x8,
b.lo .L0103
ldr w16, [x1]
ldr w18, [x5,
str w16, [x9]
str w18, [x4,
ret
.p2align 4
.L0103:
lsr x14, x8,
ldrb w16, [x1]
ldrb w15, [x5]
ldrb w18, [x1, x14]
strb w16, [x9]
strb w18, [x9, x14]
strb w15, [x4]
ret
.L0:
eor x0, x0, x0
ret
END(__memccpy)