Path: blob/main/lib/libc/aarch64/string/strlcpy.S
104963 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2024 Getz Mikalsen <[email protected]>4*/56#include <machine/asm.h>78.weak strlcpy9.set strlcpy, __strlcpy10.text1112ENTRY(__strlcpy)13subs x2, x2, #114b.lo .L01516mov x9, x0 // stash copy of dst pointer17bic x10, x1, #0xf // src aligned18and x11, x1, #0xf // src offset1920ldr q1, [x10]21cmeq v1.16b, v1.16b, #0 // NUL found in head?2223mov x8, #-1 // fill register with 0xfff..fff24lsl x12, x11, #225lsl x8, x8, x12 // mask of bytes in the string2627shrn v1.8b, v1.8h, #428fmov x5, d12930ands x5, x5, x831b.ne .Lhead_nul3233ldr q3, [x10, #16] // load second string chunk34ldr q2, [x1] // load true head35mov x8, #3236sub x8, x8, x113738cmeq v1.16b, v3.16b, #0 // NUL found in second chunk?3940subs x2, x2, x841b.ls .Lhead_buf_end4243/* process second chunk */44shrn v1.8b, v1.8h, #445fmov x5, d146cbnz x5, .Lsecond_nul4748/* string didn't end in second chunk and neither did buffer */49ldr q1, [x10, #32] // load next string chunk50str q2, [x0] // deposit head into buffer51sub x0, x0, x11 // adjust x052str q3, [x0, #16] // deposit second chunk53add x10, x10, #32 // advance src54add x0, x0, #32 // advance dst55subs x2, x2, #16 // enough left for another round?56b.ls 1f5758/* main loop unrolled twice */59.p2align 4600:61cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?62shrn v2.8b, v2.8h, #463fmov x5, d26465cbnz x5, 3f6667str q1, [x0]68ldr q1, [x10, #16] // load next chunk6970cmp x2, #16 // more than a full chunk left?71b.ls 2f7273add x10, x10, #32 // advance pointers74add x0, x0, #327576cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?77shrn v2.8b, v2.8h, #478fmov x5, d279cbnz x5, 4f // process chunk if match8081str q1, [x0, #-16]82ldr q1, [x10] // load next chunk8384subs x2, x2, #3285b.hi 0b86871:88sub x10, x10, #16 // undo second advancement89add x2, x2, #1690sub x0, x0, #169192/* 1--16 bytes left in the buffer but string has not ended yet */932:94cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?95shrn v2.8b, v2.8h, #496fmov x4, d29798mov x6, #0xf99mov x7, x4100101lsl x5, x2, #2 // shift 0xf to the limits position102lsl x5, x6, x5103cmp x2, #16 // dont induce match if limit >=16104csel x5, x5, xzr, lo105orr x8, x4, x5 // treat limit as if terminator present106107rbit x8, x8 // simulate x86 tzcnt108clz x8, x8 // index of mismatch109lsr x8, x8, #2110111add x0, x0, x8112113ldr q1, [x10, x8] // load tail114str q1, [x0] // store tail115strb wzr, [x0, #16]116117/* continue to find the end of the string */118cbnz x7, 1f119120/* we opt for a simpler strlen than the one in libc as the121* cmeq, shrn approach is faster for shorter strings.122*/123.p2align 41240:125ldr q1, [x10, #32]126cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL127shrn v1.8b, v1.8h, #4128fmov x7, d1129cbnz x7, 2f130131ldr q1, [x10, #48]132cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL133shrn v1.8b, v1.8h, #4134fmov x7, d1135add x10, x10, #32136cbz x7, 0b1371381: sub x10, x10, #161392: rbit x8, x7140clz x8, x8 // index of mismatch141lsr x8, x8, #2142143sub x10, x10, x1144add x0, x10, #32145add x0, x0, x8146147ret1481494:150sub x10, x10, #16 // undo second advancement151sub x0, x0, #16 // undo second advancement152153/* string has ended but buffer has not */1543:155rbit x8, x5156clz x8, x8 // index of mismatch157lsr x8, x8, #2158159add x0, x0, x8 // restore dst pointer160add x10, x10, x8161162ldr q1, [x10, #-15]163str q1, [x0, #-15]164add x0, x0, #1165sub x0, x10, x1166167ret168169.Lhead_buf_end:170shrn v1.8b, v1.8h, #4171fmov x8, d1172173add x2, x2, #32 // restore limit174175mov x7, x8176mov x6, #0xf177178cmp x2, #16 // should we induce a match or not179b.lo 0f180181rbit x8, x8182clz x8, x8 // index of mismatch183lsr x8, x8, #2184add x8, x8, #16185186cmp x8, x2187csel x8, x8, x2, lo // copy min(buflen, srclen) bytes188b 1f1890:190191rbit x8, x8192clz x8, x8 // index of mismatch193lsr x8, x8, #2194195mov x8, x21961:197198sub x8, x8, x11199strb wzr, [x9, x8]200201/* continue to find the end of the string */202cbnz x7, 1f203204/* we opt for a simpler strlen than the one in libc as the205* cmeq, shrn approach is faster for shorter strings.206*/207.p2align 42080:209ldr q1, [x10, #32]210cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL211shrn v1.8b, v1.8h, #4212fmov x7, d1213cbnz x7, 2f214215ldr q1, [x10, #48]216cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL217shrn v1.8b, v1.8h, #4218fmov x7, d1219add x10, x10, #32220cbz x7, 0b2212221: sub x10, x10, #162232: rbit x6, x7224clz x6, x6 // index of mismatch225lsr x6, x6, #2226227sub x10, x10, x1228add x0, x10, #32229add x0, x0, x6230231add x4, x9, x8 // dst + cnt232add x5, x1, x8 // src + cnt233234b .L1732235236.Lsecond_nul:237add x2, x2, x8238239rbit x8, x5240clz x8, x8 // index of mismatch241lsr x5, x8, #2242243sub x8, x11, #16244sub x0, x5, x8 // string length245246cmp x0, x2 // did we match or hit limit first?247csel x8, x2, x0, hi248249add x4, x9, x8 // dst + cnt250add x5, x1, x8 // src + cnt251252strb wzr, [x4]253254/* copy 17-32 bytes */255.L1732:256cmp x8, #16257b.lo .L0816258ldp x16, x17, [x1]259ldp x12, x1, [x5, #-16]260stp x16, x17, [x9]261stp x12, x1, [x4, #-16]262ret263264.Lhead_nul:265rbit x8, x5266clz x8, x8 // index of mismatch267lsr x8, x8, #2268269sub x0, x8, x11270cmp x0, x2271csel x8, x2, x0, hi272273add x4, x9, x8 // dst + cnt274add x5, x1, x8 // src + cnt275strb wzr, [x4]276277/* Copy 8-16 bytes */278.L0816:279tbz x8, #3, .L0407280ldr x16, [x1]281ldr x17, [x5, #-8]282str x16, [x9]283str x17, [x4, #-8]284ret285286/* Copy 4-7 bytes */287.p2align 4288.L0407:289cmp x8, #3290b.ls .L0203291ldr w16, [x1]292ldr w18, [x5, #-4]293str w16, [x9]294str w18, [x4, #-4]295ret296297.L0203:298tbz x8, 1, .L0001299ldrh w16, [x1]300ldrh w17, [x5, #-2]301strh w16, [x9]302strh w17, [x4, #-2]303ret304305.L0001:306ldrb w16, [x1]307strb w16, [x9]308strb wzr, [x4]309ret310311.L0:312mov x0, x1313b strlen314ret315END(__strlcpy)316317318