Path: blob/main/lib/libc/aarch64/string/memccpy.S
103589 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2024 Getz Mikalsen <[email protected]>4*/56#include <machine/asm.h>78.weak memccpy9.set memccpy, __memccpy10.text1112ENTRY(__memccpy)13subs x3, x3, #114b.lo .L01516dup v0.16b, w21718mov x9, x0 // stash copy of src pointer19bic x10, x1, #0xf // src aligned20and x11, x1, #0xf // src offset2122ldr q1, [x10]23cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char2425mov x8, #-1 // prepare a 0xfff..fff register26mov x6, #0xf2728lsl x12, x11, #229lsl x8, x8, x12 // mask of bytes in the string3031shrn v1.8b, v1.8h, #432fmov x5, d13334sub x12, x11, #3235adds x12, x12, x3 // distance from alignment boundary - 3236b.cc .Lrunt // branch if buffer length is 32 or less3738ands x8, x8, x539b.eq 0f4041/* match in first chunk */42rbit x8, x843clz x8, x8 // index of mismatch44lsr x8, x8, #24546sub x8, x8, x11 // ... from beginning of the string4748add x0, x0, x849add x4, x9, x8 // dst + cnt50add x5, x1, x8 // src + cnt51add x0, x0, #15253b .L081654550:56ldr q3, [x10, #16] // load second string chunk57ldr q2, [x1] // load true head58cmeq v1.16b, v3.16b, v0.16b // char found in second chunk?5960/* process second chunk */61shrn v1.8b, v1.8h, #462fmov x5, d16364cbz x5, 0f6566/* match in second chunk */67rbit x8, x568clz x8, x8 // index of mismatch69lsr x8, x8, #27071sub x11, x11, #1672sub x8, x8, x11 // adjust for alignment offset73add x0, x0, x8 // return value74add x0, x0, #17576add x4, x9, x877add x5, x1, x878b .L173279800:81/* string didn't end in second chunk and neither did buffer */82ldr q1, [x10, #32] // load next string chunk83str q2, [x0] // deposit head into buffer84sub x0, x0, x11 // adjust x085mov x3, x1286str q3, [x0, #16] // deposit second chunk8788add x10, x10, #32 // advance src89add x0, x0, #32 // advance dst90subs x3, x3, #16 // enough left for another round?91b.lo 1f9293/* main loop unrolled twice */94.p2align 4950:96cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?97shrn v2.8b, v2.8h, #498fmov x5, d299100cbnz x5, 3f101102str q1, [x0]103ldr q1, [x10, #16] // load next chunk104105cmp x3, #16 // more than a full chunk left?106b.lo 2f107108add x10, x10, #32 // advance pointers109add x0, x0, #32110111cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?112shrn v2.8b, v2.8h, #4113fmov x5, d2114cbnz x5, 4f // process chunk if match115116str q1, [x0, #-16]117ldr q1, [x10] // load next chunk118119subs x3, x3, #32120b.hs 0b1211221:123sub x10, x10, #16 // undo second advancement124add x3, x3, #16125sub x0, x0, #16126127/* 1--16 bytes left in the buffer but string has not ended yet */1282:129cmeq v2.16b, v1.16b, v0.16b // char found in second chunk?130shrn v2.8b, v2.8h, #4131fmov x4, d2132133lsl x5, x3, #2 // shift 0xf to the limits position134lsl x5, x6, x5135orr x8, x4, x5 // insert match in mask at limit136137rbit x8, x8 // simulate x86 tzcnt138clz x7, x8 // index of mismatch139lsr x8, x7, #2140141lsl x5, x6, x7 // simulate x86 bt with shifted 0xf142143add x8, x8, #1144add x0, x0, x8145146ldr q1, [x10, x8] // load tail147str q1, [x0] // store tail148149add x0, x0, #16150151tst x4, x5 // terminator encountered inside buffer?152csel x0, x0, xzr, ne // if yes, return pointer, else NUL153ret1541554:156sub x10, x10, #16 // undo second advancement157sub x0, x0, #16 // undo second advancement1581593:160rbit x8, x5161clz x8, x8 // index of mismatch162lsr x3, x8, #2163164add x0, x0, x3 // restore dst pointer165add x10, x10, x3166ldr q1, [x10, #-15]167str q1, [x0, #-15]168add x0, x0, #1169ret170171.Lrunt:172add x13, x11, x3173174mov x7, x5 // keep a copy of original match mask175176lsl x4, x12, #2 // shift 0xf to the limits position177lsl x4, x6, x4178179cmp x13, #16 // dont induce match if limit >=16180csel x4, x4, xzr, lo181orr x5, x5, x4 // insert match in mask at limit182183ands x8, x8, x5 // if match always fall through184b.ne 0f185186ldr q4, [x10, #16] // load second string chunk187cmeq v1.16b, v4.16b, v0.16b // char found in second chunk?188189/* process second chunk */190shrn v1.8b, v1.8h, #4191fmov x8, d1192mov x7, x8193194lsl x4, x12, #2195lsl x4, x6, x4196orr x8, x8, x4 // induce match in upper bytes of mask197198rbit x8, x8199clz x4, x8 // index of mismatch200lsr x8, x4, #2201add x8, x8, #16 // no match in first chunk202b 1f2032040:205rbit x8, x8206clz x4, x8 // index of mismatch207lsr x8, x4, #22081:209add x0, x0, x8 // return value if terminator not found210sub x0, x0, x11211add x0, x0, #1212213/* check if we encountered a match or the limit first */214lsl x5, x6, x4215ands x7, x7, x5 // was the terminator present?216csel x0, xzr, x0, eq // return value based on what we matched217218sub x8, x8, x11219add x4, x9, x8 // dst + cnt220add x5, x1, x8 // src + cnt221222/* copy 17-32 bytes */223.L1732:224cmp x8, #16225b.lo .L0816226add x5, x5, #1 // ldp offsets are powers of 2227add x4, x4, #1228ldp x16, x17, [x1]229ldp x12, x13, [x5, #-16]230stp x16, x17, [x9]231stp x12, x13, [x4, #-16]232ret233234/* Copy 8-16 bytes */235.L0816:236tbz x8, #3, .L0407237ldr x16, [x1]238ldr x17, [x5, #-7]239str x16, [x9]240str x17, [x4, #-7]241ret242243/* Copy 4-7 bytes */244.p2align 4245.L0407:246cmp x8, #3247b.lo .L0103248ldr w16, [x1]249ldr w18, [x5, #-3]250str w16, [x9]251str w18, [x4, #-3]252ret253254/* Copy 1-3 bytes */255.p2align 4256.L0103:257lsr x14, x8, #1258ldrb w16, [x1]259ldrb w15, [x5]260ldrb w18, [x1, x14]261strb w16, [x9]262strb w18, [x9, x14]263strb w15, [x4]264ret265266.L0:267eor x0, x0, x0268ret269270END(__memccpy)271272273