Path: blob/main/lib/libc/aarch64/string/strncmp.S
103373 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2024 Getz Mikalsen <[email protected]>4*/56#include <machine/asm.h>7#include <machine/param.h>89.weak strncmp10.set strncmp, __strncmp11.text1213ENTRY(__strncmp)1415bic x8, x0, #0xf // x0 aligned to the boundary16and x9, x0, #0xf // x9 is the offset17bic x10, x1, #0xf // x1 aligned to the boundary18and x11, x1, #0xf // x11 is the offset1920subs x2, x2, #121b.lo .Lempty2223mov x13, #-1 // save constants for later24mov x16, #0xf2526/*27* Check if either string is located at end of page to avoid crossing28* into unmapped page. If so, we load 16 bytes from the nearest29* alignment boundary and shift based on the offset.30*/3132add x3, x0, #16 // end of head33add x4, x1, #1634eor x3, x3, x035eor x4, x4, x1 // bits that changed36orr x3, x3, x4 // in either str1 or str237cmp x2,#1638b.lo .Llt1639tbz w3, #PAGE_SHIFT, .Lbegin4041ldr q0, [x8] // load aligned head42ldr q1, [x10]4344lsl x14, x9, #245lsl x15, x11, #246lsl x3, x13, x14 // string head47lsl x4, x13, x154849cmeq v5.16b, v0.16b, #050cmeq v6.16b, v1.16b, #05152shrn v5.8b, v5.8h, #453shrn v6.8b, v6.8h, #454fmov x5, d555fmov x6, d65657adrp x14, shift_data58add x14, x14, :lo12:shift_data5960/* heads may cross page boundary, avoid unmapped loads */61tst x5, x362b.eq 0f6364ldr q4, [x14, x9] // load permutation table65tbl v0.16b, {v0.16b}, v4.16b6667b 1f68.p2align 4690:70ldr q0, [x0] // load true head711:72tst x6, x473b.eq 0f7475ldr q4, [x14, x11]76tbl v4.16b, {v1.16b}, v4.16b7778b 1f7980.p2align 481.Lbegin:82ldr q0, [x0] // load true heads830:84ldr q4, [x1]851:86cmeq v2.16b, v0.16b, #0 // NUL byte present?87cmeq v4.16b, v0.16b, v4.16b // which bytes match?8889orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte?9091shrn v2.8b, v2.8h, #492fmov x5, d29394cbnz x5, .Lhead_mismatch95/* load head and second chunk */96ldr q2, [x8, #16] // load second chunk97ldr q3, [x10, #16]9899add x2, x2, x11100sub x2, x2, #16101102subs x9, x9, x11 // is a&0xf >= b&0xf103b.lo .Lswapped // if not swap operands104b .Lnormal105106.p2align 4107.Llt16:108/*109* Check if either string is located at end of page to avoid crossing110* into unmapped page. If so, we load 16 bytes from the nearest111* alignment boundary and shift based on the offset.112*/113tbz w3, #PAGE_SHIFT, 2f114115ldr q0, [x8] // load aligned head116ldr q1, [x10]117118lsl x14, x9, #2119lsl x15, x11, #2120lsl x3, x13, x14 // string head121lsl x4, x13, x15122123/* Introduce a null byte match if the limit is within the aligned chunk */124add x14, x2, x9125add x15, x2, x11126lsl x14, x14, #2127lsl x15, x15, #2128lsl x14, x16, x14129lsl x15, x16, x15130131cmeq v5.16b, v0.16b, #0132cmeq v6.16b, v1.16b, #0133134shrn v5.8b, v5.8h, #4135shrn v6.8b, v6.8h, #4136fmov x5, d5137fmov x6, d6138139orr x5, x5, x14 // insert match at limit140orr x6, x6, x15141142adrp x14, shift_data143add x14, x14, :lo12:shift_data144145/* heads may cross page boundary, avoid unmapped loads */146tst x5, x3147b.eq 0f148149ldr q4, [x14, x9] // load permutation table150tbl v0.16b, {v0.16b}, v4.16b151152b 1f153.p2align 41540:155ldr q0, [x0] // load true head1561:157tst x6, x4158b.eq 0f159160ldr q4, [x14, x11]161tbl v4.16b, {v1.16b}, v4.16b162163b 1f164165.p2align 41662:167ldr q0, [x0] // load true heads1680:169ldr q4, [x1]1701:171172cmeq v2.16b, v0.16b, #0 // NUL byte present?173cmeq v4.16b, v0.16b, v4.16b // which bytes match?174175bic v2.16b, v4.16b, v2.16b // match and not NUL byte176177shrn v2.8b, v2.8h, #4178fmov x5, d2179lsl x4, x2, #2180lsl x4, x13, x4181orn x5, x4, x5 // mismatch or NUL byte?182183.Lhead_mismatch:184rbit x3, x5185clz x3, x3 // index of mismatch186lsr x3, x3, #2187ldrb w4, [x0, x3]188ldrb w5, [x1, x3]189sub w0, w4, w5190ret191192.p2align 4193.Lnormal:194sub x12, x10, x9195ldr q0, [x12, #16]!196sub x10, x10, x8197sub x11, x10, x9198199cmeq v1.16b, v3.16b, #0 // NUL present?200cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks?201shrn v1.8b, v1.8h, #4202shrn v0.8b, v0.8h, #4203fmov x6, d1204fmov x5, d0205206add x8, x8, #32 // advance to next iteration207208lsl x4, x2, #2209lsl x4, x13, x4210orr x3, x6, x4 // introduce a null byte match211cmp x2, #16 // does the buffer end within x2212csel x6, x3, x6, lo213cbnz x6, .Lnulfound2 // NUL or end of buffer found?214mvn x5, x5215cbnz x5, .Lmismatch2216sub x2, x2, #16217cmp x2, #32 // end of buffer?218b.lo .Ltail219/*220* During the main loop, the layout of the two strings is something like:221*222* v ------1------ v ------2------ v223* X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...224* X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...225*226* where v indicates the alignment boundaries and corresponding chunks227* of the strings have the same letters. Chunk A has been checked in228* the previous iteration. This iteration, we first check that string229* X1 doesn't end within region 2, then we compare chunk B between the230* two strings. As X1 is known not to hold a NUL byte in regions 1231* and 2 at this point, this also ensures that x0 has not ended yet.232*/233.p2align 42340:235ldr q0, [x8, x11]236ldr q1, [x8, x10]237ldr q2, [x8]238239cmeq v1.16b, v1.16b, #0 // end of string?240cmeq v0.16b, v0.16b, v2.16b // do the chunks match?241242shrn v1.8b, v1.8h, #4243shrn v0.8b, v0.8h, #4244fmov x6, d1245fmov x5, d0246cbnz x6, .Lnulfound247mvn x5, x5 // any mismatches?248cbnz x5, .Lmismatch249250add x8, x8, #16251252/* main loop unrolled twice */253ldr q0, [x8, x11]254ldr q1, [x8, x10]255ldr q2, [x8]256257add x8, x8, #16258cmeq v1.16b, v1.16b, #0259cmeq v0.16b, v0.16b, v2.16b260261shrn v1.8b, v1.8h, #4262shrn v0.8b, v0.8h, #4263fmov x6, d1264fmov x5, d0265cbnz x6, .Lnulfound2266mvn x5, x5267cbnz x5, .Lmismatch2268sub x2, x2, #32269cmp x2, #32 // end of buffer?270b.hs 0b // if yes, process tail271272/* end of buffer will occur in next 32 bytes */273.Ltail:274ldr q0, [x8, x11]275ldr q1, [x8, x10]276ldr q2, [x8]277278cmeq v1.16b, v1.16b, #0 // end of string?279cmeq v0.16b, v0.16b, v2.16b // do the chunks match?280281shrn v1.8b, v1.8h, #4282shrn v0.8b, v0.8h, #4283fmov x6, d1284fmov x5, d0285286/*287* If x2 <= 16 then we introduce a NUL byte in the288* result from CMEQ to avoid comparing further!289*/290291lsl x4, x2, #2292lsl x4, x13, x4293orr x3, x6, x4 // introduce a null byte match294cmp x2, #16 // does the buffer end within x2295csel x6, x3, x6, lo296297cbnz x6, .Lnulfound // NUL or end of string found298mvn x5, x5299cbnz x5, .Lmismatch300301add x8, x8, #16302303/* main loop unrolled twice */304ldr q0, [x8, x11]305ldr q1, [x8, x10]306ldr q2, [x8]307308add x8, x8, #16309cmeq v1.16b, v1.16b, #0310cmeq v0.16b, v0.16b, v2.16b311312shrn v1.8b, v1.8h, #4313shrn v0.8b, v0.8h, #4314fmov x6, d1315fmov x5, d0316317ubfiz x4, x2, #2, #4 // (x2 - 16) << 2318lsl x4, x13, x4 // take first half into account319orr x6, x6, x4 // introduce a null byte match320321.Lnulfound2:322sub x8, x8, #16323324.Lnulfound:325mov x4, x6326327ubfiz x7, x9, #2, #4328lsl x6, x6, x7 // adjust NUL mask to indices329330orn x5, x6, x5331cbnz x5, .Lmismatch332333/*334* (x0) == (x1) and NUL is past the string.335* Compare (x1) with the corresponding part336* of the other string until the NUL byte.337*/338ldr q0, [x8, x9]339ldr q1, [x8, x10]340341cmeq v1.16b, v0.16b, v1.16b342shrn v1.8b, v1.8h, #4343fmov x5, d1344345orn x5, x4, x5346347rbit x3, x5348clz x3, x3349lsr x5, x3, #2350351add x10, x10, x8 // restore x10 pointer352add x8, x8, x9 // point to corresponding chunk353354ldrb w4, [x8, x5]355ldrb w5, [x10, x5]356sub w0, w4, w5357ret358359.p2align 4360.Lmismatch2:361sub x8, x8, #16 // roll back second increment362.Lmismatch:363rbit x3, x5364clz x3, x3 // index of mismatch365lsr x3, x3, #2366add x11, x8, x11367368ldrb w4, [x8, x3]369ldrb w5, [x11, x3]370sub w0, w4, w5 // byte difference371ret372373/*374* If (a&0xf) < (b&0xf), we do the same thing but with swapped375* operands. I found that this performs slightly better than376* using conditional moves to do the swap branchless.377*/378.p2align 4379.Lswapped:380add x12, x8, x9381ldr q0, [x12, #16]!382sub x8, x8, x10383add x11, x8, x9384add x2,x2,x9385neg x9, x9386387cmeq v1.16b, v2.16b, #0388cmeq v0.16b, v0.16b, v3.16b389shrn v1.8b, v1.8h, #4390shrn v0.8b, v0.8h, #4391fmov x6, d1392fmov x5, d0393394add x10, x10, #32395396lsl x4, x2, #2397lsl x4, x13, x4398orr x3,x6,x4 // introduce a null byte match399cmp x2,#16400csel x6, x3, x6, lo401cbnz x6, .Lnulfound2s402mvn x5, x5403cbnz x5, .Lmismatch2s404405sub x2, x2, #16406cmp x2, #32407b.lo .Ltails408409/*410* During the main loop, the layout of the two strings is something like:411*412* v ------1------ v ------2------ v413* X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...414* X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...415*416* where v indicates the alignment boundaries and corresponding chunks417* of the strings have the same letters. Chunk A has been checked in418* the previous iteration. This iteration, we first check that string419* X0 doesn't end within region 2, then we compare chunk B between the420* two strings. As X0 is known not to hold a NUL byte in regions 1421* and 2 at this point, this also ensures that X1 has not ended yet.422*/423.p2align 44240:425ldr q0, [x10, x11]426ldr q1, [x10, x8]427ldr q2, [x10]428429cmeq v1.16b, v1.16b, #0430cmeq v0.16b, v0.16b, v2.16b431432shrn v1.8b, v1.8h, #4433shrn v0.8b, v0.8h, #4434fmov x6, d1435fmov x5, d0436cbnz x6, .Lnulfounds437mvn x5, x5438cbnz x5, .Lmismatchs439440add x10, x10, #16441442/* main loop unrolled twice */443ldr q0, [x10, x11]444ldr q1, [x10, x8]445ldr q2, [x10]446447add x10, x10, #16448cmeq v1.16b, v1.16b, #0449cmeq v0.16b, v0.16b, v2.16b450451shrn v1.8b, v1.8h, #4452shrn v0.8b, v0.8h, #4453fmov x6, d1454fmov x5, d0455cbnz x6, .Lnulfound2s456mvn x5, x5457cbnz x5, .Lmismatch2s458sub x2, x2, #32459cmp x2, #32460b.hs 0b461462.Ltails:463ldr q0, [x10, x11]464ldr q1, [x10, x8]465ldr q2, [x10]466467cmeq v1.16b, v1.16b, #0468cmeq v0.16b, v0.16b, v2.16b469470shrn v1.8b, v1.8h, #4471shrn v0.8b, v0.8h, #4472fmov x6, d1473fmov x5, d0474475/*476* If x2 <= 16 then we introduce a NUL byte in the477* result from CMEQ to avoid comparing further!478*/479480lsl x4, x2, #2481lsl x4, x13, x4482orr x3, x6, x4 // introduce a null byte match483cmp x2, #16484csel x6, x3, x6, lo485486cbnz x6, .Lnulfounds487mvn x5, x5488cbnz x5, .Lmismatchs489490add x10, x10, #16491492ldr q0, [x10, x11]493ldr q1, [x10, x8]494ldr q2, [x10]495496add x10, x10, #16497cmeq v1.16b, v1.16b, #0498cmeq v0.16b, v0.16b, v2.16b499500shrn v1.8b, v1.8h, #4501shrn v0.8b, v0.8h, #4502fmov x6, d1503fmov x5, d0504505ubfiz x4, x2, #2, #4506lsl x4, x13, x4507orr x6, x6, x4 // introduce a null byte match508509.Lnulfound2s:510sub x10, x10, #16511.Lnulfounds:512mov x4, x6513514ubfiz x7, x9, #2, #4515lsl x6, x6, x7516517orn x5, x6, x5518519cbnz x5, .Lmismatchs520521ldr q0, [x10, x9]522ldr q1, [x10, x8]523524cmeq v1.16b, v0.16b, v1.16b525shrn v1.8b, v1.8h, #4526fmov x5, d1527528orn x5, x4, x5529530rbit x3, x5531clz x3, x3532lsr x5, x3, #2533534add x11, x10, x8535add x10, x10, x9536537ldrb w4, [x10, x5]538ldrb w5, [x11, x5]539sub w0, w5, w4540ret541542.p2align 4543.Lmismatch2s:544sub x10, x10, #16545.Lmismatchs:546rbit x3, x5547clz x3, x3548lsr x3, x3, #2549add x11, x10, x11550551ldrb w4, [x10, x3]552ldrb w5, [x11, x3]553sub w0, w5, w4554ret555556.p2align 4557.Lempty:558eor x0, x0, x0559ret560561END(__strncmp)562563.section .rodata564.p2align 4565shift_data:566.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15567.fill 16, 1, -1568.size shift_data, .-shift_data569570571