Path: blob/main/contrib/bionic-x86_64-string/ssse3-strcmp-slm.S
39475 views
/*1Copyright (c) 2014, Intel Corporation2All rights reserved.34Redistribution and use in source and binary forms, with or without5modification, are permitted provided that the following conditions are met:67* Redistributions of source code must retain the above copyright notice,8* this list of conditions and the following disclaimer.910* Redistributions in binary form must reproduce the above copyright notice,11* this list of conditions and the following disclaimer in the documentation12* and/or other materials provided with the distribution.1314* Neither the name of Intel Corporation nor the names of its contributors15* may be used to endorse or promote products derived from this software16* without specific prior written permission.1718THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND19ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED20WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE21DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR22ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES23(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;24LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON25ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT26(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS27SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.28*/2930#ifdef USE_AS_STRNCMP31/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz32if the new counter > the old one or is 0. */33#define UPDATE_STRNCMP_COUNTER \34/* calculate left number to compare */ \35lea -16(%rcx, %r11), %r9; \36cmp %r9, %r11; \37jb L(strcmp_exitz); \38test %r9, %r9; \39je L(strcmp_exitz); \40mov %r9, %r114142#else43#define UPDATE_STRNCMP_COUNTER44#ifndef STRCMP45#define STRCMP strcmp46#endif47#endif4849#ifndef L50# define L(label) .L##label51#endif5253#ifndef cfi_startproc54# define cfi_startproc .cfi_startproc55#endif5657#ifndef cfi_endproc58# define cfi_endproc .cfi_endproc59#endif6061#ifndef ENTRY62# define ENTRY(name) \63.type name, @function; \64.globl name; \65.p2align 4; \66name: \67cfi_startproc68#endif6970#ifndef END71# define END(name) \72cfi_endproc; \73.size name, .-name74#endif75#define RETURN ret76.section .text.ssse3,"ax",@progbits77ENTRY (STRCMP)78/*79* This implementation uses SSE to compare up to 16 bytes at a time.80*/81#ifdef USE_AS_STRNCMP82test %rdx, %rdx83je L(strcmp_exitz)84cmp $1, %rdx85je L(Byte0)86mov %rdx, %r1187#endif88mov %esi, %ecx89mov %edi, %eax90/* Use 64bit AND here to avoid long NOP padding. */91and $0x3f, %rcx /* rsi alignment in cache line */92and $0x3f, %rax /* rdi alignment in cache line */93cmp $0x30, %ecx94ja L(crosscache) /* rsi: 16-byte load will cross cache line */95cmp $0x30, %eax96ja L(crosscache) /* rdi: 16-byte load will cross cache line */97movlpd (%rdi), %xmm198movlpd (%rsi), %xmm299movhpd 8(%rdi), %xmm1100movhpd 8(%rsi), %xmm2101pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */102pcmpeqb %xmm1, %xmm0 /* Any null chars? */103pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */104psubb %xmm0, %xmm1 /* packed sub of comparison results*/105pmovmskb %xmm1, %edx106sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */107jnz L(less16bytes) /* If not, find different value or null char */108#ifdef USE_AS_STRNCMP109sub $16, %r11110jbe L(strcmp_exitz) /* finish comparision */111#endif112add $16, %rsi /* prepare to search next 16 bytes */113add $16, %rdi /* prepare to search next 16 bytes */114115/*116* Determine source and destination string offsets from 16-byte alignment.117* Use relative offset difference between the two to determine which case118* below to use.119*/120.p2align 4121L(crosscache):122and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */123and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */124mov $0xffff, %edx /* for equivalent offset */125xor %r8d, %r8d126and $0xf, %ecx /* offset of rsi */127and $0xf, %eax /* offset of rdi */128cmp %eax, %ecx129je L(ashr_0) /* rsi and rdi relative offset same */130ja L(bigger)131mov %edx, %r8d /* r8d is offset flag for exit tail */132xchg %ecx, %eax133xchg %rsi, %rdi134L(bigger):135lea 15(%rax), %r9136sub %rcx, %r9137lea L(unaligned_table)(%rip), %r10138movslq (%r10, %r9,4), %r9139lea (%r10, %r9), %r10140jmp *%r10 /* jump to corresponding case */141142/*143* The following cases will be handled by ashr_0144* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case145* n(0~15) n(0~15) 15(15+ n-n) ashr_0146*/147.p2align 4148L(ashr_0):149150movdqa (%rsi), %xmm1151pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */152pcmpeqb %xmm1, %xmm0 /* Any null chars? */153pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */154psubb %xmm0, %xmm1 /* packed sub of comparison results*/155pmovmskb %xmm1, %r9d156shr %cl, %edx /* adjust 0xffff for offset */157shr %cl, %r9d /* adjust for 16-byte offset */158sub %r9d, %edx159/*160* edx must be the same with r9d if in left byte (16-rcx) is equal to161* the start from (16-rax) and no null char was seen.162*/163jne L(less32bytes) /* mismatch or null char */164UPDATE_STRNCMP_COUNTER165mov $16, %rcx166mov $16, %r9167pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */168169/*170* Now both strings are aligned at 16-byte boundary. Loop over strings171* checking 32-bytes per iteration.172*/173.p2align 4174L(loop_ashr_0):175movdqa (%rsi, %rcx), %xmm1176movdqa (%rdi, %rcx), %xmm2177178pcmpeqb %xmm1, %xmm0179pcmpeqb %xmm2, %xmm1180psubb %xmm0, %xmm1181pmovmskb %xmm1, %edx182sub $0xffff, %edx183jnz L(exit) /* mismatch or null char seen */184185#ifdef USE_AS_STRNCMP186sub $16, %r11187jbe L(strcmp_exitz)188#endif189add $16, %rcx190movdqa (%rsi, %rcx), %xmm1191movdqa (%rdi, %rcx), %xmm2192193pcmpeqb %xmm1, %xmm0194pcmpeqb %xmm2, %xmm1195psubb %xmm0, %xmm1196pmovmskb %xmm1, %edx197sub $0xffff, %edx198jnz L(exit)199#ifdef USE_AS_STRNCMP200sub $16, %r11201jbe L(strcmp_exitz)202#endif203add $16, %rcx204jmp L(loop_ashr_0)205206/*207* The following cases will be handled by ashr_1208* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case209* n(15) n -15 0(15 +(n-15) - n) ashr_1210*/211.p2align 4212L(ashr_1):213pxor %xmm0, %xmm0214movdqa (%rdi), %xmm2215movdqa (%rsi), %xmm1216pcmpeqb %xmm1, %xmm0 /* Any null chars? */217pslldq $15, %xmm2 /* shift first string to align with second */218pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */219psubb %xmm0, %xmm2 /* packed sub of comparison results*/220pmovmskb %xmm2, %r9d221shr %cl, %edx /* adjust 0xffff for offset */222shr %cl, %r9d /* adjust for 16-byte offset */223sub %r9d, %edx224jnz L(less32bytes) /* mismatch or null char seen */225movdqa (%rdi), %xmm3226UPDATE_STRNCMP_COUNTER227228pxor %xmm0, %xmm0229mov $16, %rcx /* index for loads*/230mov $1, %r9d /* byte position left over from less32bytes case */231/*232* Setup %r10 value allows us to detect crossing a page boundary.233* When %r10 goes positive we have crossed a page boundary and234* need to do a nibble.235*/236lea 1(%rdi), %r10237and $0xfff, %r10 /* offset into 4K page */238sub $0x1000, %r10 /* subtract 4K pagesize */239240.p2align 4241L(loop_ashr_1):242add $16, %r10243jg L(nibble_ashr_1) /* cross page boundary */244245L(gobble_ashr_1):246movdqa (%rsi, %rcx), %xmm1247movdqa (%rdi, %rcx), %xmm2248movdqa %xmm2, %xmm4 /* store for next cycle */249250palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */251252pcmpeqb %xmm1, %xmm0253pcmpeqb %xmm2, %xmm1254psubb %xmm0, %xmm1255pmovmskb %xmm1, %edx256sub $0xffff, %edx257jnz L(exit)258259#ifdef USE_AS_STRNCMP260sub $16, %r11261jbe L(strcmp_exitz)262#endif263add $16, %rcx264movdqa %xmm4, %xmm3265266add $16, %r10267jg L(nibble_ashr_1) /* cross page boundary */268269movdqa (%rsi, %rcx), %xmm1270movdqa (%rdi, %rcx), %xmm2271movdqa %xmm2, %xmm4 /* store for next cycle */272273palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */274275pcmpeqb %xmm1, %xmm0276pcmpeqb %xmm2, %xmm1277psubb %xmm0, %xmm1278pmovmskb %xmm1, %edx279sub $0xffff, %edx280jnz L(exit)281282#ifdef USE_AS_STRNCMP283sub $16, %r11284jbe L(strcmp_exitz)285#endif286add $16, %rcx287movdqa %xmm4, %xmm3288jmp L(loop_ashr_1)289290/*291* Nibble avoids loads across page boundary. This is to avoid a potential292* access into unmapped memory.293*/294.p2align 4295L(nibble_ashr_1):296pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/297pmovmskb %xmm0, %edx298test $0xfffe, %edx299jnz L(ashr_1_exittail) /* find null char*/300301#ifdef USE_AS_STRNCMP302cmp $14, %r11303jbe L(ashr_1_exittail)304#endif305306pxor %xmm0, %xmm0307sub $0x1000, %r10 /* substract 4K from %r10 */308jmp L(gobble_ashr_1)309310/*311* Once find null char, determine if there is a string mismatch312* before the null char.313*/314.p2align 4315L(ashr_1_exittail):316movdqa (%rsi, %rcx), %xmm1317psrldq $1, %xmm0318psrldq $1, %xmm3319jmp L(aftertail)320321/*322* The following cases will be handled by ashr_2323* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case324* n(14~15) n -14 1(15 +(n-14) - n) ashr_2325*/326.p2align 4327L(ashr_2):328pxor %xmm0, %xmm0329movdqa (%rdi), %xmm2330movdqa (%rsi), %xmm1331pcmpeqb %xmm1, %xmm0332pslldq $14, %xmm2333pcmpeqb %xmm1, %xmm2334psubb %xmm0, %xmm2335pmovmskb %xmm2, %r9d336shr %cl, %edx337shr %cl, %r9d338sub %r9d, %edx339jnz L(less32bytes)340movdqa (%rdi), %xmm3341UPDATE_STRNCMP_COUNTER342343pxor %xmm0, %xmm0344mov $16, %rcx /* index for loads */345mov $2, %r9d /* byte position left over from less32bytes case */346/*347* Setup %r10 value allows us to detect crossing a page boundary.348* When %r10 goes positive we have crossed a page boundary and349* need to do a nibble.350*/351lea 2(%rdi), %r10352and $0xfff, %r10 /* offset into 4K page */353sub $0x1000, %r10 /* subtract 4K pagesize */354355.p2align 4356L(loop_ashr_2):357add $16, %r10358jg L(nibble_ashr_2)359360L(gobble_ashr_2):361movdqa (%rsi, %rcx), %xmm1362movdqa (%rdi, %rcx), %xmm2363movdqa %xmm2, %xmm4364365palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */366367pcmpeqb %xmm1, %xmm0368pcmpeqb %xmm2, %xmm1369psubb %xmm0, %xmm1370pmovmskb %xmm1, %edx371sub $0xffff, %edx372jnz L(exit)373374#ifdef USE_AS_STRNCMP375sub $16, %r11376jbe L(strcmp_exitz)377#endif378379add $16, %rcx380movdqa %xmm4, %xmm3381382add $16, %r10383jg L(nibble_ashr_2) /* cross page boundary */384385movdqa (%rsi, %rcx), %xmm1386movdqa (%rdi, %rcx), %xmm2387movdqa %xmm2, %xmm4388389palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */390391pcmpeqb %xmm1, %xmm0392pcmpeqb %xmm2, %xmm1393psubb %xmm0, %xmm1394pmovmskb %xmm1, %edx395sub $0xffff, %edx396jnz L(exit)397398#ifdef USE_AS_STRNCMP399sub $16, %r11400jbe L(strcmp_exitz)401#endif402403add $16, %rcx404movdqa %xmm4, %xmm3405jmp L(loop_ashr_2)406407.p2align 4408L(nibble_ashr_2):409pcmpeqb %xmm3, %xmm0 /* check nibble for null char */410pmovmskb %xmm0, %edx411test $0xfffc, %edx412jnz L(ashr_2_exittail)413414#ifdef USE_AS_STRNCMP415cmp $13, %r11416jbe L(ashr_2_exittail)417#endif418419pxor %xmm0, %xmm0420sub $0x1000, %r10421jmp L(gobble_ashr_2)422423.p2align 4424L(ashr_2_exittail):425movdqa (%rsi, %rcx), %xmm1426psrldq $2, %xmm0427psrldq $2, %xmm3428jmp L(aftertail)429430/*431* The following cases will be handled by ashr_3432* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case433* n(13~15) n -13 2(15 +(n-13) - n) ashr_3434*/435.p2align 4436L(ashr_3):437pxor %xmm0, %xmm0438movdqa (%rdi), %xmm2439movdqa (%rsi), %xmm1440pcmpeqb %xmm1, %xmm0441pslldq $13, %xmm2442pcmpeqb %xmm1, %xmm2443psubb %xmm0, %xmm2444pmovmskb %xmm2, %r9d445shr %cl, %edx446shr %cl, %r9d447sub %r9d, %edx448jnz L(less32bytes)449movdqa (%rdi), %xmm3450451UPDATE_STRNCMP_COUNTER452453pxor %xmm0, %xmm0454mov $16, %rcx /* index for loads */455mov $3, %r9d /* byte position left over from less32bytes case */456/*457* Setup %r10 value allows us to detect crossing a page boundary.458* When %r10 goes positive we have crossed a page boundary and459* need to do a nibble.460*/461lea 3(%rdi), %r10462and $0xfff, %r10 /* offset into 4K page */463sub $0x1000, %r10 /* subtract 4K pagesize */464465.p2align 4466L(loop_ashr_3):467add $16, %r10468jg L(nibble_ashr_3)469470L(gobble_ashr_3):471movdqa (%rsi, %rcx), %xmm1472movdqa (%rdi, %rcx), %xmm2473movdqa %xmm2, %xmm4474475palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */476477pcmpeqb %xmm1, %xmm0478pcmpeqb %xmm2, %xmm1479psubb %xmm0, %xmm1480pmovmskb %xmm1, %edx481sub $0xffff, %edx482jnz L(exit)483484#ifdef USE_AS_STRNCMP485sub $16, %r11486jbe L(strcmp_exitz)487#endif488489add $16, %rcx490movdqa %xmm4, %xmm3491492add $16, %r10493jg L(nibble_ashr_3) /* cross page boundary */494495movdqa (%rsi, %rcx), %xmm1496movdqa (%rdi, %rcx), %xmm2497movdqa %xmm2, %xmm4498499palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */500501pcmpeqb %xmm1, %xmm0502pcmpeqb %xmm2, %xmm1503psubb %xmm0, %xmm1504pmovmskb %xmm1, %edx505sub $0xffff, %edx506jnz L(exit)507508#ifdef USE_AS_STRNCMP509sub $16, %r11510jbe L(strcmp_exitz)511#endif512513add $16, %rcx514movdqa %xmm4, %xmm3515jmp L(loop_ashr_3)516517.p2align 4518L(nibble_ashr_3):519pcmpeqb %xmm3, %xmm0 /* check nibble for null char */520pmovmskb %xmm0, %edx521test $0xfff8, %edx522jnz L(ashr_3_exittail)523524#ifdef USE_AS_STRNCMP525cmp $12, %r11526jbe L(ashr_3_exittail)527#endif528529pxor %xmm0, %xmm0530sub $0x1000, %r10531jmp L(gobble_ashr_3)532533.p2align 4534L(ashr_3_exittail):535movdqa (%rsi, %rcx), %xmm1536psrldq $3, %xmm0537psrldq $3, %xmm3538jmp L(aftertail)539540/*541* The following cases will be handled by ashr_4542* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case543* n(12~15) n -12 3(15 +(n-12) - n) ashr_4544*/545.p2align 4546L(ashr_4):547pxor %xmm0, %xmm0548movdqa (%rdi), %xmm2549movdqa (%rsi), %xmm1550pcmpeqb %xmm1, %xmm0551pslldq $12, %xmm2552pcmpeqb %xmm1, %xmm2553psubb %xmm0, %xmm2554pmovmskb %xmm2, %r9d555shr %cl, %edx556shr %cl, %r9d557sub %r9d, %edx558jnz L(less32bytes)559movdqa (%rdi), %xmm3560561UPDATE_STRNCMP_COUNTER562563pxor %xmm0, %xmm0564mov $16, %rcx /* index for loads */565mov $4, %r9d /* byte position left over from less32bytes case */566/*567* Setup %r10 value allows us to detect crossing a page boundary.568* When %r10 goes positive we have crossed a page boundary and569* need to do a nibble.570*/571lea 4(%rdi), %r10572and $0xfff, %r10 /* offset into 4K page */573sub $0x1000, %r10 /* subtract 4K pagesize */574575.p2align 4576L(loop_ashr_4):577add $16, %r10578jg L(nibble_ashr_4)579580L(gobble_ashr_4):581movdqa (%rsi, %rcx), %xmm1582movdqa (%rdi, %rcx), %xmm2583movdqa %xmm2, %xmm4584585palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */586587pcmpeqb %xmm1, %xmm0588pcmpeqb %xmm2, %xmm1589psubb %xmm0, %xmm1590pmovmskb %xmm1, %edx591sub $0xffff, %edx592jnz L(exit)593594#ifdef USE_AS_STRNCMP595sub $16, %r11596jbe L(strcmp_exitz)597#endif598599add $16, %rcx600movdqa %xmm4, %xmm3601602add $16, %r10603jg L(nibble_ashr_4) /* cross page boundary */604605movdqa (%rsi, %rcx), %xmm1606movdqa (%rdi, %rcx), %xmm2607movdqa %xmm2, %xmm4608609palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */610611pcmpeqb %xmm1, %xmm0612pcmpeqb %xmm2, %xmm1613psubb %xmm0, %xmm1614pmovmskb %xmm1, %edx615sub $0xffff, %edx616jnz L(exit)617618#ifdef USE_AS_STRNCMP619sub $16, %r11620jbe L(strcmp_exitz)621#endif622623add $16, %rcx624movdqa %xmm4, %xmm3625jmp L(loop_ashr_4)626627.p2align 4628L(nibble_ashr_4):629pcmpeqb %xmm3, %xmm0 /* check nibble for null char */630pmovmskb %xmm0, %edx631test $0xfff0, %edx632jnz L(ashr_4_exittail)633634#ifdef USE_AS_STRNCMP635cmp $11, %r11636jbe L(ashr_4_exittail)637#endif638639pxor %xmm0, %xmm0640sub $0x1000, %r10641jmp L(gobble_ashr_4)642643.p2align 4644L(ashr_4_exittail):645movdqa (%rsi, %rcx), %xmm1646psrldq $4, %xmm0647psrldq $4, %xmm3648jmp L(aftertail)649650/*651* The following cases will be handled by ashr_5652* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case653* n(11~15) n - 11 4(15 +(n-11) - n) ashr_5654*/655.p2align 4656L(ashr_5):657pxor %xmm0, %xmm0658movdqa (%rdi), %xmm2659movdqa (%rsi), %xmm1660pcmpeqb %xmm1, %xmm0661pslldq $11, %xmm2662pcmpeqb %xmm1, %xmm2663psubb %xmm0, %xmm2664pmovmskb %xmm2, %r9d665shr %cl, %edx666shr %cl, %r9d667sub %r9d, %edx668jnz L(less32bytes)669movdqa (%rdi), %xmm3670671UPDATE_STRNCMP_COUNTER672673pxor %xmm0, %xmm0674mov $16, %rcx /* index for loads */675mov $5, %r9d /* byte position left over from less32bytes case */676/*677* Setup %r10 value allows us to detect crossing a page boundary.678* When %r10 goes positive we have crossed a page boundary and679* need to do a nibble.680*/681lea 5(%rdi), %r10682and $0xfff, %r10 /* offset into 4K page */683sub $0x1000, %r10 /* subtract 4K pagesize */684685.p2align 4686L(loop_ashr_5):687add $16, %r10688jg L(nibble_ashr_5)689690L(gobble_ashr_5):691movdqa (%rsi, %rcx), %xmm1692movdqa (%rdi, %rcx), %xmm2693movdqa %xmm2, %xmm4694695palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */696697pcmpeqb %xmm1, %xmm0698pcmpeqb %xmm2, %xmm1699psubb %xmm0, %xmm1700pmovmskb %xmm1, %edx701sub $0xffff, %edx702jnz L(exit)703704#ifdef USE_AS_STRNCMP705sub $16, %r11706jbe L(strcmp_exitz)707#endif708709add $16, %rcx710movdqa %xmm4, %xmm3711712add $16, %r10713jg L(nibble_ashr_5) /* cross page boundary */714715movdqa (%rsi, %rcx), %xmm1716movdqa (%rdi, %rcx), %xmm2717movdqa %xmm2, %xmm4718719palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */720721pcmpeqb %xmm1, %xmm0722pcmpeqb %xmm2, %xmm1723psubb %xmm0, %xmm1724pmovmskb %xmm1, %edx725sub $0xffff, %edx726jnz L(exit)727728#ifdef USE_AS_STRNCMP729sub $16, %r11730jbe L(strcmp_exitz)731#endif732733add $16, %rcx734movdqa %xmm4, %xmm3735jmp L(loop_ashr_5)736737.p2align 4738L(nibble_ashr_5):739pcmpeqb %xmm3, %xmm0 /* check nibble for null char */740pmovmskb %xmm0, %edx741test $0xffe0, %edx742jnz L(ashr_5_exittail)743744#ifdef USE_AS_STRNCMP745cmp $10, %r11746jbe L(ashr_5_exittail)747#endif748749pxor %xmm0, %xmm0750sub $0x1000, %r10751jmp L(gobble_ashr_5)752753.p2align 4754L(ashr_5_exittail):755movdqa (%rsi, %rcx), %xmm1756psrldq $5, %xmm0757psrldq $5, %xmm3758jmp L(aftertail)759760/*761* The following cases will be handled by ashr_6762* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case763* n(10~15) n - 10 5(15 +(n-10) - n) ashr_6764*/765.p2align 4766L(ashr_6):767pxor %xmm0, %xmm0768movdqa (%rdi), %xmm2769movdqa (%rsi), %xmm1770pcmpeqb %xmm1, %xmm0771pslldq $10, %xmm2772pcmpeqb %xmm1, %xmm2773psubb %xmm0, %xmm2774pmovmskb %xmm2, %r9d775shr %cl, %edx776shr %cl, %r9d777sub %r9d, %edx778jnz L(less32bytes)779movdqa (%rdi), %xmm3780781UPDATE_STRNCMP_COUNTER782783pxor %xmm0, %xmm0784mov $16, %rcx /* index for loads */785mov $6, %r9d /* byte position left over from less32bytes case */786/*787* Setup %r10 value allows us to detect crossing a page boundary.788* When %r10 goes positive we have crossed a page boundary and789* need to do a nibble.790*/791lea 6(%rdi), %r10792and $0xfff, %r10 /* offset into 4K page */793sub $0x1000, %r10 /* subtract 4K pagesize */794795.p2align 4796L(loop_ashr_6):797add $16, %r10798jg L(nibble_ashr_6)799800L(gobble_ashr_6):801movdqa (%rsi, %rcx), %xmm1802movdqa (%rdi, %rcx), %xmm2803movdqa %xmm2, %xmm4804805palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */806807pcmpeqb %xmm1, %xmm0808pcmpeqb %xmm2, %xmm1809psubb %xmm0, %xmm1810pmovmskb %xmm1, %edx811sub $0xffff, %edx812jnz L(exit)813814#ifdef USE_AS_STRNCMP815sub $16, %r11816jbe L(strcmp_exitz)817#endif818819add $16, %rcx820movdqa %xmm4, %xmm3821822add $16, %r10823jg L(nibble_ashr_6) /* cross page boundary */824825movdqa (%rsi, %rcx), %xmm1826movdqa (%rdi, %rcx), %xmm2827movdqa %xmm2, %xmm4828829palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */830831pcmpeqb %xmm1, %xmm0832pcmpeqb %xmm2, %xmm1833psubb %xmm0, %xmm1834pmovmskb %xmm1, %edx835sub $0xffff, %edx836jnz L(exit)837838#ifdef USE_AS_STRNCMP839sub $16, %r11840jbe L(strcmp_exitz)841#endif842843add $16, %rcx844movdqa %xmm4, %xmm3845jmp L(loop_ashr_6)846847.p2align 4848L(nibble_ashr_6):849pcmpeqb %xmm3, %xmm0 /* check nibble for null char */850pmovmskb %xmm0, %edx851test $0xffc0, %edx852jnz L(ashr_6_exittail)853854#ifdef USE_AS_STRNCMP855cmp $9, %r11856jbe L(ashr_6_exittail)857#endif858859pxor %xmm0, %xmm0860sub $0x1000, %r10861jmp L(gobble_ashr_6)862863.p2align 4864L(ashr_6_exittail):865movdqa (%rsi, %rcx), %xmm1866psrldq $6, %xmm0867psrldq $6, %xmm3868jmp L(aftertail)869870/*871* The following cases will be handled by ashr_7872* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case873* n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7874*/875.p2align 4876L(ashr_7):877pxor %xmm0, %xmm0878movdqa (%rdi), %xmm2879movdqa (%rsi), %xmm1880pcmpeqb %xmm1, %xmm0881pslldq $9, %xmm2882pcmpeqb %xmm1, %xmm2883psubb %xmm0, %xmm2884pmovmskb %xmm2, %r9d885shr %cl, %edx886shr %cl, %r9d887sub %r9d, %edx888jnz L(less32bytes)889movdqa (%rdi), %xmm3890891UPDATE_STRNCMP_COUNTER892893pxor %xmm0, %xmm0894mov $16, %rcx /* index for loads */895mov $7, %r9d /* byte position left over from less32bytes case */896/*897* Setup %r10 value allows us to detect crossing a page boundary.898* When %r10 goes positive we have crossed a page boundary and899* need to do a nibble.900*/901lea 7(%rdi), %r10902and $0xfff, %r10 /* offset into 4K page */903sub $0x1000, %r10 /* subtract 4K pagesize */904905.p2align 4906L(loop_ashr_7):907add $16, %r10908jg L(nibble_ashr_7)909910L(gobble_ashr_7):911movdqa (%rsi, %rcx), %xmm1912movdqa (%rdi, %rcx), %xmm2913movdqa %xmm2, %xmm4914915palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */916917pcmpeqb %xmm1, %xmm0918pcmpeqb %xmm2, %xmm1919psubb %xmm0, %xmm1920pmovmskb %xmm1, %edx921sub $0xffff, %edx922jnz L(exit)923924#ifdef USE_AS_STRNCMP925sub $16, %r11926jbe L(strcmp_exitz)927#endif928929add $16, %rcx930movdqa %xmm4, %xmm3931932add $16, %r10933jg L(nibble_ashr_7) /* cross page boundary */934935movdqa (%rsi, %rcx), %xmm1936movdqa (%rdi, %rcx), %xmm2937movdqa %xmm2, %xmm4938939palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */940941pcmpeqb %xmm1, %xmm0942pcmpeqb %xmm2, %xmm1943psubb %xmm0, %xmm1944pmovmskb %xmm1, %edx945sub $0xffff, %edx946jnz L(exit)947948#ifdef USE_AS_STRNCMP949sub $16, %r11950jbe L(strcmp_exitz)951#endif952953add $16, %rcx954movdqa %xmm4, %xmm3955jmp L(loop_ashr_7)956957.p2align 4958L(nibble_ashr_7):959pcmpeqb %xmm3, %xmm0 /* check nibble for null char */960pmovmskb %xmm0, %edx961test $0xff80, %edx962jnz L(ashr_7_exittail)963964#ifdef USE_AS_STRNCMP965cmp $8, %r11966jbe L(ashr_7_exittail)967#endif968969pxor %xmm0, %xmm0970sub $0x1000, %r10971jmp L(gobble_ashr_7)972973.p2align 4974L(ashr_7_exittail):975movdqa (%rsi, %rcx), %xmm1976psrldq $7, %xmm0977psrldq $7, %xmm3978jmp L(aftertail)979980/*981* The following cases will be handled by ashr_8982* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case983* n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8984*/985.p2align 4986L(ashr_8):987pxor %xmm0, %xmm0988movdqa (%rdi), %xmm2989movdqa (%rsi), %xmm1990pcmpeqb %xmm1, %xmm0991pslldq $8, %xmm2992pcmpeqb %xmm1, %xmm2993psubb %xmm0, %xmm2994pmovmskb %xmm2, %r9d995shr %cl, %edx996shr %cl, %r9d997sub %r9d, %edx998jnz L(less32bytes)999movdqa (%rdi), %xmm310001001UPDATE_STRNCMP_COUNTER10021003pxor %xmm0, %xmm01004mov $16, %rcx /* index for loads */1005mov $8, %r9d /* byte position left over from less32bytes case */1006/*1007* Setup %r10 value allows us to detect crossing a page boundary.1008* When %r10 goes positive we have crossed a page boundary and1009* need to do a nibble.1010*/1011lea 8(%rdi), %r101012and $0xfff, %r10 /* offset into 4K page */1013sub $0x1000, %r10 /* subtract 4K pagesize */10141015.p2align 41016L(loop_ashr_8):1017add $16, %r101018jg L(nibble_ashr_8)10191020L(gobble_ashr_8):1021movdqa (%rsi, %rcx), %xmm11022movdqa (%rdi, %rcx), %xmm21023movdqa %xmm2, %xmm410241025palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */10261027pcmpeqb %xmm1, %xmm01028pcmpeqb %xmm2, %xmm11029psubb %xmm0, %xmm11030pmovmskb %xmm1, %edx1031sub $0xffff, %edx1032jnz L(exit)10331034#ifdef USE_AS_STRNCMP1035sub $16, %r111036jbe L(strcmp_exitz)1037#endif10381039add $16, %rcx1040movdqa %xmm4, %xmm310411042add $16, %r101043jg L(nibble_ashr_8) /* cross page boundary */10441045movdqa (%rsi, %rcx), %xmm11046movdqa (%rdi, %rcx), %xmm21047movdqa %xmm2, %xmm410481049palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */10501051pcmpeqb %xmm1, %xmm01052pcmpeqb %xmm2, %xmm11053psubb %xmm0, %xmm11054pmovmskb %xmm1, %edx1055sub $0xffff, %edx1056jnz L(exit)10571058#ifdef USE_AS_STRNCMP1059sub $16, %r111060jbe L(strcmp_exitz)1061#endif10621063add $16, %rcx1064movdqa %xmm4, %xmm31065jmp L(loop_ashr_8)10661067.p2align 41068L(nibble_ashr_8):1069pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1070pmovmskb %xmm0, %edx1071test $0xff00, %edx1072jnz L(ashr_8_exittail)10731074#ifdef USE_AS_STRNCMP1075cmp $7, %r111076jbe L(ashr_8_exittail)1077#endif10781079pxor %xmm0, %xmm01080sub $0x1000, %r101081jmp L(gobble_ashr_8)10821083.p2align 41084L(ashr_8_exittail):1085movdqa (%rsi, %rcx), %xmm11086psrldq $8, %xmm01087psrldq $8, %xmm31088jmp L(aftertail)10891090/*1091* The following cases will be handled by ashr_91092* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1093* n(7~15) n - 7 8(15 +(n - 7) - n) ashr_91094*/1095.p2align 41096L(ashr_9):1097pxor %xmm0, %xmm01098movdqa (%rdi), %xmm21099movdqa (%rsi), %xmm11100pcmpeqb %xmm1, %xmm01101pslldq $7, %xmm21102pcmpeqb %xmm1, %xmm21103psubb %xmm0, %xmm21104pmovmskb %xmm2, %r9d1105shr %cl, %edx1106shr %cl, %r9d1107sub %r9d, %edx1108jnz L(less32bytes)1109movdqa (%rdi), %xmm311101111UPDATE_STRNCMP_COUNTER11121113pxor %xmm0, %xmm01114mov $16, %rcx /* index for loads */1115mov $9, %r9d /* byte position left over from less32bytes case */1116/*1117* Setup %r10 value allows us to detect crossing a page boundary.1118* When %r10 goes positive we have crossed a page boundary and1119* need to do a nibble.1120*/1121lea 9(%rdi), %r101122and $0xfff, %r10 /* offset into 4K page */1123sub $0x1000, %r10 /* subtract 4K pagesize */11241125.p2align 41126L(loop_ashr_9):1127add $16, %r101128jg L(nibble_ashr_9)11291130L(gobble_ashr_9):1131movdqa (%rsi, %rcx), %xmm11132movdqa (%rdi, %rcx), %xmm21133movdqa %xmm2, %xmm411341135palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */11361137pcmpeqb %xmm1, %xmm01138pcmpeqb %xmm2, %xmm11139psubb %xmm0, %xmm11140pmovmskb %xmm1, %edx1141sub $0xffff, %edx1142jnz L(exit)11431144#ifdef USE_AS_STRNCMP1145sub $16, %r111146jbe L(strcmp_exitz)1147#endif11481149add $16, %rcx1150movdqa %xmm4, %xmm311511152add $16, %r101153jg L(nibble_ashr_9) /* cross page boundary */11541155movdqa (%rsi, %rcx), %xmm11156movdqa (%rdi, %rcx), %xmm21157movdqa %xmm2, %xmm411581159palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */11601161pcmpeqb %xmm1, %xmm01162pcmpeqb %xmm2, %xmm11163psubb %xmm0, %xmm11164pmovmskb %xmm1, %edx1165sub $0xffff, %edx1166jnz L(exit)11671168#ifdef USE_AS_STRNCMP1169sub $16, %r111170jbe L(strcmp_exitz)1171#endif11721173add $16, %rcx1174movdqa %xmm4, %xmm3 /* store for next cycle */1175jmp L(loop_ashr_9)11761177.p2align 41178L(nibble_ashr_9):1179pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1180pmovmskb %xmm0, %edx1181test $0xfe00, %edx1182jnz L(ashr_9_exittail)11831184#ifdef USE_AS_STRNCMP1185cmp $6, %r111186jbe L(ashr_9_exittail)1187#endif11881189pxor %xmm0, %xmm01190sub $0x1000, %r101191jmp L(gobble_ashr_9)11921193.p2align 41194L(ashr_9_exittail):1195movdqa (%rsi, %rcx), %xmm11196psrldq $9, %xmm01197psrldq $9, %xmm31198jmp L(aftertail)11991200/*1201* The following cases will be handled by ashr_101202* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1203* n(6~15) n - 6 9(15 +(n - 6) - n) ashr_101204*/1205.p2align 41206L(ashr_10):1207pxor %xmm0, %xmm01208movdqa (%rdi), %xmm21209movdqa (%rsi), %xmm11210pcmpeqb %xmm1, %xmm01211pslldq $6, %xmm21212pcmpeqb %xmm1, %xmm21213psubb %xmm0, %xmm21214pmovmskb %xmm2, %r9d1215shr %cl, %edx1216shr %cl, %r9d1217sub %r9d, %edx1218jnz L(less32bytes)1219movdqa (%rdi), %xmm312201221UPDATE_STRNCMP_COUNTER12221223pxor %xmm0, %xmm01224mov $16, %rcx /* index for loads */1225mov $10, %r9d /* byte position left over from less32bytes case */1226/*1227* Setup %r10 value allows us to detect crossing a page boundary.1228* When %r10 goes positive we have crossed a page boundary and1229* need to do a nibble.1230*/1231lea 10(%rdi), %r101232and $0xfff, %r10 /* offset into 4K page */1233sub $0x1000, %r10 /* subtract 4K pagesize */12341235.p2align 41236L(loop_ashr_10):1237add $16, %r101238jg L(nibble_ashr_10)12391240L(gobble_ashr_10):1241movdqa (%rsi, %rcx), %xmm11242movdqa (%rdi, %rcx), %xmm21243movdqa %xmm2, %xmm412441245palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */12461247pcmpeqb %xmm1, %xmm01248pcmpeqb %xmm2, %xmm11249psubb %xmm0, %xmm11250pmovmskb %xmm1, %edx1251sub $0xffff, %edx1252jnz L(exit)12531254#ifdef USE_AS_STRNCMP1255sub $16, %r111256jbe L(strcmp_exitz)1257#endif12581259add $16, %rcx1260movdqa %xmm4, %xmm312611262add $16, %r101263jg L(nibble_ashr_10) /* cross page boundary */12641265movdqa (%rsi, %rcx), %xmm11266movdqa (%rdi, %rcx), %xmm21267movdqa %xmm2, %xmm412681269palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */12701271pcmpeqb %xmm1, %xmm01272pcmpeqb %xmm2, %xmm11273psubb %xmm0, %xmm11274pmovmskb %xmm1, %edx1275sub $0xffff, %edx1276jnz L(exit)12771278#ifdef USE_AS_STRNCMP1279sub $16, %r111280jbe L(strcmp_exitz)1281#endif12821283add $16, %rcx1284movdqa %xmm4, %xmm31285jmp L(loop_ashr_10)12861287.p2align 41288L(nibble_ashr_10):1289pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1290pmovmskb %xmm0, %edx1291test $0xfc00, %edx1292jnz L(ashr_10_exittail)12931294#ifdef USE_AS_STRNCMP1295cmp $5, %r111296jbe L(ashr_10_exittail)1297#endif12981299pxor %xmm0, %xmm01300sub $0x1000, %r101301jmp L(gobble_ashr_10)13021303.p2align 41304L(ashr_10_exittail):1305movdqa (%rsi, %rcx), %xmm11306psrldq $10, %xmm01307psrldq $10, %xmm31308jmp L(aftertail)13091310/*1311* The following cases will be handled by ashr_111312* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1313* n(5~15) n - 5 10(15 +(n - 5) - n) ashr_111314*/1315.p2align 41316L(ashr_11):1317pxor %xmm0, %xmm01318movdqa (%rdi), %xmm21319movdqa (%rsi), %xmm11320pcmpeqb %xmm1, %xmm01321pslldq $5, %xmm21322pcmpeqb %xmm1, %xmm21323psubb %xmm0, %xmm21324pmovmskb %xmm2, %r9d1325shr %cl, %edx1326shr %cl, %r9d1327sub %r9d, %edx1328jnz L(less32bytes)1329movdqa (%rdi), %xmm313301331UPDATE_STRNCMP_COUNTER13321333pxor %xmm0, %xmm01334mov $16, %rcx /* index for loads */1335mov $11, %r9d /* byte position left over from less32bytes case */1336/*1337* Setup %r10 value allows us to detect crossing a page boundary.1338* When %r10 goes positive we have crossed a page boundary and1339* need to do a nibble.1340*/1341lea 11(%rdi), %r101342and $0xfff, %r10 /* offset into 4K page */1343sub $0x1000, %r10 /* subtract 4K pagesize */13441345.p2align 41346L(loop_ashr_11):1347add $16, %r101348jg L(nibble_ashr_11)13491350L(gobble_ashr_11):1351movdqa (%rsi, %rcx), %xmm11352movdqa (%rdi, %rcx), %xmm21353movdqa %xmm2, %xmm413541355palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */13561357pcmpeqb %xmm1, %xmm01358pcmpeqb %xmm2, %xmm11359psubb %xmm0, %xmm11360pmovmskb %xmm1, %edx1361sub $0xffff, %edx1362jnz L(exit)13631364#ifdef USE_AS_STRNCMP1365sub $16, %r111366jbe L(strcmp_exitz)1367#endif13681369add $16, %rcx1370movdqa %xmm4, %xmm313711372add $16, %r101373jg L(nibble_ashr_11) /* cross page boundary */13741375movdqa (%rsi, %rcx), %xmm11376movdqa (%rdi, %rcx), %xmm21377movdqa %xmm2, %xmm413781379palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */13801381pcmpeqb %xmm1, %xmm01382pcmpeqb %xmm2, %xmm11383psubb %xmm0, %xmm11384pmovmskb %xmm1, %edx1385sub $0xffff, %edx1386jnz L(exit)13871388#ifdef USE_AS_STRNCMP1389sub $16, %r111390jbe L(strcmp_exitz)1391#endif13921393add $16, %rcx1394movdqa %xmm4, %xmm31395jmp L(loop_ashr_11)13961397.p2align 41398L(nibble_ashr_11):1399pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1400pmovmskb %xmm0, %edx1401test $0xf800, %edx1402jnz L(ashr_11_exittail)14031404#ifdef USE_AS_STRNCMP1405cmp $4, %r111406jbe L(ashr_11_exittail)1407#endif14081409pxor %xmm0, %xmm01410sub $0x1000, %r101411jmp L(gobble_ashr_11)14121413.p2align 41414L(ashr_11_exittail):1415movdqa (%rsi, %rcx), %xmm11416psrldq $11, %xmm01417psrldq $11, %xmm31418jmp L(aftertail)14191420/*1421* The following cases will be handled by ashr_121422* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1423* n(4~15) n - 4 11(15 +(n - 4) - n) ashr_121424*/1425.p2align 41426L(ashr_12):1427pxor %xmm0, %xmm01428movdqa (%rdi), %xmm21429movdqa (%rsi), %xmm11430pcmpeqb %xmm1, %xmm01431pslldq $4, %xmm21432pcmpeqb %xmm1, %xmm21433psubb %xmm0, %xmm21434pmovmskb %xmm2, %r9d1435shr %cl, %edx1436shr %cl, %r9d1437sub %r9d, %edx1438jnz L(less32bytes)1439movdqa (%rdi), %xmm314401441UPDATE_STRNCMP_COUNTER14421443pxor %xmm0, %xmm01444mov $16, %rcx /* index for loads */1445mov $12, %r9d /* byte position left over from less32bytes case */1446/*1447* Setup %r10 value allows us to detect crossing a page boundary.1448* When %r10 goes positive we have crossed a page boundary and1449* need to do a nibble.1450*/1451lea 12(%rdi), %r101452and $0xfff, %r10 /* offset into 4K page */1453sub $0x1000, %r10 /* subtract 4K pagesize */14541455.p2align 41456L(loop_ashr_12):1457add $16, %r101458jg L(nibble_ashr_12)14591460L(gobble_ashr_12):1461movdqa (%rsi, %rcx), %xmm11462movdqa (%rdi, %rcx), %xmm21463movdqa %xmm2, %xmm414641465palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */14661467pcmpeqb %xmm1, %xmm01468pcmpeqb %xmm2, %xmm11469psubb %xmm0, %xmm11470pmovmskb %xmm1, %edx1471sub $0xffff, %edx1472jnz L(exit)14731474#ifdef USE_AS_STRNCMP1475sub $16, %r111476jbe L(strcmp_exitz)1477#endif14781479add $16, %rcx1480movdqa %xmm4, %xmm314811482add $16, %r101483jg L(nibble_ashr_12) /* cross page boundary */14841485movdqa (%rsi, %rcx), %xmm11486movdqa (%rdi, %rcx), %xmm21487movdqa %xmm2, %xmm414881489palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */14901491pcmpeqb %xmm1, %xmm01492pcmpeqb %xmm2, %xmm11493psubb %xmm0, %xmm11494pmovmskb %xmm1, %edx1495sub $0xffff, %edx1496jnz L(exit)14971498#ifdef USE_AS_STRNCMP1499sub $16, %r111500jbe L(strcmp_exitz)1501#endif15021503add $16, %rcx1504movdqa %xmm4, %xmm31505jmp L(loop_ashr_12)15061507.p2align 41508L(nibble_ashr_12):1509pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1510pmovmskb %xmm0, %edx1511test $0xf000, %edx1512jnz L(ashr_12_exittail)15131514#ifdef USE_AS_STRNCMP1515cmp $3, %r111516jbe L(ashr_12_exittail)1517#endif15181519pxor %xmm0, %xmm01520sub $0x1000, %r101521jmp L(gobble_ashr_12)15221523.p2align 41524L(ashr_12_exittail):1525movdqa (%rsi, %rcx), %xmm11526psrldq $12, %xmm01527psrldq $12, %xmm31528jmp L(aftertail)15291530/*1531* The following cases will be handled by ashr_131532* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1533* n(3~15) n - 3 12(15 +(n - 3) - n) ashr_131534*/1535.p2align 41536L(ashr_13):1537pxor %xmm0, %xmm01538movdqa (%rdi), %xmm21539movdqa (%rsi), %xmm11540pcmpeqb %xmm1, %xmm01541pslldq $3, %xmm21542pcmpeqb %xmm1, %xmm21543psubb %xmm0, %xmm21544pmovmskb %xmm2, %r9d1545shr %cl, %edx1546shr %cl, %r9d1547sub %r9d, %edx1548jnz L(less32bytes)1549movdqa (%rdi), %xmm315501551UPDATE_STRNCMP_COUNTER15521553pxor %xmm0, %xmm01554mov $16, %rcx /* index for loads */1555mov $13, %r9d /* byte position left over from less32bytes case */1556/*1557* Setup %r10 value allows us to detect crossing a page boundary.1558* When %r10 goes positive we have crossed a page boundary and1559* need to do a nibble.1560*/1561lea 13(%rdi), %r101562and $0xfff, %r10 /* offset into 4K page */1563sub $0x1000, %r10 /* subtract 4K pagesize */15641565.p2align 41566L(loop_ashr_13):1567add $16, %r101568jg L(nibble_ashr_13)15691570L(gobble_ashr_13):1571movdqa (%rsi, %rcx), %xmm11572movdqa (%rdi, %rcx), %xmm21573movdqa %xmm2, %xmm415741575palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */15761577pcmpeqb %xmm1, %xmm01578pcmpeqb %xmm2, %xmm11579psubb %xmm0, %xmm11580pmovmskb %xmm1, %edx1581sub $0xffff, %edx1582jnz L(exit)15831584#ifdef USE_AS_STRNCMP1585sub $16, %r111586jbe L(strcmp_exitz)1587#endif15881589add $16, %rcx1590movdqa %xmm4, %xmm315911592add $16, %r101593jg L(nibble_ashr_13) /* cross page boundary */15941595movdqa (%rsi, %rcx), %xmm11596movdqa (%rdi, %rcx), %xmm21597movdqa %xmm2, %xmm415981599palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */16001601pcmpeqb %xmm1, %xmm01602pcmpeqb %xmm2, %xmm11603psubb %xmm0, %xmm11604pmovmskb %xmm1, %edx1605sub $0xffff, %edx1606jnz L(exit)16071608#ifdef USE_AS_STRNCMP1609sub $16, %r111610jbe L(strcmp_exitz)1611#endif16121613add $16, %rcx1614movdqa %xmm4, %xmm31615jmp L(loop_ashr_13)16161617.p2align 41618L(nibble_ashr_13):1619pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1620pmovmskb %xmm0, %edx1621test $0xe000, %edx1622jnz L(ashr_13_exittail)16231624#ifdef USE_AS_STRNCMP1625cmp $2, %r111626jbe L(ashr_13_exittail)1627#endif16281629pxor %xmm0, %xmm01630sub $0x1000, %r101631jmp L(gobble_ashr_13)16321633.p2align 41634L(ashr_13_exittail):1635movdqa (%rsi, %rcx), %xmm11636psrldq $13, %xmm01637psrldq $13, %xmm31638jmp L(aftertail)16391640/*1641* The following cases will be handled by ashr_141642* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1643* n(2~15) n - 2 13(15 +(n - 2) - n) ashr_141644*/1645.p2align 41646L(ashr_14):1647pxor %xmm0, %xmm01648movdqa (%rdi), %xmm21649movdqa (%rsi), %xmm11650pcmpeqb %xmm1, %xmm01651pslldq $2, %xmm21652pcmpeqb %xmm1, %xmm21653psubb %xmm0, %xmm21654pmovmskb %xmm2, %r9d1655shr %cl, %edx1656shr %cl, %r9d1657sub %r9d, %edx1658jnz L(less32bytes)1659movdqa (%rdi), %xmm316601661UPDATE_STRNCMP_COUNTER16621663pxor %xmm0, %xmm01664mov $16, %rcx /* index for loads */1665mov $14, %r9d /* byte position left over from less32bytes case */1666/*1667* Setup %r10 value allows us to detect crossing a page boundary.1668* When %r10 goes positive we have crossed a page boundary and1669* need to do a nibble.1670*/1671lea 14(%rdi), %r101672and $0xfff, %r10 /* offset into 4K page */1673sub $0x1000, %r10 /* subtract 4K pagesize */16741675.p2align 41676L(loop_ashr_14):1677add $16, %r101678jg L(nibble_ashr_14)16791680L(gobble_ashr_14):1681movdqa (%rsi, %rcx), %xmm11682movdqa (%rdi, %rcx), %xmm21683movdqa %xmm2, %xmm416841685palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */16861687pcmpeqb %xmm1, %xmm01688pcmpeqb %xmm2, %xmm11689psubb %xmm0, %xmm11690pmovmskb %xmm1, %edx1691sub $0xffff, %edx1692jnz L(exit)16931694#ifdef USE_AS_STRNCMP1695sub $16, %r111696jbe L(strcmp_exitz)1697#endif16981699add $16, %rcx1700movdqa %xmm4, %xmm317011702add $16, %r101703jg L(nibble_ashr_14) /* cross page boundary */17041705movdqa (%rsi, %rcx), %xmm11706movdqa (%rdi, %rcx), %xmm21707movdqa %xmm2, %xmm417081709palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */17101711pcmpeqb %xmm1, %xmm01712pcmpeqb %xmm2, %xmm11713psubb %xmm0, %xmm11714pmovmskb %xmm1, %edx1715sub $0xffff, %edx1716jnz L(exit)17171718#ifdef USE_AS_STRNCMP1719sub $16, %r111720jbe L(strcmp_exitz)1721#endif17221723add $16, %rcx1724movdqa %xmm4, %xmm31725jmp L(loop_ashr_14)17261727.p2align 41728L(nibble_ashr_14):1729pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1730pmovmskb %xmm0, %edx1731test $0xc000, %edx1732jnz L(ashr_14_exittail)17331734#ifdef USE_AS_STRNCMP1735cmp $1, %r111736jbe L(ashr_14_exittail)1737#endif17381739pxor %xmm0, %xmm01740sub $0x1000, %r101741jmp L(gobble_ashr_14)17421743.p2align 41744L(ashr_14_exittail):1745movdqa (%rsi, %rcx), %xmm11746psrldq $14, %xmm01747psrldq $14, %xmm31748jmp L(aftertail)17491750/*1751* The following cases will be handled by ashr_151752* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case1753* n(1~15) n - 1 14(15 +(n - 1) - n) ashr_151754*/1755.p2align 41756L(ashr_15):1757pxor %xmm0, %xmm01758movdqa (%rdi), %xmm21759movdqa (%rsi), %xmm11760pcmpeqb %xmm1, %xmm01761pslldq $1, %xmm21762pcmpeqb %xmm1, %xmm21763psubb %xmm0, %xmm21764pmovmskb %xmm2, %r9d1765shr %cl, %edx1766shr %cl, %r9d1767sub %r9d, %edx1768jnz L(less32bytes)17691770movdqa (%rdi), %xmm317711772UPDATE_STRNCMP_COUNTER17731774pxor %xmm0, %xmm01775mov $16, %rcx /* index for loads */1776mov $15, %r9d /* byte position left over from less32bytes case */1777/*1778* Setup %r10 value allows us to detect crossing a page boundary.1779* When %r10 goes positive we have crossed a page boundary and1780* need to do a nibble.1781*/1782lea 15(%rdi), %r101783and $0xfff, %r10 /* offset into 4K page */17841785sub $0x1000, %r10 /* subtract 4K pagesize */17861787.p2align 41788L(loop_ashr_15):1789add $16, %r101790jg L(nibble_ashr_15)17911792L(gobble_ashr_15):1793movdqa (%rsi, %rcx), %xmm11794movdqa (%rdi, %rcx), %xmm21795movdqa %xmm2, %xmm417961797palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */17981799pcmpeqb %xmm1, %xmm01800pcmpeqb %xmm2, %xmm11801psubb %xmm0, %xmm11802pmovmskb %xmm1, %edx1803sub $0xffff, %edx1804jnz L(exit)18051806#ifdef USE_AS_STRNCMP1807sub $16, %r111808jbe L(strcmp_exitz)1809#endif18101811add $16, %rcx1812movdqa %xmm4, %xmm318131814add $16, %r101815jg L(nibble_ashr_15) /* cross page boundary */18161817movdqa (%rsi, %rcx), %xmm11818movdqa (%rdi, %rcx), %xmm21819movdqa %xmm2, %xmm418201821palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */18221823pcmpeqb %xmm1, %xmm01824pcmpeqb %xmm2, %xmm11825psubb %xmm0, %xmm11826pmovmskb %xmm1, %edx1827sub $0xffff, %edx1828jnz L(exit)18291830#ifdef USE_AS_STRNCMP1831sub $16, %r111832jbe L(strcmp_exitz)1833#endif18341835add $16, %rcx1836movdqa %xmm4, %xmm31837jmp L(loop_ashr_15)18381839.p2align 41840L(nibble_ashr_15):1841pcmpeqb %xmm3, %xmm0 /* check nibble for null char */1842pmovmskb %xmm0, %edx1843test $0x8000, %edx1844jnz L(ashr_15_exittail)18451846#ifdef USE_AS_STRNCMP1847test %r11, %r111848je L(ashr_15_exittail)1849#endif18501851pxor %xmm0, %xmm01852sub $0x1000, %r101853jmp L(gobble_ashr_15)18541855.p2align 41856L(ashr_15_exittail):1857movdqa (%rsi, %rcx), %xmm11858psrldq $15, %xmm31859psrldq $15, %xmm018601861.p2align 41862L(aftertail):1863pcmpeqb %xmm3, %xmm11864psubb %xmm0, %xmm11865pmovmskb %xmm1, %edx1866not %edx18671868.p2align 41869L(exit):1870lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */1871L(less32bytes):1872lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */1873lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */1874test %r8d, %r8d1875jz L(ret)1876xchg %rsi, %rdi /* recover original order according to flag(%r8d) */18771878.p2align 41879L(ret):1880L(less16bytes):1881bsf %rdx, %rdx /* find and store bit index in %rdx */18821883#ifdef USE_AS_STRNCMP1884sub %rdx, %r111885jbe L(strcmp_exitz)1886#endif1887movzbl (%rsi, %rdx), %ecx1888movzbl (%rdi, %rdx), %eax18891890sub %ecx, %eax1891ret18921893L(strcmp_exitz):1894xor %eax, %eax1895ret18961897.p2align 41898L(Byte0):1899movzbl (%rsi), %ecx1900movzbl (%rdi), %eax19011902sub %ecx, %eax1903ret1904END (STRCMP)19051906.section .rodata,"a",@progbits1907.p2align 31908L(unaligned_table):1909.int L(ashr_1) - L(unaligned_table)1910.int L(ashr_2) - L(unaligned_table)1911.int L(ashr_3) - L(unaligned_table)1912.int L(ashr_4) - L(unaligned_table)1913.int L(ashr_5) - L(unaligned_table)1914.int L(ashr_6) - L(unaligned_table)1915.int L(ashr_7) - L(unaligned_table)1916.int L(ashr_8) - L(unaligned_table)1917.int L(ashr_9) - L(unaligned_table)1918.int L(ashr_10) - L(unaligned_table)1919.int L(ashr_11) - L(unaligned_table)1920.int L(ashr_12) - L(unaligned_table)1921.int L(ashr_13) - L(unaligned_table)1922.int L(ashr_14) - L(unaligned_table)1923.int L(ashr_15) - L(unaligned_table)1924.int L(ashr_0) - L(unaligned_table)192519261927