/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev67-strrchr.S3* 21264 version by Rick Gorton <[email protected]>4*5* Finds length of a 0-terminated string. Optimized for the6* Alpha architecture:7*8* - memory accessed as aligned quadwords only9* - uses bcmpge to compare 8 bytes in parallel10*11* Much of the information about 21264 scheduling/coding comes from:12* Compiler Writer's Guide for the Alpha 2126413* abbreviated as 'CWG' in other comments here14* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html15* Scheduling notation:16* E - either cluster17* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U118* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L119*/2021#include <linux/export.h>22#include <asm/regdef.h>2324.set noreorder25.set noat2627.align 428.ent strrchr29.globl strrchr30strrchr:31.frame sp, 0, ra32.prologue 03334and a1, 0xff, t2 # E : 00000000000000ch35insbl a1, 1, t4 # U : 000000000000ch0036insbl a1, 2, t5 # U : 0000000000ch000037ldq_u t0, 0(a0) # L : load first quadword Latency=33839mov zero, t6 # E : t6 is last match aligned addr40or t2, t4, a1 # E : 000000000000chch41sll t5, 8, t3 # U : 00000000ch00000042mov zero, t8 # E : t8 is last match byte compare mask4344andnot a0, 7, v0 # E : align source addr45or t5, t3, t3 # E : 00000000chch000046sll a1, 32, t2 # U : 0000chch0000000047sll a1, 48, t4 # U : chch0000000000004849or t4, a1, a1 # E : chch00000000chch50or t2, t3, t2 # E : 0000chchchch000051or a1, t2, a1 # E : chchchchchchchch52lda t5, -1 # E : build garbage mask5354cmpbge zero, t0, t1 # E : bits set iff byte == zero55mskqh t5, a0, t4 # E : Complete garbage mask56xor t0, a1, t2 # E : make bytes == c zero57cmpbge zero, t4, t4 # E : bits set iff byte is garbage5859cmpbge zero, t2, t3 # E : bits set iff byte == c60andnot t1, t4, t1 # E : clear garbage from null test61andnot t3, t4, t3 # E : clear garbage from char test62bne t1, $eos # U : did we already hit the terminator?6364/* Character search main loop */65$loop:66ldq t0, 8(v0) # L : load next quadword67cmovne t3, v0, t6 # E : save previous comparisons match68nop # : Latency=2, extra map slot (keep nop with cmov)69nop7071cmovne t3, t3, t8 # E : Latency=2, extra map slot72nop # : keep with cmovne73addq v0, 8, v0 # E :74xor t0, a1, t2 # E :7576cmpbge zero, t0, t1 # E : bits set iff byte == zero77cmpbge zero, t2, t3 # E : bits set iff byte == c78beq t1, $loop # U : if we havnt seen a null, loop79nop8081/* Mask out character matches after terminator */82$eos:83negq t1, t4 # E : isolate first null byte match84and t1, t4, t4 # E :85subq t4, 1, t5 # E : build a mask of the bytes up to...86or t4, t5, t4 # E : ... and including the null8788and t3, t4, t3 # E : mask out char matches after null89cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot90nop # : Keep with cmovne91nop9293cmovne t3, v0, t6 # E :94nop # : Keep with cmovne95/* Locate the address of the last matched character */96ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0)97nop9899cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen100nop # E : hide the cmov latency (2) behind ctlz latency101lda t5, 0x3f($31) # E :102subq t5, t2, t5 # E : Normalize leading zero count103104addq t6, t5, v0 # E : and add to quadword address105ret # L0 : Latency=3106nop107nop108109.end strrchr110EXPORT_SYMBOL(strrchr)111112113