/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-memchr.S3*4* 21264 version contributed by Rick Gorton <[email protected]>5*6* Finds characters in a memory area. Optimized for the Alpha:7*8* - memory accessed as aligned quadwords only9* - uses cmpbge to compare 8 bytes in parallel10* - does binary search to find 0 byte in last11* quadword (HAKMEM needed 12 instructions to12* do this instead of the 9 instructions that13* binary search needs).14*15* For correctness consider that:16*17* - only minimum number of quadwords may be accessed18* - the third argument is an unsigned long19*20* Much of the information about 21264 scheduling/coding comes from:21* Compiler Writer's Guide for the Alpha 2126422* abbreviated as 'CWG' in other comments here23* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html24* Scheduling notation:25* E - either cluster26* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U127* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L128* Try not to change the actual algorithm if possible for consistency.29*/30#include <linux/export.h>31.set noreorder32.set noat3334.align 435.globl memchr36.ent memchr37memchr:38.frame $30,0,$26,039.prologue 04041# Hack -- if someone passes in (size_t)-1, hoping to just42# search til the end of the address space, we will overflow43# below when we find the address of the last byte. Given44# that we will never have a 56-bit address space, cropping45# the length is the easiest way to avoid trouble.46zap $18, 0x80, $5 # U : Bound length47beq $18, $not_found # U :48ldq_u $1, 0($16) # L : load first quadword Latency=349and $17, 0xff, $17 # E : L L U U : 00000000000000ch5051insbl $17, 1, $2 # U : 000000000000ch0052cmpult $18, 9, $4 # E : small (< 1 quad) string?53or $2, $17, $17 # E : 000000000000chch54lda $3, -1($31) # E : U L L U5556sll $17, 16, $2 # U : 00000000chch000057addq $16, $5, $5 # E : Max search address58or $2, $17, $17 # E : 00000000chchchch59sll $17, 32, $2 # U : U L L U : chchchch000000006061or $2, $17, $17 # E : chchchchchchchch62extql $1, $16, $7 # U : $7 is upper bits63beq $4, $first_quad # U :64ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=36566extqh $6, $16, $6 # U : 2 cycle stall for $667mov $16, $0 # E :68nop # E :69or $7, $6, $1 # E : L U L U $1 = quadword starting at $167071# Deal with the case where at most 8 bytes remain to be searched72# in $1. E.g.:73# $18 = 674# $1 = ????c6c5c4c3c2c175$last_quad:76negq $18, $6 # E :77xor $17, $1, $1 # E :78srl $3, $6, $6 # U : $6 = mask of $18 bits set79cmpbge $31, $1, $2 # E : L U L U8081nop82nop83and $2, $6, $2 # E :84beq $2, $not_found # U : U L U L8586$found_it:87#ifdef CONFIG_ALPHA_EV6788/*89* Since we are guaranteed to have set one of the bits, we don't90* have to worry about coming back with a 0x40 out of cttz...91*/92cttz $2, $3 # U0 :93addq $0, $3, $0 # E : All done94nop # E :95ret # L0 : L U L U96#else97/*98* Slow and clunky. It can probably be improved.99* An exercise left for others.100*/101negq $2, $3 # E :102and $2, $3, $2 # E :103and $2, 0x0f, $1 # E :104addq $0, 4, $3 # E :105106cmoveq $1, $3, $0 # E : Latency 2, extra map cycle107nop # E : keep with cmov108and $2, 0x33, $1 # E :109addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0110111cmoveq $1, $3, $0 # E : Latency 2, extra map cycle112nop # E : keep with cmov113and $2, 0x55, $1 # E :114addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0115116cmoveq $1, $3, $0 # E : Latency 2, extra map cycle117nop118nop119ret # L0 : L U L U120#endif121122# Deal with the case where $18 > 8 bytes remain to be123# searched. $16 may not be aligned.124.align 4125$first_quad:126andnot $16, 0x7, $0 # E :127insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)128xor $1, $17, $1 # E :129or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff130131cmpbge $31, $1, $2 # E :132bne $2, $found_it # U :133# At least one byte left to process.134ldq $1, 8($0) # L :135subq $5, 1, $18 # E : U L U L136137addq $0, 8, $0 # E :138# Make $18 point to last quad to be accessed (the139# last quad may or may not be partial).140andnot $18, 0x7, $18 # E :141cmpult $0, $18, $2 # E :142beq $2, $final # U : U L U L143144# At least two quads remain to be accessed.145146subq $18, $0, $4 # E : $4 <- nr quads to be processed147and $4, 8, $4 # E : odd number of quads?148bne $4, $odd_quad_count # U :149# At least three quads remain to be accessed150mov $1, $4 # E : L U L U : move prefetched value to correct reg151152.align 4153$unrolled_loop:154ldq $1, 8($0) # L : prefetch $1155xor $17, $4, $2 # E :156cmpbge $31, $2, $2 # E :157bne $2, $found_it # U : U L U L158159addq $0, 8, $0 # E :160nop # E :161nop # E :162nop # E :163164$odd_quad_count:165xor $17, $1, $2 # E :166ldq $4, 8($0) # L : prefetch $4167cmpbge $31, $2, $2 # E :168addq $0, 8, $6 # E :169170bne $2, $found_it # U :171cmpult $6, $18, $6 # E :172addq $0, 8, $0 # E :173nop # E :174175bne $6, $unrolled_loop # U :176mov $4, $1 # E : move prefetched value into $1177nop # E :178nop # E :179180$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do181nop # E :182nop # E :183bne $18, $last_quad # U :184185$not_found:186mov $31, $0 # E :187nop # E :188nop # E :189ret # L0 :190191.end memchr192EXPORT_SYMBOL(memchr)193194195