/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2024 Strahinja Stanisic <[email protected]>4*/56#include <machine/asm.h>78/*9* a0 - const void *b10* a1 - int c11* a2 - size_t len12*/13ENTRY(memchr)14/*15* a0 - const char *ptr16* a1 - char cccccccc[8]17* a2 - char iter[8]18* a3 - uint8_t *end19* a4 - uint64_t *end_align20* a5 - uint64_t *end_unroll21*/2223beqz a2, .Lno_match2425/* c = (uint8_t) c */26andi a1, a1, 0xFF2728/*29* t0 = 0x010101010101010130* t1 = 0x808080808080808031* t2 = b << 332* cccccccc = (uint8_t)c * t033* end = b + len;34* ptr = b & ~0b11135*/36add a3, a0, a237li t0, 0x0101010138sltu t2, a0, a339slli t1, t0, 3240neg t2, t241or t0, t0, t142and a3, a3, t243slli t1, t0, 744slli t2, a0, 345and a0, a0, ~0b11146mul a1, t0, a14748ld a2, (a0)4950/*51* mask_start = REP8_0x01 ^ (REP8_0x01 << t2)52* iter = iter ^ cccccccc53* iter = iter | mask_start54*/55sll t2, t0, t256xor a2, a2, a157xor t2, t2, t058or a2, a2, t25960/* has_zero(iter)61* end_align = (end + 7) & ~0b111;62*/63addi a4, a3, 764not t2, a265sub a2, a2, t066and t2, t2, t167andi a4, a4, ~0b11168and a2, a2, t26970/* ptr = ptr + 8 */71addi a0, a0, 87273bnez a2, .Lfind_zero7475/* if(ptr == end_align) */76beq a0, a4, .Lno_match7778/* end_unroll = end_align & ~0b1111 */79andi a5, a4, ~0b11118081/*82* Instead of branching to check if `ptr` is 16-byte aligned:83* - Probe the next 8 bytes for `c`84* - Align `ptr` down to the nearest 16-byte boundary85*86* If `ptr` was already 16-byte aligned, those 8 bytes will be87* checked again inside the unrolled loop.88*89* This removes an unpredictable branch and improves performance.90*/9192ld a2, (a0)93xor a2, a2, a19495not t2, a296sub a2, a2, t097and t2, t2, t198and a2, a2, t299100addi a0, a0, 8101102bnez a2, .Lfind_zero103104andi a0, a0, ~0b1111105106/* while(ptr != end_unroll) */107beq a0, a5, .Lskip_loop108.Lloop:109ld a2, (a0)110ld t3, 8(a0)111112xor a2, a2, a1113xor t3, t3, a1114115not t2, a2116not t4, t3117sub a2, a2, t0118sub t3, t3, t0119and t2, t2, t1120and t4, t4, t1121and a2, a2, t2122and t3, t3, t4123124addi a0, a0, 8125126bnez a2, .Lfind_zero127128/* move into iter for find_zero */129mv a2, t3130131addi a0, a0, 8132133bnez a2, .Lfind_zero134135bne a0, a5, .Lloop136.Lskip_loop:137138/* there might be one 8byte left */139beq a0, a4, .Lno_match140141ld a2, (a0)142xor a2, a2, a1143144not t2, a2145sub a2, a2, t0146and t2, t2, t1147and a2, a2, t2148149addi a0, a0, 8150151beqz a2, .Lno_match152153.Lfind_zero:154/*155* ptr = ptr - 8156* t1 = 0x0001020304050607157* iter = iter & (-iter)158* iter = iter >> 7159* iter = iter * t1160* iter = iter >> 56161*/162li t1, 0x10203000163neg t0, a2164slli t1, t1, 4165and a2, a2, t0166addi t1, t1, 0x405167srli a2, a2, 7168slli t1, t1, 16169addi a0, a0, -8170addi t1, t1, 0x607171mul a2, a2, t1172srli a2, a2, 56173174/* left = end - ptr */175sub t0, a3, a0176177/* return iter < left ? ptr + iter : NULL */178sltu t1, a2, t0179neg t1, t1180add a0, a0, a2181and a0, a0, t1182ret183184.Lno_match:185li a0, 0186ret187END(memchr)188189190