Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S
213799 views
//===----------------------------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// \file9/// This file contains assembly-optimized implementations of Scalable Matrix10/// Extension (SME) compatible memset and memchr functions.11///12/// These implementations depend on unaligned access and floating-point support.13///14/// Routines taken from libc/AOR_v20.02/string/aarch64.15///16//===----------------------------------------------------------------------===//1718#include "../assembly.h"1920//21// __arm_sc_memset22//2324#define dstin x025#define val x126#define valw w127#define count x228#define dst x329#define dstend2 x430#define zva_val x53132DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)33#ifdef __ARM_FEATURE_SVE34mov z0.b, valw35#else36bfi valw, valw, #8, #837bfi valw, valw, #16, #1638bfi val, val, #32, #3239fmov d0, val40fmov v0.d[1], val41#endif42add dstend2, dstin, count4344cmp count, 9645b.hi 7f // set_long46cmp count, 1647b.hs 4f // set_medium48mov val, v0.D[0]4950/* Set 0..15 bytes. */51tbz count, 3, 1f52str val, [dstin]53str val, [dstend2, -8]54ret55nop561: tbz count, 2, 2f57str valw, [dstin]58str valw, [dstend2, -4]59ret602: cbz count, 3f61strb valw, [dstin]62tbz count, 1, 3f63strh valw, [dstend2, -2]643: ret6566/* Set 17..96 bytes. */674: // set_medium68str q0, [dstin]69tbnz count, 6, 6f // set9670str q0, [dstend2, -16]71tbz count, 5, 5f72str q0, [dstin, 16]73str q0, [dstend2, -32]745: ret7576.p2align 477/* Set 64..96 bytes. Write 64 bytes from the start and7832 bytes from the end. */796: // set9680str q0, [dstin, 16]81stp q0, q0, [dstin, 32]82stp q0, q0, [dstend2, -32]83ret8485.p2align 4867: // set_long87and valw, valw, 25588bic dst, dstin, 1589str q0, [dstin]90cmp count, 16091ccmp valw, 0, 0, hs92b.ne 9f // no_zva9394#ifndef SKIP_ZVA_CHECK95mrs zva_val, dczid_el096and zva_val, zva_val, 3197cmp zva_val, 4 /* ZVA size is 64 bytes. */98b.ne 9f // no_zva99#endif100str q0, [dst, 16]101stp q0, q0, [dst, 32]102bic dst, dst, 63103sub count, dstend2, dst /* Count is now 64 too large. */104sub count, count, 128 /* Adjust count and bias for loop. */105106.p2align 41078: // zva_loop108add dst, dst, 64109dc zva, dst110subs count, count, 64111b.hi 8b // zva_loop112stp q0, q0, [dstend2, -64]113stp q0, q0, [dstend2, -32]114ret1151169: // no_zva117sub count, dstend2, dst /* Count is 16 too large. */118sub dst, dst, 16 /* Dst is biased by -32. */119sub count, count, 64 + 16 /* Adjust count and bias for loop. */12010: // no_zva_loop121stp q0, q0, [dst, 32]122stp q0, q0, [dst, 64]!123subs count, count, 64124b.hi 10b // no_zva_loop125stp q0, q0, [dstend2, -64]126stp q0, q0, [dstend2, -32]127ret128END_COMPILERRT_FUNCTION(__arm_sc_memset)129130//131// __arm_sc_memchr132//133134#define srcin x0135#define chrin w1136#define cntin x2137138#define result x0139140#define src x3141#define tmp x4142#define wtmp2 w5143#define synd x6144#define soff x9145#define cntrem x10146147#define vrepchr v0148#define vdata1 v1149#define vdata2 v2150#define vhas_chr1 v3151#define vhas_chr2 v4152#define vrepmask v5153#define vend v6154155/*156* Core algorithm:157*158* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits159* per byte. For each tuple, bit 0 is set if the relevant byte matched the160* requested character and bit 1 is not used (faster than using a 32bit161* syndrome). Since the bits in the syndrome reflect exactly the order in which162* things occur in the original string, counting trailing zeros allows to163* identify exactly which byte has matched.164*/165166DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)167/* Do not dereference srcin if no bytes to compare. */168cbz cntin, 4f169/*170* Magic constant 0x40100401 allows us to identify which lane matches171* the requested byte.172*/173mov wtmp2, #0x0401174movk wtmp2, #0x4010, lsl #16175dup vrepchr.16b, chrin176/* Work with aligned 32-byte chunks */177bic src, srcin, #31178dup vrepmask.4s, wtmp2179ands soff, srcin, #31180and cntrem, cntin, #31181b.eq 0f182183/*184* Input string is not 32-byte aligned. We calculate the syndrome185* value for the aligned 32 bytes block containing the first bytes186* and mask the irrelevant part.187*/188189ld1 {vdata1.16b, vdata2.16b}, [src], #32190sub tmp, soff, #32191adds cntin, cntin, tmp192cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b193cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b194and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b195and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b196addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */197addp vend.16b, vend.16b, vend.16b /* 128->64 */198mov synd, vend.d[0]199/* Clear the soff*2 lower bits */200lsl tmp, soff, #1201lsr synd, synd, tmp202lsl synd, synd, tmp203/* The first block can also be the last */204b.ls 2f205/* Have we found something already? */206cbnz synd, 3f2072080: // loop209ld1 {vdata1.16b, vdata2.16b}, [src], #32210subs cntin, cntin, #32211cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b212cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b213/* If we're out of data we finish regardless of the result */214b.ls 1f215/* Use a fast check for the termination condition */216orr vend.16b, vhas_chr1.16b, vhas_chr2.16b217addp vend.2d, vend.2d, vend.2d218mov synd, vend.d[0]219/* We're not out of data, loop if we haven't found the character */220cbz synd, 0b2212221: // end223/* Termination condition found, let's calculate the syndrome value */224and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b225and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b226addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */227addp vend.16b, vend.16b, vend.16b /* 128->64 */228mov synd, vend.d[0]229/* Only do the clear for the last possible block */230b.hi 3f2312322: // masklast233/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */234add tmp, cntrem, soff235and tmp, tmp, #31236sub tmp, tmp, #32237neg tmp, tmp, lsl #1238lsl synd, synd, tmp239lsr synd, synd, tmp2402413: // tail242/* Count the trailing zeros using bit reversing */243rbit synd, synd244/* Compensate the last post-increment */245sub src, src, #32246/* Check that we have found a character */247cmp synd, #0248/* And count the leading zeros */249clz synd, synd250/* Compute the potential result */251add result, src, synd, lsr #1252/* Select result or NULL */253csel result, xzr, result, eq254ret2552564: // zero_length257mov result, #0258ret259END_COMPILERRT_FUNCTION(__arm_sc_memchr)260261262263