Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memset-sve.S
39486 views
/*1* memset - fill memory with a constant byte2*3* Copyright (c) 2024-2024, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.10*11*/1213#include "asmdefs.h"1415.arch armv8-a+sve1617#define dstin x018#define val x119#define valw w120#define count x221#define dst x322#define dstend x423#define zva_val x524#define vlen x525#define off x326#define dstend2 x52728ENTRY (__memset_aarch64_sve)29dup v0.16B, valw30cmp count, 1631b.lo L(set_16)3233add dstend, dstin, count34cmp count, 6435b.hs L(set_128)3637/* Set 16..63 bytes. */38mov off, 1639and off, off, count, lsr 140sub dstend2, dstend, off41str q0, [dstin]42str q0, [dstin, off]43str q0, [dstend2, -16]44str q0, [dstend, -16]45ret4647.p2align 448L(set_16):49whilelo p0.b, xzr, count50st1b z0.b, p0, [dstin]51ret5253.p2align 454L(set_128):55bic dst, dstin, 1556cmp count, 12857b.hi L(set_long)58stp q0, q0, [dstin]59stp q0, q0, [dstin, 32]60stp q0, q0, [dstend, -64]61stp q0, q0, [dstend, -32]62ret6364.p2align 465L(set_long):66cmp count, 25667b.lo L(no_zva)68tst valw, 25569b.ne L(no_zva)7071#ifndef SKIP_ZVA_CHECK72mrs zva_val, dczid_el073and zva_val, zva_val, 3174cmp zva_val, 4 /* ZVA size is 64 bytes. */75b.ne L(no_zva)76#endif77str q0, [dstin]78str q0, [dst, 16]79bic dst, dstin, 3180stp q0, q0, [dst, 32]81bic dst, dstin, 6382sub count, dstend, dst /* Count is now 64 too large. */83sub count, count, 128 /* Adjust count and bias for loop. */8485sub x8, dstend, 1 /* Write last bytes before ZVA loop. */86bic x8, x8, 1587stp q0, q0, [x8, -48]88str q0, [x8, -16]89str q0, [dstend, -16]9091.p2align 492L(zva64_loop):93add dst, dst, 6494dc zva, dst95subs count, count, 6496b.hi L(zva64_loop)97ret9899L(no_zva):100str q0, [dstin]101sub count, dstend, dst /* Count is 16 too large. */102sub count, count, 64 + 16 /* Adjust count and bias for loop. */103L(no_zva_loop):104stp q0, q0, [dst, 16]105stp q0, q0, [dst, 48]106add dst, dst, 64107subs count, count, 64108b.hi L(no_zva_loop)109stp q0, q0, [dstend, -64]110stp q0, q0, [dstend, -32]111ret112113END (__memset_aarch64_sve)114115116