/* SPDX-License-Identifier: GPL-2.0-only */1/*2* Copyright (C) 2013 ARM Ltd.3* Copyright (C) 2013 Linaro.4*5* This code is based on glibc cortex strings work originally authored by Linaro6* be found @7*8* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/9* files/head:/src/aarch64/10*/1112#include <linux/linkage.h>13#include <asm/assembler.h>14#include <asm/cache.h>1516/*17* Fill in the buffer with character c (alignment handled by the hardware)18*19* Parameters:20* x0 - buf21* x1 - c22* x2 - n23* Returns:24* x0 - buf25*/2627dstin .req x028val_x .req x129val .req w130count .req x231tmp1 .req x332tmp1w .req w333tmp2 .req x434tmp2w .req w435zva_len_x .req x536zva_len .req w537zva_bits_x .req x63839A_l .req x740A_lw .req w741dst .req x842tmp3w .req w943tmp3 .req x94445SYM_FUNC_START_LOCAL(__pi_memset_generic)46mov dst, dstin /* Preserve return value. */47and A_lw, val, #25548orr A_lw, A_lw, A_lw, lsl #849orr A_lw, A_lw, A_lw, lsl #1650orr A_l, A_l, A_l, lsl #325152cmp count, #1553b.hi .Lover16_proc54/*All store maybe are non-aligned..*/55tbz count, #3, 1f56str A_l, [dst], #8571:58tbz count, #2, 2f59str A_lw, [dst], #4602:61tbz count, #1, 3f62strh A_lw, [dst], #2633:64tbz count, #0, 4f65strb A_lw, [dst]664:67ret6869.Lover16_proc:70/*Whether the start address is aligned with 16.*/71neg tmp2, dst72ands tmp2, tmp2, #1573b.eq .Laligned74/*75* The count is not less than 16, we can use stp to store the start 16 bytes,76* then adjust the dst aligned with 16.This process will make the current77* memory address at alignment boundary.78*/79stp A_l, A_l, [dst] /*non-aligned store..*/80/*make the dst aligned..*/81sub count, count, tmp282add dst, dst, tmp28384.Laligned:85cbz A_l, .Lzero_mem8687.Ltail_maybe_long:88cmp count, #6489b.ge .Lnot_short90.Ltail63:91ands tmp1, count, #0x3092b.eq 3f93cmp tmp1w, #0x2094b.eq 1f95b.lt 2f96stp A_l, A_l, [dst], #16971:98stp A_l, A_l, [dst], #16992:100stp A_l, A_l, [dst], #16101/*102* The last store length is less than 16,use stp to write last 16 bytes.103* It will lead some bytes written twice and the access is non-aligned.104*/1053:106ands count, count, #15107cbz count, 4f108add dst, dst, count109stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */1104:111ret112113/*114* Critical loop. Start at a new cache line boundary. Assuming115* 64 bytes per line, this ensures the entire loop is in one line.116*/117.p2align L1_CACHE_SHIFT118.Lnot_short:119sub dst, dst, #16/* Pre-bias. */120sub count, count, #641211:122stp A_l, A_l, [dst, #16]123stp A_l, A_l, [dst, #32]124stp A_l, A_l, [dst, #48]125stp A_l, A_l, [dst, #64]!126subs count, count, #64127b.ge 1b128tst count, #0x3f129add dst, dst, #16130b.ne .Ltail63131.Lexitfunc:132ret133134/*135* For zeroing memory, check to see if we can use the ZVA feature to136* zero entire 'cache' lines.137*/138.Lzero_mem:139cmp count, #63140b.le .Ltail63141/*142* For zeroing small amounts of memory, it's not worth setting up143* the line-clear code.144*/145cmp count, #128146b.lt .Lnot_short /*count is at least 128 bytes*/147148mrs tmp1, dczid_el0149tbnz tmp1, #4, .Lnot_short150mov tmp3w, #4151and zva_len, tmp1w, #15 /* Safety: other bits reserved. */152lsl zva_len, tmp3w, zva_len153154ands tmp3w, zva_len, #63155/*156* ensure the zva_len is not less than 64.157* It is not meaningful to use ZVA if the block size is less than 64.158*/159b.ne .Lnot_short160.Lzero_by_line:161/*162* Compute how far we need to go to become suitably aligned. We're163* already at quad-word alignment.164*/165cmp count, zva_len_x166b.lt .Lnot_short /* Not enough to reach alignment. */167sub zva_bits_x, zva_len_x, #1168neg tmp2, dst169ands tmp2, tmp2, zva_bits_x170b.eq 2f /* Already aligned. */171/* Not aligned, check that there's enough to copy after alignment.*/172sub tmp1, count, tmp2173/*174* grantee the remain length to be ZVA is bigger than 64,175* avoid to make the 2f's process over mem range.*/176cmp tmp1, #64177ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */178b.lt .Lnot_short179/*180* We know that there's at least 64 bytes to zero and that it's safe181* to overrun by 64 bytes.182*/183mov count, tmp11841:185stp A_l, A_l, [dst]186stp A_l, A_l, [dst, #16]187stp A_l, A_l, [dst, #32]188subs tmp2, tmp2, #64189stp A_l, A_l, [dst, #48]190add dst, dst, #64191b.ge 1b192/* We've overrun a bit, so adjust dst downwards.*/193add dst, dst, tmp21942:195sub count, count, zva_len_x1963:197dc zva, dst198add dst, dst, zva_len_x199subs count, count, zva_len_x200b.ge 3b201ands count, count, zva_bits_x202b.ne .Ltail_maybe_long203ret204SYM_FUNC_END(__pi_memset_generic)205206#ifdef CONFIG_AS_HAS_MOPS207.arch_extension mops208SYM_FUNC_START(__pi_memset)209alternative_if_not ARM64_HAS_MOPS210b __pi_memset_generic211alternative_else_nop_endif212213mov dst, dstin214setp [dst]!, count!, val_x215setm [dst]!, count!, val_x216sete [dst]!, count!, val_x217ret218SYM_FUNC_END(__pi_memset)219#else220SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)221#endif222223SYM_FUNC_ALIAS(__memset, __pi_memset)224EXPORT_SYMBOL(__memset)225226SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)227EXPORT_SYMBOL(memset)228229230