/* SPDX-License-Identifier: GPL-2.0-only */1/*2* Copyright (c) 2011, The Linux Foundation. All rights reserved.3*/456/* HEXAGON assembly optimized memset */7/* Replaces the standard library function memset */8910.macro HEXAGON_OPT_FUNC_BEGIN name11.text12.p2align 413.globl \name14.type \name, @function15\name:16.endm1718.macro HEXAGON_OPT_FUNC_FINISH name19.size \name, . - \name20.endm2122/* FUNCTION: memset (v2 version) */23#if __HEXAGON_ARCH__ < 324HEXAGON_OPT_FUNC_BEGIN memset25{26r6 = #827r7 = extractu(r0, #3 , #0)28p0 = cmp.eq(r2, #0)29p1 = cmp.gtu(r2, #7)30}31{32r4 = vsplatb(r1)33r8 = r0 /* leave r0 intact for return val */34r9 = sub(r6, r7) /* bytes until double alignment */35if p0 jumpr r31 /* count == 0, so return */36}37{38r3 = #039r7 = #040p0 = tstbit(r9, #0)41if p1 jump 2f /* skip byte loop */42}4344/* less than 8 bytes to set, so just set a byte at a time and return */4546loop0(1f, r2) /* byte loop */47.falign481: /* byte loop */49{50memb(r8++#1) = r451}:endloop052jumpr r3153.falign542: /* skip byte loop */55{56r6 = #157p0 = tstbit(r9, #1)58p1 = cmp.eq(r2, #1)59if !p0 jump 3f /* skip initial byte store */60}61{62memb(r8++#1) = r463r3:2 = sub(r3:2, r7:6)64if p1 jumpr r3165}66.falign673: /* skip initial byte store */68{69r6 = #270p0 = tstbit(r9, #2)71p1 = cmp.eq(r2, #2)72if !p0 jump 4f /* skip initial half store */73}74{75memh(r8++#2) = r476r3:2 = sub(r3:2, r7:6)77if p1 jumpr r3178}79.falign804: /* skip initial half store */81{82r6 = #483p0 = cmp.gtu(r2, #7)84p1 = cmp.eq(r2, #4)85if !p0 jump 5f /* skip initial word store */86}87{88memw(r8++#4) = r489r3:2 = sub(r3:2, r7:6)90p0 = cmp.gtu(r2, #11)91if p1 jumpr r3192}93.falign945: /* skip initial word store */95{96r10 = lsr(r2, #3)97p1 = cmp.eq(r3, #1)98if !p0 jump 7f /* skip double loop */99}100{101r5 = r4102r6 = #8103loop0(6f, r10) /* double loop */104}105106/* set bytes a double word at a time */107108.falign1096: /* double loop */110{111memd(r8++#8) = r5:4112r3:2 = sub(r3:2, r7:6)113p1 = cmp.eq(r2, #8)114}:endloop0115.falign1167: /* skip double loop */117{118p0 = tstbit(r2, #2)119if p1 jumpr r31120}121{122r6 = #4123p0 = tstbit(r2, #1)124p1 = cmp.eq(r2, #4)125if !p0 jump 8f /* skip final word store */126}127{128memw(r8++#4) = r4129r3:2 = sub(r3:2, r7:6)130if p1 jumpr r31131}132.falign1338: /* skip final word store */134{135p1 = cmp.eq(r2, #2)136if !p0 jump 9f /* skip final half store */137}138{139memh(r8++#2) = r4140if p1 jumpr r31141}142.falign1439: /* skip final half store */144{145memb(r8++#1) = r4146jumpr r31147}148HEXAGON_OPT_FUNC_FINISH memset149#endif150151152/* FUNCTION: memset (v3 and higher version) */153#if __HEXAGON_ARCH__ >= 3154HEXAGON_OPT_FUNC_BEGIN memset155{156r7=vsplatb(r1)157r6 = r0158if (r2==#0) jump:nt .L1159}160{161r5:4=combine(r7,r7)162p0 = cmp.gtu(r2,#8)163if (p0.new) jump:nt .L3164}165{166r3 = r0167loop0(.L47,r2)168}169.falign170.L47:171{172memb(r3++#1) = r1173}:endloop0 /* start=.L47 */174jumpr r31175.L3:176{177p0 = tstbit(r0,#0)178if (!p0.new) jump:nt .L8179p1 = cmp.eq(r2, #1)180}181{182r6 = add(r0, #1)183r2 = add(r2,#-1)184memb(r0) = r1185if (p1) jump .L1186}187.L8:188{189p0 = tstbit(r6,#1)190if (!p0.new) jump:nt .L10191}192{193r2 = add(r2,#-2)194memh(r6++#2) = r7195p0 = cmp.eq(r2, #2)196if (p0.new) jump:nt .L1197}198.L10:199{200p0 = tstbit(r6,#2)201if (!p0.new) jump:nt .L12202}203{204r2 = add(r2,#-4)205memw(r6++#4) = r7206p0 = cmp.eq(r2, #4)207if (p0.new) jump:nt .L1208}209.L12:210{211p0 = cmp.gtu(r2,#127)212if (!p0.new) jump:nt .L14213}214r3 = and(r6,#31)215if (r3==#0) jump:nt .L17216{217memd(r6++#8) = r5:4218r2 = add(r2,#-8)219}220r3 = and(r6,#31)221if (r3==#0) jump:nt .L17222{223memd(r6++#8) = r5:4224r2 = add(r2,#-8)225}226r3 = and(r6,#31)227if (r3==#0) jump:nt .L17228{229memd(r6++#8) = r5:4230r2 = add(r2,#-8)231}232.L17:233{234r3 = lsr(r2,#5)235if (r1!=#0) jump:nt .L18236}237{238r8 = r3239r3 = r6240loop0(.L46,r3)241}242.falign243.L46:244{245dczeroa(r6)246r6 = add(r6,#32)247r2 = add(r2,#-32)248}:endloop0 /* start=.L46 */249.L14:250{251p0 = cmp.gtu(r2,#7)252if (!p0.new) jump:nt .L28253r8 = lsr(r2,#3)254}255loop0(.L44,r8)256.falign257.L44:258{259memd(r6++#8) = r5:4260r2 = add(r2,#-8)261}:endloop0 /* start=.L44 */262.L28:263{264p0 = tstbit(r2,#2)265if (!p0.new) jump:nt .L33266}267{268r2 = add(r2,#-4)269memw(r6++#4) = r7270}271.L33:272{273p0 = tstbit(r2,#1)274if (!p0.new) jump:nt .L35275}276{277r2 = add(r2,#-2)278memh(r6++#2) = r7279}280.L35:281p0 = cmp.eq(r2,#1)282if (p0) memb(r6) = r1283.L1:284jumpr r31285.L18:286loop0(.L45,r3)287.falign288.L45:289dczeroa(r6)290{291memd(r6++#8) = r5:4292r2 = add(r2,#-32)293}294memd(r6++#8) = r5:4295memd(r6++#8) = r5:4296{297memd(r6++#8) = r5:4298}:endloop0 /* start=.L45 */299jump .L14300HEXAGON_OPT_FUNC_FINISH memset301#endif302303304