/* SPDX-License-Identifier: GPL-2.0-only */1/*2* Copyright (C) 2022 Michael T. Kloos <[email protected]>3*/45#include <linux/linkage.h>6#include <asm/asm.h>78SYM_FUNC_START(__memmove)9/*10* Returns11* a0 - dest12*13* Parameters14* a0 - Inclusive first byte of dest15* a1 - Inclusive first byte of src16* a2 - Length of copy n17*18* Because the return matches the parameter register a0,19* we will not clobber or modify that register.20*21* Note: This currently only works on little-endian.22* To port to big-endian, reverse the direction of shifts23* in the 2 misaligned fixup copy loops.24*/2526/* Return if nothing to do */27beq a0, a1, .Lreturn_from_memmove28beqz a2, .Lreturn_from_memmove2930/*31* Register Uses32* Forward Copy: a1 - Index counter of src33* Reverse Copy: a4 - Index counter of src34* Forward Copy: t3 - Index counter of dest35* Reverse Copy: t4 - Index counter of dest36* Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest37* Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest38* Both Copy Modes: t0 - Link / Temporary for load-store39* Both Copy Modes: t1 - Temporary for load-store40* Both Copy Modes: t2 - Temporary for load-store41* Both Copy Modes: a5 - dest to src alignment offset42* Both Copy Modes: a6 - Shift ammount43* Both Copy Modes: a7 - Inverse Shift ammount44* Both Copy Modes: a2 - Alternate breakpoint for unrolled loops45*/4647/*48* Solve for some register values now.49* Byte copy does not need t5 or t6.50*/51mv t3, a052add t4, a0, a253add a4, a1, a25455/*56* Byte copy if copying less than (2 * SZREG) bytes. This can57* cause problems with the bulk copy implementation and is58* small enough not to bother.59*/60andi t0, a2, -(2 * SZREG)61beqz t0, .Lbyte_copy6263/*64* Now solve for t5 and t6.65*/66andi t5, t3, -SZREG67andi t6, t4, -SZREG68/*69* If dest(Register t3) rounded down to the nearest naturally70* aligned SZREG address, does not equal dest, then add SZREG71* to find the low-bound of SZREG alignment in the dest memory72* region. Note that this could overshoot the dest memory73* region if n is less than SZREG. This is one reason why74* we always byte copy if n is less than SZREG.75* Otherwise, dest is already naturally aligned to SZREG.76*/77beq t5, t3, 1f78addi t5, t5, SZREG791:8081/*82* If the dest and src are co-aligned to SZREG, then there is83* no need for the full rigmarole of a full misaligned fixup copy.84* Instead, do a simpler co-aligned copy.85*/86xor t0, a0, a187andi t1, t0, (SZREG - 1)88beqz t1, .Lcoaligned_copy89/* Fall through to misaligned fixup copy */9091.Lmisaligned_fixup_copy:92bltu a1, a0, .Lmisaligned_fixup_copy_reverse9394.Lmisaligned_fixup_copy_forward:95jal t0, .Lbyte_copy_until_aligned_forward9697andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */98slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */99sub a5, a1, t3 /* Find the difference between src and dest */100andi a1, a1, -SZREG /* Align the src pointer */101addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/102103/*104* Compute The Inverse Shift105* a7 = XLEN - a6 = XLEN + -a6106* 2s complement negation to find the negative: -a6 = ~a6 + 1107* Add that to XLEN. XLEN = SZREG * 8.108*/109not a7, a6110addi a7, a7, (SZREG * 8 + 1)111112/*113* Fix Misalignment Copy Loop - Forward114* load_val0 = load_ptr[0];115* do {116* load_val1 = load_ptr[1];117* store_ptr += 2;118* store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});119*120* if (store_ptr == {a2})121* break;122*123* load_val0 = load_ptr[2];124* load_ptr += 2;125* store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});126*127* } while (store_ptr != store_ptr_end);128* store_ptr = store_ptr_end;129*/130131REG_L t0, (0 * SZREG)(a1)1321:133REG_L t1, (1 * SZREG)(a1)134addi t3, t3, (2 * SZREG)135srl t0, t0, a6136sll t2, t1, a7137or t2, t0, t2138REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)139140beq t3, a2, 2f141142REG_L t0, (2 * SZREG)(a1)143addi a1, a1, (2 * SZREG)144srl t1, t1, a6145sll t2, t0, a7146or t2, t1, t2147REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)148149bne t3, t6, 1b1502:151mv t3, t6 /* Fix the dest pointer in case the loop was broken */152153add a1, t3, a5 /* Restore the src pointer */154j .Lbyte_copy_forward /* Copy any remaining bytes */155156.Lmisaligned_fixup_copy_reverse:157jal t0, .Lbyte_copy_until_aligned_reverse158159andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */160slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */161sub a5, a4, t4 /* Find the difference between src and dest */162andi a4, a4, -SZREG /* Align the src pointer */163addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/164165/*166* Compute The Inverse Shift167* a7 = XLEN - a6 = XLEN + -a6168* 2s complement negation to find the negative: -a6 = ~a6 + 1169* Add that to XLEN. XLEN = SZREG * 8.170*/171not a7, a6172addi a7, a7, (SZREG * 8 + 1)173174/*175* Fix Misalignment Copy Loop - Reverse176* load_val1 = load_ptr[0];177* do {178* load_val0 = load_ptr[-1];179* store_ptr -= 2;180* store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});181*182* if (store_ptr == {a2})183* break;184*185* load_val1 = load_ptr[-2];186* load_ptr -= 2;187* store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});188*189* } while (store_ptr != store_ptr_end);190* store_ptr = store_ptr_end;191*/192193REG_L t1, ( 0 * SZREG)(a4)1941:195REG_L t0, (-1 * SZREG)(a4)196addi t4, t4, (-2 * SZREG)197sll t1, t1, a7198srl t2, t0, a6199or t2, t1, t2200REG_S t2, ( 1 * SZREG)(t4)201202beq t4, a2, 2f203204REG_L t1, (-2 * SZREG)(a4)205addi a4, a4, (-2 * SZREG)206sll t0, t0, a7207srl t2, t1, a6208or t2, t0, t2209REG_S t2, ( 0 * SZREG)(t4)210211bne t4, t5, 1b2122:213mv t4, t5 /* Fix the dest pointer in case the loop was broken */214215add a4, t4, a5 /* Restore the src pointer */216j .Lbyte_copy_reverse /* Copy any remaining bytes */217218/*219* Simple copy loops for SZREG co-aligned memory locations.220* These also make calls to do byte copies for any unaligned221* data at their terminations.222*/223.Lcoaligned_copy:224bltu a1, a0, .Lcoaligned_copy_reverse225226.Lcoaligned_copy_forward:227jal t0, .Lbyte_copy_until_aligned_forward2282291:230REG_L t1, ( 0 * SZREG)(a1)231addi a1, a1, SZREG232addi t3, t3, SZREG233REG_S t1, (-1 * SZREG)(t3)234bne t3, t6, 1b235236j .Lbyte_copy_forward /* Copy any remaining bytes */237238.Lcoaligned_copy_reverse:239jal t0, .Lbyte_copy_until_aligned_reverse2402411:242REG_L t1, (-1 * SZREG)(a4)243addi a4, a4, -SZREG244addi t4, t4, -SZREG245REG_S t1, ( 0 * SZREG)(t4)246bne t4, t5, 1b247248j .Lbyte_copy_reverse /* Copy any remaining bytes */249250/*251* These are basically sub-functions within the function. They252* are used to byte copy until the dest pointer is in alignment.253* At which point, a bulk copy method can be used by the254* calling code. These work on the same registers as the bulk255* copy loops. Therefore, the register values can be picked256* up from where they were left and we avoid code duplication257* without any overhead except the call in and return jumps.258*/259.Lbyte_copy_until_aligned_forward:260beq t3, t5, 2f2611:262lb t1, 0(a1)263addi a1, a1, 1264addi t3, t3, 1265sb t1, -1(t3)266bne t3, t5, 1b2672:268jalr zero, 0x0(t0) /* Return to multibyte copy loop */269270.Lbyte_copy_until_aligned_reverse:271beq t4, t6, 2f2721:273lb t1, -1(a4)274addi a4, a4, -1275addi t4, t4, -1276sb t1, 0(t4)277bne t4, t6, 1b2782:279jalr zero, 0x0(t0) /* Return to multibyte copy loop */280281/*282* Simple byte copy loops.283* These will byte copy until they reach the end of data to copy.284* At that point, they will call to return from memmove.285*/286.Lbyte_copy:287bltu a1, a0, .Lbyte_copy_reverse288289.Lbyte_copy_forward:290beq t3, t4, 2f2911:292lb t1, 0(a1)293addi a1, a1, 1294addi t3, t3, 1295sb t1, -1(t3)296bne t3, t4, 1b2972:298ret299300.Lbyte_copy_reverse:301beq t4, t3, 2f3021:303lb t1, -1(a4)304addi a4, a4, -1305addi t4, t4, -1306sb t1, 0(t4)307bne t4, t3, 1b3082:309310.Lreturn_from_memmove:311ret312313SYM_FUNC_END(__memmove)314SYM_FUNC_ALIAS_WEAK(memmove, __memmove)315SYM_FUNC_ALIAS(__pi_memmove, __memmove)316SYM_FUNC_ALIAS(__pi___memmove, __memmove)317318319