/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2024 Strahinja Stanisic <[email protected]>4*/56#include <machine/asm.h>78/*9* a0 - void* dst10* a1 - const void* src11* a2 - size_t len12*/13ENTRY(memcpy)14beqz a2, .Lreturn1516/* diff = (dstv - srcv) & 0b111 */17sub t0, a0, a118andi t0, t0, 0b1111920sltiu t1, a2, 82122/* we never change a0, because memcpy returns the original dst */23mv a3, a02425/* len < 8 */26bnez t1, .Lend2728/* t1 = (-dst) & 0b111 */29neg t1, a030andi t1, t1, 0b1113132sub a2, a2, t13334la t2, .Lduff_start35slli t3, t1, 336sub t2, t2, t337jr t238lb t3, 6(a1)39sb t3, 6(a3)40lb t3, 5(a1)41sb t3, 5(a3)42lb t3, 4(a1)43sb t3, 4(a3)44lb t3, 3(a1)45sb t3, 3(a3)46lb t3, 2(a1)47sb t3, 2(a3)48lb t3, 1(a1)49sb t3, 1(a3)50lb t3, 0(a1)51sb t3, 0(a3)52.Lduff_start:5354add a1, a1, t155add a3, a3, t15657beqz a2, .Lreturn5859beqz t0, .Lmemcpy86061/*62* a4 - size_t right_shift63* a5 - size_t left_shift64* a6 - size_t whole (number of dword stores)65*/6667/* right_shift = (src % 0b111) * 8; */68andi a4, a1, 0b11169slli a4, a4, 37071/* left_shift = 64 - right_shift */72neg a5, a47374/* whole = len / 8 */75srli a6, a2, 37677/* len = len % 8 */78andi a2, a2, 0b1117980/* t0 - uint64_t* ptr */8182/* ptr = src & ~0b111 */83andi t0, a1, ~0b1118485/* src += whole * 8 */86slli t1, a6, 387add a1, a1, t18889/*90* t1 - uint64_t low91* t2 - uint64_t high92*/9394/* low = *ptr++ */95ld t1, (t0)96addi t0, t0, 89798/* low >>= right_shift */99srl t1, t1, a4100101beqz a6, .Llmain_skip102.Llmain:103/* high = *ptr++ */104ld t2, (t0)105addi t0, t0, 8106107/* whole-- */108addi a6, a6, -1109110/* temp = (high << left_shift) | low */111sll t3, t2, a5112or t3, t3, t1113114/* low = high >> right_shift */115srl t1, t2, a4116117/* *dst++ = temp */118sd t3, (a3)119addi a3, a3, 8120121bnez a6, .Llmain122123.Llmain_skip:124125.Lend:126la t1, .Lduff_end127slli t2, a2, 3128sub t1, t1, t2129jr t1130lb t2, 6(a1)131sb t2, 6(a3)132lb t2, 5(a1)133sb t2, 5(a3)134lb t2, 4(a1)135sb t2, 4(a3)136lb t2, 3(a1)137sb t2, 3(a3)138lb t2, 2(a1)139sb t2, 2(a3)140lb t2, 1(a1)141sb t2, 1(a3)142lb t2, 0(a1)143sb t2, 0(a3)144.Lduff_end:145146.Lreturn:147ret148149/* exectued when dst - src is multiple of 8150* a0 - void* dst151* a1 - const void* src152* a2 - size_t len153*/154.Lmemcpy8:155156beqz a2, .Lreturn157158slti t0, a2, 128159bnez t0, .Llmain8_64_skip160161/* a4 - uint64_t* end_unroll */162163/* end_unroll = dst + len / 64 * 64 */164andi t0, a2, ~0b111111165add a4, a3, t0166167/* len = len % 64 */168andi a2, a2, 0b111111169170.Llmain8_64:171ld t0, 0(a1)172ld t1, 8(a1)173ld t2, 16(a1)174ld t3, 24(a1)175sd t0, 0(a3)176sd t1, 8(a3)177sd t2, 16(a3)178sd t3, 24(a3)179ld t0, 32(a1)180ld t1, 40(a1)181ld t2, 48(a1)182ld t3, 56(a1)183sd t0, 32(a3)184sd t1, 40(a3)185sd t2, 48(a3)186sd t3, 56(a3)187addi a3, a3, 64188addi a1, a1, 64189bne a3, a4, .Llmain8_64190.Llmain8_64_skip:191192beqz a2, .Lreturn193194/* a4 - uint64_t* end_align */195196/* end_align = (dst + len) & ~0b111 */197add a4, a3, a2198andi a4, a4, ~0b111199200/* len = len % 8 */201andi a2, a2, 0b111202203beq a3, a4, .Llmain8_skip204.Llmain8:205ld t0, (a1)206sd t0, (a3)207addi a3, a3, 8208addi a1, a1, 8209bne a3, a4, .Llmain8210.Llmain8_skip:211212la t1, .Lduff_end213slli t2, a2, 3214sub t1, t1, t2215jr t1216END(memcpy)217218219