/*1* Copyright (C) 2008-2009 Michal Simek <[email protected]>2* Copyright (C) 2008-2009 PetaLogix3* Copyright (C) 2008 Jim Law - Iris LP All rights reserved.4*5* This file is subject to the terms and conditions of the GNU General6* Public License. See the file COPYING in the main directory of this7* archive for more details.8*9* Written by Jim Law <[email protected]>10*11* intended to replace:12* memcpy in memcpy.c and13* memmove in memmove.c14* ... in arch/microblaze/lib15*16*17* assly_fastcopy.S18*19* Attempt at quicker memcpy and memmove for MicroBlaze20* Input : Operand1 in Reg r5 - destination address21* Operand2 in Reg r6 - source address22* Operand3 in Reg r7 - number of bytes to transfer23* Output: Result in Reg r3 - starting destinaition address24*25*26* Explanation:27* Perform (possibly unaligned) copy of a block of memory28* between mem locations with size of xfer spec'd in bytes29*/3031#include <linux/linkage.h>32.text33.globl memcpy34.type memcpy, @function35.ent memcpy3637memcpy:38fast_memcpy_ascending:39/* move d to return register as value of function */40addi r3, r5, 04142addi r4, r0, 4 /* n = 4 */43cmpu r4, r4, r7 /* n = c - n (unsigned) */44blti r4, a_xfer_end /* if n < 0, less than one word to transfer */4546/* transfer first 0~3 bytes to get aligned dest address */47andi r4, r5, 3 /* n = d & 3 */48/* if zero, destination already aligned */49beqi r4, a_dalign_done50/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */51rsubi r4, r4, 452rsub r7, r4, r7 /* c = c - n adjust c */5354a_xfer_first_loop:55/* if no bytes left to transfer, transfer the bulk */56beqi r4, a_dalign_done57lbui r11, r6, 0 /* h = *s */58sbi r11, r5, 0 /* *d = h */59addi r6, r6, 1 /* s++ */60addi r5, r5, 1 /* d++ */61brid a_xfer_first_loop /* loop */62addi r4, r4, -1 /* n-- (IN DELAY SLOT) */6364a_dalign_done:65addi r4, r0, 32 /* n = 32 */66cmpu r4, r4, r7 /* n = c - n (unsigned) */67/* if n < 0, less than one block to transfer */68blti r4, a_block_done6970a_block_xfer:71andi r4, r7, 0xffffffe0 /* n = c & ~31 */72rsub r7, r4, r7 /* c = c - n */7374andi r9, r6, 3 /* t1 = s & 3 */75/* if temp != 0, unaligned transfers needed */76bnei r9, a_block_unaligned7778a_block_aligned:79lwi r9, r6, 0 /* t1 = *(s + 0) */80lwi r10, r6, 4 /* t2 = *(s + 4) */81lwi r11, r6, 8 /* t3 = *(s + 8) */82lwi r12, r6, 12 /* t4 = *(s + 12) */83swi r9, r5, 0 /* *(d + 0) = t1 */84swi r10, r5, 4 /* *(d + 4) = t2 */85swi r11, r5, 8 /* *(d + 8) = t3 */86swi r12, r5, 12 /* *(d + 12) = t4 */87lwi r9, r6, 16 /* t1 = *(s + 16) */88lwi r10, r6, 20 /* t2 = *(s + 20) */89lwi r11, r6, 24 /* t3 = *(s + 24) */90lwi r12, r6, 28 /* t4 = *(s + 28) */91swi r9, r5, 16 /* *(d + 16) = t1 */92swi r10, r5, 20 /* *(d + 20) = t2 */93swi r11, r5, 24 /* *(d + 24) = t3 */94swi r12, r5, 28 /* *(d + 28) = t4 */95addi r6, r6, 32 /* s = s + 32 */96addi r4, r4, -32 /* n = n - 32 */97bneid r4, a_block_aligned /* while (n) loop */98addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */99bri a_block_done100101a_block_unaligned:102andi r8, r6, 0xfffffffc /* as = s & ~3 */103add r6, r6, r4 /* s = s + n */104lwi r11, r8, 0 /* h = *(as + 0) */105106addi r9, r9, -1107beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */108addi r9, r9, -1109beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */110111a_block_u3:112bslli r11, r11, 24 /* h = h << 24 */113a_bu3_loop:114lwi r12, r8, 4 /* v = *(as + 4) */115bsrli r9, r12, 8 /* t1 = v >> 8 */116or r9, r11, r9 /* t1 = h | t1 */117swi r9, r5, 0 /* *(d + 0) = t1 */118bslli r11, r12, 24 /* h = v << 24 */119lwi r12, r8, 8 /* v = *(as + 8) */120bsrli r9, r12, 8 /* t1 = v >> 8 */121or r9, r11, r9 /* t1 = h | t1 */122swi r9, r5, 4 /* *(d + 4) = t1 */123bslli r11, r12, 24 /* h = v << 24 */124lwi r12, r8, 12 /* v = *(as + 12) */125bsrli r9, r12, 8 /* t1 = v >> 8 */126or r9, r11, r9 /* t1 = h | t1 */127swi r9, r5, 8 /* *(d + 8) = t1 */128bslli r11, r12, 24 /* h = v << 24 */129lwi r12, r8, 16 /* v = *(as + 16) */130bsrli r9, r12, 8 /* t1 = v >> 8 */131or r9, r11, r9 /* t1 = h | t1 */132swi r9, r5, 12 /* *(d + 12) = t1 */133bslli r11, r12, 24 /* h = v << 24 */134lwi r12, r8, 20 /* v = *(as + 20) */135bsrli r9, r12, 8 /* t1 = v >> 8 */136or r9, r11, r9 /* t1 = h | t1 */137swi r9, r5, 16 /* *(d + 16) = t1 */138bslli r11, r12, 24 /* h = v << 24 */139lwi r12, r8, 24 /* v = *(as + 24) */140bsrli r9, r12, 8 /* t1 = v >> 8 */141or r9, r11, r9 /* t1 = h | t1 */142swi r9, r5, 20 /* *(d + 20) = t1 */143bslli r11, r12, 24 /* h = v << 24 */144lwi r12, r8, 28 /* v = *(as + 28) */145bsrli r9, r12, 8 /* t1 = v >> 8 */146or r9, r11, r9 /* t1 = h | t1 */147swi r9, r5, 24 /* *(d + 24) = t1 */148bslli r11, r12, 24 /* h = v << 24 */149lwi r12, r8, 32 /* v = *(as + 32) */150bsrli r9, r12, 8 /* t1 = v >> 8 */151or r9, r11, r9 /* t1 = h | t1 */152swi r9, r5, 28 /* *(d + 28) = t1 */153bslli r11, r12, 24 /* h = v << 24 */154addi r8, r8, 32 /* as = as + 32 */155addi r4, r4, -32 /* n = n - 32 */156bneid r4, a_bu3_loop /* while (n) loop */157addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */158bri a_block_done159160a_block_u1:161bslli r11, r11, 8 /* h = h << 8 */162a_bu1_loop:163lwi r12, r8, 4 /* v = *(as + 4) */164bsrli r9, r12, 24 /* t1 = v >> 24 */165or r9, r11, r9 /* t1 = h | t1 */166swi r9, r5, 0 /* *(d + 0) = t1 */167bslli r11, r12, 8 /* h = v << 8 */168lwi r12, r8, 8 /* v = *(as + 8) */169bsrli r9, r12, 24 /* t1 = v >> 24 */170or r9, r11, r9 /* t1 = h | t1 */171swi r9, r5, 4 /* *(d + 4) = t1 */172bslli r11, r12, 8 /* h = v << 8 */173lwi r12, r8, 12 /* v = *(as + 12) */174bsrli r9, r12, 24 /* t1 = v >> 24 */175or r9, r11, r9 /* t1 = h | t1 */176swi r9, r5, 8 /* *(d + 8) = t1 */177bslli r11, r12, 8 /* h = v << 8 */178lwi r12, r8, 16 /* v = *(as + 16) */179bsrli r9, r12, 24 /* t1 = v >> 24 */180or r9, r11, r9 /* t1 = h | t1 */181swi r9, r5, 12 /* *(d + 12) = t1 */182bslli r11, r12, 8 /* h = v << 8 */183lwi r12, r8, 20 /* v = *(as + 20) */184bsrli r9, r12, 24 /* t1 = v >> 24 */185or r9, r11, r9 /* t1 = h | t1 */186swi r9, r5, 16 /* *(d + 16) = t1 */187bslli r11, r12, 8 /* h = v << 8 */188lwi r12, r8, 24 /* v = *(as + 24) */189bsrli r9, r12, 24 /* t1 = v >> 24 */190or r9, r11, r9 /* t1 = h | t1 */191swi r9, r5, 20 /* *(d + 20) = t1 */192bslli r11, r12, 8 /* h = v << 8 */193lwi r12, r8, 28 /* v = *(as + 28) */194bsrli r9, r12, 24 /* t1 = v >> 24 */195or r9, r11, r9 /* t1 = h | t1 */196swi r9, r5, 24 /* *(d + 24) = t1 */197bslli r11, r12, 8 /* h = v << 8 */198lwi r12, r8, 32 /* v = *(as + 32) */199bsrli r9, r12, 24 /* t1 = v >> 24 */200or r9, r11, r9 /* t1 = h | t1 */201swi r9, r5, 28 /* *(d + 28) = t1 */202bslli r11, r12, 8 /* h = v << 8 */203addi r8, r8, 32 /* as = as + 32 */204addi r4, r4, -32 /* n = n - 32 */205bneid r4, a_bu1_loop /* while (n) loop */206addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */207bri a_block_done208209a_block_u2:210bslli r11, r11, 16 /* h = h << 16 */211a_bu2_loop:212lwi r12, r8, 4 /* v = *(as + 4) */213bsrli r9, r12, 16 /* t1 = v >> 16 */214or r9, r11, r9 /* t1 = h | t1 */215swi r9, r5, 0 /* *(d + 0) = t1 */216bslli r11, r12, 16 /* h = v << 16 */217lwi r12, r8, 8 /* v = *(as + 8) */218bsrli r9, r12, 16 /* t1 = v >> 16 */219or r9, r11, r9 /* t1 = h | t1 */220swi r9, r5, 4 /* *(d + 4) = t1 */221bslli r11, r12, 16 /* h = v << 16 */222lwi r12, r8, 12 /* v = *(as + 12) */223bsrli r9, r12, 16 /* t1 = v >> 16 */224or r9, r11, r9 /* t1 = h | t1 */225swi r9, r5, 8 /* *(d + 8) = t1 */226bslli r11, r12, 16 /* h = v << 16 */227lwi r12, r8, 16 /* v = *(as + 16) */228bsrli r9, r12, 16 /* t1 = v >> 16 */229or r9, r11, r9 /* t1 = h | t1 */230swi r9, r5, 12 /* *(d + 12) = t1 */231bslli r11, r12, 16 /* h = v << 16 */232lwi r12, r8, 20 /* v = *(as + 20) */233bsrli r9, r12, 16 /* t1 = v >> 16 */234or r9, r11, r9 /* t1 = h | t1 */235swi r9, r5, 16 /* *(d + 16) = t1 */236bslli r11, r12, 16 /* h = v << 16 */237lwi r12, r8, 24 /* v = *(as + 24) */238bsrli r9, r12, 16 /* t1 = v >> 16 */239or r9, r11, r9 /* t1 = h | t1 */240swi r9, r5, 20 /* *(d + 20) = t1 */241bslli r11, r12, 16 /* h = v << 16 */242lwi r12, r8, 28 /* v = *(as + 28) */243bsrli r9, r12, 16 /* t1 = v >> 16 */244or r9, r11, r9 /* t1 = h | t1 */245swi r9, r5, 24 /* *(d + 24) = t1 */246bslli r11, r12, 16 /* h = v << 16 */247lwi r12, r8, 32 /* v = *(as + 32) */248bsrli r9, r12, 16 /* t1 = v >> 16 */249or r9, r11, r9 /* t1 = h | t1 */250swi r9, r5, 28 /* *(d + 28) = t1 */251bslli r11, r12, 16 /* h = v << 16 */252addi r8, r8, 32 /* as = as + 32 */253addi r4, r4, -32 /* n = n - 32 */254bneid r4, a_bu2_loop /* while (n) loop */255addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */256257a_block_done:258addi r4, r0, 4 /* n = 4 */259cmpu r4, r4, r7 /* n = c - n (unsigned) */260blti r4, a_xfer_end /* if n < 0, less than one word to transfer */261262a_word_xfer:263andi r4, r7, 0xfffffffc /* n = c & ~3 */264addi r10, r0, 0 /* offset = 0 */265266andi r9, r6, 3 /* t1 = s & 3 */267/* if temp != 0, unaligned transfers needed */268bnei r9, a_word_unaligned269270a_word_aligned:271lw r9, r6, r10 /* t1 = *(s+offset) */272sw r9, r5, r10 /* *(d+offset) = t1 */273addi r4, r4,-4 /* n-- */274bneid r4, a_word_aligned /* loop */275addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */276277bri a_word_done278279a_word_unaligned:280andi r8, r6, 0xfffffffc /* as = s & ~3 */281lwi r11, r8, 0 /* h = *(as + 0) */282addi r8, r8, 4 /* as = as + 4 */283284addi r9, r9, -1285beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */286addi r9, r9, -1287beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */288289a_word_u3:290bslli r11, r11, 24 /* h = h << 24 */291a_wu3_loop:292lw r12, r8, r10 /* v = *(as + offset) */293bsrli r9, r12, 8 /* t1 = v >> 8 */294or r9, r11, r9 /* t1 = h | t1 */295sw r9, r5, r10 /* *(d + offset) = t1 */296bslli r11, r12, 24 /* h = v << 24 */297addi r4, r4,-4 /* n = n - 4 */298bneid r4, a_wu3_loop /* while (n) loop */299addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */300301bri a_word_done302303a_word_u1:304bslli r11, r11, 8 /* h = h << 8 */305a_wu1_loop:306lw r12, r8, r10 /* v = *(as + offset) */307bsrli r9, r12, 24 /* t1 = v >> 24 */308or r9, r11, r9 /* t1 = h | t1 */309sw r9, r5, r10 /* *(d + offset) = t1 */310bslli r11, r12, 8 /* h = v << 8 */311addi r4, r4,-4 /* n = n - 4 */312bneid r4, a_wu1_loop /* while (n) loop */313addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */314315bri a_word_done316317a_word_u2:318bslli r11, r11, 16 /* h = h << 16 */319a_wu2_loop:320lw r12, r8, r10 /* v = *(as + offset) */321bsrli r9, r12, 16 /* t1 = v >> 16 */322or r9, r11, r9 /* t1 = h | t1 */323sw r9, r5, r10 /* *(d + offset) = t1 */324bslli r11, r12, 16 /* h = v << 16 */325addi r4, r4,-4 /* n = n - 4 */326bneid r4, a_wu2_loop /* while (n) loop */327addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */328329a_word_done:330add r5, r5, r10 /* d = d + offset */331add r6, r6, r10 /* s = s + offset */332rsub r7, r10, r7 /* c = c - offset */333334a_xfer_end:335a_xfer_end_loop:336beqi r7, a_done /* while (c) */337lbui r9, r6, 0 /* t1 = *s */338addi r6, r6, 1 /* s++ */339sbi r9, r5, 0 /* *d = t1 */340addi r7, r7, -1 /* c-- */341brid a_xfer_end_loop /* loop */342addi r5, r5, 1 /* d++ (IN DELAY SLOT) */343344a_done:345rtsd r15, 8346nop347348.size memcpy, . - memcpy349.end memcpy350/*----------------------------------------------------------------------------*/351.globl memmove352.type memmove, @function353.ent memmove354355memmove:356cmpu r4, r5, r6 /* n = s - d */357bgei r4,fast_memcpy_ascending358359fast_memcpy_descending:360/* move d to return register as value of function */361addi r3, r5, 0362363add r5, r5, r7 /* d = d + c */364add r6, r6, r7 /* s = s + c */365366addi r4, r0, 4 /* n = 4 */367cmpu r4, r4, r7 /* n = c - n (unsigned) */368blti r4,d_xfer_end /* if n < 0, less than one word to transfer */369370/* transfer first 0~3 bytes to get aligned dest address */371andi r4, r5, 3 /* n = d & 3 */372/* if zero, destination already aligned */373beqi r4,d_dalign_done374rsub r7, r4, r7 /* c = c - n adjust c */375376d_xfer_first_loop:377/* if no bytes left to transfer, transfer the bulk */378beqi r4,d_dalign_done379addi r6, r6, -1 /* s-- */380addi r5, r5, -1 /* d-- */381lbui r11, r6, 0 /* h = *s */382sbi r11, r5, 0 /* *d = h */383brid d_xfer_first_loop /* loop */384addi r4, r4, -1 /* n-- (IN DELAY SLOT) */385386d_dalign_done:387addi r4, r0, 32 /* n = 32 */388cmpu r4, r4, r7 /* n = c - n (unsigned) */389/* if n < 0, less than one block to transfer */390blti r4, d_block_done391392d_block_xfer:393andi r4, r7, 0xffffffe0 /* n = c & ~31 */394rsub r7, r4, r7 /* c = c - n */395396andi r9, r6, 3 /* t1 = s & 3 */397/* if temp != 0, unaligned transfers needed */398bnei r9, d_block_unaligned399400d_block_aligned:401addi r6, r6, -32 /* s = s - 32 */402addi r5, r5, -32 /* d = d - 32 */403lwi r9, r6, 28 /* t1 = *(s + 28) */404lwi r10, r6, 24 /* t2 = *(s + 24) */405lwi r11, r6, 20 /* t3 = *(s + 20) */406lwi r12, r6, 16 /* t4 = *(s + 16) */407swi r9, r5, 28 /* *(d + 28) = t1 */408swi r10, r5, 24 /* *(d + 24) = t2 */409swi r11, r5, 20 /* *(d + 20) = t3 */410swi r12, r5, 16 /* *(d + 16) = t4 */411lwi r9, r6, 12 /* t1 = *(s + 12) */412lwi r10, r6, 8 /* t2 = *(s + 8) */413lwi r11, r6, 4 /* t3 = *(s + 4) */414lwi r12, r6, 0 /* t4 = *(s + 0) */415swi r9, r5, 12 /* *(d + 12) = t1 */416swi r10, r5, 8 /* *(d + 8) = t2 */417swi r11, r5, 4 /* *(d + 4) = t3 */418addi r4, r4, -32 /* n = n - 32 */419bneid r4, d_block_aligned /* while (n) loop */420swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */421bri d_block_done422423d_block_unaligned:424andi r8, r6, 0xfffffffc /* as = s & ~3 */425rsub r6, r4, r6 /* s = s - n */426lwi r11, r8, 0 /* h = *(as + 0) */427428addi r9, r9, -1429beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */430addi r9, r9, -1431beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */432433d_block_u3:434bsrli r11, r11, 8 /* h = h >> 8 */435d_bu3_loop:436addi r8, r8, -32 /* as = as - 32 */437addi r5, r5, -32 /* d = d - 32 */438lwi r12, r8, 28 /* v = *(as + 28) */439bslli r9, r12, 24 /* t1 = v << 24 */440or r9, r11, r9 /* t1 = h | t1 */441swi r9, r5, 28 /* *(d + 28) = t1 */442bsrli r11, r12, 8 /* h = v >> 8 */443lwi r12, r8, 24 /* v = *(as + 24) */444bslli r9, r12, 24 /* t1 = v << 24 */445or r9, r11, r9 /* t1 = h | t1 */446swi r9, r5, 24 /* *(d + 24) = t1 */447bsrli r11, r12, 8 /* h = v >> 8 */448lwi r12, r8, 20 /* v = *(as + 20) */449bslli r9, r12, 24 /* t1 = v << 24 */450or r9, r11, r9 /* t1 = h | t1 */451swi r9, r5, 20 /* *(d + 20) = t1 */452bsrli r11, r12, 8 /* h = v >> 8 */453lwi r12, r8, 16 /* v = *(as + 16) */454bslli r9, r12, 24 /* t1 = v << 24 */455or r9, r11, r9 /* t1 = h | t1 */456swi r9, r5, 16 /* *(d + 16) = t1 */457bsrli r11, r12, 8 /* h = v >> 8 */458lwi r12, r8, 12 /* v = *(as + 12) */459bslli r9, r12, 24 /* t1 = v << 24 */460or r9, r11, r9 /* t1 = h | t1 */461swi r9, r5, 12 /* *(d + 112) = t1 */462bsrli r11, r12, 8 /* h = v >> 8 */463lwi r12, r8, 8 /* v = *(as + 8) */464bslli r9, r12, 24 /* t1 = v << 24 */465or r9, r11, r9 /* t1 = h | t1 */466swi r9, r5, 8 /* *(d + 8) = t1 */467bsrli r11, r12, 8 /* h = v >> 8 */468lwi r12, r8, 4 /* v = *(as + 4) */469bslli r9, r12, 24 /* t1 = v << 24 */470or r9, r11, r9 /* t1 = h | t1 */471swi r9, r5, 4 /* *(d + 4) = t1 */472bsrli r11, r12, 8 /* h = v >> 8 */473lwi r12, r8, 0 /* v = *(as + 0) */474bslli r9, r12, 24 /* t1 = v << 24 */475or r9, r11, r9 /* t1 = h | t1 */476swi r9, r5, 0 /* *(d + 0) = t1 */477addi r4, r4, -32 /* n = n - 32 */478bneid r4, d_bu3_loop /* while (n) loop */479bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */480bri d_block_done481482d_block_u1:483bsrli r11, r11, 24 /* h = h >> 24 */484d_bu1_loop:485addi r8, r8, -32 /* as = as - 32 */486addi r5, r5, -32 /* d = d - 32 */487lwi r12, r8, 28 /* v = *(as + 28) */488bslli r9, r12, 8 /* t1 = v << 8 */489or r9, r11, r9 /* t1 = h | t1 */490swi r9, r5, 28 /* *(d + 28) = t1 */491bsrli r11, r12, 24 /* h = v >> 24 */492lwi r12, r8, 24 /* v = *(as + 24) */493bslli r9, r12, 8 /* t1 = v << 8 */494or r9, r11, r9 /* t1 = h | t1 */495swi r9, r5, 24 /* *(d + 24) = t1 */496bsrli r11, r12, 24 /* h = v >> 24 */497lwi r12, r8, 20 /* v = *(as + 20) */498bslli r9, r12, 8 /* t1 = v << 8 */499or r9, r11, r9 /* t1 = h | t1 */500swi r9, r5, 20 /* *(d + 20) = t1 */501bsrli r11, r12, 24 /* h = v >> 24 */502lwi r12, r8, 16 /* v = *(as + 16) */503bslli r9, r12, 8 /* t1 = v << 8 */504or r9, r11, r9 /* t1 = h | t1 */505swi r9, r5, 16 /* *(d + 16) = t1 */506bsrli r11, r12, 24 /* h = v >> 24 */507lwi r12, r8, 12 /* v = *(as + 12) */508bslli r9, r12, 8 /* t1 = v << 8 */509or r9, r11, r9 /* t1 = h | t1 */510swi r9, r5, 12 /* *(d + 112) = t1 */511bsrli r11, r12, 24 /* h = v >> 24 */512lwi r12, r8, 8 /* v = *(as + 8) */513bslli r9, r12, 8 /* t1 = v << 8 */514or r9, r11, r9 /* t1 = h | t1 */515swi r9, r5, 8 /* *(d + 8) = t1 */516bsrli r11, r12, 24 /* h = v >> 24 */517lwi r12, r8, 4 /* v = *(as + 4) */518bslli r9, r12, 8 /* t1 = v << 8 */519or r9, r11, r9 /* t1 = h | t1 */520swi r9, r5, 4 /* *(d + 4) = t1 */521bsrli r11, r12, 24 /* h = v >> 24 */522lwi r12, r8, 0 /* v = *(as + 0) */523bslli r9, r12, 8 /* t1 = v << 8 */524or r9, r11, r9 /* t1 = h | t1 */525swi r9, r5, 0 /* *(d + 0) = t1 */526addi r4, r4, -32 /* n = n - 32 */527bneid r4, d_bu1_loop /* while (n) loop */528bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */529bri d_block_done530531d_block_u2:532bsrli r11, r11, 16 /* h = h >> 16 */533d_bu2_loop:534addi r8, r8, -32 /* as = as - 32 */535addi r5, r5, -32 /* d = d - 32 */536lwi r12, r8, 28 /* v = *(as + 28) */537bslli r9, r12, 16 /* t1 = v << 16 */538or r9, r11, r9 /* t1 = h | t1 */539swi r9, r5, 28 /* *(d + 28) = t1 */540bsrli r11, r12, 16 /* h = v >> 16 */541lwi r12, r8, 24 /* v = *(as + 24) */542bslli r9, r12, 16 /* t1 = v << 16 */543or r9, r11, r9 /* t1 = h | t1 */544swi r9, r5, 24 /* *(d + 24) = t1 */545bsrli r11, r12, 16 /* h = v >> 16 */546lwi r12, r8, 20 /* v = *(as + 20) */547bslli r9, r12, 16 /* t1 = v << 16 */548or r9, r11, r9 /* t1 = h | t1 */549swi r9, r5, 20 /* *(d + 20) = t1 */550bsrli r11, r12, 16 /* h = v >> 16 */551lwi r12, r8, 16 /* v = *(as + 16) */552bslli r9, r12, 16 /* t1 = v << 16 */553or r9, r11, r9 /* t1 = h | t1 */554swi r9, r5, 16 /* *(d + 16) = t1 */555bsrli r11, r12, 16 /* h = v >> 16 */556lwi r12, r8, 12 /* v = *(as + 12) */557bslli r9, r12, 16 /* t1 = v << 16 */558or r9, r11, r9 /* t1 = h | t1 */559swi r9, r5, 12 /* *(d + 112) = t1 */560bsrli r11, r12, 16 /* h = v >> 16 */561lwi r12, r8, 8 /* v = *(as + 8) */562bslli r9, r12, 16 /* t1 = v << 16 */563or r9, r11, r9 /* t1 = h | t1 */564swi r9, r5, 8 /* *(d + 8) = t1 */565bsrli r11, r12, 16 /* h = v >> 16 */566lwi r12, r8, 4 /* v = *(as + 4) */567bslli r9, r12, 16 /* t1 = v << 16 */568or r9, r11, r9 /* t1 = h | t1 */569swi r9, r5, 4 /* *(d + 4) = t1 */570bsrli r11, r12, 16 /* h = v >> 16 */571lwi r12, r8, 0 /* v = *(as + 0) */572bslli r9, r12, 16 /* t1 = v << 16 */573or r9, r11, r9 /* t1 = h | t1 */574swi r9, r5, 0 /* *(d + 0) = t1 */575addi r4, r4, -32 /* n = n - 32 */576bneid r4, d_bu2_loop /* while (n) loop */577bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */578579d_block_done:580addi r4, r0, 4 /* n = 4 */581cmpu r4, r4, r7 /* n = c - n (unsigned) */582blti r4,d_xfer_end /* if n < 0, less than one word to transfer */583584d_word_xfer:585andi r4, r7, 0xfffffffc /* n = c & ~3 */586rsub r5, r4, r5 /* d = d - n */587rsub r6, r4, r6 /* s = s - n */588rsub r7, r4, r7 /* c = c - n */589590andi r9, r6, 3 /* t1 = s & 3 */591/* if temp != 0, unaligned transfers needed */592bnei r9, d_word_unaligned593594d_word_aligned:595addi r4, r4,-4 /* n-- */596lw r9, r6, r4 /* t1 = *(s+n) */597bneid r4, d_word_aligned /* loop */598sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */599600bri d_word_done601602d_word_unaligned:603andi r8, r6, 0xfffffffc /* as = s & ~3 */604lw r11, r8, r4 /* h = *(as + n) */605606addi r9, r9, -1607beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */608addi r9, r9, -1609beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */610611d_word_u3:612bsrli r11, r11, 8 /* h = h >> 8 */613d_wu3_loop:614addi r4, r4,-4 /* n = n - 4 */615lw r12, r8, r4 /* v = *(as + n) */616bslli r9, r12, 24 /* t1 = v << 24 */617or r9, r11, r9 /* t1 = h | t1 */618sw r9, r5, r4 /* *(d + n) = t1 */619bneid r4, d_wu3_loop /* while (n) loop */620bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */621622bri d_word_done623624d_word_u1:625bsrli r11, r11, 24 /* h = h >> 24 */626d_wu1_loop:627addi r4, r4,-4 /* n = n - 4 */628lw r12, r8, r4 /* v = *(as + n) */629bslli r9, r12, 8 /* t1 = v << 8 */630or r9, r11, r9 /* t1 = h | t1 */631sw r9, r5, r4 /* *(d + n) = t1 */632bneid r4, d_wu1_loop /* while (n) loop */633bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */634635bri d_word_done636637d_word_u2:638bsrli r11, r11, 16 /* h = h >> 16 */639d_wu2_loop:640addi r4, r4,-4 /* n = n - 4 */641lw r12, r8, r4 /* v = *(as + n) */642bslli r9, r12, 16 /* t1 = v << 16 */643or r9, r11, r9 /* t1 = h | t1 */644sw r9, r5, r4 /* *(d + n) = t1 */645bneid r4, d_wu2_loop /* while (n) loop */646bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */647648d_word_done:649650d_xfer_end:651d_xfer_end_loop:652beqi r7, a_done /* while (c) */653addi r6, r6, -1 /* s-- */654lbui r9, r6, 0 /* t1 = *s */655addi r5, r5, -1 /* d-- */656sbi r9, r5, 0 /* *d = t1 */657brid d_xfer_end_loop /* loop */658addi r7, r7, -1 /* c-- (IN DELAY SLOT) */659660d_done:661rtsd r15, 8662nop663664.size memmove, . - memmove665.end memmove666667668