/*1* Copyright (C) 2008-2009 Michal Simek <[email protected]>2* Copyright (C) 2008-2009 PetaLogix3* Copyright (C) 2008 Jim Law - Iris LP All rights reserved.4*5* This file is subject to the terms and conditions of the GNU General6* Public License. See the file COPYING in the main directory of this7* archive for more details.8*9* Written by Jim Law <[email protected]>10*11* intended to replace:12* memcpy in memcpy.c and13* memmove in memmove.c14* ... in arch/microblaze/lib15*16*17* assly_fastcopy.S18*19* Attempt at quicker memcpy and memmove for MicroBlaze20* Input : Operand1 in Reg r5 - destination address21* Operand2 in Reg r6 - source address22* Operand3 in Reg r7 - number of bytes to transfer23* Output: Result in Reg r3 - starting destinaition address24*25*26* Explanation:27* Perform (possibly unaligned) copy of a block of memory28* between mem locations with size of xfer spec'd in bytes29*/3031#ifdef __MICROBLAZEEL__32#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.33#endif3435#include <linux/linkage.h>36.text37.globl memcpy38.type memcpy, @function39.ent memcpy4041memcpy:42fast_memcpy_ascending:43/* move d to return register as value of function */44addi r3, r5, 04546addi r4, r0, 4 /* n = 4 */47cmpu r4, r4, r7 /* n = c - n (unsigned) */48blti r4, a_xfer_end /* if n < 0, less than one word to transfer */4950/* transfer first 0~3 bytes to get aligned dest address */51andi r4, r5, 3 /* n = d & 3 */52/* if zero, destination already aligned */53beqi r4, a_dalign_done54/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */55rsubi r4, r4, 456rsub r7, r4, r7 /* c = c - n adjust c */5758a_xfer_first_loop:59/* if no bytes left to transfer, transfer the bulk */60beqi r4, a_dalign_done61lbui r11, r6, 0 /* h = *s */62sbi r11, r5, 0 /* *d = h */63addi r6, r6, 1 /* s++ */64addi r5, r5, 1 /* d++ */65brid a_xfer_first_loop /* loop */66addi r4, r4, -1 /* n-- (IN DELAY SLOT) */6768a_dalign_done:69addi r4, r0, 32 /* n = 32 */70cmpu r4, r4, r7 /* n = c - n (unsigned) */71/* if n < 0, less than one block to transfer */72blti r4, a_block_done7374a_block_xfer:75andi r4, r7, 0xffffffe0 /* n = c & ~31 */76rsub r7, r4, r7 /* c = c - n */7778andi r9, r6, 3 /* t1 = s & 3 */79/* if temp != 0, unaligned transfers needed */80bnei r9, a_block_unaligned8182a_block_aligned:83lwi r9, r6, 0 /* t1 = *(s + 0) */84lwi r10, r6, 4 /* t2 = *(s + 4) */85lwi r11, r6, 8 /* t3 = *(s + 8) */86lwi r12, r6, 12 /* t4 = *(s + 12) */87swi r9, r5, 0 /* *(d + 0) = t1 */88swi r10, r5, 4 /* *(d + 4) = t2 */89swi r11, r5, 8 /* *(d + 8) = t3 */90swi r12, r5, 12 /* *(d + 12) = t4 */91lwi r9, r6, 16 /* t1 = *(s + 16) */92lwi r10, r6, 20 /* t2 = *(s + 20) */93lwi r11, r6, 24 /* t3 = *(s + 24) */94lwi r12, r6, 28 /* t4 = *(s + 28) */95swi r9, r5, 16 /* *(d + 16) = t1 */96swi r10, r5, 20 /* *(d + 20) = t2 */97swi r11, r5, 24 /* *(d + 24) = t3 */98swi r12, r5, 28 /* *(d + 28) = t4 */99addi r6, r6, 32 /* s = s + 32 */100addi r4, r4, -32 /* n = n - 32 */101bneid r4, a_block_aligned /* while (n) loop */102addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */103bri a_block_done104105a_block_unaligned:106andi r8, r6, 0xfffffffc /* as = s & ~3 */107add r6, r6, r4 /* s = s + n */108lwi r11, r8, 0 /* h = *(as + 0) */109110addi r9, r9, -1111beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */112addi r9, r9, -1113beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */114115a_block_u3:116bslli r11, r11, 24 /* h = h << 24 */117a_bu3_loop:118lwi r12, r8, 4 /* v = *(as + 4) */119bsrli r9, r12, 8 /* t1 = v >> 8 */120or r9, r11, r9 /* t1 = h | t1 */121swi r9, r5, 0 /* *(d + 0) = t1 */122bslli r11, r12, 24 /* h = v << 24 */123lwi r12, r8, 8 /* v = *(as + 8) */124bsrli r9, r12, 8 /* t1 = v >> 8 */125or r9, r11, r9 /* t1 = h | t1 */126swi r9, r5, 4 /* *(d + 4) = t1 */127bslli r11, r12, 24 /* h = v << 24 */128lwi r12, r8, 12 /* v = *(as + 12) */129bsrli r9, r12, 8 /* t1 = v >> 8 */130or r9, r11, r9 /* t1 = h | t1 */131swi r9, r5, 8 /* *(d + 8) = t1 */132bslli r11, r12, 24 /* h = v << 24 */133lwi r12, r8, 16 /* v = *(as + 16) */134bsrli r9, r12, 8 /* t1 = v >> 8 */135or r9, r11, r9 /* t1 = h | t1 */136swi r9, r5, 12 /* *(d + 12) = t1 */137bslli r11, r12, 24 /* h = v << 24 */138lwi r12, r8, 20 /* v = *(as + 20) */139bsrli r9, r12, 8 /* t1 = v >> 8 */140or r9, r11, r9 /* t1 = h | t1 */141swi r9, r5, 16 /* *(d + 16) = t1 */142bslli r11, r12, 24 /* h = v << 24 */143lwi r12, r8, 24 /* v = *(as + 24) */144bsrli r9, r12, 8 /* t1 = v >> 8 */145or r9, r11, r9 /* t1 = h | t1 */146swi r9, r5, 20 /* *(d + 20) = t1 */147bslli r11, r12, 24 /* h = v << 24 */148lwi r12, r8, 28 /* v = *(as + 28) */149bsrli r9, r12, 8 /* t1 = v >> 8 */150or r9, r11, r9 /* t1 = h | t1 */151swi r9, r5, 24 /* *(d + 24) = t1 */152bslli r11, r12, 24 /* h = v << 24 */153lwi r12, r8, 32 /* v = *(as + 32) */154bsrli r9, r12, 8 /* t1 = v >> 8 */155or r9, r11, r9 /* t1 = h | t1 */156swi r9, r5, 28 /* *(d + 28) = t1 */157bslli r11, r12, 24 /* h = v << 24 */158addi r8, r8, 32 /* as = as + 32 */159addi r4, r4, -32 /* n = n - 32 */160bneid r4, a_bu3_loop /* while (n) loop */161addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */162bri a_block_done163164a_block_u1:165bslli r11, r11, 8 /* h = h << 8 */166a_bu1_loop:167lwi r12, r8, 4 /* v = *(as + 4) */168bsrli r9, r12, 24 /* t1 = v >> 24 */169or r9, r11, r9 /* t1 = h | t1 */170swi r9, r5, 0 /* *(d + 0) = t1 */171bslli r11, r12, 8 /* h = v << 8 */172lwi r12, r8, 8 /* v = *(as + 8) */173bsrli r9, r12, 24 /* t1 = v >> 24 */174or r9, r11, r9 /* t1 = h | t1 */175swi r9, r5, 4 /* *(d + 4) = t1 */176bslli r11, r12, 8 /* h = v << 8 */177lwi r12, r8, 12 /* v = *(as + 12) */178bsrli r9, r12, 24 /* t1 = v >> 24 */179or r9, r11, r9 /* t1 = h | t1 */180swi r9, r5, 8 /* *(d + 8) = t1 */181bslli r11, r12, 8 /* h = v << 8 */182lwi r12, r8, 16 /* v = *(as + 16) */183bsrli r9, r12, 24 /* t1 = v >> 24 */184or r9, r11, r9 /* t1 = h | t1 */185swi r9, r5, 12 /* *(d + 12) = t1 */186bslli r11, r12, 8 /* h = v << 8 */187lwi r12, r8, 20 /* v = *(as + 20) */188bsrli r9, r12, 24 /* t1 = v >> 24 */189or r9, r11, r9 /* t1 = h | t1 */190swi r9, r5, 16 /* *(d + 16) = t1 */191bslli r11, r12, 8 /* h = v << 8 */192lwi r12, r8, 24 /* v = *(as + 24) */193bsrli r9, r12, 24 /* t1 = v >> 24 */194or r9, r11, r9 /* t1 = h | t1 */195swi r9, r5, 20 /* *(d + 20) = t1 */196bslli r11, r12, 8 /* h = v << 8 */197lwi r12, r8, 28 /* v = *(as + 28) */198bsrli r9, r12, 24 /* t1 = v >> 24 */199or r9, r11, r9 /* t1 = h | t1 */200swi r9, r5, 24 /* *(d + 24) = t1 */201bslli r11, r12, 8 /* h = v << 8 */202lwi r12, r8, 32 /* v = *(as + 32) */203bsrli r9, r12, 24 /* t1 = v >> 24 */204or r9, r11, r9 /* t1 = h | t1 */205swi r9, r5, 28 /* *(d + 28) = t1 */206bslli r11, r12, 8 /* h = v << 8 */207addi r8, r8, 32 /* as = as + 32 */208addi r4, r4, -32 /* n = n - 32 */209bneid r4, a_bu1_loop /* while (n) loop */210addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */211bri a_block_done212213a_block_u2:214bslli r11, r11, 16 /* h = h << 16 */215a_bu2_loop:216lwi r12, r8, 4 /* v = *(as + 4) */217bsrli r9, r12, 16 /* t1 = v >> 16 */218or r9, r11, r9 /* t1 = h | t1 */219swi r9, r5, 0 /* *(d + 0) = t1 */220bslli r11, r12, 16 /* h = v << 16 */221lwi r12, r8, 8 /* v = *(as + 8) */222bsrli r9, r12, 16 /* t1 = v >> 16 */223or r9, r11, r9 /* t1 = h | t1 */224swi r9, r5, 4 /* *(d + 4) = t1 */225bslli r11, r12, 16 /* h = v << 16 */226lwi r12, r8, 12 /* v = *(as + 12) */227bsrli r9, r12, 16 /* t1 = v >> 16 */228or r9, r11, r9 /* t1 = h | t1 */229swi r9, r5, 8 /* *(d + 8) = t1 */230bslli r11, r12, 16 /* h = v << 16 */231lwi r12, r8, 16 /* v = *(as + 16) */232bsrli r9, r12, 16 /* t1 = v >> 16 */233or r9, r11, r9 /* t1 = h | t1 */234swi r9, r5, 12 /* *(d + 12) = t1 */235bslli r11, r12, 16 /* h = v << 16 */236lwi r12, r8, 20 /* v = *(as + 20) */237bsrli r9, r12, 16 /* t1 = v >> 16 */238or r9, r11, r9 /* t1 = h | t1 */239swi r9, r5, 16 /* *(d + 16) = t1 */240bslli r11, r12, 16 /* h = v << 16 */241lwi r12, r8, 24 /* v = *(as + 24) */242bsrli r9, r12, 16 /* t1 = v >> 16 */243or r9, r11, r9 /* t1 = h | t1 */244swi r9, r5, 20 /* *(d + 20) = t1 */245bslli r11, r12, 16 /* h = v << 16 */246lwi r12, r8, 28 /* v = *(as + 28) */247bsrli r9, r12, 16 /* t1 = v >> 16 */248or r9, r11, r9 /* t1 = h | t1 */249swi r9, r5, 24 /* *(d + 24) = t1 */250bslli r11, r12, 16 /* h = v << 16 */251lwi r12, r8, 32 /* v = *(as + 32) */252bsrli r9, r12, 16 /* t1 = v >> 16 */253or r9, r11, r9 /* t1 = h | t1 */254swi r9, r5, 28 /* *(d + 28) = t1 */255bslli r11, r12, 16 /* h = v << 16 */256addi r8, r8, 32 /* as = as + 32 */257addi r4, r4, -32 /* n = n - 32 */258bneid r4, a_bu2_loop /* while (n) loop */259addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */260261a_block_done:262addi r4, r0, 4 /* n = 4 */263cmpu r4, r4, r7 /* n = c - n (unsigned) */264blti r4, a_xfer_end /* if n < 0, less than one word to transfer */265266a_word_xfer:267andi r4, r7, 0xfffffffc /* n = c & ~3 */268addi r10, r0, 0 /* offset = 0 */269270andi r9, r6, 3 /* t1 = s & 3 */271/* if temp != 0, unaligned transfers needed */272bnei r9, a_word_unaligned273274a_word_aligned:275lw r9, r6, r10 /* t1 = *(s+offset) */276sw r9, r5, r10 /* *(d+offset) = t1 */277addi r4, r4,-4 /* n-- */278bneid r4, a_word_aligned /* loop */279addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */280281bri a_word_done282283a_word_unaligned:284andi r8, r6, 0xfffffffc /* as = s & ~3 */285lwi r11, r8, 0 /* h = *(as + 0) */286addi r8, r8, 4 /* as = as + 4 */287288addi r9, r9, -1289beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */290addi r9, r9, -1291beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */292293a_word_u3:294bslli r11, r11, 24 /* h = h << 24 */295a_wu3_loop:296lw r12, r8, r10 /* v = *(as + offset) */297bsrli r9, r12, 8 /* t1 = v >> 8 */298or r9, r11, r9 /* t1 = h | t1 */299sw r9, r5, r10 /* *(d + offset) = t1 */300bslli r11, r12, 24 /* h = v << 24 */301addi r4, r4,-4 /* n = n - 4 */302bneid r4, a_wu3_loop /* while (n) loop */303addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */304305bri a_word_done306307a_word_u1:308bslli r11, r11, 8 /* h = h << 8 */309a_wu1_loop:310lw r12, r8, r10 /* v = *(as + offset) */311bsrli r9, r12, 24 /* t1 = v >> 24 */312or r9, r11, r9 /* t1 = h | t1 */313sw r9, r5, r10 /* *(d + offset) = t1 */314bslli r11, r12, 8 /* h = v << 8 */315addi r4, r4,-4 /* n = n - 4 */316bneid r4, a_wu1_loop /* while (n) loop */317addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */318319bri a_word_done320321a_word_u2:322bslli r11, r11, 16 /* h = h << 16 */323a_wu2_loop:324lw r12, r8, r10 /* v = *(as + offset) */325bsrli r9, r12, 16 /* t1 = v >> 16 */326or r9, r11, r9 /* t1 = h | t1 */327sw r9, r5, r10 /* *(d + offset) = t1 */328bslli r11, r12, 16 /* h = v << 16 */329addi r4, r4,-4 /* n = n - 4 */330bneid r4, a_wu2_loop /* while (n) loop */331addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */332333a_word_done:334add r5, r5, r10 /* d = d + offset */335add r6, r6, r10 /* s = s + offset */336rsub r7, r10, r7 /* c = c - offset */337338a_xfer_end:339a_xfer_end_loop:340beqi r7, a_done /* while (c) */341lbui r9, r6, 0 /* t1 = *s */342addi r6, r6, 1 /* s++ */343sbi r9, r5, 0 /* *d = t1 */344addi r7, r7, -1 /* c-- */345brid a_xfer_end_loop /* loop */346addi r5, r5, 1 /* d++ (IN DELAY SLOT) */347348a_done:349rtsd r15, 8350nop351352.size memcpy, . - memcpy353.end memcpy354/*----------------------------------------------------------------------------*/355.globl memmove356.type memmove, @function357.ent memmove358359memmove:360cmpu r4, r5, r6 /* n = s - d */361bgei r4,fast_memcpy_ascending362363fast_memcpy_descending:364/* move d to return register as value of function */365addi r3, r5, 0366367add r5, r5, r7 /* d = d + c */368add r6, r6, r7 /* s = s + c */369370addi r4, r0, 4 /* n = 4 */371cmpu r4, r4, r7 /* n = c - n (unsigned) */372blti r4,d_xfer_end /* if n < 0, less than one word to transfer */373374/* transfer first 0~3 bytes to get aligned dest address */375andi r4, r5, 3 /* n = d & 3 */376/* if zero, destination already aligned */377beqi r4,d_dalign_done378rsub r7, r4, r7 /* c = c - n adjust c */379380d_xfer_first_loop:381/* if no bytes left to transfer, transfer the bulk */382beqi r4,d_dalign_done383addi r6, r6, -1 /* s-- */384addi r5, r5, -1 /* d-- */385lbui r11, r6, 0 /* h = *s */386sbi r11, r5, 0 /* *d = h */387brid d_xfer_first_loop /* loop */388addi r4, r4, -1 /* n-- (IN DELAY SLOT) */389390d_dalign_done:391addi r4, r0, 32 /* n = 32 */392cmpu r4, r4, r7 /* n = c - n (unsigned) */393/* if n < 0, less than one block to transfer */394blti r4, d_block_done395396d_block_xfer:397andi r4, r7, 0xffffffe0 /* n = c & ~31 */398rsub r7, r4, r7 /* c = c - n */399400andi r9, r6, 3 /* t1 = s & 3 */401/* if temp != 0, unaligned transfers needed */402bnei r9, d_block_unaligned403404d_block_aligned:405addi r6, r6, -32 /* s = s - 32 */406addi r5, r5, -32 /* d = d - 32 */407lwi r9, r6, 28 /* t1 = *(s + 28) */408lwi r10, r6, 24 /* t2 = *(s + 24) */409lwi r11, r6, 20 /* t3 = *(s + 20) */410lwi r12, r6, 16 /* t4 = *(s + 16) */411swi r9, r5, 28 /* *(d + 28) = t1 */412swi r10, r5, 24 /* *(d + 24) = t2 */413swi r11, r5, 20 /* *(d + 20) = t3 */414swi r12, r5, 16 /* *(d + 16) = t4 */415lwi r9, r6, 12 /* t1 = *(s + 12) */416lwi r10, r6, 8 /* t2 = *(s + 8) */417lwi r11, r6, 4 /* t3 = *(s + 4) */418lwi r12, r6, 0 /* t4 = *(s + 0) */419swi r9, r5, 12 /* *(d + 12) = t1 */420swi r10, r5, 8 /* *(d + 8) = t2 */421swi r11, r5, 4 /* *(d + 4) = t3 */422addi r4, r4, -32 /* n = n - 32 */423bneid r4, d_block_aligned /* while (n) loop */424swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */425bri d_block_done426427d_block_unaligned:428andi r8, r6, 0xfffffffc /* as = s & ~3 */429rsub r6, r4, r6 /* s = s - n */430lwi r11, r8, 0 /* h = *(as + 0) */431432addi r9, r9, -1433beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */434addi r9, r9, -1435beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */436437d_block_u3:438bsrli r11, r11, 8 /* h = h >> 8 */439d_bu3_loop:440addi r8, r8, -32 /* as = as - 32 */441addi r5, r5, -32 /* d = d - 32 */442lwi r12, r8, 28 /* v = *(as + 28) */443bslli r9, r12, 24 /* t1 = v << 24 */444or r9, r11, r9 /* t1 = h | t1 */445swi r9, r5, 28 /* *(d + 28) = t1 */446bsrli r11, r12, 8 /* h = v >> 8 */447lwi r12, r8, 24 /* v = *(as + 24) */448bslli r9, r12, 24 /* t1 = v << 24 */449or r9, r11, r9 /* t1 = h | t1 */450swi r9, r5, 24 /* *(d + 24) = t1 */451bsrli r11, r12, 8 /* h = v >> 8 */452lwi r12, r8, 20 /* v = *(as + 20) */453bslli r9, r12, 24 /* t1 = v << 24 */454or r9, r11, r9 /* t1 = h | t1 */455swi r9, r5, 20 /* *(d + 20) = t1 */456bsrli r11, r12, 8 /* h = v >> 8 */457lwi r12, r8, 16 /* v = *(as + 16) */458bslli r9, r12, 24 /* t1 = v << 24 */459or r9, r11, r9 /* t1 = h | t1 */460swi r9, r5, 16 /* *(d + 16) = t1 */461bsrli r11, r12, 8 /* h = v >> 8 */462lwi r12, r8, 12 /* v = *(as + 12) */463bslli r9, r12, 24 /* t1 = v << 24 */464or r9, r11, r9 /* t1 = h | t1 */465swi r9, r5, 12 /* *(d + 112) = t1 */466bsrli r11, r12, 8 /* h = v >> 8 */467lwi r12, r8, 8 /* v = *(as + 8) */468bslli r9, r12, 24 /* t1 = v << 24 */469or r9, r11, r9 /* t1 = h | t1 */470swi r9, r5, 8 /* *(d + 8) = t1 */471bsrli r11, r12, 8 /* h = v >> 8 */472lwi r12, r8, 4 /* v = *(as + 4) */473bslli r9, r12, 24 /* t1 = v << 24 */474or r9, r11, r9 /* t1 = h | t1 */475swi r9, r5, 4 /* *(d + 4) = t1 */476bsrli r11, r12, 8 /* h = v >> 8 */477lwi r12, r8, 0 /* v = *(as + 0) */478bslli r9, r12, 24 /* t1 = v << 24 */479or r9, r11, r9 /* t1 = h | t1 */480swi r9, r5, 0 /* *(d + 0) = t1 */481addi r4, r4, -32 /* n = n - 32 */482bneid r4, d_bu3_loop /* while (n) loop */483bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */484bri d_block_done485486d_block_u1:487bsrli r11, r11, 24 /* h = h >> 24 */488d_bu1_loop:489addi r8, r8, -32 /* as = as - 32 */490addi r5, r5, -32 /* d = d - 32 */491lwi r12, r8, 28 /* v = *(as + 28) */492bslli r9, r12, 8 /* t1 = v << 8 */493or r9, r11, r9 /* t1 = h | t1 */494swi r9, r5, 28 /* *(d + 28) = t1 */495bsrli r11, r12, 24 /* h = v >> 24 */496lwi r12, r8, 24 /* v = *(as + 24) */497bslli r9, r12, 8 /* t1 = v << 8 */498or r9, r11, r9 /* t1 = h | t1 */499swi r9, r5, 24 /* *(d + 24) = t1 */500bsrli r11, r12, 24 /* h = v >> 24 */501lwi r12, r8, 20 /* v = *(as + 20) */502bslli r9, r12, 8 /* t1 = v << 8 */503or r9, r11, r9 /* t1 = h | t1 */504swi r9, r5, 20 /* *(d + 20) = t1 */505bsrli r11, r12, 24 /* h = v >> 24 */506lwi r12, r8, 16 /* v = *(as + 16) */507bslli r9, r12, 8 /* t1 = v << 8 */508or r9, r11, r9 /* t1 = h | t1 */509swi r9, r5, 16 /* *(d + 16) = t1 */510bsrli r11, r12, 24 /* h = v >> 24 */511lwi r12, r8, 12 /* v = *(as + 12) */512bslli r9, r12, 8 /* t1 = v << 8 */513or r9, r11, r9 /* t1 = h | t1 */514swi r9, r5, 12 /* *(d + 112) = t1 */515bsrli r11, r12, 24 /* h = v >> 24 */516lwi r12, r8, 8 /* v = *(as + 8) */517bslli r9, r12, 8 /* t1 = v << 8 */518or r9, r11, r9 /* t1 = h | t1 */519swi r9, r5, 8 /* *(d + 8) = t1 */520bsrli r11, r12, 24 /* h = v >> 24 */521lwi r12, r8, 4 /* v = *(as + 4) */522bslli r9, r12, 8 /* t1 = v << 8 */523or r9, r11, r9 /* t1 = h | t1 */524swi r9, r5, 4 /* *(d + 4) = t1 */525bsrli r11, r12, 24 /* h = v >> 24 */526lwi r12, r8, 0 /* v = *(as + 0) */527bslli r9, r12, 8 /* t1 = v << 8 */528or r9, r11, r9 /* t1 = h | t1 */529swi r9, r5, 0 /* *(d + 0) = t1 */530addi r4, r4, -32 /* n = n - 32 */531bneid r4, d_bu1_loop /* while (n) loop */532bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */533bri d_block_done534535d_block_u2:536bsrli r11, r11, 16 /* h = h >> 16 */537d_bu2_loop:538addi r8, r8, -32 /* as = as - 32 */539addi r5, r5, -32 /* d = d - 32 */540lwi r12, r8, 28 /* v = *(as + 28) */541bslli r9, r12, 16 /* t1 = v << 16 */542or r9, r11, r9 /* t1 = h | t1 */543swi r9, r5, 28 /* *(d + 28) = t1 */544bsrli r11, r12, 16 /* h = v >> 16 */545lwi r12, r8, 24 /* v = *(as + 24) */546bslli r9, r12, 16 /* t1 = v << 16 */547or r9, r11, r9 /* t1 = h | t1 */548swi r9, r5, 24 /* *(d + 24) = t1 */549bsrli r11, r12, 16 /* h = v >> 16 */550lwi r12, r8, 20 /* v = *(as + 20) */551bslli r9, r12, 16 /* t1 = v << 16 */552or r9, r11, r9 /* t1 = h | t1 */553swi r9, r5, 20 /* *(d + 20) = t1 */554bsrli r11, r12, 16 /* h = v >> 16 */555lwi r12, r8, 16 /* v = *(as + 16) */556bslli r9, r12, 16 /* t1 = v << 16 */557or r9, r11, r9 /* t1 = h | t1 */558swi r9, r5, 16 /* *(d + 16) = t1 */559bsrli r11, r12, 16 /* h = v >> 16 */560lwi r12, r8, 12 /* v = *(as + 12) */561bslli r9, r12, 16 /* t1 = v << 16 */562or r9, r11, r9 /* t1 = h | t1 */563swi r9, r5, 12 /* *(d + 112) = t1 */564bsrli r11, r12, 16 /* h = v >> 16 */565lwi r12, r8, 8 /* v = *(as + 8) */566bslli r9, r12, 16 /* t1 = v << 16 */567or r9, r11, r9 /* t1 = h | t1 */568swi r9, r5, 8 /* *(d + 8) = t1 */569bsrli r11, r12, 16 /* h = v >> 16 */570lwi r12, r8, 4 /* v = *(as + 4) */571bslli r9, r12, 16 /* t1 = v << 16 */572or r9, r11, r9 /* t1 = h | t1 */573swi r9, r5, 4 /* *(d + 4) = t1 */574bsrli r11, r12, 16 /* h = v >> 16 */575lwi r12, r8, 0 /* v = *(as + 0) */576bslli r9, r12, 16 /* t1 = v << 16 */577or r9, r11, r9 /* t1 = h | t1 */578swi r9, r5, 0 /* *(d + 0) = t1 */579addi r4, r4, -32 /* n = n - 32 */580bneid r4, d_bu2_loop /* while (n) loop */581bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */582583d_block_done:584addi r4, r0, 4 /* n = 4 */585cmpu r4, r4, r7 /* n = c - n (unsigned) */586blti r4,d_xfer_end /* if n < 0, less than one word to transfer */587588d_word_xfer:589andi r4, r7, 0xfffffffc /* n = c & ~3 */590rsub r5, r4, r5 /* d = d - n */591rsub r6, r4, r6 /* s = s - n */592rsub r7, r4, r7 /* c = c - n */593594andi r9, r6, 3 /* t1 = s & 3 */595/* if temp != 0, unaligned transfers needed */596bnei r9, d_word_unaligned597598d_word_aligned:599addi r4, r4,-4 /* n-- */600lw r9, r6, r4 /* t1 = *(s+n) */601bneid r4, d_word_aligned /* loop */602sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */603604bri d_word_done605606d_word_unaligned:607andi r8, r6, 0xfffffffc /* as = s & ~3 */608lw r11, r8, r4 /* h = *(as + n) */609610addi r9, r9, -1611beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */612addi r9, r9, -1613beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */614615d_word_u3:616bsrli r11, r11, 8 /* h = h >> 8 */617d_wu3_loop:618addi r4, r4,-4 /* n = n - 4 */619lw r12, r8, r4 /* v = *(as + n) */620bslli r9, r12, 24 /* t1 = v << 24 */621or r9, r11, r9 /* t1 = h | t1 */622sw r9, r5, r4 /* *(d + n) = t1 */623bneid r4, d_wu3_loop /* while (n) loop */624bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */625626bri d_word_done627628d_word_u1:629bsrli r11, r11, 24 /* h = h >> 24 */630d_wu1_loop:631addi r4, r4,-4 /* n = n - 4 */632lw r12, r8, r4 /* v = *(as + n) */633bslli r9, r12, 8 /* t1 = v << 8 */634or r9, r11, r9 /* t1 = h | t1 */635sw r9, r5, r4 /* *(d + n) = t1 */636bneid r4, d_wu1_loop /* while (n) loop */637bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */638639bri d_word_done640641d_word_u2:642bsrli r11, r11, 16 /* h = h >> 16 */643d_wu2_loop:644addi r4, r4,-4 /* n = n - 4 */645lw r12, r8, r4 /* v = *(as + n) */646bslli r9, r12, 16 /* t1 = v << 16 */647or r9, r11, r9 /* t1 = h | t1 */648sw r9, r5, r4 /* *(d + n) = t1 */649bneid r4, d_wu2_loop /* while (n) loop */650bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */651652d_word_done:653654d_xfer_end:655d_xfer_end_loop:656beqi r7, a_done /* while (c) */657addi r6, r6, -1 /* s-- */658lbui r9, r6, 0 /* t1 = *s */659addi r5, r5, -1 /* d-- */660sbi r9, r5, 0 /* *d = t1 */661brid d_xfer_end_loop /* loop */662addi r7, r7, -1 /* c-- (IN DELAY SLOT) */663664d_done:665rtsd r15, 8666nop667668.size memmove, . - memmove669.end memmove670671672