/*-1* Copyright (c) 2018 Instituto de Pesquisas Eldorado2* All rights reserved.3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions6* are met:7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9* 2. Redistributions in binary form must reproduce the above copyright10* notice, this list of conditions and the following disclaimer in the11* documentation and/or other materials provided with the distribution.12* 3. Neither the name of the author nor the names of its contributors may13* be used to endorse or promote products derived from this software14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE.26*27*/2829#include <machine/asm.h>30#define BLOCK_SIZE_BITS 631#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS)32#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1)3334/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/35#ifndef ALIGN_MASK36#define ALIGN_MASK 0x737#endif3839#define MULTI_PHASE_THRESHOLD 5124041#ifndef FN_NAME42#ifdef MEMMOVE43#define FN_NAME __memmove44WEAK_REFERENCE(__memmove, memmove);45#else46#define FN_NAME __bcopy47WEAK_REFERENCE(__bcopy, bcopy);48#endif49#endif5051/*52* r3: dst53* r4: src54* r5: len55*/5657ENTRY(FN_NAME)58cmpld %r3, %r4 /* src == dst? nothing to do */59beqlr-60cmpdi %r5, 0 /* len == 0? nothing to do */61beqlr-6263#ifdef MEMMOVE64std %r3, -8(%r1) /* save dst */65#else /* bcopy: swap src/dst */66mr %r0, %r367mr %r3, %r468mr %r4, %r069#endif7071/* First check for relative alignment, if unaligned copy one byte at a time */72andi. %r8, %r3, ALIGN_MASK73andi. %r7, %r4, ALIGN_MASK74cmpd %r7, %r875bne .Lunaligned767778cmpldi %r5, MULTI_PHASE_THRESHOLD79bge .Lmulti_phase80b .Lfast_copy8182.Lunaligned:83/* forward or backward copy? */84cmpd %r4, %r385blt .Lbackward_unaligned8687/* Just need to setup increment and jump to copy */88li %r0, 189mtctr %r590b .Lsingle_1_loop9192.Lbackward_unaligned:93/* advance src and dst to last byte, set decrement and jump to copy */94add %r3, %r3, %r595addi %r3, %r3, -196add %r4, %r4, %r597addi %r4, %r4, -198li %r0, -199mtctr %r5100b .Lsingle_1_loop101102.Lfast_copy:103/* align src */104cmpd %r4, %r3 /* forward or backward copy? */105blt .Lbackward_align106107.align 5108.Lalign:109andi. %r0, %r4, 15110beq .Lsingle_copy111lbz %r0, 0(%r4)112addi %r4, %r4, 1113stb %r0, 0(%r3)114addi %r3, %r3, 1115addi %r5, %r5, -1116cmpdi %r5, 0117beq- .Ldone118b .Lalign119120.Lbackward_align:121/* advance src and dst to end (past last byte) */122add %r3, %r3, %r5123add %r4, %r4, %r5124.align 5125.Lbackward_align_loop:126andi. %r0, %r4, 15127beq .Lbackward_single_copy128lbzu %r0, -1(%r4)129addi %r5, %r5, -1130stbu %r0, -1(%r3)131cmpdi %r5, 0132beq- .Ldone133b .Lbackward_align_loop134135.Lsingle_copy:136/* forward copy */137li %r0, 1138li %r8, 16139li %r9, 0140b .Lsingle_phase141142.Lbackward_single_copy:143/* backward copy */144li %r0, -1145li %r8, -16146li %r9, -15147/* point src and dst to last byte */148addi %r3, %r3, -1149addi %r4, %r4, -1150151.Lsingle_phase:152srdi. %r6, %r5, 4 /* number of 16-bytes */153beq .Lsingle_1154155/* pre-adjustment */156add %r3, %r3, %r9157add %r4, %r4, %r9158159mtctr %r6160.align 5161.Lsingle_16_loop:162ld %r6, 0(%r4)163ld %r7, 8(%r4)164add %r4, %r4, %r8165std %r6, 0(%r3)166std %r7, 8(%r3)167add %r3, %r3, %r8168bdnz .Lsingle_16_loop169170/* post-adjustment */171sub %r3, %r3, %r9172sub %r4, %r4, %r9173174.Lsingle_1:175andi. %r6, %r5, 0x0f /* number of 1-bytes */176beq .Ldone /* 1-bytes == 0? done */177178mtctr %r6179.align 5180.Lsingle_1_loop:181lbz %r6, 0(%r4)182add %r4, %r4, %r0 /* increment */183stb %r6, 0(%r3)184add %r3, %r3, %r0 /* increment */185bdnz .Lsingle_1_loop186187.Ldone:188#ifdef MEMMOVE189ld %r3, -8(%r1) /* restore dst */190#endif191blr192193194.Lmulti_phase:195/* set up multi-phase copy parameters */196197/* r7 = bytes before the aligned section of the buffer */198andi. %r6, %r4, 15199subfic %r7, %r6, 16200/* r8 = bytes in and after the aligned section of the buffer */201sub %r8, %r5, %r7202/* r9 = bytes after the aligned section of the buffer */203andi. %r9, %r8, BLOCK_SIZE_MASK204/* r10 = BLOCKS in the aligned section of the buffer */205srdi %r10, %r8, BLOCK_SIZE_BITS206207/* forward or backward copy? */208cmpd %r4, %r3209blt .Lbackward_multi_copy210211/* set up forward copy parameters */212std %r7, -32(%r1) /* bytes to copy in phase 1 */213std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */214std %r9, -48(%r1) /* bytes to copy in phase 3 */215216li %r0, 1 /* increment for phases 1 and 3 */217li %r5, BLOCK_SIZE /* increment for phase 2 */218219/* op offsets for phase 2 */220li %r7, 0221li %r8, 16222li %r9, 32223li %r10, 48224225std %r8, -16(%r1) /* 16-byte increment (16) */226std %r7, -24(%r1) /* 16-byte pre/post adjustment (0) */227228b .Lphase1229230.Lbackward_multi_copy:231/* set up backward copy parameters */232std %r9, -32(%r1) /* bytes to copy in phase 1 */233std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */234std %r7, -48(%r1) /* bytes to copy in phase 3 */235236li %r0, -1 /* increment for phases 1 and 3 */237add %r6, %r5, %r0 /* r6 = len - 1 */238li %r5, -BLOCK_SIZE /* increment for phase 2 */239/* advance src and dst to the last position */240add %r3, %r3, %r6241add %r4, %r4, %r6242243/* op offsets for phase 2 */244li %r7, -15245li %r8, -31246li %r9, -47247li %r10, -63248249add %r6, %r7, %r0 /* r6 = -16 */250std %r6, -16(%r1) /* 16-byte increment (-16) */251std %r7, -24(%r1) /* 16-byte pre/post adjustment (-15) */252253.Lphase1:254ld %r6, -32(%r1) /* bytes to copy in phase 1 */255cmpldi %r6, 0 /* r6 == 0? skip phase 1 */256beq+ .Lphase2257258mtctr %r6259.align 5260.Lphase1_loop:261lbz %r6, 0(%r4)262add %r4, %r4, %r0 /* phase 1 increment */263stb %r6, 0(%r3)264add %r3, %r3, %r0 /* phase 1 increment */265bdnz .Lphase1_loop266267.Lphase2:268ld %r6, -40(%r1) /* BLOCKS to copy in phase 2 */269cmpldi %r6, 0 /* %r6 == 0? skip phase 2 */270beq .Lphase3271272#ifdef FN_PHASE2273FN_PHASE2274#else275/* save registers */276std %r14, -56(%r1)277std %r15, -64(%r1)278std %r16, -72(%r1)279std %r17, -80(%r1)280std %r18, -88(%r1)281std %r19, -96(%r1)282std %r20, -104(%r1)283std %r21, -112(%r1)284285addi %r18, %r7, 8286addi %r19, %r8, 8287addi %r20, %r9, 8288addi %r21, %r10, 8289290mtctr %r6291.align 5292.Lphase2_loop:293ldx %r14, %r7, %r4294ldx %r15, %r18, %r4295ldx %r16, %r8, %r4296ldx %r17, %r19, %r4297stdx %r14, %r7, %r3298stdx %r15, %r18, %r3299stdx %r16, %r8, %r3300stdx %r17, %r19, %r3301302ldx %r14, %r9, %r4303ldx %r15, %r20, %r4304ldx %r16, %r10, %r4305ldx %r17, %r21, %r4306stdx %r14, %r9, %r3307stdx %r15, %r20, %r3308stdx %r16, %r10, %r3309stdx %r17, %r21, %r3310311add %r4, %r4, %r5 /* phase 2 increment */312add %r3, %r3, %r5 /* phase 2 increment */313314bdnz .Lphase2_loop315316/* restore registers */317ld %r14, -56(%r1)318ld %r15, -64(%r1)319ld %r16, -72(%r1)320ld %r17, -80(%r1)321ld %r18, -88(%r1)322ld %r19, -96(%r1)323ld %r20, -104(%r1)324ld %r21, -112(%r1)325#endif326327.Lphase3:328/* load registers for transitioning into the single-phase logic */329ld %r5, -48(%r1) /* bytes to copy in phase 3 */330ld %r8, -16(%r1) /* 16-byte increment */331ld %r9, -24(%r1) /* 16-byte pre/post adjustment */332b .Lsingle_phase333334END(FN_NAME)335336.section .note.GNU-stack,"",%progbits337338339340