ENTRY_CFI(memcpy)
mov.f 0, r2
;;; if size is zero
jz.d [blink]
mov r3, r0 ; don;t clobber ret val
;;; if size <= 8
cmp r2, 8
bls.d @.Lsmallchunk
mov.f lp_count, r2
and.f r4, r0, 0x03
rsub lp_count, r4, 4
lpnz @.Laligndestination
;; LOOP BEGIN
ldb.ab r5, [r1,1]
sub r2, r2, 1
stb.ab r5, [r3,1]
.Laligndestination:
;;; Check the alignment of the source
and.f r4, r1, 0x03
bnz.d @.Lsourceunaligned
;;; CASE 0: Both source and destination are 32bit aligned
;;; Convert len to Dwords, unfold x4
lsr.f lp_count, r2, ZOLSHFT
lpnz @.Lcopy32_64bytes
;; LOOP START
LOADX (r6, r1)
LOADX (r8, r1)
LOADX (r10, r1)
LOADX (r4, r1)
STOREX (r6, r3)
STOREX (r8, r3)
STOREX (r10, r3)
STOREX (r4, r3)
.Lcopy32_64bytes:
and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes
.Lsmallchunk:
lpnz @.Lcopyremainingbytes
;; LOOP START
ldb.ab r5, [r1,1]
stb.ab r5, [r3,1]
.Lcopyremainingbytes:
j [blink]
;;; END CASE 0
.Lsourceunaligned:
cmp r4, 2
beq.d @.LunalignedOffby2
sub r2, r2, 1
bhi.d @.LunalignedOffby3
ldb.ab r5, [r1, 1]
;;; CASE 1: The source is unaligned, off by 1
;; Hence I need to read 1 byte for a 16bit alignment
;; and 2bytes to reach 32bit alignment
ldh.ab r6, [r1, 2]
sub r2, r2, 2
;; Convert to words, unfold x2
lsr.f lp_count, r2, 3
MERGE_1 (r6, r6, 8)
MERGE_2 (r5, r5, 24)
or r5, r5, r6
;; Both src and dst are aligned
lpnz @.Lcopy8bytes_1
;; LOOP START
ld.ab r6, [r1, 4]
ld.ab r8, [r1,4]
SHIFT_1 (r7, r6, 24)
or r7, r7, r5
SHIFT_2 (r5, r6, 8)
SHIFT_1 (r9, r8, 24)
or r9, r9, r5
SHIFT_2 (r5, r8, 8)
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
.Lcopy8bytes_1:
;; Write back the remaining 16bits
EXTRACT_1 (r6, r5, 16)
sth.ab r6, [r3, 2]
;; Write back the remaining 8bits
EXTRACT_2 (r5, r5, 16)
stb.ab r5, [r3, 1]
and.f lp_count, r2, 0x07 ;Last 8bytes
lpnz @.Lcopybytewise_1
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
.Lcopybytewise_1:
j [blink]
.LunalignedOffby2:
;;; CASE 2: The source is unaligned, off by 2
ldh.ab r5, [r1, 2]
sub r2, r2, 1
;; Both src and dst are aligned
;; Convert to words, unfold x2
lsr.f lp_count, r2, 3
asl.nz r5, r5, 16
lpnz @.Lcopy8bytes_2
;; LOOP START
ld.ab r6, [r1, 4]
ld.ab r8, [r1,4]
SHIFT_1 (r7, r6, 16)
or r7, r7, r5
SHIFT_2 (r5, r6, 16)
SHIFT_1 (r9, r8, 16)
or r9, r9, r5
SHIFT_2 (r5, r8, 16)
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
.Lcopy8bytes_2:
lsr.nz r5, r5, 16
sth.ab r5, [r3, 2]
and.f lp_count, r2, 0x07 ;Last 8bytes
lpnz @.Lcopybytewise_2
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
.Lcopybytewise_2:
j [blink]
.LunalignedOffby3:
;;; CASE 3: The source is unaligned, off by 3
;;; Hence, I need to read 1byte for achieve the 32bit alignment
;; Both src and dst are aligned
;; Convert to words, unfold x2
lsr.f lp_count, r2, 3
asl.ne r5, r5, 24
lpnz @.Lcopy8bytes_3
;; LOOP START
ld.ab r6, [r1, 4]
ld.ab r8, [r1,4]
SHIFT_1 (r7, r6, 8)
or r7, r7, r5
SHIFT_2 (r5, r6, 24)
SHIFT_1 (r9, r8, 8)
or r9, r9, r5
SHIFT_2 (r5, r8, 24)
st.ab r7, [r3, 4]
st.ab r9, [r3, 4]
.Lcopy8bytes_3:
lsr.nz r5, r5, 24
stb.ab r5, [r3, 1]
and.f lp_count, r2, 0x07 ;Last 8bytes
lpnz @.Lcopybytewise_3
;; LOOP START
ldb.ab r6, [r1,1]
stb.ab r6, [r3,1]
.Lcopybytewise_3:
j [blink]
END_CFI(memcpy)