Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcmp.S
39486 views
/* memcmp - compare memory1*2* Copyright (c) 2013-2022, Arm Limited.3* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception4*/56/* Assumptions:7*8* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.9*/1011#include "asmdefs.h"1213#define src1 x014#define src2 x115#define limit x216#define result w01718#define data1 x319#define data1w w320#define data2 x421#define data2w w422#define data3 x523#define data3w w524#define data4 x625#define data4w w626#define tmp x627#define src1end x728#define src2end x8293031ENTRY (__memcmp_aarch64)32cmp limit, 1633b.lo L(less16)34ldp data1, data3, [src1]35ldp data2, data4, [src2]36ccmp data1, data2, 0, ne37ccmp data3, data4, 0, eq38b.ne L(return2)3940add src1end, src1, limit41add src2end, src2, limit42cmp limit, 3243b.ls L(last_bytes)44cmp limit, 16045b.hs L(loop_align)46sub limit, limit, 324748.p2align 449L(loop32):50ldp data1, data3, [src1, 16]51ldp data2, data4, [src2, 16]52cmp data1, data253ccmp data3, data4, 0, eq54b.ne L(return2)55cmp limit, 1656b.ls L(last_bytes)5758ldp data1, data3, [src1, 32]59ldp data2, data4, [src2, 32]60cmp data1, data261ccmp data3, data4, 0, eq62b.ne L(return2)63add src1, src1, 3264add src2, src2, 3265L(last64):66subs limit, limit, 3267b.hi L(loop32)6869/* Compare last 1-16 bytes using unaligned access. */70L(last_bytes):71ldp data1, data3, [src1end, -16]72ldp data2, data4, [src2end, -16]73L(return2):74cmp data1, data275csel data1, data1, data3, ne76csel data2, data2, data4, ne7778/* Compare data bytes and set return value to 0, -1 or 1. */79L(return):80#ifndef __AARCH64EB__81rev data1, data182rev data2, data283#endif84cmp data1, data285cset result, ne86cneg result, result, lo87ret8889.p2align 490L(less16):91add src1end, src1, limit92add src2end, src2, limit93tbz limit, 3, L(less8)94ldr data1, [src1]95ldr data2, [src2]96ldr data3, [src1end, -8]97ldr data4, [src2end, -8]98b L(return2)99100.p2align 4101L(less8):102tbz limit, 2, L(less4)103ldr data1w, [src1]104ldr data2w, [src2]105ldr data3w, [src1end, -4]106ldr data4w, [src2end, -4]107b L(return2)108109L(less4):110tbz limit, 1, L(less2)111ldrh data1w, [src1]112ldrh data2w, [src2]113cmp data1w, data2w114b.ne L(return)115L(less2):116mov result, 0117tbz limit, 0, L(return_zero)118ldrb data1w, [src1end, -1]119ldrb data2w, [src2end, -1]120sub result, data1w, data2w121L(return_zero):122ret123124L(loop_align):125ldp data1, data3, [src1, 16]126ldp data2, data4, [src2, 16]127cmp data1, data2128ccmp data3, data4, 0, eq129b.ne L(return2)130131/* Align src2 and adjust src1, src2 and limit. */132and tmp, src2, 15133sub tmp, tmp, 16134sub src2, src2, tmp135add limit, limit, tmp136sub src1, src1, tmp137sub limit, limit, 64 + 16138139.p2align 4140L(loop64):141ldr q0, [src1, 16]142ldr q1, [src2, 16]143subs limit, limit, 64144ldr q2, [src1, 32]145ldr q3, [src2, 32]146eor v0.16b, v0.16b, v1.16b147eor v1.16b, v2.16b, v3.16b148ldr q2, [src1, 48]149ldr q3, [src2, 48]150umaxp v0.16b, v0.16b, v1.16b151ldr q4, [src1, 64]!152ldr q5, [src2, 64]!153eor v1.16b, v2.16b, v3.16b154eor v2.16b, v4.16b, v5.16b155umaxp v1.16b, v1.16b, v2.16b156umaxp v0.16b, v0.16b, v1.16b157umaxp v0.16b, v0.16b, v0.16b158fmov tmp, d0159ccmp tmp, 0, 0, hi160b.eq L(loop64)161162/* If equal, process last 1-64 bytes using scalar loop. */163add limit, limit, 64 + 16164cbz tmp, L(last64)165166/* Determine the 8-byte aligned offset of the first difference. */167#ifdef __AARCH64EB__168rev16 tmp, tmp169#endif170rev tmp, tmp171clz tmp, tmp172bic tmp, tmp, 7173sub tmp, tmp, 48174ldr data1, [src1, tmp]175ldr data2, [src2, tmp]176#ifndef __AARCH64EB__177rev data1, data1178rev data2, data2179#endif180mov result, 1181cmp data1, data2182cneg result, result, lo183ret184185END (__memcmp_aarch64)186187188