Path: blob/master/thirdparty/mbedtls/library/bn_mul.h
9898 views
/**1* \file bn_mul.h2*3* \brief Multi-precision integer library4*/5/*6* Copyright The Mbed TLS Contributors7* SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later8*/9/*10* Multiply source vector [s] with b, add result11* to destination vector [d] and set carry c.12*13* Currently supports:14*15* . IA-32 (386+) . AMD64 / EM64T16* . IA-32 (SSE2) . Motorola 6800017* . PowerPC, 32-bit . MicroBlaze18* . PowerPC, 64-bit . TriCore19* . SPARC v8 . ARM v3+20* . Alpha . MIPS3221* . C, longlong . C, generic22*/23#ifndef MBEDTLS_BN_MUL_H24#define MBEDTLS_BN_MUL_H2526#include "mbedtls/build_info.h"2728#include "mbedtls/bignum.h"293031/*32* Conversion macros for embedded constants:33* build lists of mbedtls_mpi_uint's from lists of unsigned char's grouped by 8, 4 or 234*/35#if defined(MBEDTLS_HAVE_INT32)3637#define MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d) \38((mbedtls_mpi_uint) (a) << 0) | \39((mbedtls_mpi_uint) (b) << 8) | \40((mbedtls_mpi_uint) (c) << 16) | \41((mbedtls_mpi_uint) (d) << 24)4243#define MBEDTLS_BYTES_TO_T_UINT_2(a, b) \44MBEDTLS_BYTES_TO_T_UINT_4(a, b, 0, 0)4546#define MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, e, f, g, h) \47MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d), \48MBEDTLS_BYTES_TO_T_UINT_4(e, f, g, h)4950#else /* 64-bits */5152#define MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, e, f, g, h) \53((mbedtls_mpi_uint) (a) << 0) | \54((mbedtls_mpi_uint) (b) << 8) | \55((mbedtls_mpi_uint) (c) << 16) | \56((mbedtls_mpi_uint) (d) << 24) | \57((mbedtls_mpi_uint) (e) << 32) | \58((mbedtls_mpi_uint) (f) << 40) | \59((mbedtls_mpi_uint) (g) << 48) | \60((mbedtls_mpi_uint) (h) << 56)6162#define MBEDTLS_BYTES_TO_T_UINT_4(a, b, c, d) \63MBEDTLS_BYTES_TO_T_UINT_8(a, b, c, d, 0, 0, 0, 0)6465#define MBEDTLS_BYTES_TO_T_UINT_2(a, b) \66MBEDTLS_BYTES_TO_T_UINT_8(a, b, 0, 0, 0, 0, 0, 0)6768#endif /* bits in mbedtls_mpi_uint */6970/* *INDENT-OFF* */71#if defined(MBEDTLS_HAVE_ASM)7273/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */74#if defined(__GNUC__) && \75( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )7677/*78* GCC < 5.0 treated the x86 ebx (which is used for the GOT) as a79* fixed reserved register when building as PIC, leading to errors80* like: bn_mul.h:46:13: error: PIC register clobbered by 'ebx' in 'asm'81*82* This is fixed by an improved register allocator in GCC 5+. From the83* release notes:84* Register allocation improvements: Reuse of the PIC hard register,85* instead of using a fixed register, was implemented on x86/x86-6486* targets. This improves generated PIC code performance as more hard87* registers can be used.88*/89#if defined(__GNUC__) && __GNUC__ < 5 && defined(__PIC__)90#define MULADDC_CANNOT_USE_EBX91#endif9293/*94* Disable use of the i386 assembly code below if option -O0, to disable all95* compiler optimisations, is passed, detected with __OPTIMIZE__96* This is done as the number of registers used in the assembly code doesn't97* work with the -O0 option.98*/99#if defined(__i386__) && defined(__OPTIMIZE__) && !defined(MULADDC_CANNOT_USE_EBX)100101#define MULADDC_X1_INIT \102{ mbedtls_mpi_uint t; \103asm( \104"movl %%ebx, %0 \n\t" \105"movl %5, %%esi \n\t" \106"movl %6, %%edi \n\t" \107"movl %7, %%ecx \n\t" \108"movl %8, %%ebx \n\t"109110#define MULADDC_X1_CORE \111"lodsl \n\t" \112"mull %%ebx \n\t" \113"addl %%ecx, %%eax \n\t" \114"adcl $0, %%edx \n\t" \115"addl (%%edi), %%eax \n\t" \116"adcl $0, %%edx \n\t" \117"movl %%edx, %%ecx \n\t" \118"stosl \n\t"119120#define MULADDC_X1_STOP \121"movl %4, %%ebx \n\t" \122"movl %%ecx, %1 \n\t" \123"movl %%edi, %2 \n\t" \124"movl %%esi, %3 \n\t" \125: "=m" (t), "=m" (c), "=m" (d), "=m" (s) \126: "m" (t), "m" (s), "m" (d), "m" (c), "m" (b) \127: "eax", "ebx", "ecx", "edx", "esi", "edi" \128); }129130#if defined(MBEDTLS_HAVE_SSE2)131132#define MULADDC_X8_INIT MULADDC_X1_INIT133134#define MULADDC_X8_CORE \135"movd %%ecx, %%mm1 \n\t" \136"movd %%ebx, %%mm0 \n\t" \137"movd (%%edi), %%mm3 \n\t" \138"paddq %%mm3, %%mm1 \n\t" \139"movd (%%esi), %%mm2 \n\t" \140"pmuludq %%mm0, %%mm2 \n\t" \141"movd 4(%%esi), %%mm4 \n\t" \142"pmuludq %%mm0, %%mm4 \n\t" \143"movd 8(%%esi), %%mm6 \n\t" \144"pmuludq %%mm0, %%mm6 \n\t" \145"movd 12(%%esi), %%mm7 \n\t" \146"pmuludq %%mm0, %%mm7 \n\t" \147"paddq %%mm2, %%mm1 \n\t" \148"movd 4(%%edi), %%mm3 \n\t" \149"paddq %%mm4, %%mm3 \n\t" \150"movd 8(%%edi), %%mm5 \n\t" \151"paddq %%mm6, %%mm5 \n\t" \152"movd 12(%%edi), %%mm4 \n\t" \153"paddq %%mm4, %%mm7 \n\t" \154"movd %%mm1, (%%edi) \n\t" \155"movd 16(%%esi), %%mm2 \n\t" \156"pmuludq %%mm0, %%mm2 \n\t" \157"psrlq $32, %%mm1 \n\t" \158"movd 20(%%esi), %%mm4 \n\t" \159"pmuludq %%mm0, %%mm4 \n\t" \160"paddq %%mm3, %%mm1 \n\t" \161"movd 24(%%esi), %%mm6 \n\t" \162"pmuludq %%mm0, %%mm6 \n\t" \163"movd %%mm1, 4(%%edi) \n\t" \164"psrlq $32, %%mm1 \n\t" \165"movd 28(%%esi), %%mm3 \n\t" \166"pmuludq %%mm0, %%mm3 \n\t" \167"paddq %%mm5, %%mm1 \n\t" \168"movd 16(%%edi), %%mm5 \n\t" \169"paddq %%mm5, %%mm2 \n\t" \170"movd %%mm1, 8(%%edi) \n\t" \171"psrlq $32, %%mm1 \n\t" \172"paddq %%mm7, %%mm1 \n\t" \173"movd 20(%%edi), %%mm5 \n\t" \174"paddq %%mm5, %%mm4 \n\t" \175"movd %%mm1, 12(%%edi) \n\t" \176"psrlq $32, %%mm1 \n\t" \177"paddq %%mm2, %%mm1 \n\t" \178"movd 24(%%edi), %%mm5 \n\t" \179"paddq %%mm5, %%mm6 \n\t" \180"movd %%mm1, 16(%%edi) \n\t" \181"psrlq $32, %%mm1 \n\t" \182"paddq %%mm4, %%mm1 \n\t" \183"movd 28(%%edi), %%mm5 \n\t" \184"paddq %%mm5, %%mm3 \n\t" \185"movd %%mm1, 20(%%edi) \n\t" \186"psrlq $32, %%mm1 \n\t" \187"paddq %%mm6, %%mm1 \n\t" \188"movd %%mm1, 24(%%edi) \n\t" \189"psrlq $32, %%mm1 \n\t" \190"paddq %%mm3, %%mm1 \n\t" \191"movd %%mm1, 28(%%edi) \n\t" \192"addl $32, %%edi \n\t" \193"addl $32, %%esi \n\t" \194"psrlq $32, %%mm1 \n\t" \195"movd %%mm1, %%ecx \n\t"196197#define MULADDC_X8_STOP \198"emms \n\t" \199"movl %4, %%ebx \n\t" \200"movl %%ecx, %1 \n\t" \201"movl %%edi, %2 \n\t" \202"movl %%esi, %3 \n\t" \203: "=m" (t), "=m" (c), "=m" (d), "=m" (s) \204: "m" (t), "m" (s), "m" (d), "m" (c), "m" (b) \205: "eax", "ebx", "ecx", "edx", "esi", "edi" \206); } \207208#endif /* SSE2 */209210#endif /* i386 */211212#if defined(__amd64__) || defined (__x86_64__)213214#define MULADDC_X1_INIT \215asm( \216"xorq %%r8, %%r8\n"217218#define MULADDC_X1_CORE \219"movq (%%rsi), %%rax\n" \220"mulq %%rbx\n" \221"addq $8, %%rsi\n" \222"addq %%rcx, %%rax\n" \223"movq %%r8, %%rcx\n" \224"adcq $0, %%rdx\n" \225"nop \n" \226"addq %%rax, (%%rdi)\n" \227"adcq %%rdx, %%rcx\n" \228"addq $8, %%rdi\n"229230#define MULADDC_X1_STOP \231: "+c" (c), "+D" (d), "+S" (s), "+m" (*(uint64_t (*)[16]) d) \232: "b" (b), "m" (*(const uint64_t (*)[16]) s) \233: "rax", "rdx", "r8" \234);235236#endif /* AMD64 */237238// The following assembly code assumes that a pointer will fit in a 64-bit register239// (including ILP32 __aarch64__ ABIs such as on watchOS, hence the 2^32 - 1)240#if defined(__aarch64__) && (UINTPTR_MAX == 0xfffffffful || UINTPTR_MAX == 0xfffffffffffffffful)241242/*243* There are some issues around different compilers requiring different constraint244* syntax for updating pointers from assembly code (see notes for245* MBEDTLS_ASM_AARCH64_PTR_CONSTRAINT in common.h), especially on aarch64_32 (aka ILP32).246*247* For this reason we cast the pointers to/from uintptr_t here.248*/249#define MULADDC_X1_INIT \250do { uintptr_t muladdc_d = (uintptr_t) d, muladdc_s = (uintptr_t) s; asm(251252#define MULADDC_X1_CORE \253"ldr x4, [%x2], #8 \n\t" \254"ldr x5, [%x1] \n\t" \255"mul x6, x4, %4 \n\t" \256"umulh x7, x4, %4 \n\t" \257"adds x5, x5, x6 \n\t" \258"adc x7, x7, xzr \n\t" \259"adds x5, x5, %0 \n\t" \260"adc %0, x7, xzr \n\t" \261"str x5, [%x1], #8 \n\t"262263#define MULADDC_X1_STOP \264: "+r" (c), \265"+r" (muladdc_d), \266"+r" (muladdc_s), \267"+m" (*(uint64_t (*)[16]) d) \268: "r" (b), "m" (*(const uint64_t (*)[16]) s) \269: "x4", "x5", "x6", "x7", "cc" \270); d = (mbedtls_mpi_uint *)muladdc_d; s = (mbedtls_mpi_uint *)muladdc_s; } while (0);271272#endif /* Aarch64 */273274#if defined(__mc68020__) || defined(__mcpu32__)275276#define MULADDC_X1_INIT \277asm( \278"movl %3, %%a2 \n\t" \279"movl %4, %%a3 \n\t" \280"movl %5, %%d3 \n\t" \281"movl %6, %%d2 \n\t" \282"moveq #0, %%d0 \n\t"283284#define MULADDC_X1_CORE \285"movel %%a2@+, %%d1 \n\t" \286"mulul %%d2, %%d4:%%d1 \n\t" \287"addl %%d3, %%d1 \n\t" \288"addxl %%d0, %%d4 \n\t" \289"moveq #0, %%d3 \n\t" \290"addl %%d1, %%a3@+ \n\t" \291"addxl %%d4, %%d3 \n\t"292293#define MULADDC_X1_STOP \294"movl %%d3, %0 \n\t" \295"movl %%a3, %1 \n\t" \296"movl %%a2, %2 \n\t" \297: "=m" (c), "=m" (d), "=m" (s) \298: "m" (s), "m" (d), "m" (c), "m" (b) \299: "d0", "d1", "d2", "d3", "d4", "a2", "a3" \300);301302#define MULADDC_X8_INIT MULADDC_X1_INIT303304#define MULADDC_X8_CORE \305"movel %%a2@+, %%d1 \n\t" \306"mulul %%d2, %%d4:%%d1 \n\t" \307"addxl %%d3, %%d1 \n\t" \308"addxl %%d0, %%d4 \n\t" \309"addl %%d1, %%a3@+ \n\t" \310"movel %%a2@+, %%d1 \n\t" \311"mulul %%d2, %%d3:%%d1 \n\t" \312"addxl %%d4, %%d1 \n\t" \313"addxl %%d0, %%d3 \n\t" \314"addl %%d1, %%a3@+ \n\t" \315"movel %%a2@+, %%d1 \n\t" \316"mulul %%d2, %%d4:%%d1 \n\t" \317"addxl %%d3, %%d1 \n\t" \318"addxl %%d0, %%d4 \n\t" \319"addl %%d1, %%a3@+ \n\t" \320"movel %%a2@+, %%d1 \n\t" \321"mulul %%d2, %%d3:%%d1 \n\t" \322"addxl %%d4, %%d1 \n\t" \323"addxl %%d0, %%d3 \n\t" \324"addl %%d1, %%a3@+ \n\t" \325"movel %%a2@+, %%d1 \n\t" \326"mulul %%d2, %%d4:%%d1 \n\t" \327"addxl %%d3, %%d1 \n\t" \328"addxl %%d0, %%d4 \n\t" \329"addl %%d1, %%a3@+ \n\t" \330"movel %%a2@+, %%d1 \n\t" \331"mulul %%d2, %%d3:%%d1 \n\t" \332"addxl %%d4, %%d1 \n\t" \333"addxl %%d0, %%d3 \n\t" \334"addl %%d1, %%a3@+ \n\t" \335"movel %%a2@+, %%d1 \n\t" \336"mulul %%d2, %%d4:%%d1 \n\t" \337"addxl %%d3, %%d1 \n\t" \338"addxl %%d0, %%d4 \n\t" \339"addl %%d1, %%a3@+ \n\t" \340"movel %%a2@+, %%d1 \n\t" \341"mulul %%d2, %%d3:%%d1 \n\t" \342"addxl %%d4, %%d1 \n\t" \343"addxl %%d0, %%d3 \n\t" \344"addl %%d1, %%a3@+ \n\t" \345"addxl %%d0, %%d3 \n\t"346347#define MULADDC_X8_STOP MULADDC_X1_STOP348349#endif /* MC68000 */350351#if defined(__powerpc64__) || defined(__ppc64__)352353#if defined(__MACH__) && defined(__APPLE__)354355#define MULADDC_X1_INIT \356asm( \357"ld r3, %3 \n\t" \358"ld r4, %4 \n\t" \359"ld r5, %5 \n\t" \360"ld r6, %6 \n\t" \361"addi r3, r3, -8 \n\t" \362"addi r4, r4, -8 \n\t" \363"addic r5, r5, 0 \n\t"364365#define MULADDC_X1_CORE \366"ldu r7, 8(r3) \n\t" \367"mulld r8, r7, r6 \n\t" \368"mulhdu r9, r7, r6 \n\t" \369"adde r8, r8, r5 \n\t" \370"ld r7, 8(r4) \n\t" \371"addze r5, r9 \n\t" \372"addc r8, r8, r7 \n\t" \373"stdu r8, 8(r4) \n\t"374375#define MULADDC_X1_STOP \376"addze r5, r5 \n\t" \377"addi r4, r4, 8 \n\t" \378"addi r3, r3, 8 \n\t" \379"std r5, %0 \n\t" \380"std r4, %1 \n\t" \381"std r3, %2 \n\t" \382: "=m" (c), "=m" (d), "=m" (s) \383: "m" (s), "m" (d), "m" (c), "m" (b) \384: "r3", "r4", "r5", "r6", "r7", "r8", "r9" \385);386387388#else /* __MACH__ && __APPLE__ */389390#define MULADDC_X1_INIT \391asm( \392"ld %%r3, %3 \n\t" \393"ld %%r4, %4 \n\t" \394"ld %%r5, %5 \n\t" \395"ld %%r6, %6 \n\t" \396"addi %%r3, %%r3, -8 \n\t" \397"addi %%r4, %%r4, -8 \n\t" \398"addic %%r5, %%r5, 0 \n\t"399400#define MULADDC_X1_CORE \401"ldu %%r7, 8(%%r3) \n\t" \402"mulld %%r8, %%r7, %%r6 \n\t" \403"mulhdu %%r9, %%r7, %%r6 \n\t" \404"adde %%r8, %%r8, %%r5 \n\t" \405"ld %%r7, 8(%%r4) \n\t" \406"addze %%r5, %%r9 \n\t" \407"addc %%r8, %%r8, %%r7 \n\t" \408"stdu %%r8, 8(%%r4) \n\t"409410#define MULADDC_X1_STOP \411"addze %%r5, %%r5 \n\t" \412"addi %%r4, %%r4, 8 \n\t" \413"addi %%r3, %%r3, 8 \n\t" \414"std %%r5, %0 \n\t" \415"std %%r4, %1 \n\t" \416"std %%r3, %2 \n\t" \417: "=m" (c), "=m" (d), "=m" (s) \418: "m" (s), "m" (d), "m" (c), "m" (b) \419: "r3", "r4", "r5", "r6", "r7", "r8", "r9" \420);421422#endif /* __MACH__ && __APPLE__ */423424#elif defined(__powerpc__) || defined(__ppc__) /* end PPC64/begin PPC32 */425426#if defined(__MACH__) && defined(__APPLE__)427428#define MULADDC_X1_INIT \429asm( \430"lwz r3, %3 \n\t" \431"lwz r4, %4 \n\t" \432"lwz r5, %5 \n\t" \433"lwz r6, %6 \n\t" \434"addi r3, r3, -4 \n\t" \435"addi r4, r4, -4 \n\t" \436"addic r5, r5, 0 \n\t"437438#define MULADDC_X1_CORE \439"lwzu r7, 4(r3) \n\t" \440"mullw r8, r7, r6 \n\t" \441"mulhwu r9, r7, r6 \n\t" \442"adde r8, r8, r5 \n\t" \443"lwz r7, 4(r4) \n\t" \444"addze r5, r9 \n\t" \445"addc r8, r8, r7 \n\t" \446"stwu r8, 4(r4) \n\t"447448#define MULADDC_X1_STOP \449"addze r5, r5 \n\t" \450"addi r4, r4, 4 \n\t" \451"addi r3, r3, 4 \n\t" \452"stw r5, %0 \n\t" \453"stw r4, %1 \n\t" \454"stw r3, %2 \n\t" \455: "=m" (c), "=m" (d), "=m" (s) \456: "m" (s), "m" (d), "m" (c), "m" (b) \457: "r3", "r4", "r5", "r6", "r7", "r8", "r9" \458);459460#else /* __MACH__ && __APPLE__ */461462#define MULADDC_X1_INIT \463asm( \464"lwz %%r3, %3 \n\t" \465"lwz %%r4, %4 \n\t" \466"lwz %%r5, %5 \n\t" \467"lwz %%r6, %6 \n\t" \468"addi %%r3, %%r3, -4 \n\t" \469"addi %%r4, %%r4, -4 \n\t" \470"addic %%r5, %%r5, 0 \n\t"471472#define MULADDC_X1_CORE \473"lwzu %%r7, 4(%%r3) \n\t" \474"mullw %%r8, %%r7, %%r6 \n\t" \475"mulhwu %%r9, %%r7, %%r6 \n\t" \476"adde %%r8, %%r8, %%r5 \n\t" \477"lwz %%r7, 4(%%r4) \n\t" \478"addze %%r5, %%r9 \n\t" \479"addc %%r8, %%r8, %%r7 \n\t" \480"stwu %%r8, 4(%%r4) \n\t"481482#define MULADDC_X1_STOP \483"addze %%r5, %%r5 \n\t" \484"addi %%r4, %%r4, 4 \n\t" \485"addi %%r3, %%r3, 4 \n\t" \486"stw %%r5, %0 \n\t" \487"stw %%r4, %1 \n\t" \488"stw %%r3, %2 \n\t" \489: "=m" (c), "=m" (d), "=m" (s) \490: "m" (s), "m" (d), "m" (c), "m" (b) \491: "r3", "r4", "r5", "r6", "r7", "r8", "r9" \492);493494#endif /* __MACH__ && __APPLE__ */495496#endif /* PPC32 */497498/*499* The Sparc(64) assembly is reported to be broken.500* Disable it for now, until we're able to fix it.501*/502#if 0 && defined(__sparc__)503#if defined(__sparc64__)504505#define MULADDC_X1_INIT \506asm( \507"ldx %3, %%o0 \n\t" \508"ldx %4, %%o1 \n\t" \509"ld %5, %%o2 \n\t" \510"ld %6, %%o3 \n\t"511512#define MULADDC_X1_CORE \513"ld [%%o0], %%o4 \n\t" \514"inc 4, %%o0 \n\t" \515"ld [%%o1], %%o5 \n\t" \516"umul %%o3, %%o4, %%o4 \n\t" \517"addcc %%o4, %%o2, %%o4 \n\t" \518"rd %%y, %%g1 \n\t" \519"addx %%g1, 0, %%g1 \n\t" \520"addcc %%o4, %%o5, %%o4 \n\t" \521"st %%o4, [%%o1] \n\t" \522"addx %%g1, 0, %%o2 \n\t" \523"inc 4, %%o1 \n\t"524525#define MULADDC_X1_STOP \526"st %%o2, %0 \n\t" \527"stx %%o1, %1 \n\t" \528"stx %%o0, %2 \n\t" \529: "=m" (c), "=m" (d), "=m" (s) \530: "m" (s), "m" (d), "m" (c), "m" (b) \531: "g1", "o0", "o1", "o2", "o3", "o4", \532"o5" \533);534535#else /* __sparc64__ */536537#define MULADDC_X1_INIT \538asm( \539"ld %3, %%o0 \n\t" \540"ld %4, %%o1 \n\t" \541"ld %5, %%o2 \n\t" \542"ld %6, %%o3 \n\t"543544#define MULADDC_X1_CORE \545"ld [%%o0], %%o4 \n\t" \546"inc 4, %%o0 \n\t" \547"ld [%%o1], %%o5 \n\t" \548"umul %%o3, %%o4, %%o4 \n\t" \549"addcc %%o4, %%o2, %%o4 \n\t" \550"rd %%y, %%g1 \n\t" \551"addx %%g1, 0, %%g1 \n\t" \552"addcc %%o4, %%o5, %%o4 \n\t" \553"st %%o4, [%%o1] \n\t" \554"addx %%g1, 0, %%o2 \n\t" \555"inc 4, %%o1 \n\t"556557#define MULADDC_X1_STOP \558"st %%o2, %0 \n\t" \559"st %%o1, %1 \n\t" \560"st %%o0, %2 \n\t" \561: "=m" (c), "=m" (d), "=m" (s) \562: "m" (s), "m" (d), "m" (c), "m" (b) \563: "g1", "o0", "o1", "o2", "o3", "o4", \564"o5" \565);566567#endif /* __sparc64__ */568#endif /* __sparc__ */569570#if defined(__microblaze__) || defined(microblaze)571572#define MULADDC_X1_INIT \573asm( \574"lwi r3, %3 \n\t" \575"lwi r4, %4 \n\t" \576"lwi r5, %5 \n\t" \577"lwi r6, %6 \n\t" \578"andi r7, r6, 0xffff \n\t" \579"bsrli r6, r6, 16 \n\t"580581#if(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)582#define MULADDC_LHUI \583"lhui r9, r3, 0 \n\t" \584"addi r3, r3, 2 \n\t" \585"lhui r8, r3, 0 \n\t"586#else587#define MULADDC_LHUI \588"lhui r8, r3, 0 \n\t" \589"addi r3, r3, 2 \n\t" \590"lhui r9, r3, 0 \n\t"591#endif592593#define MULADDC_X1_CORE \594MULADDC_LHUI \595"addi r3, r3, 2 \n\t" \596"mul r10, r9, r6 \n\t" \597"mul r11, r8, r7 \n\t" \598"mul r12, r9, r7 \n\t" \599"mul r13, r8, r6 \n\t" \600"bsrli r8, r10, 16 \n\t" \601"bsrli r9, r11, 16 \n\t" \602"add r13, r13, r8 \n\t" \603"add r13, r13, r9 \n\t" \604"bslli r10, r10, 16 \n\t" \605"bslli r11, r11, 16 \n\t" \606"add r12, r12, r10 \n\t" \607"addc r13, r13, r0 \n\t" \608"add r12, r12, r11 \n\t" \609"addc r13, r13, r0 \n\t" \610"lwi r10, r4, 0 \n\t" \611"add r12, r12, r10 \n\t" \612"addc r13, r13, r0 \n\t" \613"add r12, r12, r5 \n\t" \614"addc r5, r13, r0 \n\t" \615"swi r12, r4, 0 \n\t" \616"addi r4, r4, 4 \n\t"617618#define MULADDC_X1_STOP \619"swi r5, %0 \n\t" \620"swi r4, %1 \n\t" \621"swi r3, %2 \n\t" \622: "=m" (c), "=m" (d), "=m" (s) \623: "m" (s), "m" (d), "m" (c), "m" (b) \624: "r3", "r4", "r5", "r6", "r7", "r8", \625"r9", "r10", "r11", "r12", "r13" \626);627628#endif /* MicroBlaze */629630#if defined(__tricore__)631632#define MULADDC_X1_INIT \633asm( \634"ld.a %%a2, %3 \n\t" \635"ld.a %%a3, %4 \n\t" \636"ld.w %%d4, %5 \n\t" \637"ld.w %%d1, %6 \n\t" \638"xor %%d5, %%d5 \n\t"639640#define MULADDC_X1_CORE \641"ld.w %%d0, [%%a2+] \n\t" \642"madd.u %%e2, %%e4, %%d0, %%d1 \n\t" \643"ld.w %%d0, [%%a3] \n\t" \644"addx %%d2, %%d2, %%d0 \n\t" \645"addc %%d3, %%d3, 0 \n\t" \646"mov %%d4, %%d3 \n\t" \647"st.w [%%a3+], %%d2 \n\t"648649#define MULADDC_X1_STOP \650"st.w %0, %%d4 \n\t" \651"st.a %1, %%a3 \n\t" \652"st.a %2, %%a2 \n\t" \653: "=m" (c), "=m" (d), "=m" (s) \654: "m" (s), "m" (d), "m" (c), "m" (b) \655: "d0", "d1", "e2", "d4", "a2", "a3" \656);657658#endif /* TriCore */659660#if defined(__arm__)661662#if defined(__thumb__) && !defined(__thumb2__)663#if defined(MBEDTLS_COMPILER_IS_GCC)664/*665* Thumb 1 ISA. This code path has only been tested successfully on gcc;666* it does not compile on clang or armclang.667*/668669#if !defined(__OPTIMIZE__) && defined(__GNUC__)670/*671* Note, gcc -O0 by default uses r7 for the frame pointer, so it complains about672* our use of r7 below, unless -fomit-frame-pointer is passed.673*674* On the other hand, -fomit-frame-pointer is implied by any -Ox options with675* x !=0, which we can detect using __OPTIMIZE__ (which is also defined by676* clang and armcc5 under the same conditions).677*678* If gcc needs to use r7, we use r1 as a scratch register and have a few extra679* instructions to preserve/restore it; otherwise, we can use r7 and avoid680* the preserve/restore overhead.681*/682#define MULADDC_SCRATCH "RS .req r1 \n\t"683#define MULADDC_PRESERVE_SCRATCH "mov r10, r1 \n\t"684#define MULADDC_RESTORE_SCRATCH "mov r1, r10 \n\t"685#define MULADDC_SCRATCH_CLOBBER "r10"686#else /* !defined(__OPTIMIZE__) && defined(__GNUC__) */687#define MULADDC_SCRATCH "RS .req r7 \n\t"688#define MULADDC_PRESERVE_SCRATCH ""689#define MULADDC_RESTORE_SCRATCH ""690#define MULADDC_SCRATCH_CLOBBER "r7"691#endif /* !defined(__OPTIMIZE__) && defined(__GNUC__) */692693#define MULADDC_X1_INIT \694asm( \695MULADDC_SCRATCH \696"ldr r0, %3 \n\t" \697"ldr r1, %4 \n\t" \698"ldr r2, %5 \n\t" \699"ldr r3, %6 \n\t" \700"lsr r4, r3, #16 \n\t" \701"mov r9, r4 \n\t" \702"lsl r4, r3, #16 \n\t" \703"lsr r4, r4, #16 \n\t" \704"mov r8, r4 \n\t" \705706707#define MULADDC_X1_CORE \708MULADDC_PRESERVE_SCRATCH \709"ldmia r0!, {r6} \n\t" \710"lsr RS, r6, #16 \n\t" \711"lsl r6, r6, #16 \n\t" \712"lsr r6, r6, #16 \n\t" \713"mov r4, r8 \n\t" \714"mul r4, r6 \n\t" \715"mov r3, r9 \n\t" \716"mul r6, r3 \n\t" \717"mov r5, r9 \n\t" \718"mul r5, RS \n\t" \719"mov r3, r8 \n\t" \720"mul RS, r3 \n\t" \721"lsr r3, r6, #16 \n\t" \722"add r5, r5, r3 \n\t" \723"lsr r3, RS, #16 \n\t" \724"add r5, r5, r3 \n\t" \725"add r4, r4, r2 \n\t" \726"mov r2, #0 \n\t" \727"adc r5, r2 \n\t" \728"lsl r3, r6, #16 \n\t" \729"add r4, r4, r3 \n\t" \730"adc r5, r2 \n\t" \731"lsl r3, RS, #16 \n\t" \732"add r4, r4, r3 \n\t" \733"adc r5, r2 \n\t" \734MULADDC_RESTORE_SCRATCH \735"ldr r3, [r1] \n\t" \736"add r4, r4, r3 \n\t" \737"adc r2, r5 \n\t" \738"stmia r1!, {r4} \n\t"739740#define MULADDC_X1_STOP \741"str r2, %0 \n\t" \742"str r1, %1 \n\t" \743"str r0, %2 \n\t" \744: "=m" (c), "=m" (d), "=m" (s) \745: "m" (s), "m" (d), "m" (c), "m" (b) \746: "r0", "r1", "r2", "r3", "r4", "r5", \747"r6", MULADDC_SCRATCH_CLOBBER, "r8", "r9", "cc" \748);749#endif /* !defined(__ARMCC_VERSION) && !defined(__clang__) */750751#elif (__ARM_ARCH >= 6) && \752defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)753/* Armv6-M (or later) with DSP Instruction Set Extensions.754* Requires support for either Thumb 2 or Arm ISA.755*/756757#define MULADDC_X1_INIT \758{ \759mbedtls_mpi_uint tmp_a, tmp_b; \760asm volatile (761762#define MULADDC_X1_CORE \763".p2align 2 \n\t" \764"ldr %[a], [%[in]], #4 \n\t" \765"ldr %[b], [%[acc]] \n\t" \766"umaal %[b], %[carry], %[scalar], %[a] \n\t" \767"str %[b], [%[acc]], #4 \n\t"768769#define MULADDC_X1_STOP \770: [a] "=&r" (tmp_a), \771[b] "=&r" (tmp_b), \772[in] "+r" (s), \773[acc] "+r" (d), \774[carry] "+l" (c) \775: [scalar] "r" (b) \776: "memory" \777); \778}779780#define MULADDC_X2_INIT \781{ \782mbedtls_mpi_uint tmp_a0, tmp_b0; \783mbedtls_mpi_uint tmp_a1, tmp_b1; \784asm volatile (785786/* - Make sure loop is 4-byte aligned to avoid stalls787* upon repeated non-word aligned instructions in788* some microarchitectures.789* - Don't use ldm with post-increment or back-to-back790* loads with post-increment and same address register791* to avoid stalls on some microarchitectures.792* - Bunch loads and stores to reduce latency on some793* microarchitectures. E.g., on Cortex-M4, the first794* in a series of load/store operations has latency795* 2 cycles, while subsequent loads/stores are single-cycle. */796#define MULADDC_X2_CORE \797".p2align 2 \n\t" \798"ldr %[a0], [%[in]], #+8 \n\t" \799"ldr %[b0], [%[acc]], #+8 \n\t" \800"ldr %[a1], [%[in], #-4] \n\t" \801"ldr %[b1], [%[acc], #-4] \n\t" \802"umaal %[b0], %[carry], %[scalar], %[a0] \n\t" \803"umaal %[b1], %[carry], %[scalar], %[a1] \n\t" \804"str %[b0], [%[acc], #-8] \n\t" \805"str %[b1], [%[acc], #-4] \n\t"806807#define MULADDC_X2_STOP \808: [a0] "=&r" (tmp_a0), \809[b0] "=&r" (tmp_b0), \810[a1] "=&r" (tmp_a1), \811[b1] "=&r" (tmp_b1), \812[in] "+r" (s), \813[acc] "+r" (d), \814[carry] "+l" (c) \815: [scalar] "r" (b) \816: "memory" \817); \818}819820#else /* Thumb 2 or Arm ISA, without DSP extensions */821822#define MULADDC_X1_INIT \823asm( \824"ldr r0, %3 \n\t" \825"ldr r1, %4 \n\t" \826"ldr r2, %5 \n\t" \827"ldr r3, %6 \n\t"828829#define MULADDC_X1_CORE \830"ldr r4, [r0], #4 \n\t" \831"mov r5, #0 \n\t" \832"ldr r6, [r1] \n\t" \833"umlal r2, r5, r3, r4 \n\t" \834"adds r4, r6, r2 \n\t" \835"adc r2, r5, #0 \n\t" \836"str r4, [r1], #4 \n\t"837838#define MULADDC_X1_STOP \839"str r2, %0 \n\t" \840"str r1, %1 \n\t" \841"str r0, %2 \n\t" \842: "=m" (c), "=m" (d), "=m" (s) \843: "m" (s), "m" (d), "m" (c), "m" (b) \844: "r0", "r1", "r2", "r3", "r4", "r5", \845"r6", "cc" \846);847848#endif /* ISA codepath selection */849850#endif /* defined(__arm__) */851852#if defined(__alpha__)853854#define MULADDC_X1_INIT \855asm( \856"ldq $1, %3 \n\t" \857"ldq $2, %4 \n\t" \858"ldq $3, %5 \n\t" \859"ldq $4, %6 \n\t"860861#define MULADDC_X1_CORE \862"ldq $6, 0($1) \n\t" \863"addq $1, 8, $1 \n\t" \864"mulq $6, $4, $7 \n\t" \865"umulh $6, $4, $6 \n\t" \866"addq $7, $3, $7 \n\t" \867"cmpult $7, $3, $3 \n\t" \868"ldq $5, 0($2) \n\t" \869"addq $7, $5, $7 \n\t" \870"cmpult $7, $5, $5 \n\t" \871"stq $7, 0($2) \n\t" \872"addq $2, 8, $2 \n\t" \873"addq $6, $3, $3 \n\t" \874"addq $5, $3, $3 \n\t"875876#define MULADDC_X1_STOP \877"stq $3, %0 \n\t" \878"stq $2, %1 \n\t" \879"stq $1, %2 \n\t" \880: "=m" (c), "=m" (d), "=m" (s) \881: "m" (s), "m" (d), "m" (c), "m" (b) \882: "$1", "$2", "$3", "$4", "$5", "$6", "$7" \883);884#endif /* Alpha */885886#if defined(__mips__) && !defined(__mips64)887888#define MULADDC_X1_INIT \889asm( \890"lw $10, %3 \n\t" \891"lw $11, %4 \n\t" \892"lw $12, %5 \n\t" \893"lw $13, %6 \n\t"894895#define MULADDC_X1_CORE \896"lw $14, 0($10) \n\t" \897"multu $13, $14 \n\t" \898"addi $10, $10, 4 \n\t" \899"mflo $14 \n\t" \900"mfhi $9 \n\t" \901"addu $14, $12, $14 \n\t" \902"lw $15, 0($11) \n\t" \903"sltu $12, $14, $12 \n\t" \904"addu $15, $14, $15 \n\t" \905"sltu $14, $15, $14 \n\t" \906"addu $12, $12, $9 \n\t" \907"sw $15, 0($11) \n\t" \908"addu $12, $12, $14 \n\t" \909"addi $11, $11, 4 \n\t"910911#define MULADDC_X1_STOP \912"sw $12, %0 \n\t" \913"sw $11, %1 \n\t" \914"sw $10, %2 \n\t" \915: "=m" (c), "=m" (d), "=m" (s) \916: "m" (s), "m" (d), "m" (c), "m" (b) \917: "$9", "$10", "$11", "$12", "$13", "$14", "$15", "lo", "hi" \918);919920#endif /* MIPS */921#endif /* GNUC */922923#if (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)924925#define MULADDC_X1_INIT \926__asm mov esi, s \927__asm mov edi, d \928__asm mov ecx, c \929__asm mov ebx, b930931#define MULADDC_X1_CORE \932__asm lodsd \933__asm mul ebx \934__asm add eax, ecx \935__asm adc edx, 0 \936__asm add eax, [edi] \937__asm adc edx, 0 \938__asm mov ecx, edx \939__asm stosd940941#define MULADDC_X1_STOP \942__asm mov c, ecx \943__asm mov d, edi \944__asm mov s, esi945946#if defined(MBEDTLS_HAVE_SSE2)947948#define EMIT __asm _emit949950#define MULADDC_X8_INIT MULADDC_X1_INIT951952#define MULADDC_X8_CORE \953EMIT 0x0F EMIT 0x6E EMIT 0xC9 \954EMIT 0x0F EMIT 0x6E EMIT 0xC3 \955EMIT 0x0F EMIT 0x6E EMIT 0x1F \956EMIT 0x0F EMIT 0xD4 EMIT 0xCB \957EMIT 0x0F EMIT 0x6E EMIT 0x16 \958EMIT 0x0F EMIT 0xF4 EMIT 0xD0 \959EMIT 0x0F EMIT 0x6E EMIT 0x66 EMIT 0x04 \960EMIT 0x0F EMIT 0xF4 EMIT 0xE0 \961EMIT 0x0F EMIT 0x6E EMIT 0x76 EMIT 0x08 \962EMIT 0x0F EMIT 0xF4 EMIT 0xF0 \963EMIT 0x0F EMIT 0x6E EMIT 0x7E EMIT 0x0C \964EMIT 0x0F EMIT 0xF4 EMIT 0xF8 \965EMIT 0x0F EMIT 0xD4 EMIT 0xCA \966EMIT 0x0F EMIT 0x6E EMIT 0x5F EMIT 0x04 \967EMIT 0x0F EMIT 0xD4 EMIT 0xDC \968EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x08 \969EMIT 0x0F EMIT 0xD4 EMIT 0xEE \970EMIT 0x0F EMIT 0x6E EMIT 0x67 EMIT 0x0C \971EMIT 0x0F EMIT 0xD4 EMIT 0xFC \972EMIT 0x0F EMIT 0x7E EMIT 0x0F \973EMIT 0x0F EMIT 0x6E EMIT 0x56 EMIT 0x10 \974EMIT 0x0F EMIT 0xF4 EMIT 0xD0 \975EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \976EMIT 0x0F EMIT 0x6E EMIT 0x66 EMIT 0x14 \977EMIT 0x0F EMIT 0xF4 EMIT 0xE0 \978EMIT 0x0F EMIT 0xD4 EMIT 0xCB \979EMIT 0x0F EMIT 0x6E EMIT 0x76 EMIT 0x18 \980EMIT 0x0F EMIT 0xF4 EMIT 0xF0 \981EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x04 \982EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \983EMIT 0x0F EMIT 0x6E EMIT 0x5E EMIT 0x1C \984EMIT 0x0F EMIT 0xF4 EMIT 0xD8 \985EMIT 0x0F EMIT 0xD4 EMIT 0xCD \986EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x10 \987EMIT 0x0F EMIT 0xD4 EMIT 0xD5 \988EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x08 \989EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \990EMIT 0x0F EMIT 0xD4 EMIT 0xCF \991EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x14 \992EMIT 0x0F EMIT 0xD4 EMIT 0xE5 \993EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x0C \994EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \995EMIT 0x0F EMIT 0xD4 EMIT 0xCA \996EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x18 \997EMIT 0x0F EMIT 0xD4 EMIT 0xF5 \998EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x10 \999EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \1000EMIT 0x0F EMIT 0xD4 EMIT 0xCC \1001EMIT 0x0F EMIT 0x6E EMIT 0x6F EMIT 0x1C \1002EMIT 0x0F EMIT 0xD4 EMIT 0xDD \1003EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x14 \1004EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \1005EMIT 0x0F EMIT 0xD4 EMIT 0xCE \1006EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x18 \1007EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \1008EMIT 0x0F EMIT 0xD4 EMIT 0xCB \1009EMIT 0x0F EMIT 0x7E EMIT 0x4F EMIT 0x1C \1010EMIT 0x83 EMIT 0xC7 EMIT 0x20 \1011EMIT 0x83 EMIT 0xC6 EMIT 0x20 \1012EMIT 0x0F EMIT 0x73 EMIT 0xD1 EMIT 0x20 \1013EMIT 0x0F EMIT 0x7E EMIT 0xC910141015#define MULADDC_X8_STOP \1016EMIT 0x0F EMIT 0x77 \1017__asm mov c, ecx \1018__asm mov d, edi \1019__asm mov s, esi10201021#endif /* SSE2 */1022#endif /* MSVC */10231024#endif /* MBEDTLS_HAVE_ASM */10251026#if !defined(MULADDC_X1_CORE)1027#if defined(MBEDTLS_HAVE_UDBL)10281029#define MULADDC_X1_INIT \1030{ \1031mbedtls_t_udbl r; \1032mbedtls_mpi_uint r0, r1;10331034#define MULADDC_X1_CORE \1035r = *(s++) * (mbedtls_t_udbl) b; \1036r0 = (mbedtls_mpi_uint) r; \1037r1 = (mbedtls_mpi_uint)( r >> biL ); \1038r0 += c; r1 += (r0 < c); \1039r0 += *d; r1 += (r0 < *d); \1040c = r1; *(d++) = r0;10411042#define MULADDC_X1_STOP \1043}10441045#else /* MBEDTLS_HAVE_UDBL */10461047#define MULADDC_X1_INIT \1048{ \1049mbedtls_mpi_uint s0, s1, b0, b1; \1050mbedtls_mpi_uint r0, r1, rx, ry; \1051b0 = ( b << biH ) >> biH; \1052b1 = ( b >> biH );10531054#define MULADDC_X1_CORE \1055s0 = ( *s << biH ) >> biH; \1056s1 = ( *s >> biH ); s++; \1057rx = s0 * b1; r0 = s0 * b0; \1058ry = s1 * b0; r1 = s1 * b1; \1059r1 += ( rx >> biH ); \1060r1 += ( ry >> biH ); \1061rx <<= biH; ry <<= biH; \1062r0 += rx; r1 += (r0 < rx); \1063r0 += ry; r1 += (r0 < ry); \1064r0 += c; r1 += (r0 < c); \1065r0 += *d; r1 += (r0 < *d); \1066c = r1; *(d++) = r0;10671068#define MULADDC_X1_STOP \1069}10701071#endif /* C (longlong) */1072#endif /* C (generic) */10731074#if !defined(MULADDC_X2_CORE)1075#define MULADDC_X2_INIT MULADDC_X1_INIT1076#define MULADDC_X2_STOP MULADDC_X1_STOP1077#define MULADDC_X2_CORE MULADDC_X1_CORE MULADDC_X1_CORE1078#endif /* MULADDC_X2_CORE */10791080#if !defined(MULADDC_X4_CORE)1081#define MULADDC_X4_INIT MULADDC_X2_INIT1082#define MULADDC_X4_STOP MULADDC_X2_STOP1083#define MULADDC_X4_CORE MULADDC_X2_CORE MULADDC_X2_CORE1084#endif /* MULADDC_X4_CORE */10851086#if !defined(MULADDC_X8_CORE)1087#define MULADDC_X8_INIT MULADDC_X4_INIT1088#define MULADDC_X8_STOP MULADDC_X4_STOP1089#define MULADDC_X8_CORE MULADDC_X4_CORE MULADDC_X4_CORE1090#endif /* MULADDC_X8_CORE */10911092/* *INDENT-ON* */1093#endif /* bn_mul.h */109410951096