Path: blob/linux/scryptjane/scrypt-jane-portable-x86.h
1201 views
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))1#define X86ASM2/* gcc 2.95 royally screws up stack alignments on variables */3#if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))4#define X86ASM_SSE5#define X86ASM_SSE26#endif7#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))8#define X86ASM_SSSE39#endif10#if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))11#define X86ASM_AVX12#endif13#endif1415#if defined(CPU_X86_64) && defined(COMPILER_GCC)16#define X86_64ASM17#define X86_64ASM_SSE218#if (COMPILER_GCC >= 40102)19#define X86_64ASM_SSSE320#endif21#if (COMPILER_GCC >= 40400)22#define X86_64ASM_AVX23#endif24#endif2526#if defined(COMPILER_MSVC)27#define X86_INTRINSIC28#if defined(CPU_X86_64) || defined(X86ASM_SSE)29#define X86_INTRINSIC_SSE30#endif31#if defined(CPU_X86_64) || defined(X86ASM_SSE2)32#define X86_INTRINSIC_SSE233#endif34#if (COMPILER_MSVC >= 1400)35#define X86_INTRINSIC_SSSE336#endif37#endif3839#if defined(COMPILER_MSVC) && defined(CPU_X86_64)40#define X86_64USE_INTRINSIC41#endif4243#if defined(COMPILER_MSVC) && defined(CPU_X86_64)44#define X86_64USE_INTRINSIC45#endif4647#ifdef __AVX__48#define X86_INTRINSIC_AVX49#endif5051#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)52#define X86_INTRINSIC53#if defined(__SSE__)54#define X86_INTRINSIC_SSE55#endif56#if defined(__SSE2__)57#define X86_INTRINSIC_SSE258#endif59#if defined(__SSSE3__)60#define X86_INTRINSIC_SSSE361#endif62#if defined(__AVX__)63#define X86_INTRINSIC_AVX64#endif6566/* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */67#undef X86_64ASM_SSSE368#undef X86_64ASM_AVX69#undef X86_64ASM_SSE270#undef X86ASM_AVX71#undef X86ASM_SSSE372#undef X86ASM_SSE273#undef X86ASM_SSE74#endif7576/* only use simd on windows (or SSE2 on gcc)! */77#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)78#if defined(X86_INTRINSIC_SSE)79#define X86_INTRINSIC80#include <mmintrin.h>81#include <xmmintrin.h>82typedef __m64 qmm;83typedef __m128 xmm;84typedef __m128d xmmd;85#endif86#if defined(X86_INTRINSIC_SSE2)87#define X86_INTRINSIC_SSE288#include <emmintrin.h>89typedef __m128i xmmi;90#endif91#if defined(X86_INTRINSIC_SSSE3)92#define X86_INTRINSIC_SSSE393#include <tmmintrin.h>94#endif95#if defined (X86_INTRINSIC_AVX)96#define X86_INTRINSIC_AVX97#include <immintrin.h>98#endif99#endif100101102#if defined(X86_INTRINSIC_SSE2)103typedef union packedelem8_t {104uint8_t u[16];105xmmi v;106} packedelem8;107108typedef union packedelem32_t {109uint32_t u[4];110xmmi v;111} packedelem32;112113typedef union packedelem64_t {114uint64_t u[2];115xmmi v;116} packedelem64;117#else118typedef union packedelem8_t {119uint8_t u[16];120uint32_t dw[4];121} packedelem8;122123typedef union packedelem32_t {124uint32_t u[4];125uint8_t b[16];126} packedelem32;127128typedef union packedelem64_t {129uint64_t u[2];130uint8_t b[16];131} packedelem64;132#endif133134#if defined(X86_INTRINSIC_SSSE3)135static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};136static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};137#endif138139/*140x86 inline asm for gcc/msvc. usage:141142asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)143asm_naked_fn(name)144a1(..)145a2(.., ..)146a3(.., .., ..)14764bit OR 0 paramters: a1(ret)14832bit AND n parameters: aret(4n), eg aret(16) for 4 parameters149asm_naked_fn_end(name)150*/151152#if defined(X86ASM) || defined(X86_64ASM)153154#if defined(COMPILER_MSVC)155#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */156#define a1(x) __asm {x}157#define a2(x, y) __asm {x, y}158#define a3(x, y, z) __asm {x, y, z}159#define a4(x, y, z, w) __asm {x, y, z, w}160#define al(x) __asm {label##x:}161#define aj(x, y, z) __asm {x label##y}162#define asm_align8 a1(ALIGN 8)163#define asm_align16 a1(ALIGN 16)164165#define asm_calling_convention STDCALL166#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn167#define asm_naked_fn(fn) {168#define asm_naked_fn_end(fn) }169#elif defined(COMPILER_GCC)170#define GNU_AS1(x) #x ";\n"171#define GNU_AS2(x, y) #x ", " #y ";\n"172#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"173#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"174#define GNU_ASL(x) "\n" #x ":\n"175#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"176#define GNU_ASJ(x, y, z) #x " " #y #z ";"177178#define a1(x) GNU_AS1(x)179#define a2(x, y) GNU_AS2(x, y)180#define a3(x, y, z) GNU_AS3(x, y, z)181#define a4(x, y, z, w) GNU_AS4(x, y, z, w)182#define al(x) GNU_ASL(x)183#define aj(x, y, z) GNU_ASJ(x, y, z)184#define asm_align8 a1(.align 8)185#define asm_align16 a1(.align 16)186187#if defined(OS_WINDOWS)188#define asm_calling_convention CDECL189#define aret(n) a1(ret)190#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );191#else192#define asm_calling_convention STDCALL193#define aret(n) a1(ret n)194#define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" );195#endif196#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn197#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)198199#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"200#define asm_gcc_parms() ".att_syntax prefix;"201#define asm_gcc_trashed() __asm__ __volatile__("" :::202#define asm_gcc_end() );203#else204need x86 asm205#endif206207#endif /* X86ASM || X86_64ASM */208209210#if defined(CPU_X86) || defined(CPU_X86_64)211212typedef enum cpu_flags_x86_t {213cpu_mmx = 1 << 0,214cpu_sse = 1 << 1,215cpu_sse2 = 1 << 2,216cpu_sse3 = 1 << 3,217cpu_ssse3 = 1 << 4,218cpu_sse4_1 = 1 << 5,219cpu_sse4_2 = 1 << 6,220cpu_avx = 1 << 7221} cpu_flags_x86;222223typedef enum cpu_vendors_x86_t {224cpu_nobody,225cpu_intel,226cpu_amd227} cpu_vendors_x86;228229typedef struct x86_regs_t {230uint32_t eax, ebx, ecx, edx;231} x86_regs;232233#if defined(X86ASM)234asm_naked_fn_proto(int, has_cpuid)(void)235asm_naked_fn(has_cpuid)236a1(pushfd)237a1(pop eax)238a2(mov ecx, eax)239a2(xor eax, 0x200000)240a1(push eax)241a1(popfd)242a1(pushfd)243a1(pop eax)244a2(xor eax, ecx)245a2(shr eax, 21)246a2(and eax, 1)247a1(push ecx)248a1(popfd)249a1(ret)250asm_naked_fn_end(has_cpuid)251#endif /* X86ASM */252253254static void NOINLINE255get_cpuid(x86_regs *regs, uint32_t flags) {256#if defined(COMPILER_MSVC)257__cpuid((int *)regs, (int)flags);258#else259#if defined(CPU_X86_64)260#define cpuid_bx rbx261#else262#define cpuid_bx ebx263#endif264265asm_gcc()266a1(push cpuid_bx)267a1(cpuid)268a2(mov [%1 + 0], eax)269a2(mov [%1 + 4], ebx)270a2(mov [%1 + 8], ecx)271a2(mov [%1 + 12], edx)272a1(pop cpuid_bx)273asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"274asm_gcc_end()275#endif276}277278#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)279static uint64_t NOINLINE280get_xgetbv(uint32_t flags) {281#if defined(COMPILER_MSVC)282return _xgetbv(flags);283#else284uint32_t lo, hi;285asm_gcc()286a1(xgetbv)287asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)288asm_gcc_end()289return ((uint64_t)lo | ((uint64_t)hi << 32));290#endif291}292#endif // AVX support293294#if defined(SCRYPT_TEST_SPEED)295size_t cpu_detect_mask = (size_t)-1;296#endif297298static size_t299detect_cpu(void) {300union { uint8_t s[12]; uint32_t i[3]; } vendor_string;301cpu_vendors_x86 vendor = cpu_nobody;302x86_regs regs;303uint32_t max_level;304size_t cpu_flags = 0;305#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)306uint64_t xgetbv_flags;307#endif308309#if defined(CPU_X86)310if (!has_cpuid())311return cpu_flags;312#endif313314get_cpuid(®s, 0);315max_level = regs.eax;316vendor_string.i[0] = regs.ebx;317vendor_string.i[1] = regs.edx;318vendor_string.i[2] = regs.ecx;319320if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))321vendor = cpu_intel;322else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))323vendor = cpu_amd;324325if (max_level & 0x00000500) {326/* "Intel P5 pre-B0" */327cpu_flags |= cpu_mmx;328return cpu_flags;329}330331if (max_level < 1)332return cpu_flags;333334get_cpuid(®s, 1);335#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)336/* xsave/xrestore */337if (regs.ecx & (1 << 27)) {338xgetbv_flags = get_xgetbv(0);339if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;340}341#endif342if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;343if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;344if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;345if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;346if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;347if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;348if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;349350#if defined(SCRYPT_TEST_SPEED)351cpu_flags &= cpu_detect_mask;352#endif353354return cpu_flags;355}356357#if defined(SCRYPT_TEST_SPEED)358static const char *359get_top_cpuflag_desc(size_t flag) {360if (flag & cpu_avx) return "AVX";361else if (flag & cpu_sse4_2) return "SSE4.2";362else if (flag & cpu_sse4_1) return "SSE4.1";363else if (flag & cpu_ssse3) return "SSSE3";364else if (flag & cpu_sse2) return "SSE2";365else if (flag & cpu_sse) return "SSE";366else if (flag & cpu_mmx) return "MMX";367else return "Basic";368}369#endif370371/* enable the highest system-wide option */372#if defined(SCRYPT_CHOOSE_COMPILETIME)373#if !defined(__AVX__)374#undef X86_64ASM_AVX375#undef X86ASM_AVX376#undef X86_INTRINSIC_AVX377#endif378#if !defined(__SSSE3__)379#undef X86_64ASM_SSSE3380#undef X86ASM_SSSE3381#undef X86_INTRINSIC_SSSE3382#endif383#if !defined(__SSE2__)384#undef X86_64ASM_SSE2385#undef X86ASM_SSE2386#undef X86_INTRINSIC_SSE2387#endif388#endif389390#endif /* defined(CPU_X86) || defined(CPU_X86_64) */391392393