/* CpuArch.c -- CPU specific code1Igor Pavlov : Public domain */23#include "Precomp.h"45// #include <stdio.h>67#include "CpuArch.h"89#ifdef MY_CPU_X86_OR_AMD641011#undef NEED_CHECK_FOR_CPUID12#if !defined(MY_CPU_AMD64)13#define NEED_CHECK_FOR_CPUID14#endif1516/*17cpuid instruction supports (subFunction) parameter in ECX,18that is used only with some specific (function) parameter values.19most functions use only (subFunction==0).20*/21/*22__cpuid(): MSVC and GCC/CLANG use same function/macro name23but parameters are different.24We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.25*/2627#if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \28|| defined(__clang__) /* && (__clang_major__ >= 10) */2930/* there was some CLANG/GCC compilers that have issues with31rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).32compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.33The history of __cpuid() changes in CLANG/GCC:34GCC:352007: it preserved ebx for (__PIC__ && __i386__)362013: it preserved rbx and ebx for __PIC__372014: it doesn't preserves rbx and ebx anymore38we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.39CLANG:402014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.41Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?42Do we need __PIC__ test for CLANG or we must care about rbx even if43__PIC__ is not defined?44*/4546#define ASM_LN "\n"4748#if defined(MY_CPU_AMD64) && defined(__PIC__) \49&& ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))5051/* "=&r" selects free register. It can select even rbx, if that register is free.52"=&D" for (RDI) also works, but the code can be larger with "=&D"53"2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */5455#define x86_cpuid_MACRO_2(p, func, subFunc) { \56__asm__ __volatile__ ( \57ASM_LN "mov %%rbx, %q1" \58ASM_LN "cpuid" \59ASM_LN "xchg %%rbx, %q1" \60: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }6162#elif defined(MY_CPU_X86) && defined(__PIC__) \63&& ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))6465#define x86_cpuid_MACRO_2(p, func, subFunc) { \66__asm__ __volatile__ ( \67ASM_LN "mov %%ebx, %k1" \68ASM_LN "cpuid" \69ASM_LN "xchg %%ebx, %k1" \70: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }7172#else7374#define x86_cpuid_MACRO_2(p, func, subFunc) { \75__asm__ __volatile__ ( \76ASM_LN "cpuid" \77: "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }7879#endif8081#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)8283void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)84{85x86_cpuid_MACRO(p, func)86}8788static89void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)90{91x86_cpuid_MACRO_2(p, func, subFunc)92}939495Z7_NO_INLINE96UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)97{98#if defined(NEED_CHECK_FOR_CPUID)99#define EFALGS_CPUID_BIT 21100UInt32 a;101__asm__ __volatile__ (102ASM_LN "pushf"103ASM_LN "pushf"104ASM_LN "pop %0"105// ASM_LN "movl %0, %1"106// ASM_LN "xorl $0x200000, %0"107ASM_LN "btc %1, %0"108ASM_LN "push %0"109ASM_LN "popf"110ASM_LN "pushf"111ASM_LN "pop %0"112ASM_LN "xorl (%%esp), %0"113114ASM_LN "popf"115ASM_LN116: "=&r" (a) // "=a"117: "i" (EFALGS_CPUID_BIT)118);119if ((a & (1 << EFALGS_CPUID_BIT)) == 0)120return 0;121#endif122{123UInt32 p[4];124x86_cpuid_MACRO(p, 0)125return p[0];126}127}128129#undef ASM_LN130131#elif !defined(_MSC_VER)132133/*134// for gcc/clang and other: we can try to use __cpuid macro:135#include <cpuid.h>136void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)137{138__cpuid(func, p[0], p[1], p[2], p[3]);139}140UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)141{142return (UInt32)__get_cpuid_max(0, NULL);143}144*/145// for unsupported cpuid:146void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)147{148UNUSED_VAR(func)149p[0] = p[1] = p[2] = p[3] = 0;150}151UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)152{153return 0;154}155156#else // _MSC_VER157158#if !defined(MY_CPU_AMD64)159160UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)161{162#if defined(NEED_CHECK_FOR_CPUID)163#define EFALGS_CPUID_BIT 21164__asm pushfd165__asm pushfd166/*167__asm pop eax168// __asm mov edx, eax169__asm btc eax, EFALGS_CPUID_BIT170__asm push eax171*/172__asm btc dword ptr [esp], EFALGS_CPUID_BIT173__asm popfd174__asm pushfd175__asm pop eax176// __asm xor eax, edx177__asm xor eax, [esp]178// __asm push edx179__asm popfd180__asm and eax, (1 shl EFALGS_CPUID_BIT)181__asm jz end_func182#endif183__asm push ebx184__asm xor eax, eax // func185__asm xor ecx, ecx // subFunction (optional) for (func == 0)186__asm cpuid187__asm pop ebx188#if defined(NEED_CHECK_FOR_CPUID)189end_func:190#endif191__asm ret 0192}193194void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)195{196UNUSED_VAR(p)197UNUSED_VAR(func)198__asm push ebx199__asm push edi200__asm mov edi, ecx // p201__asm mov eax, edx // func202__asm xor ecx, ecx // subfunction (optional) for (func == 0)203__asm cpuid204__asm mov [edi ], eax205__asm mov [edi + 4], ebx206__asm mov [edi + 8], ecx207__asm mov [edi + 12], edx208__asm pop edi209__asm pop ebx210__asm ret 0211}212213static214void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)215{216UNUSED_VAR(p)217UNUSED_VAR(func)218UNUSED_VAR(subFunc)219__asm push ebx220__asm push edi221__asm mov edi, ecx // p222__asm mov eax, edx // func223__asm mov ecx, [esp + 12] // subFunc224__asm cpuid225__asm mov [edi ], eax226__asm mov [edi + 4], ebx227__asm mov [edi + 8], ecx228__asm mov [edi + 12], edx229__asm pop edi230__asm pop ebx231__asm ret 4232}233234#else // MY_CPU_AMD64235236#if _MSC_VER >= 1600237#include <intrin.h>238#define MY_cpuidex __cpuidex239240static241void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)242{243__cpuidex((int *)p, func, subFunc);244}245246#else247/*248__cpuid (func == (0 or 7)) requires subfunction number in ECX.249MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.250__cpuid() in new MSVC clears ECX.251__cpuid() in old MSVC (14.00) x64 doesn't clear ECX252We still can use __cpuid for low (func) values that don't require ECX,253but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).254So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,255where ECX value is first parameter for FASTCALL / NO_INLINE func.256So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and257old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.258259DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!260*/261static262Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo)263{264UNUSED_VAR(subFunction)265__cpuid(CPUInfo, func);266}267#define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)268#pragma message("======== MY_cpuidex_HACK WAS USED ========")269static270void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)271{272MY_cpuidex_HACK(subFunc, func, (Int32 *)p);273}274#endif // _MSC_VER >= 1600275276#if !defined(MY_CPU_AMD64)277/* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,278so we disable inlining here */279Z7_NO_INLINE280#endif281void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)282{283MY_cpuidex((Int32 *)p, (Int32)func, 0);284}285286Z7_NO_INLINE287UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)288{289Int32 a[4];290MY_cpuidex(a, 0, 0);291return a[0];292}293294#endif // MY_CPU_AMD64295#endif // _MSC_VER296297#if defined(NEED_CHECK_FOR_CPUID)298#define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }299#else300#define CHECK_CPUID_IS_SUPPORTED301#endif302#undef NEED_CHECK_FOR_CPUID303304305static306BoolInt x86cpuid_Func_1(UInt32 *p)307{308CHECK_CPUID_IS_SUPPORTED309z7_x86_cpuid(p, 1);310return True;311}312313/*314static const UInt32 kVendors[][1] =315{316{ 0x756E6547 }, // , 0x49656E69, 0x6C65746E },317{ 0x68747541 }, // , 0x69746E65, 0x444D4163 },318{ 0x746E6543 } // , 0x48727561, 0x736C7561 }319};320*/321322/*323typedef struct324{325UInt32 maxFunc;326UInt32 vendor[3];327UInt32 ver;328UInt32 b;329UInt32 c;330UInt32 d;331} Cx86cpuid;332333enum334{335CPU_FIRM_INTEL,336CPU_FIRM_AMD,337CPU_FIRM_VIA338};339int x86cpuid_GetFirm(const Cx86cpuid *p);340#define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))341#define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf))342#define x86cpuid_ver_GetStepping(ver) (ver & 0xf)343344int x86cpuid_GetFirm(const Cx86cpuid *p)345{346unsigned i;347for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)348{349const UInt32 *v = kVendors[i];350if (v[0] == p->vendor[0]351// && v[1] == p->vendor[1]352// && v[2] == p->vendor[2]353)354return (int)i;355}356return -1;357}358359BoolInt CPU_Is_InOrder()360{361Cx86cpuid p;362UInt32 family, model;363if (!x86cpuid_CheckAndRead(&p))364return True;365366family = x86cpuid_ver_GetFamily(p.ver);367model = x86cpuid_ver_GetModel(p.ver);368369switch (x86cpuid_GetFirm(&p))370{371case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (372// In-Order Atom CPU373model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330374|| model == 0x26 // 45 nm, Z6xx375|| model == 0x27 // 32 nm, Z2460376|| model == 0x35 // 32 nm, Z2760377|| model == 0x36 // 32 nm, N2xxx, D2xxx378)));379case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));380case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));381}382return False; // v23 : unknown processors are not In-Order383}384*/385386#ifdef _WIN32387#include "7zWindows.h"388#endif389390#if !defined(MY_CPU_AMD64) && defined(_WIN32)391392/* for legacy SSE ia32: there is no user-space cpu instruction to check393that OS supports SSE register storing/restoring on context switches.394So we need some OS-specific function to check that it's safe to use SSE registers.395*/396397Z7_FORCE_INLINE398static BoolInt CPU_Sys_Is_SSE_Supported(void)399{400#ifdef _MSC_VER401#pragma warning(push)402#pragma warning(disable : 4996) // `GetVersion': was declared deprecated403#endif404/* low byte is major version of Windows405We suppose that any Windows version since406Windows2000 (major == 5) supports SSE registers */407return (Byte)GetVersion() >= 5;408#if defined(_MSC_VER)409#pragma warning(pop)410#endif411}412#define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;413#else414#define CHECK_SYS_SSE_SUPPORT415#endif416417418#if !defined(MY_CPU_AMD64)419420BoolInt CPU_IsSupported_CMOV(void)421{422UInt32 a[4];423if (!x86cpuid_Func_1(&a[0]))424return 0;425return (BoolInt)(a[3] >> 15) & 1;426}427428BoolInt CPU_IsSupported_SSE(void)429{430UInt32 a[4];431CHECK_SYS_SSE_SUPPORT432if (!x86cpuid_Func_1(&a[0]))433return 0;434return (BoolInt)(a[3] >> 25) & 1;435}436437BoolInt CPU_IsSupported_SSE2(void)438{439UInt32 a[4];440CHECK_SYS_SSE_SUPPORT441if (!x86cpuid_Func_1(&a[0]))442return 0;443return (BoolInt)(a[3] >> 26) & 1;444}445446#endif447448449static UInt32 x86cpuid_Func_1_ECX(void)450{451UInt32 a[4];452CHECK_SYS_SSE_SUPPORT453if (!x86cpuid_Func_1(&a[0]))454return 0;455return a[2];456}457458BoolInt CPU_IsSupported_AES(void)459{460return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1;461}462463BoolInt CPU_IsSupported_SSSE3(void)464{465return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1;466}467468BoolInt CPU_IsSupported_SSE41(void)469{470return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1;471}472473BoolInt CPU_IsSupported_SHA(void)474{475CHECK_SYS_SSE_SUPPORT476477if (z7_x86_cpuid_GetMaxFunc() < 7)478return False;479{480UInt32 d[4];481z7_x86_cpuid(d, 7);482return (BoolInt)(d[1] >> 29) & 1;483}484}485486487BoolInt CPU_IsSupported_SHA512(void)488{489if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here490491if (z7_x86_cpuid_GetMaxFunc() < 7)492return False;493{494UInt32 d[4];495z7_x86_cpuid_subFunc(d, 7, 0);496if (d[0] < 1) // d[0] - is max supported subleaf value497return False;498z7_x86_cpuid_subFunc(d, 7, 1);499return (BoolInt)(d[0]) & 1;500}501}502503/*504MSVC: _xgetbv() intrinsic is available since VS2010SP1.505MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in506<immintrin.h> that we can use or check.507For any 32-bit x86 we can use asm code in MSVC,508but MSVC asm code is huge after compilation.509So _xgetbv() is better510511ICC: _xgetbv() intrinsic is available (in what version of ICC?)512ICC defines (__GNUC___) and it supports gnu assembler513also ICC supports MASM style code with -use-msasm switch.514but ICC doesn't support __attribute__((__target__))515516GCC/CLANG 9:517_xgetbv() is macro that works via __builtin_ia32_xgetbv()518and we need __attribute__((__target__("xsave")).519But with __target__("xsave") the function will be not520inlined to function that has no __target__("xsave") attribute.521If we want _xgetbv() call inlining, then we should use asm version522instead of calling _xgetbv().523Note:intrinsic is broke before GCC 8.2:524https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684525*/526527#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \528|| defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \529|| defined(__GNUC__) && (__GNUC__ >= 9) \530|| defined(__clang__) && (__clang_major__ >= 9)531// we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler532#if defined(__INTEL_COMPILER)533#define ATTRIB_XGETBV534#elif defined(__GNUC__) || defined(__clang__)535// we don't define ATTRIB_XGETBV here, because asm version is better for inlining.536// #define ATTRIB_XGETBV __attribute__((__target__("xsave")))537#else538#define ATTRIB_XGETBV539#endif540#endif541542#if defined(ATTRIB_XGETBV)543#include <immintrin.h>544#endif545546547// XFEATURE_ENABLED_MASK/XCR0548#define MY_XCR_XFEATURE_ENABLED_MASK 0549550#if defined(ATTRIB_XGETBV)551ATTRIB_XGETBV552#endif553static UInt64 x86_xgetbv_0(UInt32 num)554{555#if defined(ATTRIB_XGETBV)556{557return558#if (defined(_MSC_VER))559_xgetbv(num);560#else561__builtin_ia32_xgetbv(562#if !defined(__clang__)563(int)564#endif565num);566#endif567}568569#elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)570571UInt32 a, d;572#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))573__asm__574(575"xgetbv"576: "=a"(a), "=d"(d) : "c"(num) : "cc"577);578#else // is old gcc579__asm__580(581".byte 0x0f, 0x01, 0xd0" "\n\t"582: "=a"(a), "=d"(d) : "c"(num) : "cc"583);584#endif585return ((UInt64)d << 32) | a;586// return a;587588#elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)589590UInt32 a, d;591__asm {592push eax593push edx594push ecx595mov ecx, num;596// xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK597_emit 0x0f598_emit 0x01599_emit 0xd0600mov a, eax601mov d, edx602pop ecx603pop edx604pop eax605}606return ((UInt64)d << 32) | a;607// return a;608609#else // it's unknown compiler610// #error "Need xgetbv function"611UNUSED_VAR(num)612// for MSVC-X64 we could call external function from external file.613/* Actually we had checked OSXSAVE/AVX in cpuid before.614So it's expected that OS supports at least AVX and below. */615// if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0616return617// (1 << 0) | // x87618(1 << 1) // SSE619| (1 << 2); // AVX620621#endif622}623624#ifdef _WIN32625/*626Windows versions do not know about new ISA extensions that627can be introduced. But we still can use new extensions,628even if Windows doesn't report about supporting them,629But we can use new extensions, only if Windows knows about new ISA extension630that changes the number or size of registers: SSE, AVX/XSAVE, AVX512631So it's enough to check632MY_PF_AVX_INSTRUCTIONS_AVAILABLE633instead of634MY_PF_AVX2_INSTRUCTIONS_AVAILABLE635*/636#define MY_PF_XSAVE_ENABLED 17637// #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36638// #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37639// #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38640// #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39641// #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40642// #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41643#endif644645BoolInt CPU_IsSupported_AVX(void)646{647#ifdef _WIN32648if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))649return False;650/* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from651some latest Win10 revisions. But we need AVX in older Windows also.652So we don't use the following check: */653/*654if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))655return False;656*/657#endif658659/*660OS must use new special XSAVE/XRSTOR instructions to save661AVX registers when it required for context switching.662At OS statring:663OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.664Also OS sets bitmask in XCR0 register that defines what665registers will be processed by XSAVE instruction:666XCR0.SSE[bit 0] - x87 registers and state667XCR0.SSE[bit 1] - SSE registers and state668XCR0.AVX[bit 2] - AVX registers and state669CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].670So we can read that bit in user-space.671XCR0 is available for reading in user-space by new XGETBV instruction.672*/673{674const UInt32 c = x86cpuid_Func_1_ECX();675if (0 == (1676& (c >> 28) // AVX instructions are supported by hardware677& (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.678return False;679}680681/* also we can check682CPUID.1:ECX.XSAVE [bit 26] : that shows that683XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.684But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */685686/* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),687in most cases we expect that OS also will support storing/restoring688for AVX and SSE states at least.689But to be ensure for that we call user-space instruction690XGETBV(0) to get XCR0 value that contains bitmask that defines691what exact states(registers) OS have enabled for storing/restoring.692*/693694{695const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);696// printf("\n=== XGetBV=0x%x\n", bm);697return 1698& (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring699& (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring700}701// since Win7SP1: we can use GetEnabledXStateFeatures();702}703704705BoolInt CPU_IsSupported_AVX2(void)706{707if (!CPU_IsSupported_AVX())708return False;709if (z7_x86_cpuid_GetMaxFunc() < 7)710return False;711{712UInt32 d[4];713z7_x86_cpuid(d, 7);714// printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);715return 1716& (BoolInt)(d[1] >> 5); // avx2717}718}719720#if 0721BoolInt CPU_IsSupported_AVX512F_AVX512VL(void)722{723if (!CPU_IsSupported_AVX())724return False;725if (z7_x86_cpuid_GetMaxFunc() < 7)726return False;727{728UInt32 d[4];729BoolInt v;730z7_x86_cpuid(d, 7);731// printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);732v = 1733& (BoolInt)(d[1] >> 16) // avx512f734& (BoolInt)(d[1] >> 31); // avx512vl735if (!v)736return False;737}738{739const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);740// printf("\n=== XGetBV=0x%x\n", bm);741return 1742& (BoolInt)(bm >> 5) // OPMASK743& (BoolInt)(bm >> 6) // ZMM upper 256-bit744& (BoolInt)(bm >> 7); // ZMM16 ... ZMM31745}746}747#endif748749BoolInt CPU_IsSupported_VAES_AVX2(void)750{751if (!CPU_IsSupported_AVX())752return False;753if (z7_x86_cpuid_GetMaxFunc() < 7)754return False;755{756UInt32 d[4];757z7_x86_cpuid(d, 7);758// printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);759return 1760& (BoolInt)(d[1] >> 5) // avx2761// & (d[1] >> 31) // avx512vl762& (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX763}764}765766BoolInt CPU_IsSupported_PageGB(void)767{768CHECK_CPUID_IS_SUPPORTED769{770UInt32 d[4];771z7_x86_cpuid(d, 0x80000000);772if (d[0] < 0x80000001)773return False;774z7_x86_cpuid(d, 0x80000001);775return (BoolInt)(d[3] >> 26) & 1;776}777}778779780#elif defined(MY_CPU_ARM_OR_ARM64)781782#ifdef _WIN32783784#include "7zWindows.h"785786BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }787BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }788BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }789790#else791792#if defined(__APPLE__)793794/*795#include <stdio.h>796#include <string.h>797static void Print_sysctlbyname(const char *name)798{799size_t bufSize = 256;800char buf[256];801int res = sysctlbyname(name, &buf, &bufSize, NULL, 0);802{803int i;804printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize);805for (i = 0; i < 20; i++)806printf(" %2x", (unsigned)(Byte)buf[i]);807808}809}810*/811/*812Print_sysctlbyname("hw.pagesize");813Print_sysctlbyname("machdep.cpu.brand_string");814*/815816static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)817{818UInt32 val = 0;819if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)820return 1;821return 0;822}823824BoolInt CPU_IsSupported_CRC32(void)825{826return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");827}828829BoolInt CPU_IsSupported_NEON(void)830{831return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");832}833834BoolInt CPU_IsSupported_SHA512(void)835{836return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");837}838839/*840BoolInt CPU_IsSupported_SHA3(void)841{842return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");843}844*/845846#ifdef MY_CPU_ARM64847#define APPLE_CRYPTO_SUPPORT_VAL 1848#else849#define APPLE_CRYPTO_SUPPORT_VAL 0850#endif851852BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; }853BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; }854BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }855856857#else // __APPLE__858859#if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216)860#define Z7_GETAUXV_AVAILABLE861#elif !defined(__QNXNTO__)862// #pragma message("=== is not NEW GLIBC === ")863#if defined __has_include864#if __has_include (<sys/auxv.h>)865// #pragma message("=== sys/auxv.h is avail=== ")866#define Z7_GETAUXV_AVAILABLE867#endif868#endif869#endif870871#ifdef Z7_GETAUXV_AVAILABLE872// #pragma message("=== Z7_GETAUXV_AVAILABLE === ")873#include <sys/auxv.h>874#define USE_HWCAP875#endif876877#ifdef USE_HWCAP878879#if defined(__FreeBSD__) || defined(__OpenBSD__)880static unsigned long MY_getauxval(int aux)881{882unsigned long val;883if (elf_aux_info(aux, &val, sizeof(val)))884return 0;885return val;886}887#else888#define MY_getauxval getauxval889#if defined __has_include890#if __has_include (<asm/hwcap.h>)891#include <asm/hwcap.h>892#endif893#endif894#endif895896#define MY_HWCAP_CHECK_FUNC_2(name1, name2) \897BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); }898899#ifdef MY_CPU_ARM64900#define MY_HWCAP_CHECK_FUNC(name) \901MY_HWCAP_CHECK_FUNC_2(name, name)902#if 1 || defined(__ARM_NEON)903BoolInt CPU_IsSupported_NEON(void) { return True; }904#else905MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)906#endif907// MY_HWCAP_CHECK_FUNC (ASIMD)908#elif defined(MY_CPU_ARM)909#define MY_HWCAP_CHECK_FUNC(name) \910BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); }911MY_HWCAP_CHECK_FUNC_2(NEON, NEON)912#endif913914#else // USE_HWCAP915916#define MY_HWCAP_CHECK_FUNC(name) \917BoolInt CPU_IsSupported_ ## name(void) { return 0; }918#if defined(__ARM_NEON)919BoolInt CPU_IsSupported_NEON(void) { return True; }920#else921MY_HWCAP_CHECK_FUNC(NEON)922#endif923924#endif // USE_HWCAP925926MY_HWCAP_CHECK_FUNC (CRC32)927MY_HWCAP_CHECK_FUNC (SHA1)928MY_HWCAP_CHECK_FUNC (SHA2)929MY_HWCAP_CHECK_FUNC (AES)930#ifdef MY_CPU_ARM64931// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.932// we define them here, if they are not defined933#ifndef HWCAP_SHA3934// #define HWCAP_SHA3 (1 << 17)935#endif936#ifndef HWCAP_SHA512937// #pragma message("=== HWCAP_SHA512 define === ")938#define HWCAP_SHA512 (1 << 21)939#endif940MY_HWCAP_CHECK_FUNC (SHA512)941// MY_HWCAP_CHECK_FUNC (SHA3)942#endif943944#endif // __APPLE__945#endif // _WIN32946947#endif // MY_CPU_ARM_OR_ARM64948949950951#ifdef __APPLE__952953#include <sys/sysctl.h>954955int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)956{957return sysctlbyname(name, buf, bufSize, NULL, 0);958}959960int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)961{962size_t bufSize = sizeof(*val);963const int res = z7_sysctlbyname_Get(name, val, &bufSize);964if (res == 0 && bufSize != sizeof(*val))965return EFAULT;966return res;967}968969#endif970971972