Path: blob/main/sys/contrib/ck/include/gcc/x86_64/ck_pr.h
48420 views
/*1* Copyright 2009-2015 Samy Al Bahra.2* All rights reserved.3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions6* are met:7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9* 2. Redistributions in binary form must reproduce the above copyright10* notice, this list of conditions and the following disclaimer in the11* documentation and/or other materials provided with the distribution.12*13* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND14* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE15* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE16* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE17* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL18* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS19* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)20* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT21* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY22* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF23* SUCH DAMAGE.24*/2526#ifndef CK_PR_X86_64_H27#define CK_PR_X86_64_H2829#ifndef CK_PR_H30#error Do not include this file directly, use ck_pr.h31#endif3233#include <ck_cc.h>34#include <ck_md.h>35#include <ck_stdint.h>3637/*38* The following represent supported atomic operations.39* These operations may be emulated.40*/41#include "ck_f_pr.h"4243/*44* Support for TSX extensions.45*/46#ifdef CK_MD_RTM_ENABLE47#include "ck_pr_rtm.h"48#endif4950/* Minimum requirements for the CK_PR interface are met. */51#define CK_F_PR5253#ifdef CK_MD_UMP54#define CK_PR_LOCK_PREFIX55#else56#define CK_PR_LOCK_PREFIX "lock "57#endif5859/*60* Prevent speculative execution in busy-wait loops (P4 <=) or "predefined61* delay".62*/63CK_CC_INLINE static void64ck_pr_stall(void)65{66__asm__ __volatile__("pause" ::: "memory");67return;68}6970#define CK_PR_FENCE(T, I) \71CK_CC_INLINE static void \72ck_pr_fence_strict_##T(void) \73{ \74__asm__ __volatile__(I ::: "memory"); \75}7677/* Atomic operations are always serializing. */78CK_PR_FENCE(atomic, "")79CK_PR_FENCE(atomic_store, "")80CK_PR_FENCE(atomic_load, "")81CK_PR_FENCE(store_atomic, "")82CK_PR_FENCE(load_atomic, "")8384/* Traditional fence interface. */85CK_PR_FENCE(load, "lfence")86CK_PR_FENCE(load_store, "mfence")87CK_PR_FENCE(store, "sfence")88CK_PR_FENCE(store_load, "mfence")89CK_PR_FENCE(memory, "mfence")9091/* Below are stdatomic-style fences. */9293/*94* Provides load-store and store-store ordering. However, Intel specifies that95* the WC memory model is relaxed. It is likely an sfence *is* sufficient (in96* particular, stores are not re-ordered with respect to prior loads and it is97* really just the stores that are subject to re-ordering). However, we take98* the conservative route as the manuals are too ambiguous for my taste.99*/100CK_PR_FENCE(release, "mfence")101102/*103* Provides load-load and load-store ordering. The lfence instruction ensures104* all prior load operations are complete before any subsequent instructions105* actually begin execution. However, the manual also ends up going to describe106* WC memory as a relaxed model.107*/108CK_PR_FENCE(acquire, "mfence")109110CK_PR_FENCE(acqrel, "mfence")111CK_PR_FENCE(lock, "mfence")112CK_PR_FENCE(unlock, "mfence")113114#undef CK_PR_FENCE115116/*117* Read for ownership. Older compilers will generate the 32-bit118* 3DNow! variant which is binary compatible with x86-64 variant119* of prefetchw.120*/121#ifndef CK_F_PR_RFO122#define CK_F_PR_RFO123CK_CC_INLINE static void124ck_pr_rfo(const void *m)125{126127__asm__ __volatile__("prefetchw (%0)"128:129: "r" (m)130: "memory");131132return;133}134#endif /* CK_F_PR_RFO */135136/*137* Atomic fetch-and-store operations.138*/139#define CK_PR_FAS(S, M, T, C, I) \140CK_CC_INLINE static T \141ck_pr_fas_##S(M *target, T v) \142{ \143__asm__ __volatile__(I " %0, %1" \144: "+m" (*(C *)target), \145"+q" (v) \146: \147: "memory"); \148return v; \149}150151CK_PR_FAS(ptr, void, void *, uint64_t, "xchgq")152153#define CK_PR_FAS_S(S, T, I) CK_PR_FAS(S, T, T, T, I)154155#ifndef CK_PR_DISABLE_DOUBLE156CK_PR_FAS_S(double, double, "xchgq")157#endif158CK_PR_FAS_S(char, char, "xchgb")159CK_PR_FAS_S(uint, unsigned int, "xchgl")160CK_PR_FAS_S(int, int, "xchgl")161CK_PR_FAS_S(64, uint64_t, "xchgq")162CK_PR_FAS_S(32, uint32_t, "xchgl")163CK_PR_FAS_S(16, uint16_t, "xchgw")164CK_PR_FAS_S(8, uint8_t, "xchgb")165166#undef CK_PR_FAS_S167#undef CK_PR_FAS168169/*170* Atomic load-from-memory operations.171*/172#define CK_PR_LOAD(S, M, T, C, I) \173CK_CC_INLINE static T \174ck_pr_md_load_##S(const M *target) \175{ \176T r; \177__asm__ __volatile__(I " %1, %0" \178: "=q" (r) \179: "m" (*(const C *)target) \180: "memory"); \181return (r); \182}183184CK_PR_LOAD(ptr, void, void *, uint64_t, "movq")185186#define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I)187188CK_PR_LOAD_S(char, char, "movb")189CK_PR_LOAD_S(uint, unsigned int, "movl")190CK_PR_LOAD_S(int, int, "movl")191#ifndef CK_PR_DISABLE_DOUBLE192CK_PR_LOAD_S(double, double, "movq")193#endif194CK_PR_LOAD_S(64, uint64_t, "movq")195CK_PR_LOAD_S(32, uint32_t, "movl")196CK_PR_LOAD_S(16, uint16_t, "movw")197CK_PR_LOAD_S(8, uint8_t, "movb")198199#undef CK_PR_LOAD_S200#undef CK_PR_LOAD201202CK_CC_INLINE static void203ck_pr_load_64_2(const uint64_t target[2], uint64_t v[2])204{205__asm__ __volatile__("movq %%rdx, %%rcx;"206"movq %%rax, %%rbx;"207CK_PR_LOCK_PREFIX "cmpxchg16b %2;"208: "=a" (v[0]),209"=d" (v[1])210: "m" (*(const uint64_t *)target)211: "rbx", "rcx", "memory", "cc");212return;213}214215CK_CC_INLINE static void216ck_pr_load_ptr_2(const void *t, void *v)217{218ck_pr_load_64_2(CK_CPP_CAST(const uint64_t *, t),219CK_CPP_CAST(uint64_t *, v));220return;221}222223#define CK_PR_LOAD_2(S, W, T) \224CK_CC_INLINE static void \225ck_pr_md_load_##S##_##W(const T t[2], T v[2]) \226{ \227ck_pr_load_64_2((const uint64_t *)(const void *)t, \228(uint64_t *)(void *)v); \229return; \230}231232CK_PR_LOAD_2(char, 16, char)233CK_PR_LOAD_2(int, 4, int)234CK_PR_LOAD_2(uint, 4, unsigned int)235CK_PR_LOAD_2(32, 4, uint32_t)236CK_PR_LOAD_2(16, 8, uint16_t)237CK_PR_LOAD_2(8, 16, uint8_t)238239#undef CK_PR_LOAD_2240241/*242* Atomic store-to-memory operations.243*/244#define CK_PR_STORE_IMM(S, M, T, C, I, K) \245CK_CC_INLINE static void \246ck_pr_md_store_##S(M *target, T v) \247{ \248__asm__ __volatile__(I " %1, %0" \249: "=m" (*(C *)target) \250: K "q" (v) \251: "memory"); \252return; \253}254255#define CK_PR_STORE(S, M, T, C, I) \256CK_CC_INLINE static void \257ck_pr_md_store_##S(M *target, T v) \258{ \259__asm__ __volatile__(I " %1, %0" \260: "=m" (*(C *)target) \261: "q" (v) \262: "memory"); \263return; \264}265266CK_PR_STORE_IMM(ptr, void, const void *, uint64_t, "movq", CK_CC_IMM_U32)267#ifndef CK_PR_DISABLE_DOUBLE268CK_PR_STORE(double, double, double, double, "movq")269#endif270271#define CK_PR_STORE_S(S, T, I, K) CK_PR_STORE_IMM(S, T, T, T, I, K)272273CK_PR_STORE_S(char, char, "movb", CK_CC_IMM_S32)274CK_PR_STORE_S(int, int, "movl", CK_CC_IMM_S32)275CK_PR_STORE_S(uint, unsigned int, "movl", CK_CC_IMM_U32)276CK_PR_STORE_S(64, uint64_t, "movq", CK_CC_IMM_U32)277CK_PR_STORE_S(32, uint32_t, "movl", CK_CC_IMM_U32)278CK_PR_STORE_S(16, uint16_t, "movw", CK_CC_IMM_U32)279CK_PR_STORE_S(8, uint8_t, "movb", CK_CC_IMM_U32)280281#undef CK_PR_STORE_S282#undef CK_PR_STORE_IMM283#undef CK_PR_STORE284285/*286* Atomic fetch-and-add operations.287*/288#define CK_PR_FAA(S, M, T, C, I) \289CK_CC_INLINE static T \290ck_pr_faa_##S(M *target, T d) \291{ \292__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %1, %0" \293: "+m" (*(C *)target), \294"+q" (d) \295: \296: "memory", "cc"); \297return (d); \298}299300CK_PR_FAA(ptr, void, uintptr_t, uint64_t, "xaddq")301302#define CK_PR_FAA_S(S, T, I) CK_PR_FAA(S, T, T, T, I)303304CK_PR_FAA_S(char, char, "xaddb")305CK_PR_FAA_S(uint, unsigned int, "xaddl")306CK_PR_FAA_S(int, int, "xaddl")307CK_PR_FAA_S(64, uint64_t, "xaddq")308CK_PR_FAA_S(32, uint32_t, "xaddl")309CK_PR_FAA_S(16, uint16_t, "xaddw")310CK_PR_FAA_S(8, uint8_t, "xaddb")311312#undef CK_PR_FAA_S313#undef CK_PR_FAA314315/*316* Atomic store-only unary operations.317*/318#define CK_PR_UNARY(K, S, T, C, I) \319CK_PR_UNARY_R(K, S, T, C, I) \320CK_PR_UNARY_V(K, S, T, C, I)321322#define CK_PR_UNARY_R(K, S, T, C, I) \323CK_CC_INLINE static void \324ck_pr_##K##_##S(T *target) \325{ \326__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0" \327: "+m" (*(C *)target) \328: \329: "memory", "cc"); \330return; \331}332333#define CK_PR_UNARY_V(K, S, T, C, I) \334CK_CC_INLINE static bool \335ck_pr_##K##_##S##_is_zero(T *target) \336{ \337bool ret; \338__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1" \339: "+m" (*(C *)target), \340"=rm" (ret) \341: \342: "memory", "cc"); \343return ret; \344}345346#define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I)347348#define CK_PR_GENERATE(K) \349CK_PR_UNARY(K, ptr, void, uint64_t, #K "q") \350CK_PR_UNARY_S(K, char, char, #K "b") \351CK_PR_UNARY_S(K, int, int, #K "l") \352CK_PR_UNARY_S(K, uint, unsigned int, #K "l") \353CK_PR_UNARY_S(K, 64, uint64_t, #K "q") \354CK_PR_UNARY_S(K, 32, uint32_t, #K "l") \355CK_PR_UNARY_S(K, 16, uint16_t, #K "w") \356CK_PR_UNARY_S(K, 8, uint8_t, #K "b")357358CK_PR_GENERATE(inc)359CK_PR_GENERATE(dec)360CK_PR_GENERATE(neg)361362/* not does not affect condition flags. */363#undef CK_PR_UNARY_V364#define CK_PR_UNARY_V(a, b, c, d, e)365CK_PR_GENERATE(not)366367#undef CK_PR_GENERATE368#undef CK_PR_UNARY_S369#undef CK_PR_UNARY_V370#undef CK_PR_UNARY_R371#undef CK_PR_UNARY372373/*374* Atomic store-only binary operations.375*/376#define CK_PR_BINARY(K, S, M, T, C, I, O) \377CK_CC_INLINE static void \378ck_pr_##K##_##S(M *target, T d) \379{ \380__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %1, %0" \381: "+m" (*(C *)target) \382: O "q" (d) \383: "memory", "cc"); \384return; \385}386387#define CK_PR_BINARY_S(K, S, T, I, O) CK_PR_BINARY(K, S, T, T, T, I, O)388389#define CK_PR_GENERATE(K) \390CK_PR_BINARY(K, ptr, void, uintptr_t, uint64_t, #K "q", CK_CC_IMM_U32) \391CK_PR_BINARY_S(K, char, char, #K "b", CK_CC_IMM_S32) \392CK_PR_BINARY_S(K, int, int, #K "l", CK_CC_IMM_S32) \393CK_PR_BINARY_S(K, uint, unsigned int, #K "l", CK_CC_IMM_U32) \394CK_PR_BINARY_S(K, 64, uint64_t, #K "q", CK_CC_IMM_U32) \395CK_PR_BINARY_S(K, 32, uint32_t, #K "l", CK_CC_IMM_U32) \396CK_PR_BINARY_S(K, 16, uint16_t, #K "w", CK_CC_IMM_U32) \397CK_PR_BINARY_S(K, 8, uint8_t, #K "b", CK_CC_IMM_U32)398399CK_PR_GENERATE(add)400CK_PR_GENERATE(sub)401CK_PR_GENERATE(and)402CK_PR_GENERATE(or)403CK_PR_GENERATE(xor)404405#undef CK_PR_GENERATE406#undef CK_PR_BINARY_S407#undef CK_PR_BINARY408409/*410* Atomic compare and swap, with a variant that sets *v to the old value of target.411*/412#ifdef __GCC_ASM_FLAG_OUTPUTS__413#define CK_PR_CAS(S, M, T, C, I) \414CK_CC_INLINE static bool \415ck_pr_cas_##S(M *target, T compare, T set) \416{ \417bool z; \418__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \419: "+m" (*(C *)target), \420"=@ccz" (z), \421/* RAX is clobbered by cmpxchg. */ \422"+a" (compare) \423: "q" (set) \424: "memory", "cc"); \425return z; \426} \427\428CK_CC_INLINE static bool \429ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \430{ \431bool z; \432__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \433: "+m" (*(C *)target), \434"=@ccz" (z), \435"+a" (compare) \436: "q" (set) \437: "memory", "cc"); \438*(T *)v = compare; \439return z; \440}441#else442#define CK_PR_CAS(S, M, T, C, I) \443CK_CC_INLINE static bool \444ck_pr_cas_##S(M *target, T compare, T set) \445{ \446bool z; \447__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %2, %0; setz %1" \448: "+m" (*(C *)target), \449"=a" (z) \450: "q" (set), \451"a" (compare) \452: "memory", "cc"); \453return z; \454} \455\456CK_CC_INLINE static bool \457ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \458{ \459bool z; \460__asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \461"setz %1;" \462: "+m" (*(C *)target), \463"=q" (z), \464"+a" (compare) \465: "q" (set) \466: "memory", "cc"); \467*(T *)v = compare; \468return z; \469}470#endif471472CK_PR_CAS(ptr, void, void *, uint64_t, "cmpxchgq")473474#define CK_PR_CAS_S(S, T, I) CK_PR_CAS(S, T, T, T, I)475476CK_PR_CAS_S(char, char, "cmpxchgb")477CK_PR_CAS_S(int, int, "cmpxchgl")478CK_PR_CAS_S(uint, unsigned int, "cmpxchgl")479#ifndef CK_PR_DISABLE_DOUBLE480CK_PR_CAS_S(double, double, "cmpxchgq")481#endif482CK_PR_CAS_S(64, uint64_t, "cmpxchgq")483CK_PR_CAS_S(32, uint32_t, "cmpxchgl")484CK_PR_CAS_S(16, uint16_t, "cmpxchgw")485CK_PR_CAS_S(8, uint8_t, "cmpxchgb")486487#undef CK_PR_CAS_S488#undef CK_PR_CAS489490/*491* Contrary to C-interface, alignment requirements are that of uint64_t[2].492*/493CK_CC_INLINE static bool494ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2])495{496bool z;497498__asm__ __volatile__("movq 0(%4), %%rax;"499"movq 8(%4), %%rdx;"500CK_PR_LOCK_PREFIX "cmpxchg16b %0; setz %1"501: "+m" (*target),502"=q" (z)503: "b" (set[0]),504"c" (set[1]),505"q" (compare)506: "memory", "cc", "%rax", "%rdx");507return z;508}509510CK_CC_INLINE static bool511ck_pr_cas_ptr_2(void *t, void *c, void *s)512{513return ck_pr_cas_64_2(CK_CPP_CAST(uint64_t *, t),514CK_CPP_CAST(uint64_t *, c),515CK_CPP_CAST(uint64_t *, s));516}517518CK_CC_INLINE static bool519ck_pr_cas_64_2_value(uint64_t target[2],520uint64_t compare[2],521uint64_t set[2],522uint64_t v[2])523{524bool z;525526__asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg16b %0;"527"setz %3"528: "+m" (*target),529"=a" (v[0]),530"=d" (v[1]),531"=q" (z)532: "a" (compare[0]),533"d" (compare[1]),534"b" (set[0]),535"c" (set[1])536: "memory", "cc");537return z;538}539540CK_CC_INLINE static bool541ck_pr_cas_ptr_2_value(void *t, void *c, void *s, void *v)542{543return ck_pr_cas_64_2_value(CK_CPP_CAST(uint64_t *,t),544CK_CPP_CAST(uint64_t *,c),545CK_CPP_CAST(uint64_t *,s),546CK_CPP_CAST(uint64_t *,v));547}548549#define CK_PR_CAS_V(S, W, T) \550CK_CC_INLINE static bool \551ck_pr_cas_##S##_##W(T t[W], T c[W], T s[W]) \552{ \553return ck_pr_cas_64_2((uint64_t *)(void *)t, \554(uint64_t *)(void *)c, \555(uint64_t *)(void *)s); \556} \557CK_CC_INLINE static bool \558ck_pr_cas_##S##_##W##_value(T *t, T c[W], T s[W], T *v) \559{ \560return ck_pr_cas_64_2_value((uint64_t *)(void *)t, \561(uint64_t *)(void *)c, \562(uint64_t *)(void *)s, \563(uint64_t *)(void *)v); \564}565566#ifndef CK_PR_DISABLE_DOUBLE567CK_PR_CAS_V(double, 2, double)568#endif569CK_PR_CAS_V(char, 16, char)570CK_PR_CAS_V(int, 4, int)571CK_PR_CAS_V(uint, 4, unsigned int)572CK_PR_CAS_V(32, 4, uint32_t)573CK_PR_CAS_V(16, 8, uint16_t)574CK_PR_CAS_V(8, 16, uint8_t)575576#undef CK_PR_CAS_V577578/*579* Atomic bit test operations.580*/581#define CK_PR_BT(K, S, T, P, C, I) \582CK_CC_INLINE static bool \583ck_pr_##K##_##S(T *target, unsigned int b) \584{ \585bool c; \586__asm__ __volatile__(CK_PR_LOCK_PREFIX I "; setc %1" \587: "+m" (*(C *)target), \588"=q" (c) \589: "q" ((P)b) \590: "memory", "cc"); \591return c; \592}593594#define CK_PR_BT_S(K, S, T, I) CK_PR_BT(K, S, T, T, T, I)595596#define CK_PR_GENERATE(K) \597CK_PR_BT(K, ptr, void, uint64_t, uint64_t, #K "q %2, %0") \598CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \599CK_PR_BT_S(K, int, int, #K "l %2, %0") \600CK_PR_BT_S(K, 64, uint64_t, #K "q %2, %0") \601CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \602CK_PR_BT_S(K, 16, uint16_t, #K "w %w2, %0")603604CK_PR_GENERATE(btc)605CK_PR_GENERATE(bts)606CK_PR_GENERATE(btr)607608#undef CK_PR_GENERATE609#undef CK_PR_BT610611#endif /* CK_PR_X86_64_H */612613614615