Path: blob/main/contrib/llvm-project/clang/lib/Headers/__clang_hip_math.h
35233 views
/*===---- __clang_hip_math.h - Device-side HIP math support ----------------===1*2* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3* See https://llvm.org/LICENSE.txt for license information.4* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5*6*===-----------------------------------------------------------------------===7*/8#ifndef __CLANG_HIP_MATH_H__9#define __CLANG_HIP_MATH_H__1011#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)12#error "This file is for HIP and OpenMP AMDGCN device compilation only."13#endif1415#if !defined(__HIPCC_RTC__)16#include <limits.h>17#include <stdint.h>18#ifdef __OPENMP_AMDGCN__19#include <omp.h>20#endif21#endif // !defined(__HIPCC_RTC__)2223#pragma push_macro("__DEVICE__")2425#ifdef __OPENMP_AMDGCN__26#define __DEVICE__ static inline __attribute__((always_inline, nothrow))27#else28#define __DEVICE__ static __device__ inline __attribute__((always_inline))29#endif3031// Device library provides fast low precision and slow full-recision32// implementations for some functions. Which one gets selected depends on33// __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if34// -ffast-math or -fgpu-approx-transcendentals are in effect.35#pragma push_macro("__FAST_OR_SLOW")36#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)37#define __FAST_OR_SLOW(fast, slow) fast38#else39#define __FAST_OR_SLOW(fast, slow) slow40#endif4142// A few functions return bool type starting only in C++11.43#pragma push_macro("__RETURN_TYPE")44#ifdef __OPENMP_AMDGCN__45#define __RETURN_TYPE int46#else47#if defined(__cplusplus)48#define __RETURN_TYPE bool49#else50#define __RETURN_TYPE int51#endif52#endif // __OPENMP_AMDGCN__5354#if defined (__cplusplus) && __cplusplus < 201103L55// emulate static_assert on type sizes56template<bool>57struct __compare_result{};58template<>59struct __compare_result<true> {60static const __device__ bool valid;61};6263__DEVICE__64void __suppress_unused_warning(bool b){};65template <unsigned int S, unsigned int T>66__DEVICE__ void __static_assert_equal_size() {67__suppress_unused_warning(__compare_result<S == T>::valid);68}6970#define __static_assert_type_size_equal(A, B) \71__static_assert_equal_size<A,B>()7273#else74#define __static_assert_type_size_equal(A,B) \75static_assert((A) == (B), "")7677#endif7879__DEVICE__80uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {81uint64_t __r = 0;82while (*__tagp != '\0') {83char __tmp = *__tagp;8485if (__tmp >= '0' && __tmp <= '7')86__r = (__r * 8u) + __tmp - '0';87else88return 0;8990++__tagp;91}9293return __r;94}9596__DEVICE__97uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {98uint64_t __r = 0;99while (*__tagp != '\0') {100char __tmp = *__tagp;101102if (__tmp >= '0' && __tmp <= '9')103__r = (__r * 10u) + __tmp - '0';104else105return 0;106107++__tagp;108}109110return __r;111}112113__DEVICE__114uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {115uint64_t __r = 0;116while (*__tagp != '\0') {117char __tmp = *__tagp;118119if (__tmp >= '0' && __tmp <= '9')120__r = (__r * 16u) + __tmp - '0';121else if (__tmp >= 'a' && __tmp <= 'f')122__r = (__r * 16u) + __tmp - 'a' + 10;123else if (__tmp >= 'A' && __tmp <= 'F')124__r = (__r * 16u) + __tmp - 'A' + 10;125else126return 0;127128++__tagp;129}130131return __r;132}133134__DEVICE__135uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {136if (*__tagp == '0') {137++__tagp;138139if (*__tagp == 'x' || *__tagp == 'X')140return __make_mantissa_base16(__tagp);141else142return __make_mantissa_base8(__tagp);143}144145return __make_mantissa_base10(__tagp);146}147148// BEGIN FLOAT149150// BEGIN INTRINSICS151152__DEVICE__153float __cosf(float __x) { return __ocml_native_cos_f32(__x); }154155__DEVICE__156float __exp10f(float __x) {157const float __log2_10 = 0x1.a934f0p+1f;158return __builtin_amdgcn_exp2f(__log2_10 * __x);159}160161__DEVICE__162float __expf(float __x) {163const float __log2_e = 0x1.715476p+0;164return __builtin_amdgcn_exp2f(__log2_e * __x);165}166167#if defined OCML_BASIC_ROUNDED_OPERATIONS168__DEVICE__169float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }170__DEVICE__171float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }172__DEVICE__173float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }174__DEVICE__175float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }176#else177__DEVICE__178float __fadd_rn(float __x, float __y) { return __x + __y; }179#endif180181#if defined OCML_BASIC_ROUNDED_OPERATIONS182__DEVICE__183float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }184__DEVICE__185float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }186__DEVICE__187float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }188__DEVICE__189float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }190#else191__DEVICE__192float __fdiv_rn(float __x, float __y) { return __x / __y; }193#endif194195__DEVICE__196float __fdividef(float __x, float __y) { return __x / __y; }197198#if defined OCML_BASIC_ROUNDED_OPERATIONS199__DEVICE__200float __fmaf_rd(float __x, float __y, float __z) {201return __ocml_fma_rtn_f32(__x, __y, __z);202}203__DEVICE__204float __fmaf_rn(float __x, float __y, float __z) {205return __ocml_fma_rte_f32(__x, __y, __z);206}207__DEVICE__208float __fmaf_ru(float __x, float __y, float __z) {209return __ocml_fma_rtp_f32(__x, __y, __z);210}211__DEVICE__212float __fmaf_rz(float __x, float __y, float __z) {213return __ocml_fma_rtz_f32(__x, __y, __z);214}215#else216__DEVICE__217float __fmaf_rn(float __x, float __y, float __z) {218return __builtin_fmaf(__x, __y, __z);219}220#endif221222#if defined OCML_BASIC_ROUNDED_OPERATIONS223__DEVICE__224float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }225__DEVICE__226float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }227__DEVICE__228float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }229__DEVICE__230float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }231#else232__DEVICE__233float __fmul_rn(float __x, float __y) { return __x * __y; }234#endif235236#if defined OCML_BASIC_ROUNDED_OPERATIONS237__DEVICE__238float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }239__DEVICE__240float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }241__DEVICE__242float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }243__DEVICE__244float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }245#else246__DEVICE__247float __frcp_rn(float __x) { return 1.0f / __x; }248#endif249250__DEVICE__251float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); }252253#if defined OCML_BASIC_ROUNDED_OPERATIONS254__DEVICE__255float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }256__DEVICE__257float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }258__DEVICE__259float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }260__DEVICE__261float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }262#else263__DEVICE__264float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }265#endif266267#if defined OCML_BASIC_ROUNDED_OPERATIONS268__DEVICE__269float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }270__DEVICE__271float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }272__DEVICE__273float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }274__DEVICE__275float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }276#else277__DEVICE__278float __fsub_rn(float __x, float __y) { return __x - __y; }279#endif280281__DEVICE__282float __log10f(float __x) { return __builtin_log10f(__x); }283284__DEVICE__285float __log2f(float __x) { return __builtin_amdgcn_logf(__x); }286287__DEVICE__288float __logf(float __x) { return __builtin_logf(__x); }289290__DEVICE__291float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }292293__DEVICE__294float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }295296__DEVICE__297void __sincosf(float __x, float *__sinptr, float *__cosptr) {298*__sinptr = __ocml_native_sin_f32(__x);299*__cosptr = __ocml_native_cos_f32(__x);300}301302__DEVICE__303float __sinf(float __x) { return __ocml_native_sin_f32(__x); }304305__DEVICE__306float __tanf(float __x) {307return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x));308}309// END INTRINSICS310311#if defined(__cplusplus)312__DEVICE__313int abs(int __x) {314return __builtin_abs(__x);315}316__DEVICE__317long labs(long __x) {318return __builtin_labs(__x);319}320__DEVICE__321long long llabs(long long __x) {322return __builtin_llabs(__x);323}324#endif325326__DEVICE__327float acosf(float __x) { return __ocml_acos_f32(__x); }328329__DEVICE__330float acoshf(float __x) { return __ocml_acosh_f32(__x); }331332__DEVICE__333float asinf(float __x) { return __ocml_asin_f32(__x); }334335__DEVICE__336float asinhf(float __x) { return __ocml_asinh_f32(__x); }337338__DEVICE__339float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }340341__DEVICE__342float atanf(float __x) { return __ocml_atan_f32(__x); }343344__DEVICE__345float atanhf(float __x) { return __ocml_atanh_f32(__x); }346347__DEVICE__348float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }349350__DEVICE__351float ceilf(float __x) { return __builtin_ceilf(__x); }352353__DEVICE__354float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); }355356__DEVICE__357float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); }358359__DEVICE__360float coshf(float __x) { return __ocml_cosh_f32(__x); }361362__DEVICE__363float cospif(float __x) { return __ocml_cospi_f32(__x); }364365__DEVICE__366float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }367368__DEVICE__369float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }370371__DEVICE__372float erfcf(float __x) { return __ocml_erfc_f32(__x); }373374__DEVICE__375float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }376377__DEVICE__378float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }379380__DEVICE__381float erff(float __x) { return __ocml_erf_f32(__x); }382383__DEVICE__384float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }385386__DEVICE__387float exp10f(float __x) { return __ocml_exp10_f32(__x); }388389__DEVICE__390float exp2f(float __x) { return __builtin_exp2f(__x); }391392__DEVICE__393float expf(float __x) { return __builtin_expf(__x); }394395__DEVICE__396float expm1f(float __x) { return __ocml_expm1_f32(__x); }397398__DEVICE__399float fabsf(float __x) { return __builtin_fabsf(__x); }400401__DEVICE__402float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }403404__DEVICE__405float fdividef(float __x, float __y) { return __x / __y; }406407__DEVICE__408float floorf(float __x) { return __builtin_floorf(__x); }409410__DEVICE__411float fmaf(float __x, float __y, float __z) {412return __builtin_fmaf(__x, __y, __z);413}414415__DEVICE__416float fmaxf(float __x, float __y) { return __builtin_fmaxf(__x, __y); }417418__DEVICE__419float fminf(float __x, float __y) { return __builtin_fminf(__x, __y); }420421__DEVICE__422float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }423424__DEVICE__425float frexpf(float __x, int *__nptr) {426return __builtin_frexpf(__x, __nptr);427}428429__DEVICE__430float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }431432__DEVICE__433int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }434435__DEVICE__436__RETURN_TYPE __finitef(float __x) { return __builtin_isfinite(__x); }437438__DEVICE__439__RETURN_TYPE __isinff(float __x) { return __builtin_isinf(__x); }440441__DEVICE__442__RETURN_TYPE __isnanf(float __x) { return __builtin_isnan(__x); }443444__DEVICE__445float j0f(float __x) { return __ocml_j0_f32(__x); }446447__DEVICE__448float j1f(float __x) { return __ocml_j1_f32(__x); }449450__DEVICE__451float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication452// and the Miller & Brown algorithm453// for linear recurrences to get O(log n) steps, but it's unclear if454// it'd be beneficial in this case.455if (__n == 0)456return j0f(__x);457if (__n == 1)458return j1f(__x);459460float __x0 = j0f(__x);461float __x1 = j1f(__x);462for (int __i = 1; __i < __n; ++__i) {463float __x2 = (2 * __i) / __x * __x1 - __x0;464__x0 = __x1;465__x1 = __x2;466}467468return __x1;469}470471__DEVICE__472float ldexpf(float __x, int __e) { return __builtin_amdgcn_ldexpf(__x, __e); }473474__DEVICE__475float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }476477__DEVICE__478long long int llrintf(float __x) { return __builtin_rintf(__x); }479480__DEVICE__481long long int llroundf(float __x) { return __builtin_roundf(__x); }482483__DEVICE__484float log10f(float __x) { return __builtin_log10f(__x); }485486__DEVICE__487float log1pf(float __x) { return __ocml_log1p_f32(__x); }488489__DEVICE__490float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); }491492__DEVICE__493float logbf(float __x) { return __ocml_logb_f32(__x); }494495__DEVICE__496float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); }497498__DEVICE__499long int lrintf(float __x) { return __builtin_rintf(__x); }500501__DEVICE__502long int lroundf(float __x) { return __builtin_roundf(__x); }503504__DEVICE__505float modff(float __x, float *__iptr) {506float __tmp;507#ifdef __OPENMP_AMDGCN__508#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)509#endif510float __r =511__ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);512*__iptr = __tmp;513return __r;514}515516__DEVICE__517float nanf(const char *__tagp __attribute__((nonnull))) {518union {519float val;520struct ieee_float {521unsigned int mantissa : 22;522unsigned int quiet : 1;523unsigned int exponent : 8;524unsigned int sign : 1;525} bits;526} __tmp;527__static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));528529__tmp.bits.sign = 0u;530__tmp.bits.exponent = ~0u;531__tmp.bits.quiet = 1u;532__tmp.bits.mantissa = __make_mantissa(__tagp);533534return __tmp.val;535}536537__DEVICE__538float nearbyintf(float __x) { return __builtin_nearbyintf(__x); }539540__DEVICE__541float nextafterf(float __x, float __y) {542return __ocml_nextafter_f32(__x, __y);543}544545__DEVICE__546float norm3df(float __x, float __y, float __z) {547return __ocml_len3_f32(__x, __y, __z);548}549550__DEVICE__551float norm4df(float __x, float __y, float __z, float __w) {552return __ocml_len4_f32(__x, __y, __z, __w);553}554555__DEVICE__556float normcdff(float __x) { return __ocml_ncdf_f32(__x); }557558__DEVICE__559float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }560561__DEVICE__562float normf(int __dim,563const float *__a) { // TODO: placeholder until OCML adds support.564float __r = 0;565while (__dim--) {566__r += __a[0] * __a[0];567++__a;568}569570return __builtin_sqrtf(__r);571}572573__DEVICE__574float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }575576__DEVICE__577float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }578579__DEVICE__580float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }581582__DEVICE__583float remainderf(float __x, float __y) {584return __ocml_remainder_f32(__x, __y);585}586587__DEVICE__588float remquof(float __x, float __y, int *__quo) {589int __tmp;590#ifdef __OPENMP_AMDGCN__591#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)592#endif593float __r = __ocml_remquo_f32(594__x, __y, (__attribute__((address_space(5))) int *)&__tmp);595*__quo = __tmp;596597return __r;598}599600__DEVICE__601float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }602603__DEVICE__604float rintf(float __x) { return __builtin_rintf(__x); }605606__DEVICE__607float rnorm3df(float __x, float __y, float __z) {608return __ocml_rlen3_f32(__x, __y, __z);609}610611__DEVICE__612float rnorm4df(float __x, float __y, float __z, float __w) {613return __ocml_rlen4_f32(__x, __y, __z, __w);614}615616__DEVICE__617float rnormf(int __dim,618const float *__a) { // TODO: placeholder until OCML adds support.619float __r = 0;620while (__dim--) {621__r += __a[0] * __a[0];622++__a;623}624625return __ocml_rsqrt_f32(__r);626}627628__DEVICE__629float roundf(float __x) { return __builtin_roundf(__x); }630631__DEVICE__632float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }633634__DEVICE__635float scalblnf(float __x, long int __n) {636return (__n < INT_MAX) ? __builtin_amdgcn_ldexpf(__x, __n)637: __ocml_scalb_f32(__x, __n);638}639640__DEVICE__641float scalbnf(float __x, int __n) { return __builtin_amdgcn_ldexpf(__x, __n); }642643__DEVICE__644__RETURN_TYPE __signbitf(float __x) { return __builtin_signbitf(__x); }645646__DEVICE__647void sincosf(float __x, float *__sinptr, float *__cosptr) {648float __tmp;649#ifdef __OPENMP_AMDGCN__650#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)651#endif652#ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__653__sincosf(__x, __sinptr, __cosptr);654#else655*__sinptr =656__ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);657*__cosptr = __tmp;658#endif659}660661__DEVICE__662void sincospif(float __x, float *__sinptr, float *__cosptr) {663float __tmp;664#ifdef __OPENMP_AMDGCN__665#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)666#endif667*__sinptr = __ocml_sincospi_f32(668__x, (__attribute__((address_space(5))) float *)&__tmp);669*__cosptr = __tmp;670}671672__DEVICE__673float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); }674675__DEVICE__676float sinhf(float __x) { return __ocml_sinh_f32(__x); }677678__DEVICE__679float sinpif(float __x) { return __ocml_sinpi_f32(__x); }680681__DEVICE__682float sqrtf(float __x) { return __builtin_sqrtf(__x); }683684__DEVICE__685float tanf(float __x) { return __ocml_tan_f32(__x); }686687__DEVICE__688float tanhf(float __x) { return __ocml_tanh_f32(__x); }689690__DEVICE__691float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }692693__DEVICE__694float truncf(float __x) { return __builtin_truncf(__x); }695696__DEVICE__697float y0f(float __x) { return __ocml_y0_f32(__x); }698699__DEVICE__700float y1f(float __x) { return __ocml_y1_f32(__x); }701702__DEVICE__703float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication704// and the Miller & Brown algorithm705// for linear recurrences to get O(log n) steps, but it's unclear if706// it'd be beneficial in this case. Placeholder until OCML adds707// support.708if (__n == 0)709return y0f(__x);710if (__n == 1)711return y1f(__x);712713float __x0 = y0f(__x);714float __x1 = y1f(__x);715for (int __i = 1; __i < __n; ++__i) {716float __x2 = (2 * __i) / __x * __x1 - __x0;717__x0 = __x1;718__x1 = __x2;719}720721return __x1;722}723724725// END FLOAT726727// BEGIN DOUBLE728__DEVICE__729double acos(double __x) { return __ocml_acos_f64(__x); }730731__DEVICE__732double acosh(double __x) { return __ocml_acosh_f64(__x); }733734__DEVICE__735double asin(double __x) { return __ocml_asin_f64(__x); }736737__DEVICE__738double asinh(double __x) { return __ocml_asinh_f64(__x); }739740__DEVICE__741double atan(double __x) { return __ocml_atan_f64(__x); }742743__DEVICE__744double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }745746__DEVICE__747double atanh(double __x) { return __ocml_atanh_f64(__x); }748749__DEVICE__750double cbrt(double __x) { return __ocml_cbrt_f64(__x); }751752__DEVICE__753double ceil(double __x) { return __builtin_ceil(__x); }754755__DEVICE__756double copysign(double __x, double __y) {757return __builtin_copysign(__x, __y);758}759760__DEVICE__761double cos(double __x) { return __ocml_cos_f64(__x); }762763__DEVICE__764double cosh(double __x) { return __ocml_cosh_f64(__x); }765766__DEVICE__767double cospi(double __x) { return __ocml_cospi_f64(__x); }768769__DEVICE__770double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }771772__DEVICE__773double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }774775__DEVICE__776double erf(double __x) { return __ocml_erf_f64(__x); }777778__DEVICE__779double erfc(double __x) { return __ocml_erfc_f64(__x); }780781__DEVICE__782double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }783784__DEVICE__785double erfcx(double __x) { return __ocml_erfcx_f64(__x); }786787__DEVICE__788double erfinv(double __x) { return __ocml_erfinv_f64(__x); }789790__DEVICE__791double exp(double __x) { return __ocml_exp_f64(__x); }792793__DEVICE__794double exp10(double __x) { return __ocml_exp10_f64(__x); }795796__DEVICE__797double exp2(double __x) { return __ocml_exp2_f64(__x); }798799__DEVICE__800double expm1(double __x) { return __ocml_expm1_f64(__x); }801802__DEVICE__803double fabs(double __x) { return __builtin_fabs(__x); }804805__DEVICE__806double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }807808__DEVICE__809double floor(double __x) { return __builtin_floor(__x); }810811__DEVICE__812double fma(double __x, double __y, double __z) {813return __builtin_fma(__x, __y, __z);814}815816__DEVICE__817double fmax(double __x, double __y) { return __builtin_fmax(__x, __y); }818819__DEVICE__820double fmin(double __x, double __y) { return __builtin_fmin(__x, __y); }821822__DEVICE__823double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }824825__DEVICE__826double frexp(double __x, int *__nptr) {827return __builtin_frexp(__x, __nptr);828}829830__DEVICE__831double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }832833__DEVICE__834int ilogb(double __x) { return __ocml_ilogb_f64(__x); }835836__DEVICE__837__RETURN_TYPE __finite(double __x) { return __builtin_isfinite(__x); }838839__DEVICE__840__RETURN_TYPE __isinf(double __x) { return __builtin_isinf(__x); }841842__DEVICE__843__RETURN_TYPE __isnan(double __x) { return __builtin_isnan(__x); }844845__DEVICE__846double j0(double __x) { return __ocml_j0_f64(__x); }847848__DEVICE__849double j1(double __x) { return __ocml_j1_f64(__x); }850851__DEVICE__852double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication853// and the Miller & Brown algorithm854// for linear recurrences to get O(log n) steps, but it's unclear if855// it'd be beneficial in this case. Placeholder until OCML adds856// support.857if (__n == 0)858return j0(__x);859if (__n == 1)860return j1(__x);861862double __x0 = j0(__x);863double __x1 = j1(__x);864for (int __i = 1; __i < __n; ++__i) {865double __x2 = (2 * __i) / __x * __x1 - __x0;866__x0 = __x1;867__x1 = __x2;868}869return __x1;870}871872__DEVICE__873double ldexp(double __x, int __e) { return __builtin_amdgcn_ldexp(__x, __e); }874875__DEVICE__876double lgamma(double __x) { return __ocml_lgamma_f64(__x); }877878__DEVICE__879long long int llrint(double __x) { return __builtin_rint(__x); }880881__DEVICE__882long long int llround(double __x) { return __builtin_round(__x); }883884__DEVICE__885double log(double __x) { return __ocml_log_f64(__x); }886887__DEVICE__888double log10(double __x) { return __ocml_log10_f64(__x); }889890__DEVICE__891double log1p(double __x) { return __ocml_log1p_f64(__x); }892893__DEVICE__894double log2(double __x) { return __ocml_log2_f64(__x); }895896__DEVICE__897double logb(double __x) { return __ocml_logb_f64(__x); }898899__DEVICE__900long int lrint(double __x) { return __builtin_rint(__x); }901902__DEVICE__903long int lround(double __x) { return __builtin_round(__x); }904905__DEVICE__906double modf(double __x, double *__iptr) {907double __tmp;908#ifdef __OPENMP_AMDGCN__909#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)910#endif911double __r =912__ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);913*__iptr = __tmp;914915return __r;916}917918__DEVICE__919double nan(const char *__tagp) {920#if !_WIN32921union {922double val;923struct ieee_double {924uint64_t mantissa : 51;925uint32_t quiet : 1;926uint32_t exponent : 11;927uint32_t sign : 1;928} bits;929} __tmp;930__static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));931932__tmp.bits.sign = 0u;933__tmp.bits.exponent = ~0u;934__tmp.bits.quiet = 1u;935__tmp.bits.mantissa = __make_mantissa(__tagp);936937return __tmp.val;938#else939__static_assert_type_size_equal(sizeof(uint64_t), sizeof(double));940uint64_t __val = __make_mantissa(__tagp);941__val |= 0xFFF << 51;942return *reinterpret_cast<double *>(&__val);943#endif944}945946__DEVICE__947double nearbyint(double __x) { return __builtin_nearbyint(__x); }948949__DEVICE__950double nextafter(double __x, double __y) {951return __ocml_nextafter_f64(__x, __y);952}953954__DEVICE__955double norm(int __dim,956const double *__a) { // TODO: placeholder until OCML adds support.957double __r = 0;958while (__dim--) {959__r += __a[0] * __a[0];960++__a;961}962963return __builtin_sqrt(__r);964}965966__DEVICE__967double norm3d(double __x, double __y, double __z) {968return __ocml_len3_f64(__x, __y, __z);969}970971__DEVICE__972double norm4d(double __x, double __y, double __z, double __w) {973return __ocml_len4_f64(__x, __y, __z, __w);974}975976__DEVICE__977double normcdf(double __x) { return __ocml_ncdf_f64(__x); }978979__DEVICE__980double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }981982__DEVICE__983double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }984985__DEVICE__986double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }987988__DEVICE__989double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }990991__DEVICE__992double remainder(double __x, double __y) {993return __ocml_remainder_f64(__x, __y);994}995996__DEVICE__997double remquo(double __x, double __y, int *__quo) {998int __tmp;999#ifdef __OPENMP_AMDGCN__1000#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)1001#endif1002double __r = __ocml_remquo_f64(1003__x, __y, (__attribute__((address_space(5))) int *)&__tmp);1004*__quo = __tmp;10051006return __r;1007}10081009__DEVICE__1010double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }10111012__DEVICE__1013double rint(double __x) { return __builtin_rint(__x); }10141015__DEVICE__1016double rnorm(int __dim,1017const double *__a) { // TODO: placeholder until OCML adds support.1018double __r = 0;1019while (__dim--) {1020__r += __a[0] * __a[0];1021++__a;1022}10231024return __ocml_rsqrt_f64(__r);1025}10261027__DEVICE__1028double rnorm3d(double __x, double __y, double __z) {1029return __ocml_rlen3_f64(__x, __y, __z);1030}10311032__DEVICE__1033double rnorm4d(double __x, double __y, double __z, double __w) {1034return __ocml_rlen4_f64(__x, __y, __z, __w);1035}10361037__DEVICE__1038double round(double __x) { return __builtin_round(__x); }10391040__DEVICE__1041double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }10421043__DEVICE__1044double scalbln(double __x, long int __n) {1045return (__n < INT_MAX) ? __builtin_amdgcn_ldexp(__x, __n)1046: __ocml_scalb_f64(__x, __n);1047}1048__DEVICE__1049double scalbn(double __x, int __n) { return __builtin_amdgcn_ldexp(__x, __n); }10501051__DEVICE__1052__RETURN_TYPE __signbit(double __x) { return __builtin_signbit(__x); }10531054__DEVICE__1055double sin(double __x) { return __ocml_sin_f64(__x); }10561057__DEVICE__1058void sincos(double __x, double *__sinptr, double *__cosptr) {1059double __tmp;1060#ifdef __OPENMP_AMDGCN__1061#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)1062#endif1063*__sinptr = __ocml_sincos_f64(1064__x, (__attribute__((address_space(5))) double *)&__tmp);1065*__cosptr = __tmp;1066}10671068__DEVICE__1069void sincospi(double __x, double *__sinptr, double *__cosptr) {1070double __tmp;1071#ifdef __OPENMP_AMDGCN__1072#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)1073#endif1074*__sinptr = __ocml_sincospi_f64(1075__x, (__attribute__((address_space(5))) double *)&__tmp);1076*__cosptr = __tmp;1077}10781079__DEVICE__1080double sinh(double __x) { return __ocml_sinh_f64(__x); }10811082__DEVICE__1083double sinpi(double __x) { return __ocml_sinpi_f64(__x); }10841085__DEVICE__1086double sqrt(double __x) { return __builtin_sqrt(__x); }10871088__DEVICE__1089double tan(double __x) { return __ocml_tan_f64(__x); }10901091__DEVICE__1092double tanh(double __x) { return __ocml_tanh_f64(__x); }10931094__DEVICE__1095double tgamma(double __x) { return __ocml_tgamma_f64(__x); }10961097__DEVICE__1098double trunc(double __x) { return __builtin_trunc(__x); }10991100__DEVICE__1101double y0(double __x) { return __ocml_y0_f64(__x); }11021103__DEVICE__1104double y1(double __x) { return __ocml_y1_f64(__x); }11051106__DEVICE__1107double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication1108// and the Miller & Brown algorithm1109// for linear recurrences to get O(log n) steps, but it's unclear if1110// it'd be beneficial in this case. Placeholder until OCML adds1111// support.1112if (__n == 0)1113return y0(__x);1114if (__n == 1)1115return y1(__x);11161117double __x0 = y0(__x);1118double __x1 = y1(__x);1119for (int __i = 1; __i < __n; ++__i) {1120double __x2 = (2 * __i) / __x * __x1 - __x0;1121__x0 = __x1;1122__x1 = __x2;1123}11241125return __x1;1126}11271128// BEGIN INTRINSICS1129#if defined OCML_BASIC_ROUNDED_OPERATIONS1130__DEVICE__1131double __dadd_rd(double __x, double __y) {1132return __ocml_add_rtn_f64(__x, __y);1133}1134__DEVICE__1135double __dadd_rn(double __x, double __y) {1136return __ocml_add_rte_f64(__x, __y);1137}1138__DEVICE__1139double __dadd_ru(double __x, double __y) {1140return __ocml_add_rtp_f64(__x, __y);1141}1142__DEVICE__1143double __dadd_rz(double __x, double __y) {1144return __ocml_add_rtz_f64(__x, __y);1145}1146#else1147__DEVICE__1148double __dadd_rn(double __x, double __y) { return __x + __y; }1149#endif11501151#if defined OCML_BASIC_ROUNDED_OPERATIONS1152__DEVICE__1153double __ddiv_rd(double __x, double __y) {1154return __ocml_div_rtn_f64(__x, __y);1155}1156__DEVICE__1157double __ddiv_rn(double __x, double __y) {1158return __ocml_div_rte_f64(__x, __y);1159}1160__DEVICE__1161double __ddiv_ru(double __x, double __y) {1162return __ocml_div_rtp_f64(__x, __y);1163}1164__DEVICE__1165double __ddiv_rz(double __x, double __y) {1166return __ocml_div_rtz_f64(__x, __y);1167}1168#else1169__DEVICE__1170double __ddiv_rn(double __x, double __y) { return __x / __y; }1171#endif11721173#if defined OCML_BASIC_ROUNDED_OPERATIONS1174__DEVICE__1175double __dmul_rd(double __x, double __y) {1176return __ocml_mul_rtn_f64(__x, __y);1177}1178__DEVICE__1179double __dmul_rn(double __x, double __y) {1180return __ocml_mul_rte_f64(__x, __y);1181}1182__DEVICE__1183double __dmul_ru(double __x, double __y) {1184return __ocml_mul_rtp_f64(__x, __y);1185}1186__DEVICE__1187double __dmul_rz(double __x, double __y) {1188return __ocml_mul_rtz_f64(__x, __y);1189}1190#else1191__DEVICE__1192double __dmul_rn(double __x, double __y) { return __x * __y; }1193#endif11941195#if defined OCML_BASIC_ROUNDED_OPERATIONS1196__DEVICE__1197double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); }1198__DEVICE__1199double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); }1200__DEVICE__1201double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); }1202__DEVICE__1203double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); }1204#else1205__DEVICE__1206double __drcp_rn(double __x) { return 1.0 / __x; }1207#endif12081209#if defined OCML_BASIC_ROUNDED_OPERATIONS1210__DEVICE__1211double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }1212__DEVICE__1213double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); }1214__DEVICE__1215double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }1216__DEVICE__1217double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }1218#else1219__DEVICE__1220double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); }1221#endif12221223#if defined OCML_BASIC_ROUNDED_OPERATIONS1224__DEVICE__1225double __dsub_rd(double __x, double __y) {1226return __ocml_sub_rtn_f64(__x, __y);1227}1228__DEVICE__1229double __dsub_rn(double __x, double __y) {1230return __ocml_sub_rte_f64(__x, __y);1231}1232__DEVICE__1233double __dsub_ru(double __x, double __y) {1234return __ocml_sub_rtp_f64(__x, __y);1235}1236__DEVICE__1237double __dsub_rz(double __x, double __y) {1238return __ocml_sub_rtz_f64(__x, __y);1239}1240#else1241__DEVICE__1242double __dsub_rn(double __x, double __y) { return __x - __y; }1243#endif12441245#if defined OCML_BASIC_ROUNDED_OPERATIONS1246__DEVICE__1247double __fma_rd(double __x, double __y, double __z) {1248return __ocml_fma_rtn_f64(__x, __y, __z);1249}1250__DEVICE__1251double __fma_rn(double __x, double __y, double __z) {1252return __ocml_fma_rte_f64(__x, __y, __z);1253}1254__DEVICE__1255double __fma_ru(double __x, double __y, double __z) {1256return __ocml_fma_rtp_f64(__x, __y, __z);1257}1258__DEVICE__1259double __fma_rz(double __x, double __y, double __z) {1260return __ocml_fma_rtz_f64(__x, __y, __z);1261}1262#else1263__DEVICE__1264double __fma_rn(double __x, double __y, double __z) {1265return __builtin_fma(__x, __y, __z);1266}1267#endif1268// END INTRINSICS1269// END DOUBLE12701271// C only macros1272#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L1273#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x)1274#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x)1275#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x)1276#define signbit(__x) \1277_Generic((__x), float : __signbitf, double : __signbit)(__x)1278#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L12791280#if defined(__cplusplus)1281template <class T> __DEVICE__ T min(T __arg1, T __arg2) {1282return (__arg1 < __arg2) ? __arg1 : __arg2;1283}12841285template <class T> __DEVICE__ T max(T __arg1, T __arg2) {1286return (__arg1 > __arg2) ? __arg1 : __arg2;1287}12881289__DEVICE__ int min(int __arg1, int __arg2) {1290return (__arg1 < __arg2) ? __arg1 : __arg2;1291}1292__DEVICE__ int max(int __arg1, int __arg2) {1293return (__arg1 > __arg2) ? __arg1 : __arg2;1294}12951296__DEVICE__1297float max(float __x, float __y) { return __builtin_fmaxf(__x, __y); }12981299__DEVICE__1300double max(double __x, double __y) { return __builtin_fmax(__x, __y); }13011302__DEVICE__1303float min(float __x, float __y) { return __builtin_fminf(__x, __y); }13041305__DEVICE__1306double min(double __x, double __y) { return __builtin_fmin(__x, __y); }13071308#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)1309__host__ inline static int min(int __arg1, int __arg2) {1310return __arg1 < __arg2 ? __arg1 : __arg2;1311}13121313__host__ inline static int max(int __arg1, int __arg2) {1314return __arg1 > __arg2 ? __arg1 : __arg2;1315}1316#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)1317#endif13181319#pragma pop_macro("__DEVICE__")1320#pragma pop_macro("__RETURN_TYPE")1321#pragma pop_macro("__FAST_OR_SLOW")13221323#endif // __CLANG_HIP_MATH_H__132413251326