Path: blob/master/thirdparty/embree/common/sys/sycl.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "platform.h"67using sycl::float16;8using sycl::float8;9using sycl::float4;10using sycl::float3;11using sycl::float2;12using sycl::int16;13using sycl::int8;14using sycl::int4;15using sycl::int3;16using sycl::int2;17using sycl::uint16;18using sycl::uint8;19using sycl::uint4;20using sycl::uint3;21using sycl::uint2;22using sycl::uchar16;23using sycl::uchar8;24using sycl::uchar4;25using sycl::uchar3;26using sycl::uchar2;27using sycl::ushort16;28using sycl::ushort8;29using sycl::ushort4;30using sycl::ushort3;31using sycl::ushort2;3233#ifdef __SYCL_DEVICE_ONLY__34#define GLOBAL __attribute__((opencl_global))35#define LOCAL __attribute__((opencl_local))3637SYCL_EXTERNAL extern int work_group_reduce_add(int x);38SYCL_EXTERNAL extern float work_group_reduce_min(float x);39SYCL_EXTERNAL extern float work_group_reduce_max(float x);4041SYCL_EXTERNAL extern float atomic_min(volatile GLOBAL float *p, float val);42SYCL_EXTERNAL extern float atomic_min(volatile LOCAL float *p, float val);43SYCL_EXTERNAL extern float atomic_max(volatile GLOBAL float *p, float val);44SYCL_EXTERNAL extern float atomic_max(volatile LOCAL float *p, float val);4546SYCL_EXTERNAL extern "C" unsigned int intel_sub_group_ballot(bool valid);4748SYCL_EXTERNAL extern "C" void __builtin_IB_assume_uniform(void *p);4950// Load message caching control5152enum LSC_LDCC {53LSC_LDCC_DEFAULT,54LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached55LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached56LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached57LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached58LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached59LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached60LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached61};62636465// Store message caching control (also used for atomics)6667enum LSC_STCC {68LSC_STCC_DEFAULT,69LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached70LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back71LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached72LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back73LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached74LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back75LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back76};77787980///////////////////////////////////////////////////////////////////////8182// LSC Loads8384///////////////////////////////////////////////////////////////////////8586SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uchar_to_uint (const GLOBAL uint8_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D8U3287SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_ushort_to_uint(const GLOBAL uint16_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D16U3288SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uint (const GLOBAL uint32_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V189SYCL_EXTERNAL /* extern "C" */ sycl::uint2 __builtin_IB_lsc_load_global_uint2 (const GLOBAL sycl::uint2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V290SYCL_EXTERNAL /* extern "C" */ sycl::uint3 __builtin_IB_lsc_load_global_uint3 (const GLOBAL sycl::uint3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V391SYCL_EXTERNAL /* extern "C" */ sycl::uint4 __builtin_IB_lsc_load_global_uint4 (const GLOBAL sycl::uint4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V492SYCL_EXTERNAL /* extern "C" */ sycl::uint8 __builtin_IB_lsc_load_global_uint8 (const GLOBAL sycl::uint8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V893SYCL_EXTERNAL /* extern "C" */ uint64_t __builtin_IB_lsc_load_global_ulong (const GLOBAL uint64_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V194SYCL_EXTERNAL /* extern "C" */ sycl::ulong2 __builtin_IB_lsc_load_global_ulong2 (const GLOBAL sycl::ulong2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V295SYCL_EXTERNAL /* extern "C" */ sycl::ulong3 __builtin_IB_lsc_load_global_ulong3 (const GLOBAL sycl::ulong3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V396SYCL_EXTERNAL /* extern "C" */ sycl::ulong4 __builtin_IB_lsc_load_global_ulong4 (const GLOBAL sycl::ulong4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V497SYCL_EXTERNAL /* extern "C" */ sycl::ulong8 __builtin_IB_lsc_load_global_ulong8 (const GLOBAL sycl::ulong8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V89899// global address space100SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uchar_from_uint (GLOBAL uint8_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D8U32101SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ushort_from_uint(GLOBAL uint16_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D16U32102SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint (GLOBAL uint32_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D32V1103SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint2 (GLOBAL sycl::uint2 *base, int immElemOff, sycl::uint2 val, enum LSC_STCC cacheOpt); //D32V2104SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint3 (GLOBAL sycl::uint3 *base, int immElemOff, sycl::uint3 val, enum LSC_STCC cacheOpt); //D32V3105SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint4 (GLOBAL sycl::uint4 *base, int immElemOff, sycl::uint4 val, enum LSC_STCC cacheOpt); //D32V4106SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint8 (GLOBAL sycl::uint8 *base, int immElemOff, sycl::uint8 val, enum LSC_STCC cacheOpt); //D32V8107SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong (GLOBAL uint64_t *base, int immElemOff, uint64_t val, enum LSC_STCC cacheOpt); //D64V1108SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong2 (GLOBAL sycl::ulong2 *base, int immElemOff, sycl::ulong2 val, enum LSC_STCC cacheOpt); //D64V2109SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong3 (GLOBAL sycl::ulong3 *base, int immElemOff, sycl::ulong3 val, enum LSC_STCC cacheOpt); //D64V3110SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong4 (GLOBAL sycl::ulong4 *base, int immElemOff, sycl::ulong4 val, enum LSC_STCC cacheOpt); //D64V4111SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong8 (GLOBAL sycl::ulong8 *base, int immElemOff, sycl::ulong8 val, enum LSC_STCC cacheOpt); //D64V8112113///////////////////////////////////////////////////////////////////////114// prefetching115///////////////////////////////////////////////////////////////////////116//117// LSC Pre-Fetch Load functions with CacheControls118// global address space119SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uchar (const GLOBAL uint8_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32120SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ushort(const GLOBAL uint16_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32121SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint (const GLOBAL uint32_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1122SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint2 (const GLOBAL sycl::uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2123SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint3 (const GLOBAL sycl::uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3124SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint4 (const GLOBAL sycl::uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4125SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint8 (const GLOBAL sycl::uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8126SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong (const GLOBAL uint64_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1127SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong2(const GLOBAL sycl::ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2128SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong3(const GLOBAL sycl::ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3129SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong4(const GLOBAL sycl::ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4130SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong8(const GLOBAL sycl::ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8131132#else133134#define GLOBAL135#define LOCAL136137/* dummy functions for host */138inline int work_group_reduce_add(int x) { return x; }139inline float work_group_reduce_min(float x) { return x; }140inline float work_group_reduce_max(float x) { return x; }141142inline float atomic_min(volatile float *p, float val) { return val; };143inline float atomic_max(volatile float *p, float val) { return val; };144145inline uint32_t intel_sub_group_ballot(bool valid) { return 0; }146147#endif148149/* creates a temporary that is enforced to be uniform */150#define SYCL_UNIFORM_VAR(Ty,tmp,k) \151Ty tmp##_data; \152Ty* p##tmp##_data = (Ty*) sub_group_broadcast((uint64_t)&tmp##_data,k); \153Ty& tmp = *p##tmp##_data;154155#if !defined(__forceinline)156#define __forceinline inline __attribute__((always_inline))157#endif158159#if __SYCL_COMPILER_VERSION < 20210801160#define all_of_group all_of161#define any_of_group any_of162#define none_of_group none_of163#define group_broadcast broadcast164#define reduce_over_group reduce165#define exclusive_scan_over_group exclusive_scan166#define inclusive_scan_over_group inclusive_scan167#endif168169namespace embree170{171template<typename T>172__forceinline T cselect(const bool mask, const T &a, const T &b)173{174return sycl::select(b,a,(int)mask);175}176177template<typename T, typename M>178__forceinline T cselect(const M &mask, const T &a, const T &b)179{180return sycl::select(b,a,mask);181}182183#define XSTR(x) STR(x)184#define STR(x) #x185186__forceinline const sycl::sub_group this_sub_group() {187#if __LIBSYCL_MAJOR_VERSION >= 8188return sycl::ext::oneapi::this_work_item::get_sub_group();189#else190return sycl::ext::oneapi::experimental::this_sub_group();191#endif192}193194__forceinline const uint32_t get_sub_group_local_id() {195return this_sub_group().get_local_id()[0];196}197198__forceinline const uint32_t get_sub_group_size() {199return this_sub_group().get_max_local_range().size();200}201202__forceinline const uint32_t get_sub_group_id() {203return this_sub_group().get_group_id()[0];204}205206__forceinline const uint32_t get_num_sub_groups() {207return this_sub_group().get_group_range().size();208}209210__forceinline uint32_t sub_group_ballot(bool pred) {211return intel_sub_group_ballot(pred);212}213214__forceinline bool sub_group_all_of(bool pred) {215return sycl::all_of_group(this_sub_group(),pred);216}217218__forceinline bool sub_group_any_of(bool pred) {219return sycl::any_of_group(this_sub_group(),pred);220}221222__forceinline bool sub_group_none_of(bool pred) {223return sycl::none_of_group(this_sub_group(),pred);224}225226template <typename T> __forceinline T sub_group_broadcast(T x, sycl::id<1> local_id) {227return sycl::group_broadcast<sycl::sub_group>(this_sub_group(),x,local_id);228}229230template <typename T> __forceinline T sub_group_make_uniform(T x) {231return sub_group_broadcast(x,sycl::ctz(intel_sub_group_ballot(true)));232}233234__forceinline void assume_uniform_array(void* ptr) {235#ifdef __SYCL_DEVICE_ONLY__236__builtin_IB_assume_uniform(ptr);237#endif238}239240template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, BinaryOperation binary_op) {241return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,binary_op);242}243244template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, T init, BinaryOperation binary_op) {245return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,init,binary_op);246}247248template <typename T> __forceinline T sub_group_reduce_min(T x, T init) {249return sub_group_reduce(x, init, sycl::ext::oneapi::minimum<T>());250}251252template <typename T> __forceinline T sub_group_reduce_min(T x) {253return sub_group_reduce(x, sycl::ext::oneapi::minimum<T>());254}255256template <typename T> __forceinline T sub_group_reduce_max(T x) {257return sub_group_reduce(x, sycl::ext::oneapi::maximum<T>());258}259260template <typename T> __forceinline T sub_group_reduce_add(T x) {261return sub_group_reduce(x, sycl::ext::oneapi::plus<T>());262}263264template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, BinaryOperation binary_op) {265return sycl::exclusive_scan_over_group(this_sub_group(),x,binary_op);266}267268template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan_min(T x) {269return sub_group_exclusive_scan(x,sycl::ext::oneapi::minimum<T>());270}271272template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, T init, BinaryOperation binary_op) {273return sycl::exclusive_scan_over_group(this_sub_group(),x,init,binary_op);274}275276template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op) {277return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op);278}279280template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op, T init) {281return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op,init);282}283284template <typename T> __forceinline T sub_group_load(const void* src) {285return this_sub_group().load(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)src));286}287288template <typename T> __forceinline void sub_group_store(void* dst, const T& x) {289this_sub_group().store(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)dst),x);290}291}292293#if __SYCL_COMPILER_VERSION < 20210801294#undef all_of_group295#undef any_of_group296#undef none_of_group297#undef group_broadcast298#undef reduce_over_group299#undef exclusive_scan_over_group300#undef inclusive_scan_over_group301#endif302303304