Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/embree/common/sys/sycl.h
9912 views
1
// Copyright 2009-2021 Intel Corporation
2
// SPDX-License-Identifier: Apache-2.0
3
4
#pragma once
5
6
#include "platform.h"
7
8
using sycl::float16;
9
using sycl::float8;
10
using sycl::float4;
11
using sycl::float3;
12
using sycl::float2;
13
using sycl::int16;
14
using sycl::int8;
15
using sycl::int4;
16
using sycl::int3;
17
using sycl::int2;
18
using sycl::uint16;
19
using sycl::uint8;
20
using sycl::uint4;
21
using sycl::uint3;
22
using sycl::uint2;
23
using sycl::uchar16;
24
using sycl::uchar8;
25
using sycl::uchar4;
26
using sycl::uchar3;
27
using sycl::uchar2;
28
using sycl::ushort16;
29
using sycl::ushort8;
30
using sycl::ushort4;
31
using sycl::ushort3;
32
using sycl::ushort2;
33
34
#ifdef __SYCL_DEVICE_ONLY__
35
#define GLOBAL __attribute__((opencl_global))
36
#define LOCAL __attribute__((opencl_local))
37
38
SYCL_EXTERNAL extern int work_group_reduce_add(int x);
39
SYCL_EXTERNAL extern float work_group_reduce_min(float x);
40
SYCL_EXTERNAL extern float work_group_reduce_max(float x);
41
42
SYCL_EXTERNAL extern float atomic_min(volatile GLOBAL float *p, float val);
43
SYCL_EXTERNAL extern float atomic_min(volatile LOCAL float *p, float val);
44
SYCL_EXTERNAL extern float atomic_max(volatile GLOBAL float *p, float val);
45
SYCL_EXTERNAL extern float atomic_max(volatile LOCAL float *p, float val);
46
47
SYCL_EXTERNAL extern "C" unsigned int intel_sub_group_ballot(bool valid);
48
49
SYCL_EXTERNAL extern "C" void __builtin_IB_assume_uniform(void *p);
50
51
// Load message caching control
52
53
enum LSC_LDCC {
54
LSC_LDCC_DEFAULT,
55
LSC_LDCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached
56
LSC_LDCC_L1UC_L3C, // Override to L1 uncached and L3 cached
57
LSC_LDCC_L1C_L3UC, // Override to L1 cached and L3 uncached
58
LSC_LDCC_L1C_L3C, // Override to L1 cached and L3 cached
59
LSC_LDCC_L1S_L3UC, // Override to L1 streaming load and L3 uncached
60
LSC_LDCC_L1S_L3C, // Override to L1 streaming load and L3 cached
61
LSC_LDCC_L1IAR_L3C, // Override to L1 invalidate-after-read, and L3 cached
62
};
63
64
65
66
// Store message caching control (also used for atomics)
67
68
enum LSC_STCC {
69
LSC_STCC_DEFAULT,
70
LSC_STCC_L1UC_L3UC, // Override to L1 uncached and L3 uncached
71
LSC_STCC_L1UC_L3WB, // Override to L1 uncached and L3 written back
72
LSC_STCC_L1WT_L3UC, // Override to L1 written through and L3 uncached
73
LSC_STCC_L1WT_L3WB, // Override to L1 written through and L3 written back
74
LSC_STCC_L1S_L3UC, // Override to L1 streaming and L3 uncached
75
LSC_STCC_L1S_L3WB, // Override to L1 streaming and L3 written back
76
LSC_STCC_L1WB_L3WB, // Override to L1 written through and L3 written back
77
};
78
79
80
81
///////////////////////////////////////////////////////////////////////
82
83
// LSC Loads
84
85
///////////////////////////////////////////////////////////////////////
86
87
SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uchar_to_uint (const GLOBAL uint8_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D8U32
88
SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_ushort_to_uint(const GLOBAL uint16_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D16U32
89
SYCL_EXTERNAL /* extern "C" */ uint32_t __builtin_IB_lsc_load_global_uint (const GLOBAL uint32_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V1
90
SYCL_EXTERNAL /* extern "C" */ sycl::uint2 __builtin_IB_lsc_load_global_uint2 (const GLOBAL sycl::uint2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V2
91
SYCL_EXTERNAL /* extern "C" */ sycl::uint3 __builtin_IB_lsc_load_global_uint3 (const GLOBAL sycl::uint3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V3
92
SYCL_EXTERNAL /* extern "C" */ sycl::uint4 __builtin_IB_lsc_load_global_uint4 (const GLOBAL sycl::uint4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V4
93
SYCL_EXTERNAL /* extern "C" */ sycl::uint8 __builtin_IB_lsc_load_global_uint8 (const GLOBAL sycl::uint8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D32V8
94
SYCL_EXTERNAL /* extern "C" */ uint64_t __builtin_IB_lsc_load_global_ulong (const GLOBAL uint64_t *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V1
95
SYCL_EXTERNAL /* extern "C" */ sycl::ulong2 __builtin_IB_lsc_load_global_ulong2 (const GLOBAL sycl::ulong2 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V2
96
SYCL_EXTERNAL /* extern "C" */ sycl::ulong3 __builtin_IB_lsc_load_global_ulong3 (const GLOBAL sycl::ulong3 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V3
97
SYCL_EXTERNAL /* extern "C" */ sycl::ulong4 __builtin_IB_lsc_load_global_ulong4 (const GLOBAL sycl::ulong4 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V4
98
SYCL_EXTERNAL /* extern "C" */ sycl::ulong8 __builtin_IB_lsc_load_global_ulong8 (const GLOBAL sycl::ulong8 *base, int elemOff, enum LSC_LDCC cacheOpt); //D64V8
99
100
// global address space
101
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uchar_from_uint (GLOBAL uint8_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D8U32
102
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ushort_from_uint(GLOBAL uint16_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D16U32
103
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint (GLOBAL uint32_t *base, int immElemOff, uint32_t val, enum LSC_STCC cacheOpt); //D32V1
104
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint2 (GLOBAL sycl::uint2 *base, int immElemOff, sycl::uint2 val, enum LSC_STCC cacheOpt); //D32V2
105
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint3 (GLOBAL sycl::uint3 *base, int immElemOff, sycl::uint3 val, enum LSC_STCC cacheOpt); //D32V3
106
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint4 (GLOBAL sycl::uint4 *base, int immElemOff, sycl::uint4 val, enum LSC_STCC cacheOpt); //D32V4
107
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_uint8 (GLOBAL sycl::uint8 *base, int immElemOff, sycl::uint8 val, enum LSC_STCC cacheOpt); //D32V8
108
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong (GLOBAL uint64_t *base, int immElemOff, uint64_t val, enum LSC_STCC cacheOpt); //D64V1
109
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong2 (GLOBAL sycl::ulong2 *base, int immElemOff, sycl::ulong2 val, enum LSC_STCC cacheOpt); //D64V2
110
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong3 (GLOBAL sycl::ulong3 *base, int immElemOff, sycl::ulong3 val, enum LSC_STCC cacheOpt); //D64V3
111
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong4 (GLOBAL sycl::ulong4 *base, int immElemOff, sycl::ulong4 val, enum LSC_STCC cacheOpt); //D64V4
112
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_store_global_ulong8 (GLOBAL sycl::ulong8 *base, int immElemOff, sycl::ulong8 val, enum LSC_STCC cacheOpt); //D64V8
113
114
///////////////////////////////////////////////////////////////////////
115
// prefetching
116
///////////////////////////////////////////////////////////////////////
117
//
118
// LSC Pre-Fetch Load functions with CacheControls
119
// global address space
120
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uchar (const GLOBAL uint8_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
121
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ushort(const GLOBAL uint16_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
122
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint (const GLOBAL uint32_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
123
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint2 (const GLOBAL sycl::uint2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
124
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint3 (const GLOBAL sycl::uint3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
125
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint4 (const GLOBAL sycl::uint4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
126
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_uint8 (const GLOBAL sycl::uint8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
127
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong (const GLOBAL uint64_t *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
128
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong2(const GLOBAL sycl::ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
129
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong3(const GLOBAL sycl::ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
130
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong4(const GLOBAL sycl::ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
131
SYCL_EXTERNAL extern "C" void __builtin_IB_lsc_prefetch_global_ulong8(const GLOBAL sycl::ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
132
133
#else
134
135
#define GLOBAL
136
#define LOCAL
137
138
/* dummy functions for host */
139
inline int work_group_reduce_add(int x) { return x; }
140
inline float work_group_reduce_min(float x) { return x; }
141
inline float work_group_reduce_max(float x) { return x; }
142
143
inline float atomic_min(volatile float *p, float val) { return val; };
144
inline float atomic_max(volatile float *p, float val) { return val; };
145
146
inline uint32_t intel_sub_group_ballot(bool valid) { return 0; }
147
148
#endif
149
150
/* creates a temporary that is enforced to be uniform */
151
#define SYCL_UNIFORM_VAR(Ty,tmp,k) \
152
Ty tmp##_data; \
153
Ty* p##tmp##_data = (Ty*) sub_group_broadcast((uint64_t)&tmp##_data,k); \
154
Ty& tmp = *p##tmp##_data;
155
156
#if !defined(__forceinline)
157
#define __forceinline inline __attribute__((always_inline))
158
#endif
159
160
#if __SYCL_COMPILER_VERSION < 20210801
161
#define all_of_group all_of
162
#define any_of_group any_of
163
#define none_of_group none_of
164
#define group_broadcast broadcast
165
#define reduce_over_group reduce
166
#define exclusive_scan_over_group exclusive_scan
167
#define inclusive_scan_over_group inclusive_scan
168
#endif
169
170
namespace embree
171
{
172
template<typename T>
173
__forceinline T cselect(const bool mask, const T &a, const T &b)
174
{
175
return sycl::select(b,a,(int)mask);
176
}
177
178
template<typename T, typename M>
179
__forceinline T cselect(const M &mask, const T &a, const T &b)
180
{
181
return sycl::select(b,a,mask);
182
}
183
184
#define XSTR(x) STR(x)
185
#define STR(x) #x
186
187
__forceinline const sycl::sub_group this_sub_group() {
188
#if __LIBSYCL_MAJOR_VERSION >= 8
189
return sycl::ext::oneapi::this_work_item::get_sub_group();
190
#else
191
return sycl::ext::oneapi::experimental::this_sub_group();
192
#endif
193
}
194
195
__forceinline const uint32_t get_sub_group_local_id() {
196
return this_sub_group().get_local_id()[0];
197
}
198
199
__forceinline const uint32_t get_sub_group_size() {
200
return this_sub_group().get_max_local_range().size();
201
}
202
203
__forceinline const uint32_t get_sub_group_id() {
204
return this_sub_group().get_group_id()[0];
205
}
206
207
__forceinline const uint32_t get_num_sub_groups() {
208
return this_sub_group().get_group_range().size();
209
}
210
211
__forceinline uint32_t sub_group_ballot(bool pred) {
212
return intel_sub_group_ballot(pred);
213
}
214
215
__forceinline bool sub_group_all_of(bool pred) {
216
return sycl::all_of_group(this_sub_group(),pred);
217
}
218
219
__forceinline bool sub_group_any_of(bool pred) {
220
return sycl::any_of_group(this_sub_group(),pred);
221
}
222
223
__forceinline bool sub_group_none_of(bool pred) {
224
return sycl::none_of_group(this_sub_group(),pred);
225
}
226
227
template <typename T> __forceinline T sub_group_broadcast(T x, sycl::id<1> local_id) {
228
return sycl::group_broadcast<sycl::sub_group>(this_sub_group(),x,local_id);
229
}
230
231
template <typename T> __forceinline T sub_group_make_uniform(T x) {
232
return sub_group_broadcast(x,sycl::ctz(intel_sub_group_ballot(true)));
233
}
234
235
__forceinline void assume_uniform_array(void* ptr) {
236
#ifdef __SYCL_DEVICE_ONLY__
237
__builtin_IB_assume_uniform(ptr);
238
#endif
239
}
240
241
template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, BinaryOperation binary_op) {
242
return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,binary_op);
243
}
244
245
template <typename T, class BinaryOperation> __forceinline T sub_group_reduce(T x, T init, BinaryOperation binary_op) {
246
return sycl::reduce_over_group<sycl::sub_group>(this_sub_group(),x,init,binary_op);
247
}
248
249
template <typename T> __forceinline T sub_group_reduce_min(T x, T init) {
250
return sub_group_reduce(x, init, sycl::ext::oneapi::minimum<T>());
251
}
252
253
template <typename T> __forceinline T sub_group_reduce_min(T x) {
254
return sub_group_reduce(x, sycl::ext::oneapi::minimum<T>());
255
}
256
257
template <typename T> __forceinline T sub_group_reduce_max(T x) {
258
return sub_group_reduce(x, sycl::ext::oneapi::maximum<T>());
259
}
260
261
template <typename T> __forceinline T sub_group_reduce_add(T x) {
262
return sub_group_reduce(x, sycl::ext::oneapi::plus<T>());
263
}
264
265
template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, BinaryOperation binary_op) {
266
return sycl::exclusive_scan_over_group(this_sub_group(),x,binary_op);
267
}
268
269
template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan_min(T x) {
270
return sub_group_exclusive_scan(x,sycl::ext::oneapi::minimum<T>());
271
}
272
273
template <typename T, class BinaryOperation> __forceinline T sub_group_exclusive_scan(T x, T init, BinaryOperation binary_op) {
274
return sycl::exclusive_scan_over_group(this_sub_group(),x,init,binary_op);
275
}
276
277
template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op) {
278
return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op);
279
}
280
281
template <typename T, class BinaryOperation> __forceinline T sub_group_inclusive_scan(T x, BinaryOperation binary_op, T init) {
282
return sycl::inclusive_scan_over_group(this_sub_group(),x,binary_op,init);
283
}
284
285
template <typename T> __forceinline T sub_group_load(const void* src) {
286
return this_sub_group().load(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)src));
287
}
288
289
template <typename T> __forceinline void sub_group_store(void* dst, const T& x) {
290
this_sub_group().store(sycl::multi_ptr<T,sycl::access::address_space::global_space>((T*)dst),x);
291
}
292
}
293
294
#if __SYCL_COMPILER_VERSION < 20210801
295
#undef all_of_group
296
#undef any_of_group
297
#undef none_of_group
298
#undef group_broadcast
299
#undef reduce_over_group
300
#undef exclusive_scan_over_group
301
#undef inclusive_scan_over_group
302
#endif
303
304