Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/libcxx/include/__pstl/backends/libdispatch.h
35266 views
1
//===----------------------------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
10
#define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
11
12
#include <__algorithm/inplace_merge.h>
13
#include <__algorithm/lower_bound.h>
14
#include <__algorithm/max.h>
15
#include <__algorithm/merge.h>
16
#include <__algorithm/upper_bound.h>
17
#include <__atomic/atomic.h>
18
#include <__config>
19
#include <__exception/terminate.h>
20
#include <__iterator/iterator_traits.h>
21
#include <__iterator/move_iterator.h>
22
#include <__memory/allocator.h>
23
#include <__memory/construct_at.h>
24
#include <__memory/unique_ptr.h>
25
#include <__numeric/reduce.h>
26
#include <__pstl/backend_fwd.h>
27
#include <__pstl/cpu_algos/any_of.h>
28
#include <__pstl/cpu_algos/cpu_traits.h>
29
#include <__pstl/cpu_algos/fill.h>
30
#include <__pstl/cpu_algos/find_if.h>
31
#include <__pstl/cpu_algos/for_each.h>
32
#include <__pstl/cpu_algos/merge.h>
33
#include <__pstl/cpu_algos/stable_sort.h>
34
#include <__pstl/cpu_algos/transform.h>
35
#include <__pstl/cpu_algos/transform_reduce.h>
36
#include <__utility/empty.h>
37
#include <__utility/exception_guard.h>
38
#include <__utility/move.h>
39
#include <__utility/pair.h>
40
#include <cstddef>
41
#include <new>
42
#include <optional>
43
44
_LIBCPP_PUSH_MACROS
45
#include <__undef_macros>
46
47
_LIBCPP_BEGIN_NAMESPACE_STD
48
namespace __pstl {
49
50
namespace __libdispatch {
51
// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
52
// we.
53
// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
54
_LIBCPP_EXPORTED_FROM_ABI void
55
__dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
56
57
template <class _Func>
58
_LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
59
__libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
60
(*static_cast<_Func*>(__context))(__chunk);
61
});
62
}
63
64
struct __chunk_partitions {
65
ptrdiff_t __chunk_count_; // includes the first chunk
66
ptrdiff_t __chunk_size_;
67
ptrdiff_t __first_chunk_size_;
68
};
69
70
[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;
71
72
template <class _RandomAccessIterator, class _Functor>
73
_LIBCPP_HIDE_FROM_ABI optional<__empty>
74
__dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
75
// Perform the chunked execution.
76
__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
77
auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
78
auto __index =
79
__chunk == 0
80
? 0
81
: (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
82
__func(__first + __index, __first + __index + __this_chunk_size);
83
});
84
85
return __empty{};
86
}
87
} // namespace __libdispatch
88
89
template <>
90
struct __cpu_traits<__libdispatch_backend_tag> {
91
template <class _RandomAccessIterator, class _Functor>
92
_LIBCPP_HIDE_FROM_ABI static optional<__empty>
93
__for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
94
return __libdispatch::__dispatch_parallel_for(
95
__libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
96
}
97
98
template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
99
struct __merge_range {
100
__merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
101
: __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
102
103
_RandomAccessIterator1 __mid1_;
104
_RandomAccessIterator2 __mid2_;
105
_RandomAccessIteratorOut __result_;
106
};
107
108
template <typename _RandomAccessIterator1,
109
typename _RandomAccessIterator2,
110
typename _RandomAccessIterator3,
111
typename _Compare,
112
typename _LeafMerge>
113
_LIBCPP_HIDE_FROM_ABI static optional<__empty>
114
__merge(_RandomAccessIterator1 __first1,
115
_RandomAccessIterator1 __last1,
116
_RandomAccessIterator2 __first2,
117
_RandomAccessIterator2 __last2,
118
_RandomAccessIterator3 __result,
119
_Compare __comp,
120
_LeafMerge __leaf_merge) noexcept {
121
__libdispatch::__chunk_partitions __partitions =
122
__libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
123
124
if (__partitions.__chunk_count_ == 0)
125
return __empty{};
126
127
if (__partitions.__chunk_count_ == 1) {
128
__leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
129
return __empty{};
130
}
131
132
using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
133
auto const __n_ranges = __partitions.__chunk_count_ + 1;
134
135
// TODO: use __uninitialized_buffer
136
auto __destroy = [=](__merge_range_t* __ptr) {
137
std::destroy_n(__ptr, __n_ranges);
138
std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
139
};
140
141
unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
142
[&]() -> __merge_range_t* {
143
#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
144
try {
145
#endif
146
return std::allocator<__merge_range_t>().allocate(__n_ranges);
147
#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
148
} catch (const std::bad_alloc&) {
149
return nullptr;
150
}
151
#endif
152
}(),
153
__destroy);
154
155
if (!__ranges)
156
return nullopt;
157
158
// TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
159
__merge_range_t* __r = __ranges.get();
160
std::__construct_at(__r++, __first1, __first2, __result);
161
162
bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
163
164
auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
165
auto [__mid1, __mid2] = [&] {
166
if (__iterate_first_range) {
167
auto __m1 = __first1 + __chunk_size;
168
auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
169
return std::make_pair(__m1, __m2);
170
} else {
171
auto __m2 = __first2 + __chunk_size;
172
auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
173
return std::make_pair(__m1, __m2);
174
}
175
}();
176
177
__result += (__mid1 - __first1) + (__mid2 - __first2);
178
__first1 = __mid1;
179
__first2 = __mid2;
180
return {std::move(__mid1), std::move(__mid2), __result};
181
};
182
183
// handle first chunk
184
std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
185
186
// handle 2 -> N - 1 chunks
187
for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
188
std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
189
190
// handle last chunk
191
std::__construct_at(__r, __last1, __last2, __result);
192
193
__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
194
auto __first_iters = __ranges[__index];
195
auto __last_iters = __ranges[__index + 1];
196
__leaf_merge(
197
__first_iters.__mid1_,
198
__last_iters.__mid1_,
199
__first_iters.__mid2_,
200
__last_iters.__mid2_,
201
__first_iters.__result_,
202
__comp);
203
});
204
205
return __empty{};
206
}
207
208
template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
209
_LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
210
_RandomAccessIterator __first,
211
_RandomAccessIterator __last,
212
_Transform __transform,
213
_Value __init,
214
_Combiner __combiner,
215
_Reduction __reduction) {
216
if (__first == __last)
217
return __init;
218
219
auto __partitions = __libdispatch::__partition_chunks(__last - __first);
220
221
auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
222
std::destroy_n(__ptr, __count);
223
std::allocator<_Value>().deallocate(__ptr, __count);
224
};
225
226
// TODO: use __uninitialized_buffer
227
// TODO: allocate one element per worker instead of one element per chunk
228
unique_ptr<_Value[], decltype(__destroy)> __values(
229
std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
230
231
// __dispatch_apply is noexcept
232
__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
233
auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
234
auto __index = __chunk == 0 ? 0
235
: (__chunk * __partitions.__chunk_size_) +
236
(__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
237
if (__this_chunk_size != 1) {
238
std::__construct_at(
239
__values.get() + __chunk,
240
__reduction(__first + __index + 2,
241
__first + __index + __this_chunk_size,
242
__combiner(__transform(__first + __index), __transform(__first + __index + 1))));
243
} else {
244
std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
245
}
246
});
247
248
return std::reduce(
249
std::make_move_iterator(__values.get()),
250
std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
251
std::move(__init),
252
__combiner);
253
}
254
255
template <class _RandomAccessIterator, class _Comp, class _LeafSort>
256
_LIBCPP_HIDE_FROM_ABI static optional<__empty>
257
__stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
258
const auto __size = __last - __first;
259
auto __partitions = __libdispatch::__partition_chunks(__size);
260
261
if (__partitions.__chunk_count_ == 0)
262
return __empty{};
263
264
if (__partitions.__chunk_count_ == 1) {
265
__leaf_sort(__first, __last, __comp);
266
return __empty{};
267
}
268
269
using _Value = __iter_value_type<_RandomAccessIterator>;
270
271
auto __destroy = [__size](_Value* __ptr) {
272
std::destroy_n(__ptr, __size);
273
std::allocator<_Value>().deallocate(__ptr, __size);
274
};
275
276
// TODO: use __uninitialized_buffer
277
unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
278
279
// Initialize all elements to a moved-from state
280
// TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
281
std::__construct_at(__values.get(), std::move(*__first));
282
for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
283
std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
284
}
285
*__first = std::move(__values.get()[__size - 1]);
286
287
__libdispatch::__dispatch_parallel_for(
288
__partitions,
289
__first,
290
[&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
291
__leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
292
});
293
294
bool __objects_are_in_buffer = false;
295
do {
296
const auto __old_chunk_size = __partitions.__chunk_size_;
297
if (__partitions.__chunk_count_ % 2 == 1) {
298
auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
299
std::inplace_merge(
300
__first_chunk_begin,
301
__first_chunk_begin + __partitions.__first_chunk_size_,
302
__first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
303
__comp);
304
};
305
if (__objects_are_in_buffer)
306
__inplace_merge_chunks(__values.get());
307
else
308
__inplace_merge_chunks(__first);
309
__partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
310
} else {
311
__partitions.__first_chunk_size_ += __partitions.__chunk_size_;
312
}
313
314
__partitions.__chunk_size_ *= 2;
315
__partitions.__chunk_count_ /= 2;
316
317
auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
318
__libdispatch::__dispatch_parallel_for(
319
__partitions,
320
__from_first,
321
[__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
322
std::merge(std::make_move_iterator(__chunk_first),
323
std::make_move_iterator(__chunk_last - __old_chunk_size),
324
std::make_move_iterator(__chunk_last - __old_chunk_size),
325
std::make_move_iterator(__chunk_last),
326
__to_first + (__chunk_first - __from_first),
327
__comp);
328
});
329
};
330
331
if (__objects_are_in_buffer)
332
__merge_chunks(__values.get(), __first);
333
else
334
__merge_chunks(__first, __values.get());
335
__objects_are_in_buffer = !__objects_are_in_buffer;
336
} while (__partitions.__chunk_count_ > 1);
337
338
if (__objects_are_in_buffer) {
339
std::move(__values.get(), __values.get() + __size, __first);
340
}
341
342
return __empty{};
343
}
344
345
_LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
346
347
static constexpr size_t __lane_size = 64;
348
};
349
350
// Mandatory implementations of the computational basis
351
template <class _ExecutionPolicy>
352
struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy>
353
: __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {};
354
355
template <class _ExecutionPolicy>
356
struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy>
357
: __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {};
358
359
template <class _ExecutionPolicy>
360
struct __merge<__libdispatch_backend_tag, _ExecutionPolicy>
361
: __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {};
362
363
template <class _ExecutionPolicy>
364
struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy>
365
: __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {};
366
367
template <class _ExecutionPolicy>
368
struct __transform<__libdispatch_backend_tag, _ExecutionPolicy>
369
: __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {};
370
371
template <class _ExecutionPolicy>
372
struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy>
373
: __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
374
375
template <class _ExecutionPolicy>
376
struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy>
377
: __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {};
378
379
template <class _ExecutionPolicy>
380
struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy>
381
: __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
382
383
// Not mandatory, but better optimized
384
template <class _ExecutionPolicy>
385
struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy>
386
: __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {};
387
388
template <class _ExecutionPolicy>
389
struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>
390
: __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {};
391
392
} // namespace __pstl
393
_LIBCPP_END_NAMESPACE_STD
394
395
_LIBCPP_POP_MACROS
396
397
#endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
398
399