Path: blob/main/contrib/llvm-project/libcxx/include/__pstl/backends/libdispatch.h
35266 views
//===----------------------------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H9#define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H1011#include <__algorithm/inplace_merge.h>12#include <__algorithm/lower_bound.h>13#include <__algorithm/max.h>14#include <__algorithm/merge.h>15#include <__algorithm/upper_bound.h>16#include <__atomic/atomic.h>17#include <__config>18#include <__exception/terminate.h>19#include <__iterator/iterator_traits.h>20#include <__iterator/move_iterator.h>21#include <__memory/allocator.h>22#include <__memory/construct_at.h>23#include <__memory/unique_ptr.h>24#include <__numeric/reduce.h>25#include <__pstl/backend_fwd.h>26#include <__pstl/cpu_algos/any_of.h>27#include <__pstl/cpu_algos/cpu_traits.h>28#include <__pstl/cpu_algos/fill.h>29#include <__pstl/cpu_algos/find_if.h>30#include <__pstl/cpu_algos/for_each.h>31#include <__pstl/cpu_algos/merge.h>32#include <__pstl/cpu_algos/stable_sort.h>33#include <__pstl/cpu_algos/transform.h>34#include <__pstl/cpu_algos/transform_reduce.h>35#include <__utility/empty.h>36#include <__utility/exception_guard.h>37#include <__utility/move.h>38#include <__utility/pair.h>39#include <cstddef>40#include <new>41#include <optional>4243_LIBCPP_PUSH_MACROS44#include <__undef_macros>4546_LIBCPP_BEGIN_NAMESPACE_STD47namespace __pstl {4849namespace __libdispatch {50// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do51// we.52// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?53_LIBCPP_EXPORTED_FROM_ABI void54__dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;5556template <class _Func>57_LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {58__libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {59(*static_cast<_Func*>(__context))(__chunk);60});61}6263struct __chunk_partitions {64ptrdiff_t __chunk_count_; // includes the first chunk65ptrdiff_t __chunk_size_;66ptrdiff_t __first_chunk_size_;67};6869[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;7071template <class _RandomAccessIterator, class _Functor>72_LIBCPP_HIDE_FROM_ABI optional<__empty>73__dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {74// Perform the chunked execution.75__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {76auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;77auto __index =78__chunk == 079? 080: (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);81__func(__first + __index, __first + __index + __this_chunk_size);82});8384return __empty{};85}86} // namespace __libdispatch8788template <>89struct __cpu_traits<__libdispatch_backend_tag> {90template <class _RandomAccessIterator, class _Functor>91_LIBCPP_HIDE_FROM_ABI static optional<__empty>92__for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {93return __libdispatch::__dispatch_parallel_for(94__libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));95}9697template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>98struct __merge_range {99__merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)100: __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}101102_RandomAccessIterator1 __mid1_;103_RandomAccessIterator2 __mid2_;104_RandomAccessIteratorOut __result_;105};106107template <typename _RandomAccessIterator1,108typename _RandomAccessIterator2,109typename _RandomAccessIterator3,110typename _Compare,111typename _LeafMerge>112_LIBCPP_HIDE_FROM_ABI static optional<__empty>113__merge(_RandomAccessIterator1 __first1,114_RandomAccessIterator1 __last1,115_RandomAccessIterator2 __first2,116_RandomAccessIterator2 __last2,117_RandomAccessIterator3 __result,118_Compare __comp,119_LeafMerge __leaf_merge) noexcept {120__libdispatch::__chunk_partitions __partitions =121__libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));122123if (__partitions.__chunk_count_ == 0)124return __empty{};125126if (__partitions.__chunk_count_ == 1) {127__leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);128return __empty{};129}130131using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;132auto const __n_ranges = __partitions.__chunk_count_ + 1;133134// TODO: use __uninitialized_buffer135auto __destroy = [=](__merge_range_t* __ptr) {136std::destroy_n(__ptr, __n_ranges);137std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);138};139140unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(141[&]() -> __merge_range_t* {142#ifndef _LIBCPP_HAS_NO_EXCEPTIONS143try {144#endif145return std::allocator<__merge_range_t>().allocate(__n_ranges);146#ifndef _LIBCPP_HAS_NO_EXCEPTIONS147} catch (const std::bad_alloc&) {148return nullptr;149}150#endif151}(),152__destroy);153154if (!__ranges)155return nullopt;156157// TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case158__merge_range_t* __r = __ranges.get();159std::__construct_at(__r++, __first1, __first2, __result);160161bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;162163auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {164auto [__mid1, __mid2] = [&] {165if (__iterate_first_range) {166auto __m1 = __first1 + __chunk_size;167auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);168return std::make_pair(__m1, __m2);169} else {170auto __m2 = __first2 + __chunk_size;171auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);172return std::make_pair(__m1, __m2);173}174}();175176__result += (__mid1 - __first1) + (__mid2 - __first2);177__first1 = __mid1;178__first2 = __mid2;179return {std::move(__mid1), std::move(__mid2), __result};180};181182// handle first chunk183std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));184185// handle 2 -> N - 1 chunks186for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)187std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));188189// handle last chunk190std::__construct_at(__r, __last1, __last2, __result);191192__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {193auto __first_iters = __ranges[__index];194auto __last_iters = __ranges[__index + 1];195__leaf_merge(196__first_iters.__mid1_,197__last_iters.__mid1_,198__first_iters.__mid2_,199__last_iters.__mid2_,200__first_iters.__result_,201__comp);202});203204return __empty{};205}206207template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>208_LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(209_RandomAccessIterator __first,210_RandomAccessIterator __last,211_Transform __transform,212_Value __init,213_Combiner __combiner,214_Reduction __reduction) {215if (__first == __last)216return __init;217218auto __partitions = __libdispatch::__partition_chunks(__last - __first);219220auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {221std::destroy_n(__ptr, __count);222std::allocator<_Value>().deallocate(__ptr, __count);223};224225// TODO: use __uninitialized_buffer226// TODO: allocate one element per worker instead of one element per chunk227unique_ptr<_Value[], decltype(__destroy)> __values(228std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);229230// __dispatch_apply is noexcept231__libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {232auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;233auto __index = __chunk == 0 ? 0234: (__chunk * __partitions.__chunk_size_) +235(__partitions.__first_chunk_size_ - __partitions.__chunk_size_);236if (__this_chunk_size != 1) {237std::__construct_at(238__values.get() + __chunk,239__reduction(__first + __index + 2,240__first + __index + __this_chunk_size,241__combiner(__transform(__first + __index), __transform(__first + __index + 1))));242} else {243std::__construct_at(__values.get() + __chunk, __transform(__first + __index));244}245});246247return std::reduce(248std::make_move_iterator(__values.get()),249std::make_move_iterator(__values.get() + __partitions.__chunk_count_),250std::move(__init),251__combiner);252}253254template <class _RandomAccessIterator, class _Comp, class _LeafSort>255_LIBCPP_HIDE_FROM_ABI static optional<__empty>256__stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {257const auto __size = __last - __first;258auto __partitions = __libdispatch::__partition_chunks(__size);259260if (__partitions.__chunk_count_ == 0)261return __empty{};262263if (__partitions.__chunk_count_ == 1) {264__leaf_sort(__first, __last, __comp);265return __empty{};266}267268using _Value = __iter_value_type<_RandomAccessIterator>;269270auto __destroy = [__size](_Value* __ptr) {271std::destroy_n(__ptr, __size);272std::allocator<_Value>().deallocate(__ptr, __size);273};274275// TODO: use __uninitialized_buffer276unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);277278// Initialize all elements to a moved-from state279// TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928280std::__construct_at(__values.get(), std::move(*__first));281for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {282std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));283}284*__first = std::move(__values.get()[__size - 1]);285286__libdispatch::__dispatch_parallel_for(287__partitions,288__first,289[&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {290__leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);291});292293bool __objects_are_in_buffer = false;294do {295const auto __old_chunk_size = __partitions.__chunk_size_;296if (__partitions.__chunk_count_ % 2 == 1) {297auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {298std::inplace_merge(299__first_chunk_begin,300__first_chunk_begin + __partitions.__first_chunk_size_,301__first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,302__comp);303};304if (__objects_are_in_buffer)305__inplace_merge_chunks(__values.get());306else307__inplace_merge_chunks(__first);308__partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;309} else {310__partitions.__first_chunk_size_ += __partitions.__chunk_size_;311}312313__partitions.__chunk_size_ *= 2;314__partitions.__chunk_count_ /= 2;315316auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {317__libdispatch::__dispatch_parallel_for(318__partitions,319__from_first,320[__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {321std::merge(std::make_move_iterator(__chunk_first),322std::make_move_iterator(__chunk_last - __old_chunk_size),323std::make_move_iterator(__chunk_last - __old_chunk_size),324std::make_move_iterator(__chunk_last),325__to_first + (__chunk_first - __from_first),326__comp);327});328};329330if (__objects_are_in_buffer)331__merge_chunks(__values.get(), __first);332else333__merge_chunks(__first, __values.get());334__objects_are_in_buffer = !__objects_are_in_buffer;335} while (__partitions.__chunk_count_ > 1);336337if (__objects_are_in_buffer) {338std::move(__values.get(), __values.get() + __size, __first);339}340341return __empty{};342}343344_LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}345346static constexpr size_t __lane_size = 64;347};348349// Mandatory implementations of the computational basis350template <class _ExecutionPolicy>351struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy>352: __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {};353354template <class _ExecutionPolicy>355struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy>356: __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {};357358template <class _ExecutionPolicy>359struct __merge<__libdispatch_backend_tag, _ExecutionPolicy>360: __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {};361362template <class _ExecutionPolicy>363struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy>364: __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {};365366template <class _ExecutionPolicy>367struct __transform<__libdispatch_backend_tag, _ExecutionPolicy>368: __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {};369370template <class _ExecutionPolicy>371struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy>372: __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};373374template <class _ExecutionPolicy>375struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy>376: __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {};377378template <class _ExecutionPolicy>379struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy>380: __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};381382// Not mandatory, but better optimized383template <class _ExecutionPolicy>384struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy>385: __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {};386387template <class _ExecutionPolicy>388struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>389: __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {};390391} // namespace __pstl392_LIBCPP_END_NAMESPACE_STD393394_LIBCPP_POP_MACROS395396#endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H397398399