Path: blob/master/thirdparty/embree/common/algorithms/parallel_partition.h
9912 views
// Copyright 2009-2021 Intel Corporation1// SPDX-License-Identifier: Apache-2.023#pragma once45#include "parallel_for.h"6#include "../math/range.h"78namespace embree9{10/* serial partitioning */11template<typename T, typename V, typename IsLeft, typename Reduction_T>12__forceinline size_t serial_partitioning(T* array,13const size_t begin,14const size_t end,15V& leftReduction,16V& rightReduction,17const IsLeft& is_left,18const Reduction_T& reduction_t)19{20T* l = array + begin;21T* r = array + end - 1;2223while(1)24{25/* *l < pivot */26while (likely(l <= r && is_left(*l) ))27{28//prefetchw(l+4); // FIXME: enable?29reduction_t(leftReduction,*l);30++l;31}32/* *r >= pivot) */33while (likely(l <= r && !is_left(*r)))34{35//prefetchw(r-4); FIXME: enable?36reduction_t(rightReduction,*r);37--r;38}39if (r<l) break;4041reduction_t(leftReduction ,*r);42reduction_t(rightReduction,*l);43xchg(*l,*r);44l++; r--;45}4647return l - array;48}4950template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>51class __aligned(64) parallel_partition_task52{53ALIGNED_CLASS_(64);54private:5556static const size_t MAX_TASKS = 64;5758T* array;59size_t N;60const IsLeft& is_left;61const Reduction_T& reduction_t;62const Reduction_V& reduction_v;63const Vi& identity;6465size_t numTasks;66__aligned(64) size_t counter_start[MAX_TASKS+1];67__aligned(64) size_t counter_left[MAX_TASKS+1];68__aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];69__aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];70__aligned(64) V leftReductions[MAX_TASKS];71__aligned(64) V rightReductions[MAX_TASKS];7273public:7475__forceinline parallel_partition_task(T* array,76const size_t N,77const Vi& identity,78const IsLeft& is_left,79const Reduction_T& reduction_t,80const Reduction_V& reduction_v,81const size_t BLOCK_SIZE)8283: array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),84numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}8586__forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)87{88size_t i = 0;89while(index >= (size_t)r[i].size())90{91assert(i < numRanges);92index -= (size_t)r[i].size();93i++;94}95return &r[i];96}9798__forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,99const size_t numRightMisplacedRanges,100const size_t startID,101const size_t endID)102{103size_t leftLocalIndex = startID;104size_t rightLocalIndex = startID;105const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);106const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);107108size_t l_left = l_range->size() - leftLocalIndex;109size_t r_left = r_range->size() - rightLocalIndex;110T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];111T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];112size_t size = endID - startID;113size_t items = min(size,min(l_left,r_left));114115while (size)116{117if (unlikely(l_left == 0))118{119l_range++;120l_left = l_range->size();121l = &array[l_range->begin()];122items = min(size,min(l_left,r_left));123}124125if (unlikely(r_left == 0))126{127r_range++;128r_left = r_range->size();129r = &array[r_range->begin()];130items = min(size,min(l_left,r_left));131}132133size -= items;134l_left -= items;135r_left -= items;136137while(items) {138items--;139xchg(*l++,*r++);140}141}142}143144__forceinline size_t partition(V& leftReduction, V& rightReduction)145{146/* partition the individual ranges for each task */147parallel_for(numTasks,[&] (const size_t taskID) {148const size_t startID = (taskID+0)*N/numTasks;149const size_t endID = (taskID+1)*N/numTasks;150V local_left(identity);151V local_right(identity);152const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);153counter_start[taskID] = startID;154counter_left [taskID] = mid-startID;155leftReductions[taskID] = local_left;156rightReductions[taskID] = local_right;157});158counter_start[numTasks] = N;159counter_left[numTasks] = 0;160161/* finalize the reductions */162for (size_t i=0; i<numTasks; i++) {163reduction_v(leftReduction,leftReductions[i]);164reduction_v(rightReduction,rightReductions[i]);165}166167/* calculate mid point for partitioning */168size_t mid = counter_left[0];169for (size_t i=1; i<numTasks; i++)170mid += counter_left[i];171const range<ssize_t> globalLeft (0,mid);172const range<ssize_t> globalRight(mid,N);173174/* calculate all left and right ranges that are on the wrong global side */175size_t numMisplacedRangesLeft = 0;176size_t numMisplacedRangesRight = 0;177size_t numMisplacedItemsLeft MAYBE_UNUSED = 0;178size_t numMisplacedItemsRight MAYBE_UNUSED = 0;179180for (size_t i=0; i<numTasks; i++)181{182const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);183const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);184const range<ssize_t> left_misplaced = globalLeft. intersect(right_range);185const range<ssize_t> right_misplaced = globalRight.intersect(left_range);186187if (!left_misplaced.empty())188{189numMisplacedItemsLeft += left_misplaced.size();190leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;191}192193if (!right_misplaced.empty())194{195numMisplacedItemsRight += right_misplaced.size();196rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;197}198}199assert( numMisplacedItemsLeft == numMisplacedItemsRight );200201/* if no items are misplaced we are done */202if (numMisplacedItemsLeft == 0)203return mid;204205/* otherwise we copy the items to the right place in parallel */206parallel_for(numTasks,[&] (const size_t taskID) {207const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;208const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;209swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);210});211212return mid;213}214};215216template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>217__noinline size_t parallel_partitioning(T* array,218const size_t begin,219const size_t end,220const Vi &identity,221V &leftReduction,222V &rightReduction,223const IsLeft& is_left,224const Reduction_T& reduction_t,225const Reduction_V& reduction_v,226size_t BLOCK_SIZE = 128)227{228/* fall back to single threaded partitioning for small N */229if (unlikely(end-begin < BLOCK_SIZE))230return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);231232/* otherwise use parallel code */233else {234typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;235std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));236return begin+p->partition(leftReduction,rightReduction);237}238}239240template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>241__noinline size_t parallel_partitioning(T* array,242const size_t begin,243const size_t end,244const Vi &identity,245V &leftReduction,246V &rightReduction,247const IsLeft& is_left,248const Reduction_T& reduction_t,249const Reduction_V& reduction_v,250size_t BLOCK_SIZE,251size_t PARALLEL_THRESHOLD)252{253/* fall back to single threaded partitioning for small N */254if (unlikely(end-begin < PARALLEL_THRESHOLD))255return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);256257/* otherwise use parallel code */258else {259typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;260std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));261return begin+p->partition(leftReduction,rightReduction);262}263}264265266template<typename T, typename IsLeft>267inline size_t parallel_partitioning(T* array,268const size_t begin,269const size_t end,270const IsLeft& is_left,271size_t BLOCK_SIZE = 128)272{273size_t leftReduction = 0;274size_t rightReduction = 0;275return parallel_partitioning(276array,begin,end,0,leftReduction,rightReduction,is_left,277[] (size_t& t,const T& ref) { },278[] (size_t& t0,size_t& t1) { },279BLOCK_SIZE);280}281282}283284285