Path: blob/main/crates/polars-expr/src/groups/binview.rs
6940 views
use arrow::array::{Array, BinaryViewArrayGeneric, View, ViewType};1use arrow::bitmap::{Bitmap, MutableBitmap};2use arrow::buffer::Buffer;3use polars_compute::binview_index_map::{BinaryViewIndexMap, Entry};45use super::*;6use crate::hash_keys::HashKeys;78#[derive(Default)]9pub struct BinviewHashGrouper {10idx_map: BinaryViewIndexMap<()>,11null_idx: IdxSize,12}1314impl BinviewHashGrouper {15pub fn new() -> Self {16Self {17idx_map: BinaryViewIndexMap::default(),18null_idx: IdxSize::MAX,19}20}2122/// # Safety23/// The view must be valid for the given buffer set.24#[inline(always)]25unsafe fn insert_key(&mut self, hash: u64, view: View, buffers: &Arc<[Buffer<u8>]>) -> IdxSize {26unsafe {27match self.idx_map.entry_view(hash, view, buffers) {28Entry::Occupied(o) => o.index(),29Entry::Vacant(v) => {30let index = v.index();31v.insert(());32index33},34}35}36}3738#[inline(always)]39fn insert_null(&mut self) -> IdxSize {40if self.null_idx == IdxSize::MAX {41self.null_idx = self.idx_map.push_unmapped_empty_entry(());42}43self.null_idx44}4546/// # Safety47/// The view must be valid for the given buffer set.48#[inline(always)]49unsafe fn contains_key(&self, hash: u64, view: &View, buffers: &Arc<[Buffer<u8>]>) -> bool {50unsafe { self.idx_map.get_view(hash, view, buffers).is_some() }51}5253#[inline(always)]54fn contains_null(&self) -> bool {55self.null_idx < IdxSize::MAX56}5758/// # Safety59/// The views must be valid for the given buffers.60unsafe fn finalize_keys<V: ViewType + ?Sized>(61&self,62schema: &Schema,63views: Buffer<View>,64buffers: Arc<[Buffer<u8>]>,65validity: Option<Bitmap>,66) -> DataFrame {67let (name, dtype) = schema.get_at_index(0).unwrap();68unsafe {69let arrow_dtype = dtype.to_arrow(CompatLevel::newest());70let keys = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(71arrow_dtype,72views,73buffers,74validity,75None,76);77let s =78Series::from_chunks_and_dtype_unchecked(name.clone(), vec![Box::new(keys)], dtype);79DataFrame::new(vec![Column::from(s)]).unwrap()80}81}82}8384impl Grouper for BinviewHashGrouper {85fn new_empty(&self) -> Box<dyn Grouper> {86Box::new(Self::new())87}8889fn reserve(&mut self, additional: usize) {90self.idx_map.reserve(additional);91}9293fn num_groups(&self) -> IdxSize {94self.idx_map.len()95}9697unsafe fn insert_keys_subset(98&mut self,99hash_keys: &HashKeys,100subset: &[IdxSize],101group_idxs: Option<&mut Vec<IdxSize>>,102) {103let HashKeys::Binview(hash_keys) = hash_keys else {104unreachable!()105};106107unsafe {108let views = hash_keys.keys.views().as_slice();109let buffers = hash_keys.keys.data_buffers();110if let Some(validity) = hash_keys.keys.validity() {111if hash_keys.null_is_valid {112let groups = subset.iter().map(|idx| {113if validity.get_bit_unchecked(*idx as usize) {114let hash = hash_keys.hashes.value_unchecked(*idx as usize);115let view = views.get_unchecked(*idx as usize);116self.insert_key(hash, *view, buffers)117} else {118self.insert_null()119}120});121if let Some(group_idxs) = group_idxs {122group_idxs.reserve(subset.len());123group_idxs.extend(groups);124} else {125groups.for_each(drop);126}127} else {128let groups = subset.iter().filter_map(|idx| {129if validity.get_bit_unchecked(*idx as usize) {130let hash = hash_keys.hashes.value_unchecked(*idx as usize);131let view = views.get_unchecked(*idx as usize);132Some(self.insert_key(hash, *view, buffers))133} else {134None135}136});137if let Some(group_idxs) = group_idxs {138group_idxs.reserve(subset.len());139group_idxs.extend(groups);140} else {141groups.for_each(drop);142}143}144} else {145let groups = subset.iter().map(|idx| {146let hash = hash_keys.hashes.value_unchecked(*idx as usize);147let view = views.get_unchecked(*idx as usize);148self.insert_key(hash, *view, buffers)149});150if let Some(group_idxs) = group_idxs {151group_idxs.reserve(subset.len());152group_idxs.extend(groups);153} else {154groups.for_each(drop);155}156}157}158}159160fn get_keys_in_group_order(&self, schema: &Schema) -> DataFrame {161let buffers: Arc<[_]> = self162.idx_map163.buffers()164.iter()165.map(|b| Buffer::from(b.to_vec()))166.collect();167let views = self.idx_map.iter_hash_views().map(|(_h, v)| v).collect();168let validity = if self.null_idx < IdxSize::MAX {169let mut validity = MutableBitmap::new();170validity.extend_constant(self.idx_map.len() as usize, true);171validity.set(self.null_idx as usize, false);172Some(validity.freeze())173} else {174None175};176177unsafe {178let (_name, dt) = schema.get_at_index(0).unwrap();179match dt {180DataType::Binary => self.finalize_keys::<[u8]>(schema, views, buffers, validity),181DataType::String => self.finalize_keys::<str>(schema, views, buffers, validity),182_ => unreachable!(),183}184}185}186187/// # Safety188/// All groupers must be a BinviewHashGrouper.189unsafe fn probe_partitioned_groupers(190&self,191groupers: &[Box<dyn Grouper>],192hash_keys: &HashKeys,193partitioner: &HashPartitioner,194invert: bool,195probe_matches: &mut Vec<IdxSize>,196) {197let HashKeys::Binview(hash_keys) = hash_keys else {198unreachable!()199};200assert!(partitioner.num_partitions() == groupers.len());201202unsafe {203let null_p = partitioner.null_partition();204let buffers = hash_keys.keys.data_buffers();205let views = hash_keys.keys.views().as_slice();206hash_keys.for_each_hash(|idx, opt_h| {207let has_group = if let Some(h) = opt_h {208let p = partitioner.hash_to_partition(h);209let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(p);210let grouper =211&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);212let view = views.get_unchecked(idx as usize);213grouper.contains_key(h, view, buffers)214} else {215let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(null_p);216let grouper =217&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);218grouper.contains_null()219};220221if has_group != invert {222probe_matches.push(idx);223}224});225}226}227228/// # Safety229/// All groupers must be a BinviewHashGrouper.230unsafe fn contains_key_partitioned_groupers(231&self,232groupers: &[Box<dyn Grouper>],233hash_keys: &HashKeys,234partitioner: &HashPartitioner,235invert: bool,236contains_key: &mut BitmapBuilder,237) {238let HashKeys::Binview(hash_keys) = hash_keys else {239unreachable!()240};241assert!(partitioner.num_partitions() == groupers.len());242243unsafe {244let null_p = partitioner.null_partition();245let buffers = hash_keys.keys.data_buffers();246let views = hash_keys.keys.views().as_slice();247hash_keys.for_each_hash(|idx, opt_h| {248let has_group = if let Some(h) = opt_h {249let p = partitioner.hash_to_partition(h);250let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(p);251let grouper =252&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);253let view = views.get_unchecked(idx as usize);254grouper.contains_key(h, view, buffers)255} else {256let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(null_p);257let grouper =258&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);259grouper.contains_null()260};261262contains_key.push(has_group != invert);263});264}265}266267fn as_any(&self) -> &dyn Any {268self269}270}271272273