Path: blob/main/crates/polars-expr/src/groups/binview.rs
8395 views
use arrow::array::{Array, BinaryViewArrayGeneric, View, ViewType};1use arrow::bitmap::{Bitmap, MutableBitmap};2use polars_buffer::Buffer;3use polars_compute::binview_index_map::{BinaryViewIndexMap, Entry};45use super::*;6use crate::hash_keys::HashKeys;78#[derive(Default)]9pub struct BinviewHashGrouper {10idx_map: BinaryViewIndexMap<()>,11null_idx: IdxSize,12}1314impl BinviewHashGrouper {15pub fn new() -> Self {16Self {17idx_map: BinaryViewIndexMap::default(),18null_idx: IdxSize::MAX,19}20}2122/// # Safety23/// The view must be valid for the given buffer set.24#[inline(always)]25unsafe fn insert_key(26&mut self,27hash: u64,28view: View,29buffers: &Buffer<Buffer<u8>>,30) -> IdxSize {31unsafe {32match self.idx_map.entry_view(hash, view, buffers) {33Entry::Occupied(o) => o.index(),34Entry::Vacant(v) => {35let index = v.index();36v.insert(());37index38},39}40}41}4243#[inline(always)]44fn insert_null(&mut self) -> IdxSize {45if self.null_idx == IdxSize::MAX {46self.null_idx = self.idx_map.push_unmapped_empty_entry(());47}48self.null_idx49}5051/// # Safety52/// The view must be valid for the given buffer set.53#[inline(always)]54unsafe fn contains_key(&self, hash: u64, view: &View, buffers: &Buffer<Buffer<u8>>) -> bool {55unsafe { self.idx_map.get_view(hash, view, buffers).is_some() }56}5758#[inline(always)]59fn contains_null(&self) -> bool {60self.null_idx < IdxSize::MAX61}6263/// # Safety64/// The views must be valid for the given buffers.65unsafe fn finalize_keys<V: ViewType + ?Sized>(66&self,67schema: &Schema,68views: Buffer<View>,69buffers: Buffer<Buffer<u8>>,70validity: Option<Bitmap>,71) -> DataFrame {72let (name, dtype) = schema.get_at_index(0).unwrap();73unsafe {74let arrow_dtype = dtype.to_arrow(CompatLevel::newest());75let keys = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(76arrow_dtype,77views,78buffers,79validity,80None,81);82let s =83Series::from_chunks_and_dtype_unchecked(name.clone(), vec![Box::new(keys)], dtype);84DataFrame::new_unchecked(s.len(), vec![Column::from(s)])85}86}87}8889impl Grouper for BinviewHashGrouper {90fn new_empty(&self) -> Box<dyn Grouper> {91Box::new(Self::new())92}9394fn reserve(&mut self, additional: usize) {95self.idx_map.reserve(additional);96}9798fn num_groups(&self) -> IdxSize {99self.idx_map.len()100}101102unsafe fn insert_keys_subset(103&mut self,104hash_keys: &HashKeys,105subset: &[IdxSize],106group_idxs: Option<&mut Vec<IdxSize>>,107) {108let HashKeys::Binview(hash_keys) = hash_keys else {109unreachable!()110};111112unsafe {113let views = hash_keys.keys.views().as_slice();114let buffers = hash_keys.keys.data_buffers();115if let Some(validity) = hash_keys.keys.validity() {116if hash_keys.null_is_valid {117let groups = subset.iter().map(|idx| {118if validity.get_bit_unchecked(*idx as usize) {119let hash = hash_keys.hashes.value_unchecked(*idx as usize);120let view = views.get_unchecked(*idx as usize);121self.insert_key(hash, *view, buffers)122} else {123self.insert_null()124}125});126if let Some(group_idxs) = group_idxs {127group_idxs.reserve(subset.len());128group_idxs.extend(groups);129} else {130groups.for_each(drop);131}132} else {133let groups = subset.iter().filter_map(|idx| {134if validity.get_bit_unchecked(*idx as usize) {135let hash = hash_keys.hashes.value_unchecked(*idx as usize);136let view = views.get_unchecked(*idx as usize);137Some(self.insert_key(hash, *view, buffers))138} else {139None140}141});142if let Some(group_idxs) = group_idxs {143group_idxs.reserve(subset.len());144group_idxs.extend(groups);145} else {146groups.for_each(drop);147}148}149} else {150let groups = subset.iter().map(|idx| {151let hash = hash_keys.hashes.value_unchecked(*idx as usize);152let view = views.get_unchecked(*idx as usize);153self.insert_key(hash, *view, buffers)154});155if let Some(group_idxs) = group_idxs {156group_idxs.reserve(subset.len());157group_idxs.extend(groups);158} else {159groups.for_each(drop);160}161}162}163}164165fn get_keys_in_group_order(&self, schema: &Schema) -> DataFrame {166let buffers = self167.idx_map168.buffers()169.iter()170.map(|b| Buffer::from(b.to_vec()))171.collect();172let views = self.idx_map.iter_hash_views().map(|(_h, v)| v).collect();173let validity = if self.null_idx < IdxSize::MAX {174let mut validity = MutableBitmap::new();175validity.extend_constant(self.idx_map.len() as usize, true);176validity.set(self.null_idx as usize, false);177Some(validity.freeze())178} else {179None180};181182unsafe {183let (_name, dt) = schema.get_at_index(0).unwrap();184match dt {185DataType::Binary => self.finalize_keys::<[u8]>(schema, views, buffers, validity),186DataType::String => self.finalize_keys::<str>(schema, views, buffers, validity),187_ => unreachable!(),188}189}190}191192/// # Safety193/// All groupers must be a BinviewHashGrouper.194unsafe fn probe_partitioned_groupers(195&self,196groupers: &[Box<dyn Grouper>],197hash_keys: &HashKeys,198partitioner: &HashPartitioner,199invert: bool,200probe_matches: &mut Vec<IdxSize>,201) {202let HashKeys::Binview(hash_keys) = hash_keys else {203unreachable!()204};205assert!(partitioner.num_partitions() == groupers.len());206207unsafe {208let null_p = partitioner.null_partition();209let buffers = hash_keys.keys.data_buffers();210let views = hash_keys.keys.views().as_slice();211hash_keys.for_each_hash(|idx, opt_h| {212let has_group = if let Some(h) = opt_h {213let p = partitioner.hash_to_partition(h);214let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(p);215let grouper =216&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);217let view = views.get_unchecked(idx as usize);218grouper.contains_key(h, view, buffers)219} else {220let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(null_p);221let grouper =222&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);223grouper.contains_null()224};225226if has_group != invert {227probe_matches.push(idx);228}229});230}231}232233/// # Safety234/// All groupers must be a BinviewHashGrouper.235unsafe fn contains_key_partitioned_groupers(236&self,237groupers: &[Box<dyn Grouper>],238hash_keys: &HashKeys,239partitioner: &HashPartitioner,240invert: bool,241contains_key: &mut BitmapBuilder,242) {243let HashKeys::Binview(hash_keys) = hash_keys else {244unreachable!()245};246assert!(partitioner.num_partitions() == groupers.len());247248unsafe {249let null_p = partitioner.null_partition();250let buffers = hash_keys.keys.data_buffers();251let views = hash_keys.keys.views().as_slice();252hash_keys.for_each_hash(|idx, opt_h| {253let has_group = if let Some(h) = opt_h {254let p = partitioner.hash_to_partition(h);255let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(p);256let grouper =257&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);258let view = views.get_unchecked(idx as usize);259grouper.contains_key(h, view, buffers)260} else {261let dyn_grouper: &dyn Grouper = &**groupers.get_unchecked(null_p);262let grouper =263&*(dyn_grouper as *const dyn Grouper as *const BinviewHashGrouper);264grouper.contains_null()265};266267contains_key.push(has_group != invert);268});269}270}271272fn as_any(&self) -> &dyn Any {273self274}275}276277278