Path: blob/main/crates/polars-ops/src/chunked_array/gather/chunked.rs
8421 views
#![allow(unsafe_op_in_unsafe_fn)]1use std::fmt::Debug;23use arrow::array::{Array, BinaryViewArrayGeneric, View, ViewType};4use arrow::bitmap::BitmapBuilder;5use arrow::legacy::trusted_len::TrustedLenPush;6use hashbrown::hash_map::Entry;7use polars_buffer::Buffer;8use polars_core::prelude::gather::_update_gather_sorted_flag;9use polars_core::prelude::*;10use polars_core::series::IsSorted;11use polars_core::utils::Container;12use polars_core::{with_match_categorical_physical_type, with_match_physical_numeric_polars_type};1314use crate::frame::IntoDf;1516/// Gather by [`ChunkId`]17pub trait TakeChunked {18/// Gathers elements from a ChunkedArray, specifying for each element a19/// chunk index and index within that chunk through ChunkId. If20/// avoid_sharing is true the returned data should not share references21/// with the original array (like shared buffers in views).22///23/// # Safety24/// This function doesn't do any bound checks.25unsafe fn take_chunked_unchecked<const B: u64>(26&self,27by: &[ChunkId<B>],28sorted: IsSorted,29avoid_sharing: bool,30) -> Self;3132/// # Safety33/// This function doesn't do any bound checks.34unsafe fn take_opt_chunked_unchecked<const B: u64>(35&self,36by: &[ChunkId<B>],37avoid_sharing: bool,38) -> Self;39}4041impl TakeChunked for DataFrame {42/// Take elements by a slice of [`ChunkId`]s.43///44/// # Safety45/// Does not do any bound checks.46/// `sorted` indicates if the chunks are sorted.47unsafe fn take_chunked_unchecked<const B: u64>(48&self,49idx: &[ChunkId<B>],50sorted: IsSorted,51avoid_sharing: bool,52) -> DataFrame {53let cols = self54.to_df()55.apply_columns(|s| s.take_chunked_unchecked(idx, sorted, avoid_sharing));5657unsafe { DataFrame::new_unchecked_infer_height(cols) }58}5960/// Take elements by a slice of optional [`ChunkId`]s.61///62/// # Safety63/// Does not do any bound checks.64unsafe fn take_opt_chunked_unchecked<const B: u64>(65&self,66idx: &[ChunkId<B>],67avoid_sharing: bool,68) -> DataFrame {69let cols = self70.to_df()71.apply_columns(|s| s.take_opt_chunked_unchecked(idx, avoid_sharing));7273unsafe { DataFrame::new_unchecked_infer_height(cols) }74}75}7677pub trait TakeChunkedHorPar: IntoDf {78/// # Safety79/// Doesn't perform any bound checks80unsafe fn _take_chunked_unchecked_hor_par<const B: u64>(81&self,82idx: &[ChunkId<B>],83sorted: IsSorted,84) -> DataFrame {85let cols = self86.to_df()87.apply_columns_par(|s| s.take_chunked_unchecked(idx, sorted, false));8889unsafe { DataFrame::new_unchecked_infer_height(cols) }90}9192/// # Safety93/// Doesn't perform any bound checks94///95/// Check for null state in `ChunkId`.96unsafe fn _take_opt_chunked_unchecked_hor_par<const B: u64>(97&self,98idx: &[ChunkId<B>],99) -> DataFrame {100let cols = self101.to_df()102.apply_columns_par(|s| s.take_opt_chunked_unchecked(idx, false));103104unsafe { DataFrame::new_unchecked_infer_height(cols) }105}106}107108impl TakeChunkedHorPar for DataFrame {}109110impl TakeChunked for Column {111unsafe fn take_chunked_unchecked<const B: u64>(112&self,113by: &[ChunkId<B>],114sorted: IsSorted,115avoid_sharing: bool,116) -> Self {117// @scalar-opt118let s = self.as_materialized_series();119let s = unsafe { s.take_chunked_unchecked(by, sorted, avoid_sharing) };120s.into_column()121}122123unsafe fn take_opt_chunked_unchecked<const B: u64>(124&self,125by: &[ChunkId<B>],126avoid_sharing: bool,127) -> Self {128// @scalar-opt129let s = self.as_materialized_series();130let s = unsafe { s.take_opt_chunked_unchecked(by, avoid_sharing) };131s.into_column()132}133}134135impl TakeChunked for Series {136unsafe fn take_chunked_unchecked<const B: u64>(137&self,138by: &[ChunkId<B>],139sorted: IsSorted,140avoid_sharing: bool,141) -> Self {142use DataType::*;143match self.dtype() {144dt if dt.is_primitive_numeric() => {145with_match_physical_numeric_polars_type!(self.dtype(), |$T| {146let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref();147ca.take_chunked_unchecked(by, sorted, avoid_sharing).into_series()148})149},150Boolean => {151let ca = self.bool().unwrap();152ca.take_chunked_unchecked(by, sorted, avoid_sharing)153.into_series()154},155Binary => {156let ca = self.binary().unwrap();157take_chunked_unchecked_binview(ca, by, sorted, avoid_sharing).into_series()158},159String => {160let ca = self.str().unwrap();161take_chunked_unchecked_binview(ca, by, sorted, avoid_sharing).into_series()162},163List(_) => {164let ca = self.list().unwrap();165ca.take_chunked_unchecked(by, sorted, avoid_sharing)166.into_series()167},168#[cfg(feature = "dtype-array")]169Array(_, _) => {170let ca = self.array().unwrap();171ca.take_chunked_unchecked(by, sorted, avoid_sharing)172.into_series()173},174#[cfg(feature = "dtype-struct")]175Struct(_) => {176let ca = self.struct_().unwrap();177take_chunked_unchecked_struct(ca, by, sorted, avoid_sharing).into_series()178},179#[cfg(feature = "object")]180Object(_) => take_unchecked_object(self, by, sorted),181#[cfg(feature = "dtype-decimal")]182Decimal(_, _) => {183let ca = self.decimal().unwrap();184let out = ca.phys.take_chunked_unchecked(by, sorted, avoid_sharing);185out.into_decimal_unchecked(ca.precision(), ca.scale())186.into_series()187},188#[cfg(feature = "dtype-date")]189Date => {190let ca = self.date().unwrap();191ca.physical()192.take_chunked_unchecked(by, sorted, avoid_sharing)193.into_date()194.into_series()195},196#[cfg(feature = "dtype-datetime")]197Datetime(u, z) => {198let ca = self.datetime().unwrap();199ca.physical()200.take_chunked_unchecked(by, sorted, avoid_sharing)201.into_datetime(*u, z.clone())202.into_series()203},204#[cfg(feature = "dtype-duration")]205Duration(u) => {206let ca = self.duration().unwrap();207ca.physical()208.take_chunked_unchecked(by, sorted, avoid_sharing)209.into_duration(*u)210.into_series()211},212#[cfg(feature = "dtype-time")]213Time => {214let ca = self.time().unwrap();215ca.physical()216.take_chunked_unchecked(by, sorted, avoid_sharing)217.into_time()218.into_series()219},220#[cfg(feature = "dtype-categorical")]221Categorical(_, _) | Enum(_, _) => {222with_match_categorical_physical_type!(self.dtype().cat_physical().unwrap(), |$C| {223let ca = self.cat::<$C>().unwrap();224CategoricalChunked::<$C>::from_cats_and_dtype_unchecked(225ca.physical().take_chunked_unchecked(by, sorted, avoid_sharing),226self.dtype().clone()227)228.into_series()229})230},231Null => Series::new_null(self.name().clone(), by.len()),232_ => unreachable!(),233}234}235236/// Take function that checks of null state in `ChunkIdx`.237unsafe fn take_opt_chunked_unchecked<const B: u64>(238&self,239by: &[ChunkId<B>],240avoid_sharing: bool,241) -> Self {242use DataType::*;243match self.dtype() {244dt if dt.is_primitive_numeric() => {245with_match_physical_numeric_polars_type!(self.dtype(), |$T| {246let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref();247ca.take_opt_chunked_unchecked(by, avoid_sharing).into_series()248})249},250Boolean => {251let ca = self.bool().unwrap();252ca.take_opt_chunked_unchecked(by, avoid_sharing)253.into_series()254},255Binary => {256let ca = self.binary().unwrap();257take_opt_chunked_unchecked_binview(ca, by, avoid_sharing).into_series()258},259String => {260let ca = self.str().unwrap();261take_opt_chunked_unchecked_binview(ca, by, avoid_sharing).into_series()262},263List(_) => {264let ca = self.list().unwrap();265ca.take_opt_chunked_unchecked(by, avoid_sharing)266.into_series()267},268#[cfg(feature = "dtype-array")]269Array(_, _) => {270let ca = self.array().unwrap();271ca.take_opt_chunked_unchecked(by, avoid_sharing)272.into_series()273},274#[cfg(feature = "dtype-struct")]275Struct(_) => {276let ca = self.struct_().unwrap();277take_opt_chunked_unchecked_struct(ca, by, avoid_sharing).into_series()278},279#[cfg(feature = "object")]280Object(_) => take_opt_unchecked_object(self, by, avoid_sharing),281#[cfg(feature = "dtype-decimal")]282Decimal(_, _) => {283let ca = self.decimal().unwrap();284let out = ca.phys.take_opt_chunked_unchecked(by, avoid_sharing);285out.into_decimal_unchecked(ca.precision(), ca.scale())286.into_series()287},288#[cfg(feature = "dtype-date")]289Date => {290let ca = self.date().unwrap();291ca.physical()292.take_opt_chunked_unchecked(by, avoid_sharing)293.into_date()294.into_series()295},296#[cfg(feature = "dtype-datetime")]297Datetime(u, z) => {298let ca = self.datetime().unwrap();299ca.physical()300.take_opt_chunked_unchecked(by, avoid_sharing)301.into_datetime(*u, z.clone())302.into_series()303},304#[cfg(feature = "dtype-duration")]305Duration(u) => {306let ca = self.duration().unwrap();307ca.physical()308.take_opt_chunked_unchecked(by, avoid_sharing)309.into_duration(*u)310.into_series()311},312#[cfg(feature = "dtype-time")]313Time => {314let ca = self.time().unwrap();315ca.physical()316.take_opt_chunked_unchecked(by, avoid_sharing)317.into_time()318.into_series()319},320#[cfg(feature = "dtype-categorical")]321Categorical(_, _) | Enum(_, _) => {322with_match_categorical_physical_type!(self.dtype().cat_physical().unwrap(), |$C| {323let ca = self.cat::<$C>().unwrap();324CategoricalChunked::<$C>::from_cats_and_dtype_unchecked(325ca.physical().take_opt_chunked_unchecked(by, avoid_sharing),326self.dtype().clone()327)328.into_series()329})330},331Null => Series::new_null(self.name().clone(), by.len()),332_ => unreachable!(),333}334}335}336337impl<T> TakeChunked for ChunkedArray<T>338where339T: PolarsDataType,340T::Array: Debug,341{342unsafe fn take_chunked_unchecked<const B: u64>(343&self,344by: &[ChunkId<B>],345sorted: IsSorted,346_allow_sharing: bool,347) -> Self {348let arrow_dtype = self.dtype().to_arrow(CompatLevel::newest());349350let mut out = if !self.has_nulls() {351let iter = by.iter().map(|chunk_id| {352debug_assert!(353!chunk_id.is_null(),354"null chunks should not hit this branch"355);356let (chunk_idx, array_idx) = chunk_id.extract();357let arr = self.downcast_get_unchecked(chunk_idx as usize);358arr.value_unchecked(array_idx as usize)359});360361let arr = iter.collect_arr_trusted_with_dtype(arrow_dtype);362ChunkedArray::with_chunk_like(self, arr)363} else {364let iter = by.iter().map(|chunk_id| {365debug_assert!(366!chunk_id.is_null(),367"null chunks should not hit this branch"368);369let (chunk_idx, array_idx) = chunk_id.extract();370let arr = self.downcast_get_unchecked(chunk_idx as usize);371arr.get_unchecked(array_idx as usize)372});373374let arr = iter.collect_arr_trusted_with_dtype(arrow_dtype);375ChunkedArray::with_chunk_like(self, arr)376};377let sorted_flag = _update_gather_sorted_flag(self.is_sorted_flag(), sorted);378out.set_sorted_flag(sorted_flag);379out380}381382// Take function that checks of null state in `ChunkIdx`.383unsafe fn take_opt_chunked_unchecked<const B: u64>(384&self,385by: &[ChunkId<B>],386_allow_sharing: bool,387) -> Self {388let arrow_dtype = self.dtype().to_arrow(CompatLevel::newest());389390if !self.has_nulls() {391let arr = by392.iter()393.map(|chunk_id| {394if chunk_id.is_null() {395None396} else {397let (chunk_idx, array_idx) = chunk_id.extract();398let arr = self.downcast_get_unchecked(chunk_idx as usize);399Some(arr.value_unchecked(array_idx as usize).clone())400}401})402.collect_arr_trusted_with_dtype(arrow_dtype);403ChunkedArray::with_chunk_like(self, arr)404} else {405let arr = by406.iter()407.map(|chunk_id| {408if chunk_id.is_null() {409None410} else {411let (chunk_idx, array_idx) = chunk_id.extract();412let arr = self.downcast_get_unchecked(chunk_idx as usize);413arr.get_unchecked(array_idx as usize)414}415})416.collect_arr_trusted_with_dtype(arrow_dtype);417418ChunkedArray::with_chunk_like(self, arr)419}420}421}422423#[cfg(feature = "object")]424unsafe fn take_unchecked_object<const B: u64>(425s: &Series,426by: &[ChunkId<B>],427_sorted: IsSorted,428) -> Series {429use polars_core::chunked_array::object::registry::get_object_builder;430431let mut builder = get_object_builder(s.name().clone(), by.len());432433by.iter().for_each(|chunk_id| {434let (chunk_idx, array_idx) = chunk_id.extract();435let object = s.get_object_chunked_unchecked(chunk_idx as usize, array_idx as usize);436builder.append_option(object.map(|v| v.as_any()))437});438builder.to_series()439}440441#[cfg(feature = "object")]442unsafe fn take_opt_unchecked_object<const B: u64>(443s: &Series,444by: &[ChunkId<B>],445_allow_sharing: bool,446) -> Series {447use polars_core::chunked_array::object::registry::get_object_builder;448449let mut builder = get_object_builder(s.name().clone(), by.len());450451by.iter().for_each(|chunk_id| {452if chunk_id.is_null() {453builder.append_null()454} else {455let (chunk_idx, array_idx) = chunk_id.extract();456let object = s.get_object_chunked_unchecked(chunk_idx as usize, array_idx as usize);457builder.append_option(object.map(|v| v.as_any()))458}459});460builder.to_series()461}462463unsafe fn take_chunked_unchecked_binview<const B: u64, T, V>(464ca: &ChunkedArray<T>,465by: &[ChunkId<B>],466sorted: IsSorted,467avoid_sharing: bool,468) -> ChunkedArray<T>469where470T: PolarsDataType<Array = BinaryViewArrayGeneric<V>>,471T::Array: Debug,472V: ViewType + ?Sized,473{474if avoid_sharing {475return ca.take_chunked_unchecked(by, sorted, avoid_sharing);476}477478let mut views = Vec::with_capacity(by.len());479let (validity, arc_data_buffers);480481// If we can cheaply clone the list of buffers from the ChunkedArray we will,482// otherwise we will only clone those buffers we need.483if ca.n_chunks() == 1 {484let arr = ca.downcast_iter().next().unwrap();485let arr_views = arr.views();486487validity = if arr.has_nulls() {488let mut validity = BitmapBuilder::with_capacity(by.len());489for id in by.iter() {490let (chunk_idx, array_idx) = id.extract();491debug_assert!(chunk_idx == 0);492if arr.is_null_unchecked(array_idx as usize) {493views.push_unchecked(View::default());494validity.push_unchecked(false);495} else {496views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));497validity.push_unchecked(true);498}499}500Some(validity.freeze())501} else {502for id in by.iter() {503let (chunk_idx, array_idx) = id.extract();504debug_assert!(chunk_idx == 0);505views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));506}507None508};509510arc_data_buffers = arr.data_buffers().clone();511}512// Dedup the buffers while creating the views.513else if by.len() < ca.n_chunks() {514let mut buffer_idxs = PlHashMap::with_capacity(8);515let mut buffers = Vec::with_capacity(8);516517validity = if ca.has_nulls() {518let mut validity = BitmapBuilder::with_capacity(by.len());519for id in by.iter() {520let (chunk_idx, array_idx) = id.extract();521522let arr = ca.downcast_get_unchecked(chunk_idx as usize);523if arr.is_null_unchecked(array_idx as usize) {524views.push_unchecked(View::default());525validity.push_unchecked(false);526} else {527let view = *arr.views().get_unchecked(array_idx as usize);528views.push_unchecked(update_view_and_dedup(529view,530arr.data_buffers(),531&mut buffer_idxs,532&mut buffers,533));534validity.push_unchecked(true);535}536}537Some(validity.freeze())538} else {539for id in by.iter() {540let (chunk_idx, array_idx) = id.extract();541542let arr = ca.downcast_get_unchecked(chunk_idx as usize);543let view = *arr.views().get_unchecked(array_idx as usize);544views.push_unchecked(update_view_and_dedup(545view,546arr.data_buffers(),547&mut buffer_idxs,548&mut buffers,549));550}551None552};553554arc_data_buffers = buffers.into();555}556// Dedup the buffers up front557else {558let (buffers, buffer_offsets) = dedup_buffers_by_arc(ca);559560validity = if ca.has_nulls() {561let mut validity = BitmapBuilder::with_capacity(by.len());562for id in by.iter() {563let (chunk_idx, array_idx) = id.extract();564565let arr = ca.downcast_get_unchecked(chunk_idx as usize);566if arr.is_null_unchecked(array_idx as usize) {567views.push_unchecked(View::default());568validity.push_unchecked(false);569} else {570let view = *arr.views().get_unchecked(array_idx as usize);571let view = rewrite_view(view, chunk_idx, &buffer_offsets);572views.push_unchecked(view);573validity.push_unchecked(true);574}575}576Some(validity.freeze())577} else {578for id in by.iter() {579let (chunk_idx, array_idx) = id.extract();580581let arr = ca.downcast_get_unchecked(chunk_idx as usize);582let view = *arr.views().get_unchecked(array_idx as usize);583let view = rewrite_view(view, chunk_idx, &buffer_offsets);584views.push_unchecked(view);585}586None587};588589arc_data_buffers = buffers.into();590};591592let arr = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(593V::DATA_TYPE,594views.into(),595arc_data_buffers,596validity,597None,598);599600let mut out = ChunkedArray::with_chunk(ca.name().clone(), arr.maybe_gc());601let sorted_flag = _update_gather_sorted_flag(ca.is_sorted_flag(), sorted);602out.set_sorted_flag(sorted_flag);603out604}605606#[allow(clippy::unnecessary_cast)]607#[inline(always)]608unsafe fn rewrite_view(mut view: View, chunk_idx: IdxSize, buffer_offsets: &[u32]) -> View {609if view.length > 12 {610let base_offset = *buffer_offsets.get_unchecked(chunk_idx as usize);611view.buffer_idx += base_offset;612}613view614}615616unsafe fn update_view_and_dedup(617mut view: View,618orig_buffers: &[Buffer<u8>],619buffer_idxs: &mut PlHashMap<(*const u8, usize), u32>,620buffers: &mut Vec<Buffer<u8>>,621) -> View {622if view.length > 12 {623// Dedup on pointer + length.624let orig_buffer = orig_buffers.get_unchecked(view.buffer_idx as usize);625view.buffer_idx =626match buffer_idxs.entry((orig_buffer.as_slice().as_ptr(), orig_buffer.len())) {627Entry::Occupied(o) => *o.get(),628Entry::Vacant(v) => {629let buffer_idx = buffers.len() as u32;630buffers.push(orig_buffer.clone());631v.insert(buffer_idx);632buffer_idx633},634};635}636view637}638639fn dedup_buffers_by_arc<T, V>(ca: &ChunkedArray<T>) -> (Vec<Buffer<u8>>, Vec<u32>)640where641T: PolarsDataType<Array = BinaryViewArrayGeneric<V>>,642V: ViewType + ?Sized,643{644// Dedup buffers up front. Note: don't do this during view update, as this is often is much645// more costly.646let mut buffers = Vec::with_capacity(ca.chunks().len());647// Dont need to include the length, as we look at the arc pointers, which are immutable.648let mut buffers_dedup = PlHashMap::with_capacity(ca.chunks().len());649let mut buffer_offsets = Vec::with_capacity(ca.chunks().len() + 1);650651for arr in ca.downcast_iter() {652let data_buffers = arr.data_buffers();653let arc_ptr = data_buffers.as_ptr();654let offset = match buffers_dedup.entry(arc_ptr) {655Entry::Occupied(o) => *o.get(),656Entry::Vacant(v) => {657let offset = buffers.len() as u32;658buffers.extend(data_buffers.iter().cloned());659v.insert(offset);660offset661},662};663buffer_offsets.push(offset);664}665(buffers, buffer_offsets)666}667668unsafe fn take_opt_chunked_unchecked_binview<const B: u64, T, V>(669ca: &ChunkedArray<T>,670by: &[ChunkId<B>],671avoid_sharing: bool,672) -> ChunkedArray<T>673where674T: PolarsDataType<Array = BinaryViewArrayGeneric<V>>,675T::Array: Debug,676V: ViewType + ?Sized,677{678if avoid_sharing {679return ca.take_opt_chunked_unchecked(by, avoid_sharing);680}681682let mut views = Vec::with_capacity(by.len());683let mut validity = BitmapBuilder::with_capacity(by.len());684685// If we can cheaply clone the list of buffers from the ChunkedArray we will,686// otherwise we will only clone those buffers we need.687let arc_data_buffers = if ca.n_chunks() == 1 {688let arr = ca.downcast_iter().next().unwrap();689let arr_views = arr.views();690691if arr.has_nulls() {692for id in by.iter() {693let (chunk_idx, array_idx) = id.extract();694debug_assert!(id.is_null() || chunk_idx == 0);695if id.is_null() || arr.is_null_unchecked(array_idx as usize) {696views.push_unchecked(View::default());697validity.push_unchecked(false);698} else {699views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));700validity.push_unchecked(true);701}702}703} else {704for id in by.iter() {705let (chunk_idx, array_idx) = id.extract();706debug_assert!(id.is_null() || chunk_idx == 0);707if id.is_null() {708views.push_unchecked(View::default());709validity.push_unchecked(false);710} else {711views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));712validity.push_unchecked(true);713}714}715}716717arr.data_buffers().clone()718}719// Dedup the buffers while creating the views.720else if by.len() < ca.n_chunks() {721let mut buffer_idxs = PlHashMap::with_capacity(8);722let mut buffers = Vec::with_capacity(8);723724if ca.has_nulls() {725for id in by.iter() {726let (chunk_idx, array_idx) = id.extract();727728if id.is_null() {729views.push_unchecked(View::default());730validity.push_unchecked(false);731} else {732let arr = ca.downcast_get_unchecked(chunk_idx as usize);733if arr.is_null_unchecked(array_idx as usize) {734views.push_unchecked(View::default());735validity.push_unchecked(false);736} else {737let view = *arr.views().get_unchecked(array_idx as usize);738views.push_unchecked(update_view_and_dedup(739view,740arr.data_buffers(),741&mut buffer_idxs,742&mut buffers,743));744validity.push_unchecked(true);745}746}747}748} else {749for id in by.iter() {750let (chunk_idx, array_idx) = id.extract();751752if id.is_null() {753views.push_unchecked(View::default());754validity.push_unchecked(false);755} else {756let arr = ca.downcast_get_unchecked(chunk_idx as usize);757let view = *arr.views().get_unchecked(array_idx as usize);758views.push_unchecked(update_view_and_dedup(759view,760arr.data_buffers(),761&mut buffer_idxs,762&mut buffers,763));764validity.push_unchecked(true);765}766}767};768769buffers.into()770}771// Dedup the buffers up front772else {773let (buffers, buffer_offsets) = dedup_buffers_by_arc(ca);774775if ca.has_nulls() {776for id in by.iter() {777let (chunk_idx, array_idx) = id.extract();778779if id.is_null() {780views.push_unchecked(View::default());781validity.push_unchecked(false);782} else {783let arr = ca.downcast_get_unchecked(chunk_idx as usize);784if arr.is_null_unchecked(array_idx as usize) {785views.push_unchecked(View::default());786validity.push_unchecked(false);787} else {788let view = *arr.views().get_unchecked(array_idx as usize);789let view = rewrite_view(view, chunk_idx, &buffer_offsets);790views.push_unchecked(view);791validity.push_unchecked(true);792}793}794}795} else {796for id in by.iter() {797let (chunk_idx, array_idx) = id.extract();798799if id.is_null() {800views.push_unchecked(View::default());801validity.push_unchecked(false);802} else {803let arr = ca.downcast_get_unchecked(chunk_idx as usize);804let view = *arr.views().get_unchecked(array_idx as usize);805let view = rewrite_view(view, chunk_idx, &buffer_offsets);806views.push_unchecked(view);807validity.push_unchecked(true);808}809}810};811812buffers.into()813};814815let arr = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(816V::DATA_TYPE,817views.into(),818arc_data_buffers,819Some(validity.freeze()),820None,821);822823ChunkedArray::with_chunk(ca.name().clone(), arr.maybe_gc())824}825826#[cfg(feature = "dtype-struct")]827unsafe fn take_chunked_unchecked_struct<const B: u64>(828ca: &StructChunked,829by: &[ChunkId<B>],830sorted: IsSorted,831avoid_sharing: bool,832) -> StructChunked {833let fields = ca834.fields_as_series()835.iter()836.map(|s| s.take_chunked_unchecked(by, sorted, avoid_sharing))837.collect::<Vec<_>>();838let mut out = StructChunked::from_series(ca.name().clone(), by.len(), fields.iter()).unwrap();839840if !ca.has_nulls() {841return out;842}843844let mut validity = BitmapBuilder::with_capacity(by.len());845if ca.n_chunks() == 1 {846let arr = ca.downcast_as_array();847let bitmap = arr.validity().unwrap();848for id in by.iter() {849let (chunk_idx, array_idx) = id.extract();850debug_assert!(chunk_idx == 0);851validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));852}853} else {854for id in by.iter() {855let (chunk_idx, array_idx) = id.extract();856let arr = ca.downcast_get_unchecked(chunk_idx as usize);857if let Some(bitmap) = arr.validity() {858validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));859} else {860validity.push_unchecked(true);861}862}863}864865out.rechunk_mut(); // Should be a no-op.866out.downcast_iter_mut()867.next()868.unwrap()869.set_validity(validity.into_opt_validity());870out871}872873#[cfg(feature = "dtype-struct")]874unsafe fn take_opt_chunked_unchecked_struct<const B: u64>(875ca: &StructChunked,876by: &[ChunkId<B>],877avoid_sharing: bool,878) -> StructChunked {879let fields = ca880.fields_as_series()881.iter()882.map(|s| s.take_opt_chunked_unchecked(by, avoid_sharing))883.collect::<Vec<_>>();884let mut out = StructChunked::from_series(ca.name().clone(), by.len(), fields.iter()).unwrap();885886let mut validity = BitmapBuilder::with_capacity(by.len());887if ca.n_chunks() == 1 {888let arr = ca.downcast_as_array();889if let Some(bitmap) = arr.validity() {890for id in by.iter() {891if id.is_null() {892validity.push_unchecked(false);893} else {894let (chunk_idx, array_idx) = id.extract();895debug_assert!(chunk_idx == 0);896validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));897}898}899} else {900for id in by.iter() {901validity.push_unchecked(!id.is_null());902}903}904} else {905for id in by.iter() {906if id.is_null() {907validity.push_unchecked(false);908} else {909let (chunk_idx, array_idx) = id.extract();910let arr = ca.downcast_get_unchecked(chunk_idx as usize);911if let Some(bitmap) = arr.validity() {912validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));913} else {914validity.push_unchecked(true);915}916}917}918}919920out.rechunk_mut(); // Should be a no-op.921out.downcast_iter_mut()922.next()923.unwrap()924.set_validity(validity.into_opt_validity());925out926}927928#[cfg(test)]929mod test {930use super::*;931932#[test]933fn test_binview_chunked_gather() {934unsafe {935// # Series without nulls;936let mut s_1 = Series::new(937"a".into(),938&["1 loooooooooooong string", "2 loooooooooooong string"],939);940let s_2 = Series::new(941"a".into(),942&["11 loooooooooooong string", "22 loooooooooooong string"],943);944let s_3 = Series::new(945"a".into(),946&[947"111 loooooooooooong string",948"222 loooooooooooong string",949"small", // this tests we don't mess with the inlined view950],951);952s_1.append(&s_2).unwrap();953s_1.append(&s_3).unwrap();954955assert_eq!(s_1.n_chunks(), 3);956957// ## Ids without nulls;958let by: [ChunkId<24>; 7] = [959ChunkId::store(0, 0),960ChunkId::store(0, 1),961ChunkId::store(1, 1),962ChunkId::store(1, 0),963ChunkId::store(2, 0),964ChunkId::store(2, 1),965ChunkId::store(2, 2),966];967968let out = s_1.take_chunked_unchecked(&by, IsSorted::Not, true);969let idx = IdxCa::new("".into(), [0, 1, 3, 2, 4, 5, 6]);970let expected = s_1.rechunk().take(&idx).unwrap();971assert!(out.equals(&expected));972973// ## Ids with nulls;974let by: [ChunkId<24>; 4] = [975ChunkId::null(),976ChunkId::store(0, 1),977ChunkId::store(1, 1),978ChunkId::store(1, 0),979];980let out = s_1.take_opt_chunked_unchecked(&by, true);981982let idx = IdxCa::new("".into(), [None, Some(1), Some(3), Some(2)]);983let expected = s_1.rechunk().take(&idx).unwrap();984assert!(out.equals_missing(&expected));985986// # Series with nulls;987let mut s_1 = Series::new(988"a".into(),989&["1 loooooooooooong string 1", "2 loooooooooooong string 2"],990);991let s_2 = Series::new("a".into(), &[Some("11 loooooooooooong string 11"), None]);992s_1.append(&s_2).unwrap();993994// ## Ids without nulls;995let by: [ChunkId<24>; 4] = [996ChunkId::store(0, 0),997ChunkId::store(0, 1),998ChunkId::store(1, 1),999ChunkId::store(1, 0),1000];10011002let out = s_1.take_chunked_unchecked(&by, IsSorted::Not, true);1003let idx = IdxCa::new("".into(), [0, 1, 3, 2]);1004let expected = s_1.rechunk().take(&idx).unwrap();1005assert!(out.equals_missing(&expected));10061007// ## Ids with nulls;1008let by: [ChunkId<24>; 4] = [1009ChunkId::null(),1010ChunkId::store(0, 1),1011ChunkId::store(1, 1),1012ChunkId::store(1, 0),1013];1014let out = s_1.take_opt_chunked_unchecked(&by, true);10151016let idx = IdxCa::new("".into(), [None, Some(1), Some(3), Some(2)]);1017let expected = s_1.rechunk().take(&idx).unwrap();1018assert!(out.equals_missing(&expected));1019}1020}10211022#[test]1023#[cfg(feature = "dtype-categorical")]1024fn test_list_categorical_dtype_preserved_after_take() {1025use polars_core::prelude::*;10261027unsafe {1028// Create List(String) and convert to List(Categorical)1029let mut builder = ListStringChunkedBuilder::new("a".into(), 2, 3);1030builder.append_values_iter(["a", "b"].iter().copied());1031builder.append_values_iter(["c", "d"].iter().copied());1032let list_str = builder.finish().into_series();10331034let list_cat = list_str1035.list()1036.unwrap()1037.apply_to_inner(&|s| s.cast(&DataType::from_categories(Categories::global())))1038.unwrap()1039.into_series();10401041// Append to create chunked series1042let mut chunked = list_cat.clone();1043chunked.append(&list_cat).unwrap();1044assert_eq!(chunked.n_chunks(), 2);10451046// Perform chunked take1047let by: [ChunkId<24>; 2] = [ChunkId::store(0, 0), ChunkId::store(1, 0)];1048let out = chunked.take_chunked_unchecked(&by, IsSorted::Not, false);10491050// Verify the Polars dtype is preserved1051// The bug was that List(Categorical) was becoming List(UInt32) after take1052assert!(1053matches!(out.dtype(), DataType::List(inner) if matches!(inner.as_ref(), DataType::Categorical(_, _))),1054"List(Categorical) dtype should be preserved after take_chunked_unchecked. Got: {:?}",1055out.dtype()1056);1057}1058}1059}106010611062