Path: blob/main/crates/polars-ops/src/chunked_array/gather/chunked.rs
6939 views
#![allow(unsafe_op_in_unsafe_fn)]1use std::fmt::Debug;23use arrow::array::{Array, BinaryViewArrayGeneric, View, ViewType};4use arrow::bitmap::BitmapBuilder;5use arrow::buffer::Buffer;6use arrow::legacy::trusted_len::TrustedLenPush;7use hashbrown::hash_map::Entry;8use polars_core::prelude::gather::_update_gather_sorted_flag;9use polars_core::prelude::*;10use polars_core::series::IsSorted;11use polars_core::utils::Container;12use polars_core::{with_match_categorical_physical_type, with_match_physical_numeric_polars_type};1314use crate::frame::IntoDf;1516/// Gather by [`ChunkId`]17pub trait TakeChunked {18/// Gathers elements from a ChunkedArray, specifying for each element a19/// chunk index and index within that chunk through ChunkId. If20/// avoid_sharing is true the returned data should not share references21/// with the original array (like shared buffers in views).22///23/// # Safety24/// This function doesn't do any bound checks.25unsafe fn take_chunked_unchecked<const B: u64>(26&self,27by: &[ChunkId<B>],28sorted: IsSorted,29avoid_sharing: bool,30) -> Self;3132/// # Safety33/// This function doesn't do any bound checks.34unsafe fn take_opt_chunked_unchecked<const B: u64>(35&self,36by: &[ChunkId<B>],37avoid_sharing: bool,38) -> Self;39}4041impl TakeChunked for DataFrame {42/// Take elements by a slice of [`ChunkId`]s.43///44/// # Safety45/// Does not do any bound checks.46/// `sorted` indicates if the chunks are sorted.47unsafe fn take_chunked_unchecked<const B: u64>(48&self,49idx: &[ChunkId<B>],50sorted: IsSorted,51avoid_sharing: bool,52) -> DataFrame {53let cols = self54.to_df()55._apply_columns(&|s| s.take_chunked_unchecked(idx, sorted, avoid_sharing));5657unsafe { DataFrame::new_no_checks_height_from_first(cols) }58}5960/// Take elements by a slice of optional [`ChunkId`]s.61///62/// # Safety63/// Does not do any bound checks.64unsafe fn take_opt_chunked_unchecked<const B: u64>(65&self,66idx: &[ChunkId<B>],67avoid_sharing: bool,68) -> DataFrame {69let cols = self70.to_df()71._apply_columns(&|s| s.take_opt_chunked_unchecked(idx, avoid_sharing));7273unsafe { DataFrame::new_no_checks_height_from_first(cols) }74}75}7677pub trait TakeChunkedHorPar: IntoDf {78/// # Safety79/// Doesn't perform any bound checks80unsafe fn _take_chunked_unchecked_hor_par<const B: u64>(81&self,82idx: &[ChunkId<B>],83sorted: IsSorted,84) -> DataFrame {85let cols = self86.to_df()87._apply_columns_par(&|s| s.take_chunked_unchecked(idx, sorted, false));8889unsafe { DataFrame::new_no_checks_height_from_first(cols) }90}9192/// # Safety93/// Doesn't perform any bound checks94///95/// Check for null state in `ChunkId`.96unsafe fn _take_opt_chunked_unchecked_hor_par<const B: u64>(97&self,98idx: &[ChunkId<B>],99) -> DataFrame {100let cols = self101.to_df()102._apply_columns_par(&|s| s.take_opt_chunked_unchecked(idx, false));103104unsafe { DataFrame::new_no_checks_height_from_first(cols) }105}106}107108impl TakeChunkedHorPar for DataFrame {}109110impl TakeChunked for Column {111unsafe fn take_chunked_unchecked<const B: u64>(112&self,113by: &[ChunkId<B>],114sorted: IsSorted,115avoid_sharing: bool,116) -> Self {117// @scalar-opt118let s = self.as_materialized_series();119let s = unsafe { s.take_chunked_unchecked(by, sorted, avoid_sharing) };120s.into_column()121}122123unsafe fn take_opt_chunked_unchecked<const B: u64>(124&self,125by: &[ChunkId<B>],126avoid_sharing: bool,127) -> Self {128// @scalar-opt129let s = self.as_materialized_series();130let s = unsafe { s.take_opt_chunked_unchecked(by, avoid_sharing) };131s.into_column()132}133}134135impl TakeChunked for Series {136unsafe fn take_chunked_unchecked<const B: u64>(137&self,138by: &[ChunkId<B>],139sorted: IsSorted,140avoid_sharing: bool,141) -> Self {142use DataType::*;143match self.dtype() {144dt if dt.is_primitive_numeric() => {145with_match_physical_numeric_polars_type!(self.dtype(), |$T| {146let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref();147ca.take_chunked_unchecked(by, sorted, avoid_sharing).into_series()148})149},150Boolean => {151let ca = self.bool().unwrap();152ca.take_chunked_unchecked(by, sorted, avoid_sharing)153.into_series()154},155Binary => {156let ca = self.binary().unwrap();157take_chunked_unchecked_binview(ca, by, sorted, avoid_sharing).into_series()158},159String => {160let ca = self.str().unwrap();161take_chunked_unchecked_binview(ca, by, sorted, avoid_sharing).into_series()162},163List(_) => {164let ca = self.list().unwrap();165ca.take_chunked_unchecked(by, sorted, avoid_sharing)166.into_series()167},168#[cfg(feature = "dtype-array")]169Array(_, _) => {170let ca = self.array().unwrap();171ca.take_chunked_unchecked(by, sorted, avoid_sharing)172.into_series()173},174#[cfg(feature = "dtype-struct")]175Struct(_) => {176let ca = self.struct_().unwrap();177take_chunked_unchecked_struct(ca, by, sorted, avoid_sharing).into_series()178},179#[cfg(feature = "object")]180Object(_) => take_unchecked_object(self, by, sorted),181#[cfg(feature = "dtype-decimal")]182Decimal(_, _) => {183let ca = self.decimal().unwrap();184let out = ca.phys.take_chunked_unchecked(by, sorted, avoid_sharing);185out.into_decimal_unchecked(ca.precision(), ca.scale())186.into_series()187},188#[cfg(feature = "dtype-date")]189Date => {190let ca = self.date().unwrap();191ca.physical()192.take_chunked_unchecked(by, sorted, avoid_sharing)193.into_date()194.into_series()195},196#[cfg(feature = "dtype-datetime")]197Datetime(u, z) => {198let ca = self.datetime().unwrap();199ca.physical()200.take_chunked_unchecked(by, sorted, avoid_sharing)201.into_datetime(*u, z.clone())202.into_series()203},204#[cfg(feature = "dtype-duration")]205Duration(u) => {206let ca = self.duration().unwrap();207ca.physical()208.take_chunked_unchecked(by, sorted, avoid_sharing)209.into_duration(*u)210.into_series()211},212#[cfg(feature = "dtype-time")]213Time => {214let ca = self.time().unwrap();215ca.physical()216.take_chunked_unchecked(by, sorted, avoid_sharing)217.into_time()218.into_series()219},220#[cfg(feature = "dtype-categorical")]221Categorical(_, _) | Enum(_, _) => {222with_match_categorical_physical_type!(self.dtype().cat_physical().unwrap(), |$C| {223let ca = self.cat::<$C>().unwrap();224CategoricalChunked::<$C>::from_cats_and_dtype_unchecked(225ca.physical().take_chunked_unchecked(by, sorted, avoid_sharing),226self.dtype().clone()227)228.into_series()229})230},231Null => Series::new_null(self.name().clone(), by.len()),232_ => unreachable!(),233}234}235236/// Take function that checks of null state in `ChunkIdx`.237unsafe fn take_opt_chunked_unchecked<const B: u64>(238&self,239by: &[ChunkId<B>],240avoid_sharing: bool,241) -> Self {242use DataType::*;243match self.dtype() {244dt if dt.is_primitive_numeric() => {245with_match_physical_numeric_polars_type!(self.dtype(), |$T| {246let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref();247ca.take_opt_chunked_unchecked(by, avoid_sharing).into_series()248})249},250Boolean => {251let ca = self.bool().unwrap();252ca.take_opt_chunked_unchecked(by, avoid_sharing)253.into_series()254},255Binary => {256let ca = self.binary().unwrap();257take_opt_chunked_unchecked_binview(ca, by, avoid_sharing).into_series()258},259String => {260let ca = self.str().unwrap();261take_opt_chunked_unchecked_binview(ca, by, avoid_sharing).into_series()262},263List(_) => {264let ca = self.list().unwrap();265ca.take_opt_chunked_unchecked(by, avoid_sharing)266.into_series()267},268#[cfg(feature = "dtype-array")]269Array(_, _) => {270let ca = self.array().unwrap();271ca.take_opt_chunked_unchecked(by, avoid_sharing)272.into_series()273},274#[cfg(feature = "dtype-struct")]275Struct(_) => {276let ca = self.struct_().unwrap();277take_opt_chunked_unchecked_struct(ca, by, avoid_sharing).into_series()278},279#[cfg(feature = "object")]280Object(_) => take_opt_unchecked_object(self, by, avoid_sharing),281#[cfg(feature = "dtype-decimal")]282Decimal(_, _) => {283let ca = self.decimal().unwrap();284let out = ca.phys.take_opt_chunked_unchecked(by, avoid_sharing);285out.into_decimal_unchecked(ca.precision(), ca.scale())286.into_series()287},288#[cfg(feature = "dtype-date")]289Date => {290let ca = self.date().unwrap();291ca.physical()292.take_opt_chunked_unchecked(by, avoid_sharing)293.into_date()294.into_series()295},296#[cfg(feature = "dtype-datetime")]297Datetime(u, z) => {298let ca = self.datetime().unwrap();299ca.physical()300.take_opt_chunked_unchecked(by, avoid_sharing)301.into_datetime(*u, z.clone())302.into_series()303},304#[cfg(feature = "dtype-duration")]305Duration(u) => {306let ca = self.duration().unwrap();307ca.physical()308.take_opt_chunked_unchecked(by, avoid_sharing)309.into_duration(*u)310.into_series()311},312#[cfg(feature = "dtype-time")]313Time => {314let ca = self.time().unwrap();315ca.physical()316.take_opt_chunked_unchecked(by, avoid_sharing)317.into_time()318.into_series()319},320#[cfg(feature = "dtype-categorical")]321Categorical(_, _) | Enum(_, _) => {322with_match_categorical_physical_type!(self.dtype().cat_physical().unwrap(), |$C| {323let ca = self.cat::<$C>().unwrap();324CategoricalChunked::<$C>::from_cats_and_dtype_unchecked(325ca.physical().take_opt_chunked_unchecked(by, avoid_sharing),326self.dtype().clone()327)328.into_series()329})330},331Null => Series::new_null(self.name().clone(), by.len()),332_ => unreachable!(),333}334}335}336337impl<T> TakeChunked for ChunkedArray<T>338where339T: PolarsDataType,340T::Array: Debug,341{342unsafe fn take_chunked_unchecked<const B: u64>(343&self,344by: &[ChunkId<B>],345sorted: IsSorted,346_allow_sharing: bool,347) -> Self {348let arrow_dtype = self.dtype().to_arrow(CompatLevel::newest());349350let mut out = if !self.has_nulls() {351let iter = by.iter().map(|chunk_id| {352debug_assert!(353!chunk_id.is_null(),354"null chunks should not hit this branch"355);356let (chunk_idx, array_idx) = chunk_id.extract();357let arr = self.downcast_get_unchecked(chunk_idx as usize);358arr.value_unchecked(array_idx as usize)359});360361let arr = iter.collect_arr_trusted_with_dtype(arrow_dtype);362ChunkedArray::with_chunk(self.name().clone(), arr)363} else {364let iter = by.iter().map(|chunk_id| {365debug_assert!(366!chunk_id.is_null(),367"null chunks should not hit this branch"368);369let (chunk_idx, array_idx) = chunk_id.extract();370let arr = self.downcast_get_unchecked(chunk_idx as usize);371arr.get_unchecked(array_idx as usize)372});373374let arr = iter.collect_arr_trusted_with_dtype(arrow_dtype);375ChunkedArray::with_chunk(self.name().clone(), arr)376};377let sorted_flag = _update_gather_sorted_flag(self.is_sorted_flag(), sorted);378out.set_sorted_flag(sorted_flag);379out380}381382// Take function that checks of null state in `ChunkIdx`.383unsafe fn take_opt_chunked_unchecked<const B: u64>(384&self,385by: &[ChunkId<B>],386_allow_sharing: bool,387) -> Self {388let arrow_dtype = self.dtype().to_arrow(CompatLevel::newest());389390if !self.has_nulls() {391let arr = by392.iter()393.map(|chunk_id| {394if chunk_id.is_null() {395None396} else {397let (chunk_idx, array_idx) = chunk_id.extract();398let arr = self.downcast_get_unchecked(chunk_idx as usize);399Some(arr.value_unchecked(array_idx as usize).clone())400}401})402.collect_arr_trusted_with_dtype(arrow_dtype);403404ChunkedArray::with_chunk(self.name().clone(), arr)405} else {406let arr = by407.iter()408.map(|chunk_id| {409if chunk_id.is_null() {410None411} else {412let (chunk_idx, array_idx) = chunk_id.extract();413let arr = self.downcast_get_unchecked(chunk_idx as usize);414arr.get_unchecked(array_idx as usize)415}416})417.collect_arr_trusted_with_dtype(arrow_dtype);418419ChunkedArray::with_chunk(self.name().clone(), arr)420}421}422}423424#[cfg(feature = "object")]425unsafe fn take_unchecked_object<const B: u64>(426s: &Series,427by: &[ChunkId<B>],428_sorted: IsSorted,429) -> Series {430use polars_core::chunked_array::object::registry::get_object_builder;431432let mut builder = get_object_builder(s.name().clone(), by.len());433434by.iter().for_each(|chunk_id| {435let (chunk_idx, array_idx) = chunk_id.extract();436let object = s.get_object_chunked_unchecked(chunk_idx as usize, array_idx as usize);437builder.append_option(object.map(|v| v.as_any()))438});439builder.to_series()440}441442#[cfg(feature = "object")]443unsafe fn take_opt_unchecked_object<const B: u64>(444s: &Series,445by: &[ChunkId<B>],446_allow_sharing: bool,447) -> Series {448use polars_core::chunked_array::object::registry::get_object_builder;449450let mut builder = get_object_builder(s.name().clone(), by.len());451452by.iter().for_each(|chunk_id| {453if chunk_id.is_null() {454builder.append_null()455} else {456let (chunk_idx, array_idx) = chunk_id.extract();457let object = s.get_object_chunked_unchecked(chunk_idx as usize, array_idx as usize);458builder.append_option(object.map(|v| v.as_any()))459}460});461builder.to_series()462}463464unsafe fn take_chunked_unchecked_binview<const B: u64, T, V>(465ca: &ChunkedArray<T>,466by: &[ChunkId<B>],467sorted: IsSorted,468avoid_sharing: bool,469) -> ChunkedArray<T>470where471T: PolarsDataType<Array = BinaryViewArrayGeneric<V>>,472T::Array: Debug,473V: ViewType + ?Sized,474{475if avoid_sharing {476return ca.take_chunked_unchecked(by, sorted, avoid_sharing);477}478479let mut views = Vec::with_capacity(by.len());480let (validity, arc_data_buffers);481482// If we can cheaply clone the list of buffers from the ChunkedArray we will,483// otherwise we will only clone those buffers we need.484if ca.n_chunks() == 1 {485let arr = ca.downcast_iter().next().unwrap();486let arr_views = arr.views();487488validity = if arr.has_nulls() {489let mut validity = BitmapBuilder::with_capacity(by.len());490for id in by.iter() {491let (chunk_idx, array_idx) = id.extract();492debug_assert!(chunk_idx == 0);493if arr.is_null_unchecked(array_idx as usize) {494views.push_unchecked(View::default());495validity.push_unchecked(false);496} else {497views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));498validity.push_unchecked(true);499}500}501Some(validity.freeze())502} else {503for id in by.iter() {504let (chunk_idx, array_idx) = id.extract();505debug_assert!(chunk_idx == 0);506views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));507}508None509};510511arc_data_buffers = arr.data_buffers().clone();512}513// Dedup the buffers while creating the views.514else if by.len() < ca.n_chunks() {515let mut buffer_idxs = PlHashMap::with_capacity(8);516let mut buffers = Vec::with_capacity(8);517518validity = if ca.has_nulls() {519let mut validity = BitmapBuilder::with_capacity(by.len());520for id in by.iter() {521let (chunk_idx, array_idx) = id.extract();522523let arr = ca.downcast_get_unchecked(chunk_idx as usize);524if arr.is_null_unchecked(array_idx as usize) {525views.push_unchecked(View::default());526validity.push_unchecked(false);527} else {528let view = *arr.views().get_unchecked(array_idx as usize);529views.push_unchecked(update_view_and_dedup(530view,531arr.data_buffers(),532&mut buffer_idxs,533&mut buffers,534));535validity.push_unchecked(true);536}537}538Some(validity.freeze())539} else {540for id in by.iter() {541let (chunk_idx, array_idx) = id.extract();542543let arr = ca.downcast_get_unchecked(chunk_idx as usize);544let view = *arr.views().get_unchecked(array_idx as usize);545views.push_unchecked(update_view_and_dedup(546view,547arr.data_buffers(),548&mut buffer_idxs,549&mut buffers,550));551}552None553};554555arc_data_buffers = buffers.into();556}557// Dedup the buffers up front558else {559let (buffers, buffer_offsets) = dedup_buffers_by_arc(ca);560561validity = if ca.has_nulls() {562let mut validity = BitmapBuilder::with_capacity(by.len());563for id in by.iter() {564let (chunk_idx, array_idx) = id.extract();565566let arr = ca.downcast_get_unchecked(chunk_idx as usize);567if arr.is_null_unchecked(array_idx as usize) {568views.push_unchecked(View::default());569validity.push_unchecked(false);570} else {571let view = *arr.views().get_unchecked(array_idx as usize);572let view = rewrite_view(view, chunk_idx, &buffer_offsets);573views.push_unchecked(view);574validity.push_unchecked(true);575}576}577Some(validity.freeze())578} else {579for id in by.iter() {580let (chunk_idx, array_idx) = id.extract();581582let arr = ca.downcast_get_unchecked(chunk_idx as usize);583let view = *arr.views().get_unchecked(array_idx as usize);584let view = rewrite_view(view, chunk_idx, &buffer_offsets);585views.push_unchecked(view);586}587None588};589590arc_data_buffers = buffers.into();591};592593let arr = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(594V::DATA_TYPE,595views.into(),596arc_data_buffers,597validity,598None,599);600601let mut out = ChunkedArray::with_chunk(ca.name().clone(), arr.maybe_gc());602let sorted_flag = _update_gather_sorted_flag(ca.is_sorted_flag(), sorted);603out.set_sorted_flag(sorted_flag);604out605}606607#[allow(clippy::unnecessary_cast)]608#[inline(always)]609unsafe fn rewrite_view(mut view: View, chunk_idx: IdxSize, buffer_offsets: &[u32]) -> View {610if view.length > 12 {611let base_offset = *buffer_offsets.get_unchecked(chunk_idx as usize);612view.buffer_idx += base_offset;613}614view615}616617unsafe fn update_view_and_dedup(618mut view: View,619orig_buffers: &[Buffer<u8>],620buffer_idxs: &mut PlHashMap<(*const u8, usize), u32>,621buffers: &mut Vec<Buffer<u8>>,622) -> View {623if view.length > 12 {624// Dedup on pointer + length.625let orig_buffer = orig_buffers.get_unchecked(view.buffer_idx as usize);626view.buffer_idx =627match buffer_idxs.entry((orig_buffer.as_slice().as_ptr(), orig_buffer.len())) {628Entry::Occupied(o) => *o.get(),629Entry::Vacant(v) => {630let buffer_idx = buffers.len() as u32;631buffers.push(orig_buffer.clone());632v.insert(buffer_idx);633buffer_idx634},635};636}637view638}639640fn dedup_buffers_by_arc<T, V>(ca: &ChunkedArray<T>) -> (Vec<Buffer<u8>>, Vec<u32>)641where642T: PolarsDataType<Array = BinaryViewArrayGeneric<V>>,643V: ViewType + ?Sized,644{645// Dedup buffers up front. Note: don't do this during view update, as this is often is much646// more costly.647let mut buffers = Vec::with_capacity(ca.chunks().len());648// Dont need to include the length, as we look at the arc pointers, which are immutable.649let mut buffers_dedup = PlHashMap::with_capacity(ca.chunks().len());650let mut buffer_offsets = Vec::with_capacity(ca.chunks().len() + 1);651652for arr in ca.downcast_iter() {653let data_buffers = arr.data_buffers();654let arc_ptr = data_buffers.as_ptr();655let offset = match buffers_dedup.entry(arc_ptr) {656Entry::Occupied(o) => *o.get(),657Entry::Vacant(v) => {658let offset = buffers.len() as u32;659buffers.extend(data_buffers.iter().cloned());660v.insert(offset);661offset662},663};664buffer_offsets.push(offset);665}666(buffers, buffer_offsets)667}668669unsafe fn take_opt_chunked_unchecked_binview<const B: u64, T, V>(670ca: &ChunkedArray<T>,671by: &[ChunkId<B>],672avoid_sharing: bool,673) -> ChunkedArray<T>674where675T: PolarsDataType<Array = BinaryViewArrayGeneric<V>>,676T::Array: Debug,677V: ViewType + ?Sized,678{679if avoid_sharing {680return ca.take_opt_chunked_unchecked(by, avoid_sharing);681}682683let mut views = Vec::with_capacity(by.len());684let mut validity = BitmapBuilder::with_capacity(by.len());685686// If we can cheaply clone the list of buffers from the ChunkedArray we will,687// otherwise we will only clone those buffers we need.688let arc_data_buffers = if ca.n_chunks() == 1 {689let arr = ca.downcast_iter().next().unwrap();690let arr_views = arr.views();691692if arr.has_nulls() {693for id in by.iter() {694let (chunk_idx, array_idx) = id.extract();695debug_assert!(id.is_null() || chunk_idx == 0);696if id.is_null() || arr.is_null_unchecked(array_idx as usize) {697views.push_unchecked(View::default());698validity.push_unchecked(false);699} else {700views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));701validity.push_unchecked(true);702}703}704} else {705for id in by.iter() {706let (chunk_idx, array_idx) = id.extract();707debug_assert!(id.is_null() || chunk_idx == 0);708if id.is_null() {709views.push_unchecked(View::default());710validity.push_unchecked(false);711} else {712views.push_unchecked(*arr_views.get_unchecked(array_idx as usize));713validity.push_unchecked(true);714}715}716}717718arr.data_buffers().clone()719}720// Dedup the buffers while creating the views.721else if by.len() < ca.n_chunks() {722let mut buffer_idxs = PlHashMap::with_capacity(8);723let mut buffers = Vec::with_capacity(8);724725if ca.has_nulls() {726for id in by.iter() {727let (chunk_idx, array_idx) = id.extract();728729if id.is_null() {730views.push_unchecked(View::default());731validity.push_unchecked(false);732} else {733let arr = ca.downcast_get_unchecked(chunk_idx as usize);734if arr.is_null_unchecked(array_idx as usize) {735views.push_unchecked(View::default());736validity.push_unchecked(false);737} else {738let view = *arr.views().get_unchecked(array_idx as usize);739views.push_unchecked(update_view_and_dedup(740view,741arr.data_buffers(),742&mut buffer_idxs,743&mut buffers,744));745validity.push_unchecked(true);746}747}748}749} else {750for id in by.iter() {751let (chunk_idx, array_idx) = id.extract();752753if id.is_null() {754views.push_unchecked(View::default());755validity.push_unchecked(false);756} else {757let arr = ca.downcast_get_unchecked(chunk_idx as usize);758let view = *arr.views().get_unchecked(array_idx as usize);759views.push_unchecked(update_view_and_dedup(760view,761arr.data_buffers(),762&mut buffer_idxs,763&mut buffers,764));765validity.push_unchecked(true);766}767}768};769770buffers.into()771}772// Dedup the buffers up front773else {774let (buffers, buffer_offsets) = dedup_buffers_by_arc(ca);775776if ca.has_nulls() {777for id in by.iter() {778let (chunk_idx, array_idx) = id.extract();779780if id.is_null() {781views.push_unchecked(View::default());782validity.push_unchecked(false);783} else {784let arr = ca.downcast_get_unchecked(chunk_idx as usize);785if arr.is_null_unchecked(array_idx as usize) {786views.push_unchecked(View::default());787validity.push_unchecked(false);788} else {789let view = *arr.views().get_unchecked(array_idx as usize);790let view = rewrite_view(view, chunk_idx, &buffer_offsets);791views.push_unchecked(view);792validity.push_unchecked(true);793}794}795}796} else {797for id in by.iter() {798let (chunk_idx, array_idx) = id.extract();799800if id.is_null() {801views.push_unchecked(View::default());802validity.push_unchecked(false);803} else {804let arr = ca.downcast_get_unchecked(chunk_idx as usize);805let view = *arr.views().get_unchecked(array_idx as usize);806let view = rewrite_view(view, chunk_idx, &buffer_offsets);807views.push_unchecked(view);808validity.push_unchecked(true);809}810}811};812813buffers.into()814};815816let arr = BinaryViewArrayGeneric::<V>::new_unchecked_unknown_md(817V::DATA_TYPE,818views.into(),819arc_data_buffers,820Some(validity.freeze()),821None,822);823824ChunkedArray::with_chunk(ca.name().clone(), arr.maybe_gc())825}826827#[cfg(feature = "dtype-struct")]828unsafe fn take_chunked_unchecked_struct<const B: u64>(829ca: &StructChunked,830by: &[ChunkId<B>],831sorted: IsSorted,832avoid_sharing: bool,833) -> StructChunked {834let fields = ca835.fields_as_series()836.iter()837.map(|s| s.take_chunked_unchecked(by, sorted, avoid_sharing))838.collect::<Vec<_>>();839let mut out = StructChunked::from_series(ca.name().clone(), by.len(), fields.iter()).unwrap();840841if !ca.has_nulls() {842return out;843}844845let mut validity = BitmapBuilder::with_capacity(by.len());846if ca.n_chunks() == 1 {847let arr = ca.downcast_as_array();848let bitmap = arr.validity().unwrap();849for id in by.iter() {850let (chunk_idx, array_idx) = id.extract();851debug_assert!(chunk_idx == 0);852validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));853}854} else {855for id in by.iter() {856let (chunk_idx, array_idx) = id.extract();857let arr = ca.downcast_get_unchecked(chunk_idx as usize);858if let Some(bitmap) = arr.validity() {859validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));860} else {861validity.push_unchecked(true);862}863}864}865866out.rechunk_mut(); // Should be a no-op.867out.downcast_iter_mut()868.next()869.unwrap()870.set_validity(validity.into_opt_validity());871out872}873874#[cfg(feature = "dtype-struct")]875unsafe fn take_opt_chunked_unchecked_struct<const B: u64>(876ca: &StructChunked,877by: &[ChunkId<B>],878avoid_sharing: bool,879) -> StructChunked {880let fields = ca881.fields_as_series()882.iter()883.map(|s| s.take_opt_chunked_unchecked(by, avoid_sharing))884.collect::<Vec<_>>();885let mut out = StructChunked::from_series(ca.name().clone(), by.len(), fields.iter()).unwrap();886887let mut validity = BitmapBuilder::with_capacity(by.len());888if ca.n_chunks() == 1 {889let arr = ca.downcast_as_array();890if let Some(bitmap) = arr.validity() {891for id in by.iter() {892if id.is_null() {893validity.push_unchecked(false);894} else {895let (chunk_idx, array_idx) = id.extract();896debug_assert!(chunk_idx == 0);897validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));898}899}900} else {901for id in by.iter() {902validity.push_unchecked(!id.is_null());903}904}905} else {906for id in by.iter() {907if id.is_null() {908validity.push_unchecked(false);909} else {910let (chunk_idx, array_idx) = id.extract();911let arr = ca.downcast_get_unchecked(chunk_idx as usize);912if let Some(bitmap) = arr.validity() {913validity.push_unchecked(bitmap.get_bit_unchecked(array_idx as usize));914} else {915validity.push_unchecked(true);916}917}918}919}920921out.rechunk_mut(); // Should be a no-op.922out.downcast_iter_mut()923.next()924.unwrap()925.set_validity(validity.into_opt_validity());926out927}928929#[cfg(test)]930mod test {931use super::*;932933#[test]934fn test_binview_chunked_gather() {935unsafe {936// # Series without nulls;937let mut s_1 = Series::new(938"a".into(),939&["1 loooooooooooong string", "2 loooooooooooong string"],940);941let s_2 = Series::new(942"a".into(),943&["11 loooooooooooong string", "22 loooooooooooong string"],944);945let s_3 = Series::new(946"a".into(),947&[948"111 loooooooooooong string",949"222 loooooooooooong string",950"small", // this tests we don't mess with the inlined view951],952);953s_1.append(&s_2).unwrap();954s_1.append(&s_3).unwrap();955956assert_eq!(s_1.n_chunks(), 3);957958// ## Ids without nulls;959let by: [ChunkId<24>; 7] = [960ChunkId::store(0, 0),961ChunkId::store(0, 1),962ChunkId::store(1, 1),963ChunkId::store(1, 0),964ChunkId::store(2, 0),965ChunkId::store(2, 1),966ChunkId::store(2, 2),967];968969let out = s_1.take_chunked_unchecked(&by, IsSorted::Not, true);970let idx = IdxCa::new("".into(), [0, 1, 3, 2, 4, 5, 6]);971let expected = s_1.rechunk().take(&idx).unwrap();972assert!(out.equals(&expected));973974// ## Ids with nulls;975let by: [ChunkId<24>; 4] = [976ChunkId::null(),977ChunkId::store(0, 1),978ChunkId::store(1, 1),979ChunkId::store(1, 0),980];981let out = s_1.take_opt_chunked_unchecked(&by, true);982983let idx = IdxCa::new("".into(), [None, Some(1), Some(3), Some(2)]);984let expected = s_1.rechunk().take(&idx).unwrap();985assert!(out.equals_missing(&expected));986987// # Series with nulls;988let mut s_1 = Series::new(989"a".into(),990&["1 loooooooooooong string 1", "2 loooooooooooong string 2"],991);992let s_2 = Series::new("a".into(), &[Some("11 loooooooooooong string 11"), None]);993s_1.append(&s_2).unwrap();994995// ## Ids without nulls;996let by: [ChunkId<24>; 4] = [997ChunkId::store(0, 0),998ChunkId::store(0, 1),999ChunkId::store(1, 1),1000ChunkId::store(1, 0),1001];10021003let out = s_1.take_chunked_unchecked(&by, IsSorted::Not, true);1004let idx = IdxCa::new("".into(), [0, 1, 3, 2]);1005let expected = s_1.rechunk().take(&idx).unwrap();1006assert!(out.equals_missing(&expected));10071008// ## Ids with nulls;1009let by: [ChunkId<24>; 4] = [1010ChunkId::null(),1011ChunkId::store(0, 1),1012ChunkId::store(1, 1),1013ChunkId::store(1, 0),1014];1015let out = s_1.take_opt_chunked_unchecked(&by, true);10161017let idx = IdxCa::new("".into(), [None, Some(1), Some(3), Some(2)]);1018let expected = s_1.rechunk().take(&idx).unwrap();1019assert!(out.equals_missing(&expected));1020}1021}1022}102310241025