Path: blob/main/crates/polars-core/src/chunked_array/mod.rs
8407 views
//! The typed heart of every Series column.1#![allow(unsafe_op_in_unsafe_fn)]2use std::sync::Arc;34use arrow::array::*;5use arrow::bitmap::Bitmap;6use arrow::compute::concatenate::concatenate_unchecked;7use polars_compute::filter::filter_with_bitmap;89use crate::prelude::{ChunkTakeUnchecked, *};1011pub mod ops;12#[macro_use]13pub mod arithmetic;14pub mod builder;15pub mod cast;16pub mod collect;17pub mod comparison;18pub mod flags;19pub mod float;20pub mod iterator;21#[cfg(feature = "ndarray")]22pub(crate) mod ndarray;2324pub mod arg_min_max;25#[cfg(feature = "dtype-array")]26pub(crate) mod array;27mod binary;28mod binary_offset;29mod bitwise;30#[cfg(feature = "object")]31mod drop;32mod from;33mod from_iterator;34pub mod from_iterator_par;35pub(crate) mod list;36pub(crate) mod logical;37#[cfg(feature = "object")]38pub mod object;39#[cfg(feature = "random")]40mod random;41#[cfg(feature = "dtype-struct")]42mod struct_;43#[cfg(any(44feature = "temporal",45feature = "dtype-datetime",46feature = "dtype-date"47))]48pub mod temporal;49mod to_vec;50mod trusted_len;51pub(crate) use arg_min_max::*;52use arrow::legacy::prelude::*;53#[cfg(feature = "dtype-struct")]54pub use struct_::StructChunked;5556use self::flags::{StatisticsFlags, StatisticsFlagsIM};57use crate::series::IsSorted;58use crate::utils::{first_non_null, first_null, last_non_null};5960pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;6162/// # ChunkedArray63///64/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows65/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.66/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].67///68/// ```rust69/// # use polars_core::prelude::*;70/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {71/// ca.apply_values(|v| v.cos())72/// }73/// ```74///75/// ## Conversion between Series and ChunkedArrays76/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.77///78/// ```rust79/// # use polars_core::prelude::*;80/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{81/// series.i32()82/// }83///84/// fn to_series(ca: Int32Chunked) -> Series {85/// ca.into_series()86/// }87/// ```88///89/// # Iterators90///91/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)92/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby93/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).94///95/// ```rust96/// # use polars_core::prelude::*;97///98/// fn iter_forward(ca: &Float32Chunked) {99/// ca.iter()100/// .for_each(|opt_v| println!("{:?}", opt_v))101/// }102///103/// fn iter_backward(ca: &Float32Chunked) {104/// ca.iter()105/// .rev()106/// .for_each(|opt_v| println!("{:?}", opt_v))107/// }108/// ```109///110/// # Memory layout111///112/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.113/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.114///115/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.116/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).117///118/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection119/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.120/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.121///122/// If you want to have predictable performance123/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after124/// multiple append operations.125///126/// See also [`ChunkedArray::extend`] for appends within a chunk.127///128/// # Invariants129/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].130/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]131/// chunks.132/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the133/// logical type given by the datatype.134///135/// [`List`]: crate::datatypes::DataType::List136pub struct ChunkedArray<T: PolarsDataType> {137pub(crate) field: Arc<Field>,138pub(crate) chunks: Vec<ArrayRef>,139140pub(crate) flags: StatisticsFlagsIM,141142length: usize,143null_count: usize,144_pd: std::marker::PhantomData<T>,145}146147impl<T: PolarsDataType> ChunkedArray<T> {148fn should_rechunk(&self) -> bool {149self.chunks.len() > 1 && self.chunks.len() > self.len() / 3150}151152fn optional_rechunk(mut self) -> Self {153// Rechunk if we have many small chunks.154if self.should_rechunk() {155self.rechunk_mut()156}157self158}159160pub(crate) fn as_any(&self) -> &dyn std::any::Any {161self162}163164/// Series to [`ChunkedArray<T>`]165pub fn unpack_series_matching_type<'a>(166&self,167series: &'a Series,168) -> PolarsResult<&'a ChunkedArray<T>> {169polars_ensure!(170self.dtype() == series.dtype(),171SchemaMismatch: "cannot unpack series of type `{}` into `{}`",172series.dtype(),173self.dtype(),174);175176// SAFETY: dtype will be correct.177Ok(unsafe { self.unpack_series_matching_physical_type(series) })178}179180/// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.181///182/// If you want to explicitly the `length` and `null_count`, look at183/// [`ChunkedArray::new_with_dims`]184fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {185unsafe {186let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);187chunked_arr.compute_len();188chunked_arr189}190}191192/// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.193/// # Safety194/// The length and null_count must be correct.195pub unsafe fn new_with_dims(196field: Arc<Field>,197chunks: Vec<ArrayRef>,198length: usize,199null_count: usize,200) -> Self {201Self {202field,203chunks,204flags: StatisticsFlagsIM::empty(),205206_pd: Default::default(),207length,208null_count,209}210}211212pub(crate) fn is_sorted_ascending_flag(&self) -> bool {213self.get_flags().is_sorted_ascending()214}215216pub(crate) fn is_sorted_descending_flag(&self) -> bool {217self.get_flags().is_sorted_descending()218}219220/// Whether `self` is sorted in any direction.221pub(crate) fn is_sorted_any(&self) -> bool {222self.get_flags().is_sorted_any()223}224225pub fn unset_fast_explode_list(&mut self) {226self.set_fast_explode_list(false)227}228229pub fn set_fast_explode_list(&mut self, value: bool) {230let mut flags = self.flags.get_mut();231flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);232self.flags.set_mut(flags);233}234235pub fn get_fast_explode_list(&self) -> bool {236self.get_flags().can_fast_explode_list()237}238239pub fn get_flags(&self) -> StatisticsFlags {240self.flags.get()241}242243/// Set flags for the [`ChunkedArray`]244pub fn set_flags(&mut self, flags: StatisticsFlags) {245self.flags = StatisticsFlagsIM::new(flags);246}247248pub fn is_sorted_flag(&self) -> IsSorted {249self.get_flags().is_sorted()250}251252pub fn retain_flags_from<U: PolarsDataType>(253&mut self,254from: &ChunkedArray<U>,255retain_flags: StatisticsFlags,256) {257let flags = from.flags.get();258// Try to avoid write contention.259if !flags.is_empty() {260self.set_flags(flags & retain_flags)261}262}263264/// Set the 'sorted' bit meta info.265pub fn set_sorted_flag(&mut self, sorted: IsSorted) {266let mut flags = self.flags.get_mut();267flags.set_sorted(sorted);268self.flags.set_mut(flags);269}270271/// Set the 'sorted' bit meta info.272pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {273let mut out = self.clone();274out.set_sorted_flag(sorted);275out276}277278pub fn first_null(&self) -> Option<usize> {279if self.null_count() == 0 {280None281}282// We now know there is at least 1 non-null item in the array, and self.len() > 0283else if self.null_count() == self.len() {284Some(0)285} else if self.is_sorted_any() {286let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {287// nulls are all at the start2880289} else {290// nulls are all at the end291self.null_count()292};293294debug_assert!(295// If we are lucky this catches something.296unsafe { self.get_unchecked(out) }.is_some(),297"incorrect sorted flag"298);299300Some(out)301} else {302first_null(self.chunks().iter().map(|arr| arr.as_ref()))303}304}305306/// Get the index of the first non null value in this [`ChunkedArray`].307pub fn first_non_null(&self) -> Option<usize> {308if self.null_count() == self.len() {309None310}311// We now know there is at least 1 non-null item in the array, and self.len() > 0312else if self.null_count() == 0 {313Some(0)314} else if self.is_sorted_any() {315let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {316// nulls are all at the start317self.null_count()318} else {319// nulls are all at the end3200321};322323debug_assert!(324// If we are lucky this catches something.325unsafe { self.get_unchecked(out) }.is_some(),326"incorrect sorted flag"327);328329Some(out)330} else {331first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))332}333}334335/// Get the index of the last non null value in this [`ChunkedArray`].336pub fn last_non_null(&self) -> Option<usize> {337if self.null_count() == self.len() {338None339}340// We now know there is at least 1 non-null item in the array, and self.len() > 0341else if self.null_count() == 0 {342Some(self.len() - 1)343} else if self.is_sorted_any() {344let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {345// nulls are all at the start346self.len() - 1347} else {348// nulls are all at the end349self.len() - self.null_count() - 1350};351352debug_assert!(353// If we are lucky this catches something.354unsafe { self.get_unchecked(out) }.is_some(),355"incorrect sorted flag"356);357358Some(out)359} else {360last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())361}362}363364pub fn drop_nulls(&self) -> Self {365if self.null_count() == 0 {366self.clone()367} else {368let chunks = self369.downcast_iter()370.map(|arr| {371if arr.null_count() == 0 {372arr.to_boxed()373} else {374filter_with_bitmap(arr, arr.validity().unwrap())375}376})377.collect();378unsafe {379Self::new_with_dims(380self.field.clone(),381chunks,382self.len() - self.null_count(),3830,384)385}386}387}388389/// Get the buffer of bits representing null values390#[inline]391#[allow(clippy::type_complexity)]392pub fn iter_validities(393&self,394) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {395fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {396arr.validity()397}398self.chunks.iter().map(to_validity)399}400401#[inline]402/// Return if any the chunks in this [`ChunkedArray`] have nulls.403pub fn has_nulls(&self) -> bool {404self.null_count > 0405}406407/// Shrink the capacity of this array to fit its length.408pub fn shrink_to_fit(&mut self) {409self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];410}411412pub fn clear(&self) -> Self {413// SAFETY: we keep the correct dtype414let mut ca = unsafe {415self.copy_with_chunks(vec![new_empty_array(416self.chunks.first().unwrap().dtype().clone(),417)])418};419420use StatisticsFlags as F;421ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);422ca423}424425/// Unpack a [`Series`] to the same physical type.426///427/// # Safety428///429/// This is unsafe as the dtype may be incorrect and430/// is assumed to be correct in other safe code.431pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(432&self,433series: &'a Series,434) -> &'a ChunkedArray<T> {435let series_trait = &**series;436if self.dtype() == series.dtype() {437&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)438} else {439use DataType::*;440match (self.dtype(), series.dtype()) {441(Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {442&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)443},444_ => panic!(445"cannot unpack series {:?} into matching type {:?}",446series,447self.dtype()448),449}450}451}452453/// Returns an iterator over the lengths of the chunks of the array.454pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {455self.chunks.iter().map(|chunk| chunk.len())456}457458/// A reference to the chunks459#[inline]460pub fn chunks(&self) -> &Vec<ArrayRef> {461&self.chunks462}463464/// A mutable reference to the chunks465///466/// # Safety467/// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.468/// And the `null_count` remains correct.469#[inline]470pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {471&mut self.chunks472}473474/// Returns true if contains a single chunk and has no null values475pub fn is_optimal_aligned(&self) -> bool {476self.chunks.len() == 1 && self.null_count() == 0477}478479/// Create a new [`ChunkedArray`] from self, where the chunks are replaced.480///481/// # Safety482/// The caller must ensure the dtypes of the chunks are correct483unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {484Self::new_with_compute_len(self.field.clone(), chunks)485}486487/// Get data type of [`ChunkedArray`].488pub fn dtype(&self) -> &DataType {489self.field.dtype()490}491492pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {493self.field = Arc::new(Field::new(self.name().clone(), dtype))494}495496/// Name of the [`ChunkedArray`].497pub fn name(&self) -> &PlSmallStr {498self.field.name()499}500501/// Get a reference to the field.502pub fn ref_field(&self) -> &Field {503&self.field504}505506/// Rename this [`ChunkedArray`].507pub fn rename(&mut self, name: PlSmallStr) {508self.field = Arc::new(Field::new(name, self.field.dtype().clone()));509}510511/// Return this [`ChunkedArray`] with a new name.512pub fn with_name(mut self, name: PlSmallStr) -> Self {513self.rename(name);514self515}516}517518impl<T> ChunkedArray<T>519where520T: PolarsDataType,521{522/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this523/// indicates a NULL value.524///525/// # Panics526/// This function will panic if `idx` is out of bounds.527#[inline]528pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {529let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);530assert!(531chunk_idx < self.chunks().len(),532"index: {} out of bounds for len: {}",533idx,534self.len()535);536unsafe {537let arr = self.downcast_get_unchecked(chunk_idx);538assert!(539arr_idx < arr.len(),540"index: {} out of bounds for len: {}",541idx,542self.len()543);544arr.get_unchecked(arr_idx)545}546}547548/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this549/// indicates a NULL value.550///551/// # Safety552/// It is the callers responsibility that the `idx < self.len()`.553#[inline]554pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {555let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);556557unsafe {558// SAFETY: up to the caller to make sure the index is valid.559self.downcast_get_unchecked(chunk_idx)560.get_unchecked(arr_idx)561}562}563564/// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned565/// value could be garbage if it was masked out by NULL. Note that the value always is initialized.566///567/// # Safety568/// It is the callers responsibility that the `idx < self.len()`.569#[inline]570pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {571let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);572573unsafe {574// SAFETY: up to the caller to make sure the index is valid.575self.downcast_get_unchecked(chunk_idx)576.value_unchecked(arr_idx)577}578}579580/// # Panics581/// Panics if the [`ChunkedArray`] is empty.582#[inline]583pub fn first(&self) -> Option<T::Physical<'_>> {584self.iter().next().unwrap()585}586587/// # Panics588/// Panics if the [`ChunkedArray`] is empty.589#[inline]590pub fn last(&self) -> Option<T::Physical<'_>> {591let arr = self592.downcast_iter()593.rev()594.find(|arr| !arr.is_empty())595.unwrap();596unsafe { arr.get_unchecked(arr.len() - 1) }597}598599pub fn set_validity(&mut self, validity: &Bitmap) {600assert_eq!(self.len(), validity.len());601let mut i = 0;602for chunk in unsafe { self.chunks_mut() } {603*chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));604i += chunk.len();605}606self.null_count = validity.unset_bits();607self.set_fast_explode_list(false);608}609}610611impl<T> ChunkedArray<T>612where613T: PolarsDataType,614ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,615{616/// Deposit values into nulls with a certain validity mask.617pub fn deposit(&self, validity: &Bitmap) -> Self {618let set_bits = validity.set_bits();619620assert_eq!(self.null_count(), 0);621assert_eq!(self.len(), set_bits);622623if set_bits == validity.len() {624return self.clone();625}626627if set_bits == 0 {628return Self::full_null_like(self, validity.len());629}630631let mut null_mask = validity.clone();632633let mut gather_idxs = Vec::with_capacity(validity.len());634let leading_nulls = null_mask.take_leading_zeros();635gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));636637let mut i = 0 as IdxSize;638gather_idxs.extend(null_mask.iter().skip(1).map(|v| {639i += IdxSize::from(v);640i641}));642643let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };644ca.set_validity(validity);645ca646}647}648649impl ListChunked {650#[inline]651pub fn get_as_series(&self, idx: usize) -> Option<Series> {652unsafe {653Some(Series::from_chunks_and_dtype_unchecked(654self.name().clone(),655vec![self.get(idx)?],656&self.inner_dtype().to_physical(),657))658}659}660661pub fn has_empty_lists(&self) -> bool {662for arr in self.downcast_iter() {663if arr.is_empty() {664continue;665}666667if match arr.validity() {668None => arr.offsets().lengths().any(|l| l == 0),669Some(validity) => arr670.offsets()671.lengths()672.enumerate()673.any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),674} {675return true;676}677}678679false680}681682pub fn has_masked_out_values(&self) -> bool {683for arr in self.downcast_iter() {684if arr.is_empty() {685continue;686}687688if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {689return true;690}691692let Some(validity) = arr.validity() else {693continue;694};695if validity.set_bits() == 0 {696continue;697}698699// @Performance: false_idx_iter700for i in (!validity).true_idx_iter() {701if arr.offsets().length_at(i) > 0 {702return true;703}704}705}706707false708}709}710711#[cfg(feature = "dtype-array")]712impl ArrayChunked {713#[inline]714pub fn get_as_series(&self, idx: usize) -> Option<Series> {715unsafe {716Some(Series::from_chunks_and_dtype_unchecked(717self.name().clone(),718vec![self.get(idx)?],719&self.inner_dtype().to_physical(),720))721}722}723724pub fn from_aligned_values(725name: PlSmallStr,726inner_dtype: &DataType,727width: usize,728chunks: Vec<ArrayRef>,729length: usize,730) -> Self {731let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);732let arrow_dtype = dtype.to_arrow(CompatLevel::newest());733let field = Arc::new(Field::new(name, dtype));734if width == 0 {735use arrow::array::builder::{ArrayBuilder, make_builder};736let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();737return ArrayChunked::new_with_compute_len(738field,739vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],740);741}742let mut total_len = 0;743let chunks = chunks744.into_iter()745.map(|chunk| {746debug_assert_eq!(chunk.len() % width, 0);747let chunk_len = chunk.len() / width;748total_len += chunk_len;749FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk, None).into_boxed()750})751.collect();752debug_assert_eq!(total_len, length);753754unsafe { Self::new_with_dims(field, chunks, length, 0) }755}756757/// Turn the ArrayChunked into the ListChunked with the same items.758///759/// This will always zero copy the values into the ListChunked.760pub fn to_list(&self) -> ListChunked {761let inner_dtype = self.inner_dtype();762let chunks = self763.downcast_iter()764.map(|chunk| {765use arrow::offset::OffsetsBuffer;766767let inner_dtype = chunk.dtype().inner_dtype().unwrap();768let dtype = inner_dtype.clone().to_large_list(true);769770let offsets = (0..=chunk.len())771.map(|i| (i * self.width()) as i64)772.collect::<Vec<i64>>();773774// SAFETY: We created our offsets in ascending manner.775let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };776777ListArray::<i64>::new(778dtype,779offsets,780chunk.values().clone(),781chunk.validity().cloned(),782)783.into_boxed()784})785.collect();786787// SAFETY: All the items were mapped 1-1 with the validity staying the same.788let mut ca = unsafe {789ListChunked::new_with_dims(790Arc::new(Field::new(791self.name().clone(),792DataType::List(Box::new(inner_dtype.clone())),793)),794chunks,795self.len(),796self.null_count(),797)798};799ca.set_fast_explode_list(!self.has_nulls());800ca801}802}803804impl<T> ChunkedArray<T>805where806T: PolarsDataType,807{808/// Should be used to match the chunk_id of another [`ChunkedArray`].809/// # Panics810/// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.811pub fn match_chunks<I>(&self, chunk_id: I) -> Self812where813I: Iterator<Item = usize>,814{815debug_assert!(self.chunks.len() == 1);816// Takes a ChunkedArray containing a single chunk.817let slice = |ca: &Self| {818let array = &ca.chunks[0];819820let mut offset = 0;821let chunks = chunk_id822.map(|len| {823// SAFETY: within bounds.824debug_assert!((offset + len) <= array.len());825let out = unsafe { array.sliced_unchecked(offset, len) };826offset += len;827out828})829.collect();830831debug_assert_eq!(offset, array.len());832833// SAFETY: We just slice the original chunks, their type will not change.834unsafe {835Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())836}837};838839if self.chunks.len() != 1 {840let out = self.rechunk();841slice(&out)842} else {843slice(self)844}845}846}847848impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {849fn as_ref_dtype(&self) -> &DataType {850self.dtype()851}852}853854pub(crate) trait AsSinglePtr: AsRefDataType {855/// Rechunk and return a ptr to the start of the array856fn as_single_ptr(&mut self) -> PolarsResult<usize> {857polars_bail!(opq = as_single_ptr, self.as_ref_dtype());858}859}860861impl<T> AsSinglePtr for ChunkedArray<T>862where863T: PolarsNumericType,864{865fn as_single_ptr(&mut self) -> PolarsResult<usize> {866self.rechunk_mut();867let a = self.data_views().next().unwrap();868let ptr = a.as_ptr();869Ok(ptr as usize)870}871}872873impl AsSinglePtr for BooleanChunked {}874impl AsSinglePtr for ListChunked {}875#[cfg(feature = "dtype-array")]876impl AsSinglePtr for ArrayChunked {}877impl AsSinglePtr for StringChunked {}878impl AsSinglePtr for BinaryChunked {}879#[cfg(feature = "object")]880impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}881882pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {883SingleNoNull(&'a T::Array),884Single(&'a T::Array),885MultiNoNull(&'a ChunkedArray<T>),886Multi(&'a ChunkedArray<T>),887}888889impl<T> ChunkedArray<T>890where891T: PolarsDataType,892{893pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {894if self.chunks.len() == 1 {895let arr = self.downcast_iter().next().unwrap();896return if arr.null_count() == 0 {897ChunkedArrayLayout::SingleNoNull(arr)898} else {899ChunkedArrayLayout::Single(arr)900};901}902903if self.downcast_iter().all(|a| a.null_count() == 0) {904ChunkedArrayLayout::MultiNoNull(self)905} else {906ChunkedArrayLayout::Multi(self)907}908}909}910911impl<T> ChunkedArray<T>912where913T: PolarsNumericType,914{915/// Returns the values of the array as a contiguous slice.916pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {917polars_ensure!(918self.chunks.len() == 1 && self.chunks[0].null_count() == 0,919ComputeError: "chunked array is not contiguous"920);921Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())922}923924/// Returns the values of the array as a contiguous mutable slice.925pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {926if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {927// SAFETY, we will not swap the PrimitiveArray.928let arr = unsafe { self.downcast_iter_mut().next().unwrap() };929arr.get_mut_values()930} else {931None932}933}934935/// Get slices of the underlying arrow data.936/// NOTE: null values should be taken into account by the user of these slices as they are handled937/// separately938pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {939self.downcast_iter().map(|arr| arr.values().as_slice())940}941942#[allow(clippy::wrong_self_convention)]943pub fn into_no_null_iter(944&self,945) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen946{947// .copied was significantly slower in benchmark, next call did not inline?948#[allow(clippy::map_clone)]949// we know the iterators len950unsafe {951self.data_views()952.flatten()953.map(|v| *v)954.trust_my_length(self.len())955}956}957}958959impl<T: PolarsDataType> Clone for ChunkedArray<T> {960fn clone(&self) -> Self {961ChunkedArray {962field: self.field.clone(),963chunks: self.chunks.clone(),964flags: self.flags.clone(),965966_pd: Default::default(),967length: self.length,968null_count: self.null_count,969}970}971}972973impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {974fn as_ref(&self) -> &ChunkedArray<T> {975self976}977}978979impl ValueSize for ListChunked {980fn get_values_size(&self) -> usize {981self.chunks982.iter()983.fold(0usize, |acc, arr| acc + arr.get_values_size())984}985}986987#[cfg(feature = "dtype-array")]988impl ValueSize for ArrayChunked {989fn get_values_size(&self) -> usize {990self.chunks991.iter()992.fold(0usize, |acc, arr| acc + arr.get_values_size())993}994}995impl ValueSize for StringChunked {996fn get_values_size(&self) -> usize {997self.chunks998.iter()999.fold(0usize, |acc, arr| acc + arr.get_values_size())1000}1001}10021003impl ValueSize for BinaryOffsetChunked {1004fn get_values_size(&self) -> usize {1005self.chunks1006.iter()1007.fold(0usize, |acc, arr| acc + arr.get_values_size())1008}1009}10101011pub(crate) fn to_primitive<T: PolarsNumericType>(1012values: Vec<T::Native>,1013validity: Option<Bitmap>,1014) -> PrimitiveArray<T::Native> {1015PrimitiveArray::new(1016T::get_static_dtype().to_arrow(CompatLevel::newest()),1017values.into(),1018validity,1019)1020}10211022pub(crate) fn to_array<T: PolarsNumericType>(1023values: Vec<T::Native>,1024validity: Option<Bitmap>,1025) -> ArrayRef {1026Box::new(to_primitive::<T>(values, validity))1027}10281029impl<T: PolarsDataType> Default for ChunkedArray<T> {1030fn default() -> Self {1031let dtype = T::get_static_dtype();1032let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());1033ChunkedArray {1034field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),1035// Invariant: always has 1 chunk.1036chunks: vec![new_empty_array(arrow_dtype)],1037flags: StatisticsFlagsIM::empty(),10381039_pd: Default::default(),1040length: 0,1041null_count: 0,1042}1043}1044}10451046#[cfg(test)]1047pub(crate) mod test {1048use crate::prelude::*;10491050pub(crate) fn get_chunked_array() -> Int32Chunked {1051ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])1052}10531054#[test]1055fn test_sort() {1056let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);1057let b = a1058.sort(false)1059.into_iter()1060.map(|opt| opt.unwrap())1061.collect::<Vec<_>>();1062assert_eq!(b, [1, 2, 3, 9]);1063let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);1064let a = a.sort(false);1065let b = a.into_iter().collect::<Vec<_>>();1066assert_eq!(b, [Some("a"), Some("b"), Some("c")]);1067assert!(a.is_sorted_ascending_flag());1068}10691070#[test]1071fn arithmetic() {1072let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);1073let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);10741075// Not really asserting anything here but still making sure the code is exercised1076// This (and more) is properly tested from the integration test suite and Python bindings.1077println!("{:?}", a + b);1078println!("{:?}", a - b);1079println!("{:?}", a * b);1080println!("{:?}", a / b);1081}10821083#[test]1084fn iter() {1085let s1 = get_chunked_array();1086// sum1087assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)1088}10891090#[test]1091fn limit() {1092let a = get_chunked_array();1093let b = a.limit(2);1094println!("{b:?}");1095assert_eq!(b.len(), 2)1096}10971098#[test]1099fn filter() {1100let a = get_chunked_array();1101let b = a1102.filter(&BooleanChunked::new(1103PlSmallStr::from_static("filter"),1104&[true, false, false],1105))1106.unwrap();1107assert_eq!(b.len(), 1);1108assert_eq!(b.into_iter().next(), Some(Some(1)));1109}11101111#[test]1112fn aggregates() {1113let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);1114assert_eq!(a.max(), Some(100));1115assert_eq!(a.min(), Some(1));1116assert_eq!(a.sum(), Some(120))1117}11181119#[test]1120fn take() {1121let a = get_chunked_array();1122let new = a.take(&[0 as IdxSize, 1]).unwrap();1123assert_eq!(new.len(), 2)1124}11251126#[test]1127fn cast() {1128let a = get_chunked_array();1129let b = a.cast(&DataType::Int64).unwrap();1130assert_eq!(b.dtype(), &DataType::Int64)1131}11321133fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])1134where1135T: PolarsNumericType,1136{1137assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)1138}11391140#[test]1141fn slice() {1142let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);1143let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);1144first.append(&second).unwrap();1145assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);1146assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);1147assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);1148assert_slice_equal(&first.slice(3, 2), &[3, 4]);1149assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);1150assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);1151assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);11521153assert_eq!(first.slice(-7, 2).len(), 1);1154assert_eq!(first.slice(-3, 4).len(), 3);1155assert_eq!(first.slice(3, 4).len(), 3);1156assert_eq!(first.slice(10, 4).len(), 0);1157}11581159#[test]1160fn sorting() {1161let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);1162let sorted = s.sort(false);1163assert_slice_equal(&sorted, &[2, 4, 9]);1164let sorted = s.sort(true);1165assert_slice_equal(&sorted, &[9, 4, 2]);11661167let s: StringChunked = ["b", "a", "z"].iter().collect();1168let sorted = s.sort(false);1169assert_eq!(1170sorted.into_iter().collect::<Vec<_>>(),1171&[Some("a"), Some("b"), Some("z")]1172);1173let sorted = s.sort(true);1174assert_eq!(1175sorted.into_iter().collect::<Vec<_>>(),1176&[Some("z"), Some("b"), Some("a")]1177);1178let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();1179let sorted = s.sort(false);1180assert_eq!(1181sorted.into_iter().collect::<Vec<_>>(),1182&[None, Some("b"), Some("z")]1183);1184}11851186#[test]1187fn reverse() {1188let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);1189// path with continuous slice1190assert_slice_equal(&s.reverse(), &[3, 2, 1]);1191// path with options1192let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);1193assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);1194let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);1195assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);11961197let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);1198assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);11991200let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);1201assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);1202}12031204#[test]1205#[cfg(feature = "dtype-categorical")]1206fn test_iter_categorical() {1207let ca = StringChunked::new(1208PlSmallStr::EMPTY,1209&[Some("foo"), None, Some("bar"), Some("ham")],1210);1211let cats = Categories::new(1212PlSmallStr::EMPTY,1213PlSmallStr::EMPTY,1214CategoricalPhysical::U32,1215);1216let ca = ca.cast(&DataType::from_categories(cats)).unwrap();1217let ca = ca.cat32().unwrap();1218let v: Vec<_> = ca.physical().into_iter().collect();1219assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);1220}12211222#[test]1223#[ignore]1224fn test_shrink_to_fit() {1225let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);1226builder.append_value("foo");1227let mut arr = builder.finish();1228let before = arr1229.chunks()1230.iter()1231.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))1232.sum::<usize>();1233arr.shrink_to_fit();1234let after = arr1235.chunks()1236.iter()1237.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))1238.sum::<usize>();1239assert!(before > after);1240}1241}124212431244