Path: blob/main/crates/polars-core/src/chunked_array/mod.rs
6940 views
//! The typed heart of every Series column.1#![allow(unsafe_op_in_unsafe_fn)]2use std::iter::Map;3use std::sync::Arc;45use arrow::array::*;6use arrow::bitmap::Bitmap;7use arrow::compute::concatenate::concatenate_unchecked;8use polars_compute::filter::filter_with_bitmap;910use crate::prelude::*;1112pub mod ops;13#[macro_use]14pub mod arithmetic;15pub mod builder;16pub mod cast;17pub mod collect;18pub mod comparison;19pub mod flags;20pub mod float;21pub mod iterator;22#[cfg(feature = "ndarray")]23pub(crate) mod ndarray;2425#[cfg(feature = "dtype-array")]26pub(crate) mod array;27mod binary;28mod binary_offset;29mod bitwise;30#[cfg(feature = "object")]31mod drop;32mod from;33mod from_iterator;34pub mod from_iterator_par;35pub(crate) mod list;36pub(crate) mod logical;37#[cfg(feature = "object")]38pub mod object;39#[cfg(feature = "random")]40mod random;41#[cfg(feature = "dtype-struct")]42mod struct_;43#[cfg(any(44feature = "temporal",45feature = "dtype-datetime",46feature = "dtype-date"47))]48pub mod temporal;49mod to_vec;50mod trusted_len;5152use std::slice::Iter;5354use arrow::legacy::prelude::*;55#[cfg(feature = "dtype-struct")]56pub use struct_::StructChunked;5758use self::flags::{StatisticsFlags, StatisticsFlagsIM};59use crate::series::IsSorted;60use crate::utils::{first_non_null, last_non_null};6162#[cfg(not(feature = "dtype-categorical"))]63pub struct RevMapping {}6465pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;6667/// # ChunkedArray68///69/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows70/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.71/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].72///73/// ```rust74/// # use polars_core::prelude::*;75/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {76/// ca.apply_values(|v| v.cos())77/// }78/// ```79///80/// ## Conversion between Series and ChunkedArrays81/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.82///83/// ```rust84/// # use polars_core::prelude::*;85/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{86/// series.i32()87/// }88///89/// fn to_series(ca: Int32Chunked) -> Series {90/// ca.into_series()91/// }92/// ```93///94/// # Iterators95///96/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)97/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby98/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).99///100/// ```rust101/// # use polars_core::prelude::*;102///103/// fn iter_forward(ca: &Float32Chunked) {104/// ca.iter()105/// .for_each(|opt_v| println!("{:?}", opt_v))106/// }107///108/// fn iter_backward(ca: &Float32Chunked) {109/// ca.iter()110/// .rev()111/// .for_each(|opt_v| println!("{:?}", opt_v))112/// }113/// ```114///115/// # Memory layout116///117/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.118/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.119///120/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.121/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).122///123/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection124/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.125/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.126///127/// If you want to have predictable performance128/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after129/// multiple append operations.130///131/// See also [`ChunkedArray::extend`] for appends within a chunk.132///133/// # Invariants134/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].135/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]136/// chunks.137/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the138/// logical type given by the datatype.139///140/// [`List`]: crate::datatypes::DataType::List141pub struct ChunkedArray<T: PolarsDataType> {142pub(crate) field: Arc<Field>,143pub(crate) chunks: Vec<ArrayRef>,144145pub(crate) flags: StatisticsFlagsIM,146147length: usize,148null_count: usize,149_pd: std::marker::PhantomData<T>,150}151152impl<T: PolarsDataType> ChunkedArray<T> {153fn should_rechunk(&self) -> bool {154self.chunks.len() > 1 && self.chunks.len() > self.len() / 3155}156157fn optional_rechunk(mut self) -> Self {158// Rechunk if we have many small chunks.159if self.should_rechunk() {160self.rechunk_mut()161}162self163}164165pub(crate) fn as_any(&self) -> &dyn std::any::Any {166self167}168169/// Series to [`ChunkedArray<T>`]170pub fn unpack_series_matching_type<'a>(171&self,172series: &'a Series,173) -> PolarsResult<&'a ChunkedArray<T>> {174match self.dtype() {175#[cfg(feature = "dtype-decimal")]176DataType::Decimal(_, _) => {177let logical = series.decimal()?;178179let ca = logical.physical();180Ok(ca.as_any().downcast_ref::<ChunkedArray<T>>().unwrap())181},182dt => {183polars_ensure!(184dt == series.dtype(),185SchemaMismatch: "cannot unpack series of type `{}` into `{}`",186series.dtype(),187dt,188);189190// SAFETY:191// dtype will be correct.192Ok(unsafe { self.unpack_series_matching_physical_type(series) })193},194}195}196197/// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.198///199/// If you want to explicitly the `length` and `null_count`, look at200/// [`ChunkedArray::new_with_dims`]201fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {202unsafe {203let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);204chunked_arr.compute_len();205chunked_arr206}207}208209/// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.210/// # Safety211/// The length and null_count must be correct.212pub unsafe fn new_with_dims(213field: Arc<Field>,214chunks: Vec<ArrayRef>,215length: usize,216null_count: usize,217) -> Self {218Self {219field,220chunks,221flags: StatisticsFlagsIM::empty(),222223_pd: Default::default(),224length,225null_count,226}227}228229pub(crate) fn is_sorted_ascending_flag(&self) -> bool {230self.get_flags().is_sorted_ascending()231}232233pub(crate) fn is_sorted_descending_flag(&self) -> bool {234self.get_flags().is_sorted_descending()235}236237/// Whether `self` is sorted in any direction.238pub(crate) fn is_sorted_any(&self) -> bool {239self.get_flags().is_sorted_any()240}241242pub fn unset_fast_explode_list(&mut self) {243self.set_fast_explode_list(false)244}245246pub fn set_fast_explode_list(&mut self, value: bool) {247let mut flags = self.flags.get_mut();248flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);249self.flags.set_mut(flags);250}251252pub fn get_fast_explode_list(&self) -> bool {253self.get_flags().can_fast_explode_list()254}255256pub fn get_flags(&self) -> StatisticsFlags {257self.flags.get()258}259260/// Set flags for the [`ChunkedArray`]261pub fn set_flags(&mut self, flags: StatisticsFlags) {262self.flags = StatisticsFlagsIM::new(flags);263}264265pub fn is_sorted_flag(&self) -> IsSorted {266self.get_flags().is_sorted()267}268269pub fn retain_flags_from<U: PolarsDataType>(270&mut self,271from: &ChunkedArray<U>,272retain_flags: StatisticsFlags,273) {274let flags = from.flags.get();275// Try to avoid write contention.276if !flags.is_empty() {277self.set_flags(flags & retain_flags)278}279}280281/// Set the 'sorted' bit meta info.282pub fn set_sorted_flag(&mut self, sorted: IsSorted) {283let mut flags = self.flags.get_mut();284flags.set_sorted(sorted);285self.flags.set_mut(flags);286}287288/// Set the 'sorted' bit meta info.289pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {290let mut out = self.clone();291out.set_sorted_flag(sorted);292out293}294295/// Get the index of the first non null value in this [`ChunkedArray`].296pub fn first_non_null(&self) -> Option<usize> {297if self.null_count() == self.len() {298None299}300// We now know there is at least 1 non-null item in the array, and self.len() > 0301else if self.null_count() == 0 {302Some(0)303} else if self.is_sorted_any() {304let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {305// nulls are all at the start306self.null_count()307} else {308// nulls are all at the end3090310};311312debug_assert!(313// If we are lucky this catches something.314unsafe { self.get_unchecked(out) }.is_some(),315"incorrect sorted flag"316);317318Some(out)319} else {320first_non_null(self.iter_validities())321}322}323324/// Get the index of the last non null value in this [`ChunkedArray`].325pub fn last_non_null(&self) -> Option<usize> {326if self.null_count() == self.len() {327None328}329// We now know there is at least 1 non-null item in the array, and self.len() > 0330else if self.null_count() == 0 {331Some(self.len() - 1)332} else if self.is_sorted_any() {333let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {334// nulls are all at the start335self.len() - 1336} else {337// nulls are all at the end338self.len() - self.null_count() - 1339};340341debug_assert!(342// If we are lucky this catches something.343unsafe { self.get_unchecked(out) }.is_some(),344"incorrect sorted flag"345);346347Some(out)348} else {349last_non_null(self.iter_validities(), self.len())350}351}352353pub fn drop_nulls(&self) -> Self {354if self.null_count() == 0 {355self.clone()356} else {357let chunks = self358.downcast_iter()359.map(|arr| {360if arr.null_count() == 0 {361arr.to_boxed()362} else {363filter_with_bitmap(arr, arr.validity().unwrap())364}365})366.collect();367unsafe {368Self::new_with_dims(369self.field.clone(),370chunks,371self.len() - self.null_count(),3720,373)374}375}376}377378/// Get the buffer of bits representing null values379#[inline]380#[allow(clippy::type_complexity)]381pub fn iter_validities(&self) -> Map<Iter<'_, ArrayRef>, fn(&ArrayRef) -> Option<&Bitmap>> {382fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {383arr.validity()384}385self.chunks.iter().map(to_validity)386}387388#[inline]389/// Return if any the chunks in this [`ChunkedArray`] have nulls.390pub fn has_nulls(&self) -> bool {391self.null_count > 0392}393394/// Shrink the capacity of this array to fit its length.395pub fn shrink_to_fit(&mut self) {396self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];397}398399pub fn clear(&self) -> Self {400// SAFETY: we keep the correct dtype401let mut ca = unsafe {402self.copy_with_chunks(vec![new_empty_array(403self.chunks.first().unwrap().dtype().clone(),404)])405};406407use StatisticsFlags as F;408ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);409ca410}411412/// Unpack a [`Series`] to the same physical type.413///414/// # Safety415///416/// This is unsafe as the dtype may be incorrect and417/// is assumed to be correct in other safe code.418pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(419&self,420series: &'a Series,421) -> &'a ChunkedArray<T> {422let series_trait = &**series;423if self.dtype() == series.dtype() {424&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)425} else {426use DataType::*;427match (self.dtype(), series.dtype()) {428(Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {429&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)430},431_ => panic!(432"cannot unpack series {:?} into matching type {:?}",433series,434self.dtype()435),436}437}438}439440/// Returns an iterator over the lengths of the chunks of the array.441pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {442self.chunks.iter().map(|chunk| chunk.len())443}444445/// A reference to the chunks446#[inline]447pub fn chunks(&self) -> &Vec<ArrayRef> {448&self.chunks449}450451/// A mutable reference to the chunks452///453/// # Safety454/// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.455/// And the `null_count` remains correct.456#[inline]457pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {458&mut self.chunks459}460461/// Returns true if contains a single chunk and has no null values462pub fn is_optimal_aligned(&self) -> bool {463self.chunks.len() == 1 && self.null_count() == 0464}465466/// Create a new [`ChunkedArray`] from self, where the chunks are replaced.467///468/// # Safety469/// The caller must ensure the dtypes of the chunks are correct470unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {471Self::new_with_compute_len(self.field.clone(), chunks)472}473474/// Get data type of [`ChunkedArray`].475pub fn dtype(&self) -> &DataType {476self.field.dtype()477}478479pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {480self.field = Arc::new(Field::new(self.name().clone(), dtype))481}482483/// Name of the [`ChunkedArray`].484pub fn name(&self) -> &PlSmallStr {485self.field.name()486}487488/// Get a reference to the field.489pub fn ref_field(&self) -> &Field {490&self.field491}492493/// Rename this [`ChunkedArray`].494pub fn rename(&mut self, name: PlSmallStr) {495self.field = Arc::new(Field::new(name, self.field.dtype().clone()));496}497498/// Return this [`ChunkedArray`] with a new name.499pub fn with_name(mut self, name: PlSmallStr) -> Self {500self.rename(name);501self502}503}504505impl<T> ChunkedArray<T>506where507T: PolarsDataType,508{509/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this510/// indicates a NULL value.511///512/// # Panics513/// This function will panic if `idx` is out of bounds.514#[inline]515pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {516let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);517assert!(518chunk_idx < self.chunks().len(),519"index: {} out of bounds for len: {}",520idx,521self.len()522);523unsafe {524let arr = self.downcast_get_unchecked(chunk_idx);525assert!(526arr_idx < arr.len(),527"index: {} out of bounds for len: {}",528idx,529self.len()530);531arr.get_unchecked(arr_idx)532}533}534535/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this536/// indicates a NULL value.537///538/// # Safety539/// It is the callers responsibility that the `idx < self.len()`.540#[inline]541pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {542let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);543544unsafe {545// SAFETY: up to the caller to make sure the index is valid.546self.downcast_get_unchecked(chunk_idx)547.get_unchecked(arr_idx)548}549}550551/// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned552/// value could be garbage if it was masked out by NULL. Note that the value always is initialized.553///554/// # Safety555/// It is the callers responsibility that the `idx < self.len()`.556#[inline]557pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {558let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);559560unsafe {561// SAFETY: up to the caller to make sure the index is valid.562self.downcast_get_unchecked(chunk_idx)563.value_unchecked(arr_idx)564}565}566567#[inline]568pub fn first(&self) -> Option<T::Physical<'_>> {569unsafe {570let arr = self.downcast_get_unchecked(0);571arr.get_unchecked(0)572}573}574575#[inline]576pub fn last(&self) -> Option<T::Physical<'_>> {577unsafe {578let arr = self.downcast_get_unchecked(self.chunks.len().checked_sub(1)?);579arr.get_unchecked(arr.len().checked_sub(1)?)580}581}582}583584impl ListChunked {585#[inline]586pub fn get_as_series(&self, idx: usize) -> Option<Series> {587unsafe {588Some(Series::from_chunks_and_dtype_unchecked(589self.name().clone(),590vec![self.get(idx)?],591&self.inner_dtype().to_physical(),592))593}594}595}596597#[cfg(feature = "dtype-array")]598impl ArrayChunked {599#[inline]600pub fn get_as_series(&self, idx: usize) -> Option<Series> {601unsafe {602Some(Series::from_chunks_and_dtype_unchecked(603self.name().clone(),604vec![self.get(idx)?],605&self.inner_dtype().to_physical(),606))607}608}609}610611impl<T> ChunkedArray<T>612where613T: PolarsDataType,614{615/// Should be used to match the chunk_id of another [`ChunkedArray`].616/// # Panics617/// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.618pub fn match_chunks<I>(&self, chunk_id: I) -> Self619where620I: Iterator<Item = usize>,621{622debug_assert!(self.chunks.len() == 1);623// Takes a ChunkedArray containing a single chunk.624let slice = |ca: &Self| {625let array = &ca.chunks[0];626627let mut offset = 0;628let chunks = chunk_id629.map(|len| {630// SAFETY: within bounds.631debug_assert!((offset + len) <= array.len());632let out = unsafe { array.sliced_unchecked(offset, len) };633offset += len;634out635})636.collect();637638debug_assert_eq!(offset, array.len());639640// SAFETY: We just slice the original chunks, their type will not change.641unsafe {642Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())643}644};645646if self.chunks.len() != 1 {647let out = self.rechunk();648slice(&out)649} else {650slice(self)651}652}653}654655impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {656fn as_ref_dtype(&self) -> &DataType {657self.dtype()658}659}660661pub(crate) trait AsSinglePtr: AsRefDataType {662/// Rechunk and return a ptr to the start of the array663fn as_single_ptr(&mut self) -> PolarsResult<usize> {664polars_bail!(opq = as_single_ptr, self.as_ref_dtype());665}666}667668impl<T> AsSinglePtr for ChunkedArray<T>669where670T: PolarsNumericType,671{672fn as_single_ptr(&mut self) -> PolarsResult<usize> {673self.rechunk_mut();674let a = self.data_views().next().unwrap();675let ptr = a.as_ptr();676Ok(ptr as usize)677}678}679680impl AsSinglePtr for BooleanChunked {}681impl AsSinglePtr for ListChunked {}682#[cfg(feature = "dtype-array")]683impl AsSinglePtr for ArrayChunked {}684impl AsSinglePtr for StringChunked {}685impl AsSinglePtr for BinaryChunked {}686#[cfg(feature = "object")]687impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}688689pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {690SingleNoNull(&'a T::Array),691Single(&'a T::Array),692MultiNoNull(&'a ChunkedArray<T>),693Multi(&'a ChunkedArray<T>),694}695696impl<T> ChunkedArray<T>697where698T: PolarsDataType,699{700pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {701if self.chunks.len() == 1 {702let arr = self.downcast_iter().next().unwrap();703return if arr.null_count() == 0 {704ChunkedArrayLayout::SingleNoNull(arr)705} else {706ChunkedArrayLayout::Single(arr)707};708}709710if self.downcast_iter().all(|a| a.null_count() == 0) {711ChunkedArrayLayout::MultiNoNull(self)712} else {713ChunkedArrayLayout::Multi(self)714}715}716}717718impl<T> ChunkedArray<T>719where720T: PolarsNumericType,721{722/// Returns the values of the array as a contiguous slice.723pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {724polars_ensure!(725self.chunks.len() == 1 && self.chunks[0].null_count() == 0,726ComputeError: "chunked array is not contiguous"727);728Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())729}730731/// Returns the values of the array as a contiguous mutable slice.732pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {733if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {734// SAFETY, we will not swap the PrimitiveArray.735let arr = unsafe { self.downcast_iter_mut().next().unwrap() };736arr.get_mut_values()737} else {738None739}740}741742/// Get slices of the underlying arrow data.743/// NOTE: null values should be taken into account by the user of these slices as they are handled744/// separately745pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {746self.downcast_iter().map(|arr| arr.values().as_slice())747}748749#[allow(clippy::wrong_self_convention)]750pub fn into_no_null_iter(751&self,752) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen753{754// .copied was significantly slower in benchmark, next call did not inline?755#[allow(clippy::map_clone)]756// we know the iterators len757unsafe {758self.data_views()759.flatten()760.map(|v| *v)761.trust_my_length(self.len())762}763}764}765766impl<T: PolarsDataType> Clone for ChunkedArray<T> {767fn clone(&self) -> Self {768ChunkedArray {769field: self.field.clone(),770chunks: self.chunks.clone(),771flags: self.flags.clone(),772773_pd: Default::default(),774length: self.length,775null_count: self.null_count,776}777}778}779780impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {781fn as_ref(&self) -> &ChunkedArray<T> {782self783}784}785786impl ValueSize for ListChunked {787fn get_values_size(&self) -> usize {788self.chunks789.iter()790.fold(0usize, |acc, arr| acc + arr.get_values_size())791}792}793794#[cfg(feature = "dtype-array")]795impl ValueSize for ArrayChunked {796fn get_values_size(&self) -> usize {797self.chunks798.iter()799.fold(0usize, |acc, arr| acc + arr.get_values_size())800}801}802impl ValueSize for StringChunked {803fn get_values_size(&self) -> usize {804self.chunks805.iter()806.fold(0usize, |acc, arr| acc + arr.get_values_size())807}808}809810impl ValueSize for BinaryOffsetChunked {811fn get_values_size(&self) -> usize {812self.chunks813.iter()814.fold(0usize, |acc, arr| acc + arr.get_values_size())815}816}817818pub(crate) fn to_primitive<T: PolarsNumericType>(819values: Vec<T::Native>,820validity: Option<Bitmap>,821) -> PrimitiveArray<T::Native> {822PrimitiveArray::new(823T::get_static_dtype().to_arrow(CompatLevel::newest()),824values.into(),825validity,826)827}828829pub(crate) fn to_array<T: PolarsNumericType>(830values: Vec<T::Native>,831validity: Option<Bitmap>,832) -> ArrayRef {833Box::new(to_primitive::<T>(values, validity))834}835836impl<T: PolarsDataType> Default for ChunkedArray<T> {837fn default() -> Self {838let dtype = T::get_static_dtype();839let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());840ChunkedArray {841field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),842// Invariant: always has 1 chunk.843chunks: vec![new_empty_array(arrow_dtype)],844flags: StatisticsFlagsIM::empty(),845846_pd: Default::default(),847length: 0,848null_count: 0,849}850}851}852853#[cfg(test)]854pub(crate) mod test {855use crate::prelude::*;856857pub(crate) fn get_chunked_array() -> Int32Chunked {858ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])859}860861#[test]862fn test_sort() {863let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);864let b = a865.sort(false)866.into_iter()867.map(|opt| opt.unwrap())868.collect::<Vec<_>>();869assert_eq!(b, [1, 2, 3, 9]);870let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);871let a = a.sort(false);872let b = a.into_iter().collect::<Vec<_>>();873assert_eq!(b, [Some("a"), Some("b"), Some("c")]);874assert!(a.is_sorted_ascending_flag());875}876877#[test]878fn arithmetic() {879let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);880let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);881882// Not really asserting anything here but still making sure the code is exercised883// This (and more) is properly tested from the integration test suite and Python bindings.884println!("{:?}", a + b);885println!("{:?}", a - b);886println!("{:?}", a * b);887println!("{:?}", a / b);888}889890#[test]891fn iter() {892let s1 = get_chunked_array();893// sum894assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)895}896897#[test]898fn limit() {899let a = get_chunked_array();900let b = a.limit(2);901println!("{b:?}");902assert_eq!(b.len(), 2)903}904905#[test]906fn filter() {907let a = get_chunked_array();908let b = a909.filter(&BooleanChunked::new(910PlSmallStr::from_static("filter"),911&[true, false, false],912))913.unwrap();914assert_eq!(b.len(), 1);915assert_eq!(b.into_iter().next(), Some(Some(1)));916}917918#[test]919fn aggregates() {920let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);921assert_eq!(a.max(), Some(100));922assert_eq!(a.min(), Some(1));923assert_eq!(a.sum(), Some(120))924}925926#[test]927fn take() {928let a = get_chunked_array();929let new = a.take(&[0 as IdxSize, 1]).unwrap();930assert_eq!(new.len(), 2)931}932933#[test]934fn cast() {935let a = get_chunked_array();936let b = a.cast(&DataType::Int64).unwrap();937assert_eq!(b.dtype(), &DataType::Int64)938}939940fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])941where942T: PolarsNumericType,943{944assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)945}946947#[test]948fn slice() {949let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);950let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);951first.append(&second).unwrap();952assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);953assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);954assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);955assert_slice_equal(&first.slice(3, 2), &[3, 4]);956assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);957assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);958assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);959960assert_eq!(first.slice(-7, 2).len(), 1);961assert_eq!(first.slice(-3, 4).len(), 3);962assert_eq!(first.slice(3, 4).len(), 3);963assert_eq!(first.slice(10, 4).len(), 0);964}965966#[test]967fn sorting() {968let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);969let sorted = s.sort(false);970assert_slice_equal(&sorted, &[2, 4, 9]);971let sorted = s.sort(true);972assert_slice_equal(&sorted, &[9, 4, 2]);973974let s: StringChunked = ["b", "a", "z"].iter().collect();975let sorted = s.sort(false);976assert_eq!(977sorted.into_iter().collect::<Vec<_>>(),978&[Some("a"), Some("b"), Some("z")]979);980let sorted = s.sort(true);981assert_eq!(982sorted.into_iter().collect::<Vec<_>>(),983&[Some("z"), Some("b"), Some("a")]984);985let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();986let sorted = s.sort(false);987assert_eq!(988sorted.into_iter().collect::<Vec<_>>(),989&[None, Some("b"), Some("z")]990);991}992993#[test]994fn reverse() {995let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);996// path with continuous slice997assert_slice_equal(&s.reverse(), &[3, 2, 1]);998// path with options999let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);1000assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);1001let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);1002assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);10031004let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);1005assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);10061007let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);1008assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);1009}10101011#[test]1012#[cfg(feature = "dtype-categorical")]1013fn test_iter_categorical() {1014let ca = StringChunked::new(1015PlSmallStr::EMPTY,1016&[Some("foo"), None, Some("bar"), Some("ham")],1017);1018let cats = Categories::new(1019PlSmallStr::EMPTY,1020PlSmallStr::EMPTY,1021CategoricalPhysical::U32,1022);1023let ca = ca.cast(&DataType::from_categories(cats)).unwrap();1024let ca = ca.cat32().unwrap();1025let v: Vec<_> = ca.physical().into_iter().collect();1026assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);1027}10281029#[test]1030#[ignore]1031fn test_shrink_to_fit() {1032let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);1033builder.append_value("foo");1034let mut arr = builder.finish();1035let before = arr1036.chunks()1037.iter()1038.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))1039.sum::<usize>();1040arr.shrink_to_fit();1041let after = arr1042.chunks()1043.iter()1044.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))1045.sum::<usize>();1046assert!(before > after);1047}1048}104910501051