Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/chunked_array/mod.rs
6940 views
1
//! The typed heart of every Series column.
2
#![allow(unsafe_op_in_unsafe_fn)]
3
use std::iter::Map;
4
use std::sync::Arc;
5
6
use arrow::array::*;
7
use arrow::bitmap::Bitmap;
8
use arrow::compute::concatenate::concatenate_unchecked;
9
use polars_compute::filter::filter_with_bitmap;
10
11
use crate::prelude::*;
12
13
pub mod ops;
14
#[macro_use]
15
pub mod arithmetic;
16
pub mod builder;
17
pub mod cast;
18
pub mod collect;
19
pub mod comparison;
20
pub mod flags;
21
pub mod float;
22
pub mod iterator;
23
#[cfg(feature = "ndarray")]
24
pub(crate) mod ndarray;
25
26
#[cfg(feature = "dtype-array")]
27
pub(crate) mod array;
28
mod binary;
29
mod binary_offset;
30
mod bitwise;
31
#[cfg(feature = "object")]
32
mod drop;
33
mod from;
34
mod from_iterator;
35
pub mod from_iterator_par;
36
pub(crate) mod list;
37
pub(crate) mod logical;
38
#[cfg(feature = "object")]
39
pub mod object;
40
#[cfg(feature = "random")]
41
mod random;
42
#[cfg(feature = "dtype-struct")]
43
mod struct_;
44
#[cfg(any(
45
feature = "temporal",
46
feature = "dtype-datetime",
47
feature = "dtype-date"
48
))]
49
pub mod temporal;
50
mod to_vec;
51
mod trusted_len;
52
53
use std::slice::Iter;
54
55
use arrow::legacy::prelude::*;
56
#[cfg(feature = "dtype-struct")]
57
pub use struct_::StructChunked;
58
59
use self::flags::{StatisticsFlags, StatisticsFlagsIM};
60
use crate::series::IsSorted;
61
use crate::utils::{first_non_null, last_non_null};
62
63
#[cfg(not(feature = "dtype-categorical"))]
64
pub struct RevMapping {}
65
66
pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
67
68
/// # ChunkedArray
69
///
70
/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
71
/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
72
/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
73
///
74
/// ```rust
75
/// # use polars_core::prelude::*;
76
/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
77
/// ca.apply_values(|v| v.cos())
78
/// }
79
/// ```
80
///
81
/// ## Conversion between Series and ChunkedArrays
82
/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
83
///
84
/// ```rust
85
/// # use polars_core::prelude::*;
86
/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
87
/// series.i32()
88
/// }
89
///
90
/// fn to_series(ca: Int32Chunked) -> Series {
91
/// ca.into_series()
92
/// }
93
/// ```
94
///
95
/// # Iterators
96
///
97
/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
98
/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
99
/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
100
///
101
/// ```rust
102
/// # use polars_core::prelude::*;
103
///
104
/// fn iter_forward(ca: &Float32Chunked) {
105
/// ca.iter()
106
/// .for_each(|opt_v| println!("{:?}", opt_v))
107
/// }
108
///
109
/// fn iter_backward(ca: &Float32Chunked) {
110
/// ca.iter()
111
/// .rev()
112
/// .for_each(|opt_v| println!("{:?}", opt_v))
113
/// }
114
/// ```
115
///
116
/// # Memory layout
117
///
118
/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
119
/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
120
///
121
/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
122
/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
123
///
124
/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
125
/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
126
/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
127
///
128
/// If you want to have predictable performance
129
/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
130
/// multiple append operations.
131
///
132
/// See also [`ChunkedArray::extend`] for appends within a chunk.
133
///
134
/// # Invariants
135
/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
136
/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
137
/// chunks.
138
/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
139
/// logical type given by the datatype.
140
///
141
/// [`List`]: crate::datatypes::DataType::List
142
pub struct ChunkedArray<T: PolarsDataType> {
143
pub(crate) field: Arc<Field>,
144
pub(crate) chunks: Vec<ArrayRef>,
145
146
pub(crate) flags: StatisticsFlagsIM,
147
148
length: usize,
149
null_count: usize,
150
_pd: std::marker::PhantomData<T>,
151
}
152
153
impl<T: PolarsDataType> ChunkedArray<T> {
154
fn should_rechunk(&self) -> bool {
155
self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
156
}
157
158
fn optional_rechunk(mut self) -> Self {
159
// Rechunk if we have many small chunks.
160
if self.should_rechunk() {
161
self.rechunk_mut()
162
}
163
self
164
}
165
166
pub(crate) fn as_any(&self) -> &dyn std::any::Any {
167
self
168
}
169
170
/// Series to [`ChunkedArray<T>`]
171
pub fn unpack_series_matching_type<'a>(
172
&self,
173
series: &'a Series,
174
) -> PolarsResult<&'a ChunkedArray<T>> {
175
match self.dtype() {
176
#[cfg(feature = "dtype-decimal")]
177
DataType::Decimal(_, _) => {
178
let logical = series.decimal()?;
179
180
let ca = logical.physical();
181
Ok(ca.as_any().downcast_ref::<ChunkedArray<T>>().unwrap())
182
},
183
dt => {
184
polars_ensure!(
185
dt == series.dtype(),
186
SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
187
series.dtype(),
188
dt,
189
);
190
191
// SAFETY:
192
// dtype will be correct.
193
Ok(unsafe { self.unpack_series_matching_physical_type(series) })
194
},
195
}
196
}
197
198
/// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
199
///
200
/// If you want to explicitly the `length` and `null_count`, look at
201
/// [`ChunkedArray::new_with_dims`]
202
fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
203
unsafe {
204
let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
205
chunked_arr.compute_len();
206
chunked_arr
207
}
208
}
209
210
/// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
211
/// # Safety
212
/// The length and null_count must be correct.
213
pub unsafe fn new_with_dims(
214
field: Arc<Field>,
215
chunks: Vec<ArrayRef>,
216
length: usize,
217
null_count: usize,
218
) -> Self {
219
Self {
220
field,
221
chunks,
222
flags: StatisticsFlagsIM::empty(),
223
224
_pd: Default::default(),
225
length,
226
null_count,
227
}
228
}
229
230
pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
231
self.get_flags().is_sorted_ascending()
232
}
233
234
pub(crate) fn is_sorted_descending_flag(&self) -> bool {
235
self.get_flags().is_sorted_descending()
236
}
237
238
/// Whether `self` is sorted in any direction.
239
pub(crate) fn is_sorted_any(&self) -> bool {
240
self.get_flags().is_sorted_any()
241
}
242
243
pub fn unset_fast_explode_list(&mut self) {
244
self.set_fast_explode_list(false)
245
}
246
247
pub fn set_fast_explode_list(&mut self, value: bool) {
248
let mut flags = self.flags.get_mut();
249
flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
250
self.flags.set_mut(flags);
251
}
252
253
pub fn get_fast_explode_list(&self) -> bool {
254
self.get_flags().can_fast_explode_list()
255
}
256
257
pub fn get_flags(&self) -> StatisticsFlags {
258
self.flags.get()
259
}
260
261
/// Set flags for the [`ChunkedArray`]
262
pub fn set_flags(&mut self, flags: StatisticsFlags) {
263
self.flags = StatisticsFlagsIM::new(flags);
264
}
265
266
pub fn is_sorted_flag(&self) -> IsSorted {
267
self.get_flags().is_sorted()
268
}
269
270
pub fn retain_flags_from<U: PolarsDataType>(
271
&mut self,
272
from: &ChunkedArray<U>,
273
retain_flags: StatisticsFlags,
274
) {
275
let flags = from.flags.get();
276
// Try to avoid write contention.
277
if !flags.is_empty() {
278
self.set_flags(flags & retain_flags)
279
}
280
}
281
282
/// Set the 'sorted' bit meta info.
283
pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
284
let mut flags = self.flags.get_mut();
285
flags.set_sorted(sorted);
286
self.flags.set_mut(flags);
287
}
288
289
/// Set the 'sorted' bit meta info.
290
pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
291
let mut out = self.clone();
292
out.set_sorted_flag(sorted);
293
out
294
}
295
296
/// Get the index of the first non null value in this [`ChunkedArray`].
297
pub fn first_non_null(&self) -> Option<usize> {
298
if self.null_count() == self.len() {
299
None
300
}
301
// We now know there is at least 1 non-null item in the array, and self.len() > 0
302
else if self.null_count() == 0 {
303
Some(0)
304
} else if self.is_sorted_any() {
305
let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
306
// nulls are all at the start
307
self.null_count()
308
} else {
309
// nulls are all at the end
310
0
311
};
312
313
debug_assert!(
314
// If we are lucky this catches something.
315
unsafe { self.get_unchecked(out) }.is_some(),
316
"incorrect sorted flag"
317
);
318
319
Some(out)
320
} else {
321
first_non_null(self.iter_validities())
322
}
323
}
324
325
/// Get the index of the last non null value in this [`ChunkedArray`].
326
pub fn last_non_null(&self) -> Option<usize> {
327
if self.null_count() == self.len() {
328
None
329
}
330
// We now know there is at least 1 non-null item in the array, and self.len() > 0
331
else if self.null_count() == 0 {
332
Some(self.len() - 1)
333
} else if self.is_sorted_any() {
334
let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
335
// nulls are all at the start
336
self.len() - 1
337
} else {
338
// nulls are all at the end
339
self.len() - self.null_count() - 1
340
};
341
342
debug_assert!(
343
// If we are lucky this catches something.
344
unsafe { self.get_unchecked(out) }.is_some(),
345
"incorrect sorted flag"
346
);
347
348
Some(out)
349
} else {
350
last_non_null(self.iter_validities(), self.len())
351
}
352
}
353
354
pub fn drop_nulls(&self) -> Self {
355
if self.null_count() == 0 {
356
self.clone()
357
} else {
358
let chunks = self
359
.downcast_iter()
360
.map(|arr| {
361
if arr.null_count() == 0 {
362
arr.to_boxed()
363
} else {
364
filter_with_bitmap(arr, arr.validity().unwrap())
365
}
366
})
367
.collect();
368
unsafe {
369
Self::new_with_dims(
370
self.field.clone(),
371
chunks,
372
self.len() - self.null_count(),
373
0,
374
)
375
}
376
}
377
}
378
379
/// Get the buffer of bits representing null values
380
#[inline]
381
#[allow(clippy::type_complexity)]
382
pub fn iter_validities(&self) -> Map<Iter<'_, ArrayRef>, fn(&ArrayRef) -> Option<&Bitmap>> {
383
fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
384
arr.validity()
385
}
386
self.chunks.iter().map(to_validity)
387
}
388
389
#[inline]
390
/// Return if any the chunks in this [`ChunkedArray`] have nulls.
391
pub fn has_nulls(&self) -> bool {
392
self.null_count > 0
393
}
394
395
/// Shrink the capacity of this array to fit its length.
396
pub fn shrink_to_fit(&mut self) {
397
self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
398
}
399
400
pub fn clear(&self) -> Self {
401
// SAFETY: we keep the correct dtype
402
let mut ca = unsafe {
403
self.copy_with_chunks(vec![new_empty_array(
404
self.chunks.first().unwrap().dtype().clone(),
405
)])
406
};
407
408
use StatisticsFlags as F;
409
ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
410
ca
411
}
412
413
/// Unpack a [`Series`] to the same physical type.
414
///
415
/// # Safety
416
///
417
/// This is unsafe as the dtype may be incorrect and
418
/// is assumed to be correct in other safe code.
419
pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
420
&self,
421
series: &'a Series,
422
) -> &'a ChunkedArray<T> {
423
let series_trait = &**series;
424
if self.dtype() == series.dtype() {
425
&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
426
} else {
427
use DataType::*;
428
match (self.dtype(), series.dtype()) {
429
(Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
430
&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
431
},
432
_ => panic!(
433
"cannot unpack series {:?} into matching type {:?}",
434
series,
435
self.dtype()
436
),
437
}
438
}
439
}
440
441
/// Returns an iterator over the lengths of the chunks of the array.
442
pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
443
self.chunks.iter().map(|chunk| chunk.len())
444
}
445
446
/// A reference to the chunks
447
#[inline]
448
pub fn chunks(&self) -> &Vec<ArrayRef> {
449
&self.chunks
450
}
451
452
/// A mutable reference to the chunks
453
///
454
/// # Safety
455
/// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
456
/// And the `null_count` remains correct.
457
#[inline]
458
pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
459
&mut self.chunks
460
}
461
462
/// Returns true if contains a single chunk and has no null values
463
pub fn is_optimal_aligned(&self) -> bool {
464
self.chunks.len() == 1 && self.null_count() == 0
465
}
466
467
/// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
468
///
469
/// # Safety
470
/// The caller must ensure the dtypes of the chunks are correct
471
unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
472
Self::new_with_compute_len(self.field.clone(), chunks)
473
}
474
475
/// Get data type of [`ChunkedArray`].
476
pub fn dtype(&self) -> &DataType {
477
self.field.dtype()
478
}
479
480
pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
481
self.field = Arc::new(Field::new(self.name().clone(), dtype))
482
}
483
484
/// Name of the [`ChunkedArray`].
485
pub fn name(&self) -> &PlSmallStr {
486
self.field.name()
487
}
488
489
/// Get a reference to the field.
490
pub fn ref_field(&self) -> &Field {
491
&self.field
492
}
493
494
/// Rename this [`ChunkedArray`].
495
pub fn rename(&mut self, name: PlSmallStr) {
496
self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
497
}
498
499
/// Return this [`ChunkedArray`] with a new name.
500
pub fn with_name(mut self, name: PlSmallStr) -> Self {
501
self.rename(name);
502
self
503
}
504
}
505
506
impl<T> ChunkedArray<T>
507
where
508
T: PolarsDataType,
509
{
510
/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
511
/// indicates a NULL value.
512
///
513
/// # Panics
514
/// This function will panic if `idx` is out of bounds.
515
#[inline]
516
pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
517
let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
518
assert!(
519
chunk_idx < self.chunks().len(),
520
"index: {} out of bounds for len: {}",
521
idx,
522
self.len()
523
);
524
unsafe {
525
let arr = self.downcast_get_unchecked(chunk_idx);
526
assert!(
527
arr_idx < arr.len(),
528
"index: {} out of bounds for len: {}",
529
idx,
530
self.len()
531
);
532
arr.get_unchecked(arr_idx)
533
}
534
}
535
536
/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
537
/// indicates a NULL value.
538
///
539
/// # Safety
540
/// It is the callers responsibility that the `idx < self.len()`.
541
#[inline]
542
pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
543
let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
544
545
unsafe {
546
// SAFETY: up to the caller to make sure the index is valid.
547
self.downcast_get_unchecked(chunk_idx)
548
.get_unchecked(arr_idx)
549
}
550
}
551
552
/// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
553
/// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
554
///
555
/// # Safety
556
/// It is the callers responsibility that the `idx < self.len()`.
557
#[inline]
558
pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
559
let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
560
561
unsafe {
562
// SAFETY: up to the caller to make sure the index is valid.
563
self.downcast_get_unchecked(chunk_idx)
564
.value_unchecked(arr_idx)
565
}
566
}
567
568
#[inline]
569
pub fn first(&self) -> Option<T::Physical<'_>> {
570
unsafe {
571
let arr = self.downcast_get_unchecked(0);
572
arr.get_unchecked(0)
573
}
574
}
575
576
#[inline]
577
pub fn last(&self) -> Option<T::Physical<'_>> {
578
unsafe {
579
let arr = self.downcast_get_unchecked(self.chunks.len().checked_sub(1)?);
580
arr.get_unchecked(arr.len().checked_sub(1)?)
581
}
582
}
583
}
584
585
impl ListChunked {
586
#[inline]
587
pub fn get_as_series(&self, idx: usize) -> Option<Series> {
588
unsafe {
589
Some(Series::from_chunks_and_dtype_unchecked(
590
self.name().clone(),
591
vec![self.get(idx)?],
592
&self.inner_dtype().to_physical(),
593
))
594
}
595
}
596
}
597
598
#[cfg(feature = "dtype-array")]
599
impl ArrayChunked {
600
#[inline]
601
pub fn get_as_series(&self, idx: usize) -> Option<Series> {
602
unsafe {
603
Some(Series::from_chunks_and_dtype_unchecked(
604
self.name().clone(),
605
vec![self.get(idx)?],
606
&self.inner_dtype().to_physical(),
607
))
608
}
609
}
610
}
611
612
impl<T> ChunkedArray<T>
613
where
614
T: PolarsDataType,
615
{
616
/// Should be used to match the chunk_id of another [`ChunkedArray`].
617
/// # Panics
618
/// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
619
pub fn match_chunks<I>(&self, chunk_id: I) -> Self
620
where
621
I: Iterator<Item = usize>,
622
{
623
debug_assert!(self.chunks.len() == 1);
624
// Takes a ChunkedArray containing a single chunk.
625
let slice = |ca: &Self| {
626
let array = &ca.chunks[0];
627
628
let mut offset = 0;
629
let chunks = chunk_id
630
.map(|len| {
631
// SAFETY: within bounds.
632
debug_assert!((offset + len) <= array.len());
633
let out = unsafe { array.sliced_unchecked(offset, len) };
634
offset += len;
635
out
636
})
637
.collect();
638
639
debug_assert_eq!(offset, array.len());
640
641
// SAFETY: We just slice the original chunks, their type will not change.
642
unsafe {
643
Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
644
}
645
};
646
647
if self.chunks.len() != 1 {
648
let out = self.rechunk();
649
slice(&out)
650
} else {
651
slice(self)
652
}
653
}
654
}
655
656
impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
657
fn as_ref_dtype(&self) -> &DataType {
658
self.dtype()
659
}
660
}
661
662
pub(crate) trait AsSinglePtr: AsRefDataType {
663
/// Rechunk and return a ptr to the start of the array
664
fn as_single_ptr(&mut self) -> PolarsResult<usize> {
665
polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
666
}
667
}
668
669
impl<T> AsSinglePtr for ChunkedArray<T>
670
where
671
T: PolarsNumericType,
672
{
673
fn as_single_ptr(&mut self) -> PolarsResult<usize> {
674
self.rechunk_mut();
675
let a = self.data_views().next().unwrap();
676
let ptr = a.as_ptr();
677
Ok(ptr as usize)
678
}
679
}
680
681
impl AsSinglePtr for BooleanChunked {}
682
impl AsSinglePtr for ListChunked {}
683
#[cfg(feature = "dtype-array")]
684
impl AsSinglePtr for ArrayChunked {}
685
impl AsSinglePtr for StringChunked {}
686
impl AsSinglePtr for BinaryChunked {}
687
#[cfg(feature = "object")]
688
impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
689
690
pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
691
SingleNoNull(&'a T::Array),
692
Single(&'a T::Array),
693
MultiNoNull(&'a ChunkedArray<T>),
694
Multi(&'a ChunkedArray<T>),
695
}
696
697
impl<T> ChunkedArray<T>
698
where
699
T: PolarsDataType,
700
{
701
pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
702
if self.chunks.len() == 1 {
703
let arr = self.downcast_iter().next().unwrap();
704
return if arr.null_count() == 0 {
705
ChunkedArrayLayout::SingleNoNull(arr)
706
} else {
707
ChunkedArrayLayout::Single(arr)
708
};
709
}
710
711
if self.downcast_iter().all(|a| a.null_count() == 0) {
712
ChunkedArrayLayout::MultiNoNull(self)
713
} else {
714
ChunkedArrayLayout::Multi(self)
715
}
716
}
717
}
718
719
impl<T> ChunkedArray<T>
720
where
721
T: PolarsNumericType,
722
{
723
/// Returns the values of the array as a contiguous slice.
724
pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
725
polars_ensure!(
726
self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
727
ComputeError: "chunked array is not contiguous"
728
);
729
Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
730
}
731
732
/// Returns the values of the array as a contiguous mutable slice.
733
pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
734
if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
735
// SAFETY, we will not swap the PrimitiveArray.
736
let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
737
arr.get_mut_values()
738
} else {
739
None
740
}
741
}
742
743
/// Get slices of the underlying arrow data.
744
/// NOTE: null values should be taken into account by the user of these slices as they are handled
745
/// separately
746
pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
747
self.downcast_iter().map(|arr| arr.values().as_slice())
748
}
749
750
#[allow(clippy::wrong_self_convention)]
751
pub fn into_no_null_iter(
752
&self,
753
) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
754
{
755
// .copied was significantly slower in benchmark, next call did not inline?
756
#[allow(clippy::map_clone)]
757
// we know the iterators len
758
unsafe {
759
self.data_views()
760
.flatten()
761
.map(|v| *v)
762
.trust_my_length(self.len())
763
}
764
}
765
}
766
767
impl<T: PolarsDataType> Clone for ChunkedArray<T> {
768
fn clone(&self) -> Self {
769
ChunkedArray {
770
field: self.field.clone(),
771
chunks: self.chunks.clone(),
772
flags: self.flags.clone(),
773
774
_pd: Default::default(),
775
length: self.length,
776
null_count: self.null_count,
777
}
778
}
779
}
780
781
impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
782
fn as_ref(&self) -> &ChunkedArray<T> {
783
self
784
}
785
}
786
787
impl ValueSize for ListChunked {
788
fn get_values_size(&self) -> usize {
789
self.chunks
790
.iter()
791
.fold(0usize, |acc, arr| acc + arr.get_values_size())
792
}
793
}
794
795
#[cfg(feature = "dtype-array")]
796
impl ValueSize for ArrayChunked {
797
fn get_values_size(&self) -> usize {
798
self.chunks
799
.iter()
800
.fold(0usize, |acc, arr| acc + arr.get_values_size())
801
}
802
}
803
impl ValueSize for StringChunked {
804
fn get_values_size(&self) -> usize {
805
self.chunks
806
.iter()
807
.fold(0usize, |acc, arr| acc + arr.get_values_size())
808
}
809
}
810
811
impl ValueSize for BinaryOffsetChunked {
812
fn get_values_size(&self) -> usize {
813
self.chunks
814
.iter()
815
.fold(0usize, |acc, arr| acc + arr.get_values_size())
816
}
817
}
818
819
pub(crate) fn to_primitive<T: PolarsNumericType>(
820
values: Vec<T::Native>,
821
validity: Option<Bitmap>,
822
) -> PrimitiveArray<T::Native> {
823
PrimitiveArray::new(
824
T::get_static_dtype().to_arrow(CompatLevel::newest()),
825
values.into(),
826
validity,
827
)
828
}
829
830
pub(crate) fn to_array<T: PolarsNumericType>(
831
values: Vec<T::Native>,
832
validity: Option<Bitmap>,
833
) -> ArrayRef {
834
Box::new(to_primitive::<T>(values, validity))
835
}
836
837
impl<T: PolarsDataType> Default for ChunkedArray<T> {
838
fn default() -> Self {
839
let dtype = T::get_static_dtype();
840
let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
841
ChunkedArray {
842
field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
843
// Invariant: always has 1 chunk.
844
chunks: vec![new_empty_array(arrow_dtype)],
845
flags: StatisticsFlagsIM::empty(),
846
847
_pd: Default::default(),
848
length: 0,
849
null_count: 0,
850
}
851
}
852
}
853
854
#[cfg(test)]
855
pub(crate) mod test {
856
use crate::prelude::*;
857
858
pub(crate) fn get_chunked_array() -> Int32Chunked {
859
ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
860
}
861
862
#[test]
863
fn test_sort() {
864
let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
865
let b = a
866
.sort(false)
867
.into_iter()
868
.map(|opt| opt.unwrap())
869
.collect::<Vec<_>>();
870
assert_eq!(b, [1, 2, 3, 9]);
871
let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
872
let a = a.sort(false);
873
let b = a.into_iter().collect::<Vec<_>>();
874
assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
875
assert!(a.is_sorted_ascending_flag());
876
}
877
878
#[test]
879
fn arithmetic() {
880
let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
881
let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
882
883
// Not really asserting anything here but still making sure the code is exercised
884
// This (and more) is properly tested from the integration test suite and Python bindings.
885
println!("{:?}", a + b);
886
println!("{:?}", a - b);
887
println!("{:?}", a * b);
888
println!("{:?}", a / b);
889
}
890
891
#[test]
892
fn iter() {
893
let s1 = get_chunked_array();
894
// sum
895
assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
896
}
897
898
#[test]
899
fn limit() {
900
let a = get_chunked_array();
901
let b = a.limit(2);
902
println!("{b:?}");
903
assert_eq!(b.len(), 2)
904
}
905
906
#[test]
907
fn filter() {
908
let a = get_chunked_array();
909
let b = a
910
.filter(&BooleanChunked::new(
911
PlSmallStr::from_static("filter"),
912
&[true, false, false],
913
))
914
.unwrap();
915
assert_eq!(b.len(), 1);
916
assert_eq!(b.into_iter().next(), Some(Some(1)));
917
}
918
919
#[test]
920
fn aggregates() {
921
let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
922
assert_eq!(a.max(), Some(100));
923
assert_eq!(a.min(), Some(1));
924
assert_eq!(a.sum(), Some(120))
925
}
926
927
#[test]
928
fn take() {
929
let a = get_chunked_array();
930
let new = a.take(&[0 as IdxSize, 1]).unwrap();
931
assert_eq!(new.len(), 2)
932
}
933
934
#[test]
935
fn cast() {
936
let a = get_chunked_array();
937
let b = a.cast(&DataType::Int64).unwrap();
938
assert_eq!(b.dtype(), &DataType::Int64)
939
}
940
941
fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
942
where
943
T: PolarsNumericType,
944
{
945
assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
946
}
947
948
#[test]
949
fn slice() {
950
let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
951
let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
952
first.append(&second).unwrap();
953
assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
954
assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
955
assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
956
assert_slice_equal(&first.slice(3, 2), &[3, 4]);
957
assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
958
assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
959
assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
960
961
assert_eq!(first.slice(-7, 2).len(), 1);
962
assert_eq!(first.slice(-3, 4).len(), 3);
963
assert_eq!(first.slice(3, 4).len(), 3);
964
assert_eq!(first.slice(10, 4).len(), 0);
965
}
966
967
#[test]
968
fn sorting() {
969
let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
970
let sorted = s.sort(false);
971
assert_slice_equal(&sorted, &[2, 4, 9]);
972
let sorted = s.sort(true);
973
assert_slice_equal(&sorted, &[9, 4, 2]);
974
975
let s: StringChunked = ["b", "a", "z"].iter().collect();
976
let sorted = s.sort(false);
977
assert_eq!(
978
sorted.into_iter().collect::<Vec<_>>(),
979
&[Some("a"), Some("b"), Some("z")]
980
);
981
let sorted = s.sort(true);
982
assert_eq!(
983
sorted.into_iter().collect::<Vec<_>>(),
984
&[Some("z"), Some("b"), Some("a")]
985
);
986
let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
987
let sorted = s.sort(false);
988
assert_eq!(
989
sorted.into_iter().collect::<Vec<_>>(),
990
&[None, Some("b"), Some("z")]
991
);
992
}
993
994
#[test]
995
fn reverse() {
996
let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
997
// path with continuous slice
998
assert_slice_equal(&s.reverse(), &[3, 2, 1]);
999
// path with options
1000
let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1001
assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1002
let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1003
assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1004
1005
let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1006
assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1007
1008
let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1009
assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1010
}
1011
1012
#[test]
1013
#[cfg(feature = "dtype-categorical")]
1014
fn test_iter_categorical() {
1015
let ca = StringChunked::new(
1016
PlSmallStr::EMPTY,
1017
&[Some("foo"), None, Some("bar"), Some("ham")],
1018
);
1019
let cats = Categories::new(
1020
PlSmallStr::EMPTY,
1021
PlSmallStr::EMPTY,
1022
CategoricalPhysical::U32,
1023
);
1024
let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1025
let ca = ca.cat32().unwrap();
1026
let v: Vec<_> = ca.physical().into_iter().collect();
1027
assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1028
}
1029
1030
#[test]
1031
#[ignore]
1032
fn test_shrink_to_fit() {
1033
let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1034
builder.append_value("foo");
1035
let mut arr = builder.finish();
1036
let before = arr
1037
.chunks()
1038
.iter()
1039
.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1040
.sum::<usize>();
1041
arr.shrink_to_fit();
1042
let after = arr
1043
.chunks()
1044
.iter()
1045
.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1046
.sum::<usize>();
1047
assert!(before > after);
1048
}
1049
}
1050
1051