Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/chunked_array/mod.rs
8407 views
1
//! The typed heart of every Series column.
2
#![allow(unsafe_op_in_unsafe_fn)]
3
use std::sync::Arc;
4
5
use arrow::array::*;
6
use arrow::bitmap::Bitmap;
7
use arrow::compute::concatenate::concatenate_unchecked;
8
use polars_compute::filter::filter_with_bitmap;
9
10
use crate::prelude::{ChunkTakeUnchecked, *};
11
12
pub mod ops;
13
#[macro_use]
14
pub mod arithmetic;
15
pub mod builder;
16
pub mod cast;
17
pub mod collect;
18
pub mod comparison;
19
pub mod flags;
20
pub mod float;
21
pub mod iterator;
22
#[cfg(feature = "ndarray")]
23
pub(crate) mod ndarray;
24
25
pub mod arg_min_max;
26
#[cfg(feature = "dtype-array")]
27
pub(crate) mod array;
28
mod binary;
29
mod binary_offset;
30
mod bitwise;
31
#[cfg(feature = "object")]
32
mod drop;
33
mod from;
34
mod from_iterator;
35
pub mod from_iterator_par;
36
pub(crate) mod list;
37
pub(crate) mod logical;
38
#[cfg(feature = "object")]
39
pub mod object;
40
#[cfg(feature = "random")]
41
mod random;
42
#[cfg(feature = "dtype-struct")]
43
mod struct_;
44
#[cfg(any(
45
feature = "temporal",
46
feature = "dtype-datetime",
47
feature = "dtype-date"
48
))]
49
pub mod temporal;
50
mod to_vec;
51
mod trusted_len;
52
pub(crate) use arg_min_max::*;
53
use arrow::legacy::prelude::*;
54
#[cfg(feature = "dtype-struct")]
55
pub use struct_::StructChunked;
56
57
use self::flags::{StatisticsFlags, StatisticsFlagsIM};
58
use crate::series::IsSorted;
59
use crate::utils::{first_non_null, first_null, last_non_null};
60
61
pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
62
63
/// # ChunkedArray
64
///
65
/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
66
/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
67
/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
68
///
69
/// ```rust
70
/// # use polars_core::prelude::*;
71
/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
72
/// ca.apply_values(|v| v.cos())
73
/// }
74
/// ```
75
///
76
/// ## Conversion between Series and ChunkedArrays
77
/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
78
///
79
/// ```rust
80
/// # use polars_core::prelude::*;
81
/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
82
/// series.i32()
83
/// }
84
///
85
/// fn to_series(ca: Int32Chunked) -> Series {
86
/// ca.into_series()
87
/// }
88
/// ```
89
///
90
/// # Iterators
91
///
92
/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
93
/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
94
/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
95
///
96
/// ```rust
97
/// # use polars_core::prelude::*;
98
///
99
/// fn iter_forward(ca: &Float32Chunked) {
100
/// ca.iter()
101
/// .for_each(|opt_v| println!("{:?}", opt_v))
102
/// }
103
///
104
/// fn iter_backward(ca: &Float32Chunked) {
105
/// ca.iter()
106
/// .rev()
107
/// .for_each(|opt_v| println!("{:?}", opt_v))
108
/// }
109
/// ```
110
///
111
/// # Memory layout
112
///
113
/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
114
/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
115
///
116
/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
117
/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
118
///
119
/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
120
/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
121
/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
122
///
123
/// If you want to have predictable performance
124
/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
125
/// multiple append operations.
126
///
127
/// See also [`ChunkedArray::extend`] for appends within a chunk.
128
///
129
/// # Invariants
130
/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
131
/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
132
/// chunks.
133
/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
134
/// logical type given by the datatype.
135
///
136
/// [`List`]: crate::datatypes::DataType::List
137
pub struct ChunkedArray<T: PolarsDataType> {
138
pub(crate) field: Arc<Field>,
139
pub(crate) chunks: Vec<ArrayRef>,
140
141
pub(crate) flags: StatisticsFlagsIM,
142
143
length: usize,
144
null_count: usize,
145
_pd: std::marker::PhantomData<T>,
146
}
147
148
impl<T: PolarsDataType> ChunkedArray<T> {
149
fn should_rechunk(&self) -> bool {
150
self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
151
}
152
153
fn optional_rechunk(mut self) -> Self {
154
// Rechunk if we have many small chunks.
155
if self.should_rechunk() {
156
self.rechunk_mut()
157
}
158
self
159
}
160
161
pub(crate) fn as_any(&self) -> &dyn std::any::Any {
162
self
163
}
164
165
/// Series to [`ChunkedArray<T>`]
166
pub fn unpack_series_matching_type<'a>(
167
&self,
168
series: &'a Series,
169
) -> PolarsResult<&'a ChunkedArray<T>> {
170
polars_ensure!(
171
self.dtype() == series.dtype(),
172
SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
173
series.dtype(),
174
self.dtype(),
175
);
176
177
// SAFETY: dtype will be correct.
178
Ok(unsafe { self.unpack_series_matching_physical_type(series) })
179
}
180
181
/// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
182
///
183
/// If you want to explicitly the `length` and `null_count`, look at
184
/// [`ChunkedArray::new_with_dims`]
185
fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
186
unsafe {
187
let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
188
chunked_arr.compute_len();
189
chunked_arr
190
}
191
}
192
193
/// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
194
/// # Safety
195
/// The length and null_count must be correct.
196
pub unsafe fn new_with_dims(
197
field: Arc<Field>,
198
chunks: Vec<ArrayRef>,
199
length: usize,
200
null_count: usize,
201
) -> Self {
202
Self {
203
field,
204
chunks,
205
flags: StatisticsFlagsIM::empty(),
206
207
_pd: Default::default(),
208
length,
209
null_count,
210
}
211
}
212
213
pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
214
self.get_flags().is_sorted_ascending()
215
}
216
217
pub(crate) fn is_sorted_descending_flag(&self) -> bool {
218
self.get_flags().is_sorted_descending()
219
}
220
221
/// Whether `self` is sorted in any direction.
222
pub(crate) fn is_sorted_any(&self) -> bool {
223
self.get_flags().is_sorted_any()
224
}
225
226
pub fn unset_fast_explode_list(&mut self) {
227
self.set_fast_explode_list(false)
228
}
229
230
pub fn set_fast_explode_list(&mut self, value: bool) {
231
let mut flags = self.flags.get_mut();
232
flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
233
self.flags.set_mut(flags);
234
}
235
236
pub fn get_fast_explode_list(&self) -> bool {
237
self.get_flags().can_fast_explode_list()
238
}
239
240
pub fn get_flags(&self) -> StatisticsFlags {
241
self.flags.get()
242
}
243
244
/// Set flags for the [`ChunkedArray`]
245
pub fn set_flags(&mut self, flags: StatisticsFlags) {
246
self.flags = StatisticsFlagsIM::new(flags);
247
}
248
249
pub fn is_sorted_flag(&self) -> IsSorted {
250
self.get_flags().is_sorted()
251
}
252
253
pub fn retain_flags_from<U: PolarsDataType>(
254
&mut self,
255
from: &ChunkedArray<U>,
256
retain_flags: StatisticsFlags,
257
) {
258
let flags = from.flags.get();
259
// Try to avoid write contention.
260
if !flags.is_empty() {
261
self.set_flags(flags & retain_flags)
262
}
263
}
264
265
/// Set the 'sorted' bit meta info.
266
pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
267
let mut flags = self.flags.get_mut();
268
flags.set_sorted(sorted);
269
self.flags.set_mut(flags);
270
}
271
272
/// Set the 'sorted' bit meta info.
273
pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
274
let mut out = self.clone();
275
out.set_sorted_flag(sorted);
276
out
277
}
278
279
pub fn first_null(&self) -> Option<usize> {
280
if self.null_count() == 0 {
281
None
282
}
283
// We now know there is at least 1 non-null item in the array, and self.len() > 0
284
else if self.null_count() == self.len() {
285
Some(0)
286
} else if self.is_sorted_any() {
287
let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
288
// nulls are all at the start
289
0
290
} else {
291
// nulls are all at the end
292
self.null_count()
293
};
294
295
debug_assert!(
296
// If we are lucky this catches something.
297
unsafe { self.get_unchecked(out) }.is_some(),
298
"incorrect sorted flag"
299
);
300
301
Some(out)
302
} else {
303
first_null(self.chunks().iter().map(|arr| arr.as_ref()))
304
}
305
}
306
307
/// Get the index of the first non null value in this [`ChunkedArray`].
308
pub fn first_non_null(&self) -> Option<usize> {
309
if self.null_count() == self.len() {
310
None
311
}
312
// We now know there is at least 1 non-null item in the array, and self.len() > 0
313
else if self.null_count() == 0 {
314
Some(0)
315
} else if self.is_sorted_any() {
316
let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
317
// nulls are all at the start
318
self.null_count()
319
} else {
320
// nulls are all at the end
321
0
322
};
323
324
debug_assert!(
325
// If we are lucky this catches something.
326
unsafe { self.get_unchecked(out) }.is_some(),
327
"incorrect sorted flag"
328
);
329
330
Some(out)
331
} else {
332
first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
333
}
334
}
335
336
/// Get the index of the last non null value in this [`ChunkedArray`].
337
pub fn last_non_null(&self) -> Option<usize> {
338
if self.null_count() == self.len() {
339
None
340
}
341
// We now know there is at least 1 non-null item in the array, and self.len() > 0
342
else if self.null_count() == 0 {
343
Some(self.len() - 1)
344
} else if self.is_sorted_any() {
345
let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
346
// nulls are all at the start
347
self.len() - 1
348
} else {
349
// nulls are all at the end
350
self.len() - self.null_count() - 1
351
};
352
353
debug_assert!(
354
// If we are lucky this catches something.
355
unsafe { self.get_unchecked(out) }.is_some(),
356
"incorrect sorted flag"
357
);
358
359
Some(out)
360
} else {
361
last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
362
}
363
}
364
365
pub fn drop_nulls(&self) -> Self {
366
if self.null_count() == 0 {
367
self.clone()
368
} else {
369
let chunks = self
370
.downcast_iter()
371
.map(|arr| {
372
if arr.null_count() == 0 {
373
arr.to_boxed()
374
} else {
375
filter_with_bitmap(arr, arr.validity().unwrap())
376
}
377
})
378
.collect();
379
unsafe {
380
Self::new_with_dims(
381
self.field.clone(),
382
chunks,
383
self.len() - self.null_count(),
384
0,
385
)
386
}
387
}
388
}
389
390
/// Get the buffer of bits representing null values
391
#[inline]
392
#[allow(clippy::type_complexity)]
393
pub fn iter_validities(
394
&self,
395
) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
396
fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
397
arr.validity()
398
}
399
self.chunks.iter().map(to_validity)
400
}
401
402
#[inline]
403
/// Return if any the chunks in this [`ChunkedArray`] have nulls.
404
pub fn has_nulls(&self) -> bool {
405
self.null_count > 0
406
}
407
408
/// Shrink the capacity of this array to fit its length.
409
pub fn shrink_to_fit(&mut self) {
410
self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
411
}
412
413
pub fn clear(&self) -> Self {
414
// SAFETY: we keep the correct dtype
415
let mut ca = unsafe {
416
self.copy_with_chunks(vec![new_empty_array(
417
self.chunks.first().unwrap().dtype().clone(),
418
)])
419
};
420
421
use StatisticsFlags as F;
422
ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
423
ca
424
}
425
426
/// Unpack a [`Series`] to the same physical type.
427
///
428
/// # Safety
429
///
430
/// This is unsafe as the dtype may be incorrect and
431
/// is assumed to be correct in other safe code.
432
pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
433
&self,
434
series: &'a Series,
435
) -> &'a ChunkedArray<T> {
436
let series_trait = &**series;
437
if self.dtype() == series.dtype() {
438
&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
439
} else {
440
use DataType::*;
441
match (self.dtype(), series.dtype()) {
442
(Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
443
&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
444
},
445
_ => panic!(
446
"cannot unpack series {:?} into matching type {:?}",
447
series,
448
self.dtype()
449
),
450
}
451
}
452
}
453
454
/// Returns an iterator over the lengths of the chunks of the array.
455
pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
456
self.chunks.iter().map(|chunk| chunk.len())
457
}
458
459
/// A reference to the chunks
460
#[inline]
461
pub fn chunks(&self) -> &Vec<ArrayRef> {
462
&self.chunks
463
}
464
465
/// A mutable reference to the chunks
466
///
467
/// # Safety
468
/// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
469
/// And the `null_count` remains correct.
470
#[inline]
471
pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
472
&mut self.chunks
473
}
474
475
/// Returns true if contains a single chunk and has no null values
476
pub fn is_optimal_aligned(&self) -> bool {
477
self.chunks.len() == 1 && self.null_count() == 0
478
}
479
480
/// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
481
///
482
/// # Safety
483
/// The caller must ensure the dtypes of the chunks are correct
484
unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
485
Self::new_with_compute_len(self.field.clone(), chunks)
486
}
487
488
/// Get data type of [`ChunkedArray`].
489
pub fn dtype(&self) -> &DataType {
490
self.field.dtype()
491
}
492
493
pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
494
self.field = Arc::new(Field::new(self.name().clone(), dtype))
495
}
496
497
/// Name of the [`ChunkedArray`].
498
pub fn name(&self) -> &PlSmallStr {
499
self.field.name()
500
}
501
502
/// Get a reference to the field.
503
pub fn ref_field(&self) -> &Field {
504
&self.field
505
}
506
507
/// Rename this [`ChunkedArray`].
508
pub fn rename(&mut self, name: PlSmallStr) {
509
self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
510
}
511
512
/// Return this [`ChunkedArray`] with a new name.
513
pub fn with_name(mut self, name: PlSmallStr) -> Self {
514
self.rename(name);
515
self
516
}
517
}
518
519
impl<T> ChunkedArray<T>
520
where
521
T: PolarsDataType,
522
{
523
/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
524
/// indicates a NULL value.
525
///
526
/// # Panics
527
/// This function will panic if `idx` is out of bounds.
528
#[inline]
529
pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
530
let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
531
assert!(
532
chunk_idx < self.chunks().len(),
533
"index: {} out of bounds for len: {}",
534
idx,
535
self.len()
536
);
537
unsafe {
538
let arr = self.downcast_get_unchecked(chunk_idx);
539
assert!(
540
arr_idx < arr.len(),
541
"index: {} out of bounds for len: {}",
542
idx,
543
self.len()
544
);
545
arr.get_unchecked(arr_idx)
546
}
547
}
548
549
/// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
550
/// indicates a NULL value.
551
///
552
/// # Safety
553
/// It is the callers responsibility that the `idx < self.len()`.
554
#[inline]
555
pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
556
let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
557
558
unsafe {
559
// SAFETY: up to the caller to make sure the index is valid.
560
self.downcast_get_unchecked(chunk_idx)
561
.get_unchecked(arr_idx)
562
}
563
}
564
565
/// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
566
/// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
567
///
568
/// # Safety
569
/// It is the callers responsibility that the `idx < self.len()`.
570
#[inline]
571
pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
572
let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
573
574
unsafe {
575
// SAFETY: up to the caller to make sure the index is valid.
576
self.downcast_get_unchecked(chunk_idx)
577
.value_unchecked(arr_idx)
578
}
579
}
580
581
/// # Panics
582
/// Panics if the [`ChunkedArray`] is empty.
583
#[inline]
584
pub fn first(&self) -> Option<T::Physical<'_>> {
585
self.iter().next().unwrap()
586
}
587
588
/// # Panics
589
/// Panics if the [`ChunkedArray`] is empty.
590
#[inline]
591
pub fn last(&self) -> Option<T::Physical<'_>> {
592
let arr = self
593
.downcast_iter()
594
.rev()
595
.find(|arr| !arr.is_empty())
596
.unwrap();
597
unsafe { arr.get_unchecked(arr.len() - 1) }
598
}
599
600
pub fn set_validity(&mut self, validity: &Bitmap) {
601
assert_eq!(self.len(), validity.len());
602
let mut i = 0;
603
for chunk in unsafe { self.chunks_mut() } {
604
*chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
605
i += chunk.len();
606
}
607
self.null_count = validity.unset_bits();
608
self.set_fast_explode_list(false);
609
}
610
}
611
612
impl<T> ChunkedArray<T>
613
where
614
T: PolarsDataType,
615
ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
616
{
617
/// Deposit values into nulls with a certain validity mask.
618
pub fn deposit(&self, validity: &Bitmap) -> Self {
619
let set_bits = validity.set_bits();
620
621
assert_eq!(self.null_count(), 0);
622
assert_eq!(self.len(), set_bits);
623
624
if set_bits == validity.len() {
625
return self.clone();
626
}
627
628
if set_bits == 0 {
629
return Self::full_null_like(self, validity.len());
630
}
631
632
let mut null_mask = validity.clone();
633
634
let mut gather_idxs = Vec::with_capacity(validity.len());
635
let leading_nulls = null_mask.take_leading_zeros();
636
gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
637
638
let mut i = 0 as IdxSize;
639
gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
640
i += IdxSize::from(v);
641
i
642
}));
643
644
let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
645
ca.set_validity(validity);
646
ca
647
}
648
}
649
650
impl ListChunked {
651
#[inline]
652
pub fn get_as_series(&self, idx: usize) -> Option<Series> {
653
unsafe {
654
Some(Series::from_chunks_and_dtype_unchecked(
655
self.name().clone(),
656
vec![self.get(idx)?],
657
&self.inner_dtype().to_physical(),
658
))
659
}
660
}
661
662
pub fn has_empty_lists(&self) -> bool {
663
for arr in self.downcast_iter() {
664
if arr.is_empty() {
665
continue;
666
}
667
668
if match arr.validity() {
669
None => arr.offsets().lengths().any(|l| l == 0),
670
Some(validity) => arr
671
.offsets()
672
.lengths()
673
.enumerate()
674
.any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
675
} {
676
return true;
677
}
678
}
679
680
false
681
}
682
683
pub fn has_masked_out_values(&self) -> bool {
684
for arr in self.downcast_iter() {
685
if arr.is_empty() {
686
continue;
687
}
688
689
if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
690
return true;
691
}
692
693
let Some(validity) = arr.validity() else {
694
continue;
695
};
696
if validity.set_bits() == 0 {
697
continue;
698
}
699
700
// @Performance: false_idx_iter
701
for i in (!validity).true_idx_iter() {
702
if arr.offsets().length_at(i) > 0 {
703
return true;
704
}
705
}
706
}
707
708
false
709
}
710
}
711
712
#[cfg(feature = "dtype-array")]
713
impl ArrayChunked {
714
#[inline]
715
pub fn get_as_series(&self, idx: usize) -> Option<Series> {
716
unsafe {
717
Some(Series::from_chunks_and_dtype_unchecked(
718
self.name().clone(),
719
vec![self.get(idx)?],
720
&self.inner_dtype().to_physical(),
721
))
722
}
723
}
724
725
pub fn from_aligned_values(
726
name: PlSmallStr,
727
inner_dtype: &DataType,
728
width: usize,
729
chunks: Vec<ArrayRef>,
730
length: usize,
731
) -> Self {
732
let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
733
let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
734
let field = Arc::new(Field::new(name, dtype));
735
if width == 0 {
736
use arrow::array::builder::{ArrayBuilder, make_builder};
737
let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
738
return ArrayChunked::new_with_compute_len(
739
field,
740
vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
741
);
742
}
743
let mut total_len = 0;
744
let chunks = chunks
745
.into_iter()
746
.map(|chunk| {
747
debug_assert_eq!(chunk.len() % width, 0);
748
let chunk_len = chunk.len() / width;
749
total_len += chunk_len;
750
FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk, None).into_boxed()
751
})
752
.collect();
753
debug_assert_eq!(total_len, length);
754
755
unsafe { Self::new_with_dims(field, chunks, length, 0) }
756
}
757
758
/// Turn the ArrayChunked into the ListChunked with the same items.
759
///
760
/// This will always zero copy the values into the ListChunked.
761
pub fn to_list(&self) -> ListChunked {
762
let inner_dtype = self.inner_dtype();
763
let chunks = self
764
.downcast_iter()
765
.map(|chunk| {
766
use arrow::offset::OffsetsBuffer;
767
768
let inner_dtype = chunk.dtype().inner_dtype().unwrap();
769
let dtype = inner_dtype.clone().to_large_list(true);
770
771
let offsets = (0..=chunk.len())
772
.map(|i| (i * self.width()) as i64)
773
.collect::<Vec<i64>>();
774
775
// SAFETY: We created our offsets in ascending manner.
776
let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
777
778
ListArray::<i64>::new(
779
dtype,
780
offsets,
781
chunk.values().clone(),
782
chunk.validity().cloned(),
783
)
784
.into_boxed()
785
})
786
.collect();
787
788
// SAFETY: All the items were mapped 1-1 with the validity staying the same.
789
let mut ca = unsafe {
790
ListChunked::new_with_dims(
791
Arc::new(Field::new(
792
self.name().clone(),
793
DataType::List(Box::new(inner_dtype.clone())),
794
)),
795
chunks,
796
self.len(),
797
self.null_count(),
798
)
799
};
800
ca.set_fast_explode_list(!self.has_nulls());
801
ca
802
}
803
}
804
805
impl<T> ChunkedArray<T>
806
where
807
T: PolarsDataType,
808
{
809
/// Should be used to match the chunk_id of another [`ChunkedArray`].
810
/// # Panics
811
/// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
812
pub fn match_chunks<I>(&self, chunk_id: I) -> Self
813
where
814
I: Iterator<Item = usize>,
815
{
816
debug_assert!(self.chunks.len() == 1);
817
// Takes a ChunkedArray containing a single chunk.
818
let slice = |ca: &Self| {
819
let array = &ca.chunks[0];
820
821
let mut offset = 0;
822
let chunks = chunk_id
823
.map(|len| {
824
// SAFETY: within bounds.
825
debug_assert!((offset + len) <= array.len());
826
let out = unsafe { array.sliced_unchecked(offset, len) };
827
offset += len;
828
out
829
})
830
.collect();
831
832
debug_assert_eq!(offset, array.len());
833
834
// SAFETY: We just slice the original chunks, their type will not change.
835
unsafe {
836
Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
837
}
838
};
839
840
if self.chunks.len() != 1 {
841
let out = self.rechunk();
842
slice(&out)
843
} else {
844
slice(self)
845
}
846
}
847
}
848
849
impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
850
fn as_ref_dtype(&self) -> &DataType {
851
self.dtype()
852
}
853
}
854
855
pub(crate) trait AsSinglePtr: AsRefDataType {
856
/// Rechunk and return a ptr to the start of the array
857
fn as_single_ptr(&mut self) -> PolarsResult<usize> {
858
polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
859
}
860
}
861
862
impl<T> AsSinglePtr for ChunkedArray<T>
863
where
864
T: PolarsNumericType,
865
{
866
fn as_single_ptr(&mut self) -> PolarsResult<usize> {
867
self.rechunk_mut();
868
let a = self.data_views().next().unwrap();
869
let ptr = a.as_ptr();
870
Ok(ptr as usize)
871
}
872
}
873
874
impl AsSinglePtr for BooleanChunked {}
875
impl AsSinglePtr for ListChunked {}
876
#[cfg(feature = "dtype-array")]
877
impl AsSinglePtr for ArrayChunked {}
878
impl AsSinglePtr for StringChunked {}
879
impl AsSinglePtr for BinaryChunked {}
880
#[cfg(feature = "object")]
881
impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
882
883
pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
884
SingleNoNull(&'a T::Array),
885
Single(&'a T::Array),
886
MultiNoNull(&'a ChunkedArray<T>),
887
Multi(&'a ChunkedArray<T>),
888
}
889
890
impl<T> ChunkedArray<T>
891
where
892
T: PolarsDataType,
893
{
894
pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
895
if self.chunks.len() == 1 {
896
let arr = self.downcast_iter().next().unwrap();
897
return if arr.null_count() == 0 {
898
ChunkedArrayLayout::SingleNoNull(arr)
899
} else {
900
ChunkedArrayLayout::Single(arr)
901
};
902
}
903
904
if self.downcast_iter().all(|a| a.null_count() == 0) {
905
ChunkedArrayLayout::MultiNoNull(self)
906
} else {
907
ChunkedArrayLayout::Multi(self)
908
}
909
}
910
}
911
912
impl<T> ChunkedArray<T>
913
where
914
T: PolarsNumericType,
915
{
916
/// Returns the values of the array as a contiguous slice.
917
pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
918
polars_ensure!(
919
self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
920
ComputeError: "chunked array is not contiguous"
921
);
922
Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
923
}
924
925
/// Returns the values of the array as a contiguous mutable slice.
926
pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
927
if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
928
// SAFETY, we will not swap the PrimitiveArray.
929
let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
930
arr.get_mut_values()
931
} else {
932
None
933
}
934
}
935
936
/// Get slices of the underlying arrow data.
937
/// NOTE: null values should be taken into account by the user of these slices as they are handled
938
/// separately
939
pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
940
self.downcast_iter().map(|arr| arr.values().as_slice())
941
}
942
943
#[allow(clippy::wrong_self_convention)]
944
pub fn into_no_null_iter(
945
&self,
946
) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
947
{
948
// .copied was significantly slower in benchmark, next call did not inline?
949
#[allow(clippy::map_clone)]
950
// we know the iterators len
951
unsafe {
952
self.data_views()
953
.flatten()
954
.map(|v| *v)
955
.trust_my_length(self.len())
956
}
957
}
958
}
959
960
impl<T: PolarsDataType> Clone for ChunkedArray<T> {
961
fn clone(&self) -> Self {
962
ChunkedArray {
963
field: self.field.clone(),
964
chunks: self.chunks.clone(),
965
flags: self.flags.clone(),
966
967
_pd: Default::default(),
968
length: self.length,
969
null_count: self.null_count,
970
}
971
}
972
}
973
974
impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
975
fn as_ref(&self) -> &ChunkedArray<T> {
976
self
977
}
978
}
979
980
impl ValueSize for ListChunked {
981
fn get_values_size(&self) -> usize {
982
self.chunks
983
.iter()
984
.fold(0usize, |acc, arr| acc + arr.get_values_size())
985
}
986
}
987
988
#[cfg(feature = "dtype-array")]
989
impl ValueSize for ArrayChunked {
990
fn get_values_size(&self) -> usize {
991
self.chunks
992
.iter()
993
.fold(0usize, |acc, arr| acc + arr.get_values_size())
994
}
995
}
996
impl ValueSize for StringChunked {
997
fn get_values_size(&self) -> usize {
998
self.chunks
999
.iter()
1000
.fold(0usize, |acc, arr| acc + arr.get_values_size())
1001
}
1002
}
1003
1004
impl ValueSize for BinaryOffsetChunked {
1005
fn get_values_size(&self) -> usize {
1006
self.chunks
1007
.iter()
1008
.fold(0usize, |acc, arr| acc + arr.get_values_size())
1009
}
1010
}
1011
1012
pub(crate) fn to_primitive<T: PolarsNumericType>(
1013
values: Vec<T::Native>,
1014
validity: Option<Bitmap>,
1015
) -> PrimitiveArray<T::Native> {
1016
PrimitiveArray::new(
1017
T::get_static_dtype().to_arrow(CompatLevel::newest()),
1018
values.into(),
1019
validity,
1020
)
1021
}
1022
1023
pub(crate) fn to_array<T: PolarsNumericType>(
1024
values: Vec<T::Native>,
1025
validity: Option<Bitmap>,
1026
) -> ArrayRef {
1027
Box::new(to_primitive::<T>(values, validity))
1028
}
1029
1030
impl<T: PolarsDataType> Default for ChunkedArray<T> {
1031
fn default() -> Self {
1032
let dtype = T::get_static_dtype();
1033
let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1034
ChunkedArray {
1035
field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1036
// Invariant: always has 1 chunk.
1037
chunks: vec![new_empty_array(arrow_dtype)],
1038
flags: StatisticsFlagsIM::empty(),
1039
1040
_pd: Default::default(),
1041
length: 0,
1042
null_count: 0,
1043
}
1044
}
1045
}
1046
1047
#[cfg(test)]
1048
pub(crate) mod test {
1049
use crate::prelude::*;
1050
1051
pub(crate) fn get_chunked_array() -> Int32Chunked {
1052
ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1053
}
1054
1055
#[test]
1056
fn test_sort() {
1057
let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1058
let b = a
1059
.sort(false)
1060
.into_iter()
1061
.map(|opt| opt.unwrap())
1062
.collect::<Vec<_>>();
1063
assert_eq!(b, [1, 2, 3, 9]);
1064
let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1065
let a = a.sort(false);
1066
let b = a.into_iter().collect::<Vec<_>>();
1067
assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1068
assert!(a.is_sorted_ascending_flag());
1069
}
1070
1071
#[test]
1072
fn arithmetic() {
1073
let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1074
let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1075
1076
// Not really asserting anything here but still making sure the code is exercised
1077
// This (and more) is properly tested from the integration test suite and Python bindings.
1078
println!("{:?}", a + b);
1079
println!("{:?}", a - b);
1080
println!("{:?}", a * b);
1081
println!("{:?}", a / b);
1082
}
1083
1084
#[test]
1085
fn iter() {
1086
let s1 = get_chunked_array();
1087
// sum
1088
assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1089
}
1090
1091
#[test]
1092
fn limit() {
1093
let a = get_chunked_array();
1094
let b = a.limit(2);
1095
println!("{b:?}");
1096
assert_eq!(b.len(), 2)
1097
}
1098
1099
#[test]
1100
fn filter() {
1101
let a = get_chunked_array();
1102
let b = a
1103
.filter(&BooleanChunked::new(
1104
PlSmallStr::from_static("filter"),
1105
&[true, false, false],
1106
))
1107
.unwrap();
1108
assert_eq!(b.len(), 1);
1109
assert_eq!(b.into_iter().next(), Some(Some(1)));
1110
}
1111
1112
#[test]
1113
fn aggregates() {
1114
let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1115
assert_eq!(a.max(), Some(100));
1116
assert_eq!(a.min(), Some(1));
1117
assert_eq!(a.sum(), Some(120))
1118
}
1119
1120
#[test]
1121
fn take() {
1122
let a = get_chunked_array();
1123
let new = a.take(&[0 as IdxSize, 1]).unwrap();
1124
assert_eq!(new.len(), 2)
1125
}
1126
1127
#[test]
1128
fn cast() {
1129
let a = get_chunked_array();
1130
let b = a.cast(&DataType::Int64).unwrap();
1131
assert_eq!(b.dtype(), &DataType::Int64)
1132
}
1133
1134
fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1135
where
1136
T: PolarsNumericType,
1137
{
1138
assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1139
}
1140
1141
#[test]
1142
fn slice() {
1143
let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1144
let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1145
first.append(&second).unwrap();
1146
assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1147
assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1148
assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1149
assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1150
assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1151
assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1152
assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1153
1154
assert_eq!(first.slice(-7, 2).len(), 1);
1155
assert_eq!(first.slice(-3, 4).len(), 3);
1156
assert_eq!(first.slice(3, 4).len(), 3);
1157
assert_eq!(first.slice(10, 4).len(), 0);
1158
}
1159
1160
#[test]
1161
fn sorting() {
1162
let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1163
let sorted = s.sort(false);
1164
assert_slice_equal(&sorted, &[2, 4, 9]);
1165
let sorted = s.sort(true);
1166
assert_slice_equal(&sorted, &[9, 4, 2]);
1167
1168
let s: StringChunked = ["b", "a", "z"].iter().collect();
1169
let sorted = s.sort(false);
1170
assert_eq!(
1171
sorted.into_iter().collect::<Vec<_>>(),
1172
&[Some("a"), Some("b"), Some("z")]
1173
);
1174
let sorted = s.sort(true);
1175
assert_eq!(
1176
sorted.into_iter().collect::<Vec<_>>(),
1177
&[Some("z"), Some("b"), Some("a")]
1178
);
1179
let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1180
let sorted = s.sort(false);
1181
assert_eq!(
1182
sorted.into_iter().collect::<Vec<_>>(),
1183
&[None, Some("b"), Some("z")]
1184
);
1185
}
1186
1187
#[test]
1188
fn reverse() {
1189
let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1190
// path with continuous slice
1191
assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1192
// path with options
1193
let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1194
assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1195
let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1196
assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1197
1198
let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1199
assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1200
1201
let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1202
assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1203
}
1204
1205
#[test]
1206
#[cfg(feature = "dtype-categorical")]
1207
fn test_iter_categorical() {
1208
let ca = StringChunked::new(
1209
PlSmallStr::EMPTY,
1210
&[Some("foo"), None, Some("bar"), Some("ham")],
1211
);
1212
let cats = Categories::new(
1213
PlSmallStr::EMPTY,
1214
PlSmallStr::EMPTY,
1215
CategoricalPhysical::U32,
1216
);
1217
let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1218
let ca = ca.cat32().unwrap();
1219
let v: Vec<_> = ca.physical().into_iter().collect();
1220
assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1221
}
1222
1223
#[test]
1224
#[ignore]
1225
fn test_shrink_to_fit() {
1226
let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1227
builder.append_value("foo");
1228
let mut arr = builder.finish();
1229
let before = arr
1230
.chunks()
1231
.iter()
1232
.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1233
.sum::<usize>();
1234
arr.shrink_to_fit();
1235
let after = arr
1236
.chunks()
1237
.iter()
1238
.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1239
.sum::<usize>();
1240
assert!(before > after);
1241
}
1242
}
1243
1244