Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/utils/mod.rs
8420 views
1
mod any_value;
2
use arrow::compute::concatenate::concatenate_validities;
3
use arrow::compute::utils::combine_validities_and;
4
pub mod flatten;
5
pub(crate) mod series;
6
mod supertype;
7
use std::borrow::Cow;
8
use std::ops::{Deref, DerefMut};
9
mod schema;
10
11
pub use any_value::*;
12
use arrow::bitmap::Bitmap;
13
pub use arrow::legacy::utils::*;
14
pub use arrow::trusted_len::TrustMyLength;
15
use flatten::*;
16
use num_traits::{One, Zero};
17
use rayon::prelude::*;
18
pub use schema::*;
19
pub use series::*;
20
pub use supertype::*;
21
pub use {arrow, rayon};
22
23
use crate::POOL;
24
use crate::prelude::*;
25
26
#[repr(transparent)]
27
pub struct Wrap<T>(pub T);
28
29
impl<T> Deref for Wrap<T> {
30
type Target = T;
31
fn deref(&self) -> &Self::Target {
32
&self.0
33
}
34
}
35
36
#[inline(always)]
37
pub fn _set_partition_size() -> usize {
38
POOL.current_num_threads()
39
}
40
41
/// Just a wrapper structure which is useful for certain impl specializations.
42
///
43
/// This is for instance use to implement
44
/// `impl<T> FromIterator<T::Native> for NoNull<ChunkedArray<T>>`
45
/// as `Option<T::Native>` was already implemented:
46
/// `impl<T> FromIterator<Option<T::Native>> for ChunkedArray<T>`
47
pub struct NoNull<T> {
48
inner: T,
49
}
50
51
impl<T> NoNull<T> {
52
pub fn new(inner: T) -> Self {
53
NoNull { inner }
54
}
55
56
pub fn into_inner(self) -> T {
57
self.inner
58
}
59
}
60
61
impl<T> Deref for NoNull<T> {
62
type Target = T;
63
64
fn deref(&self) -> &Self::Target {
65
&self.inner
66
}
67
}
68
69
impl<T> DerefMut for NoNull<T> {
70
fn deref_mut(&mut self) -> &mut Self::Target {
71
&mut self.inner
72
}
73
}
74
75
pub(crate) fn get_iter_capacity<T, I: Iterator<Item = T>>(iter: &I) -> usize {
76
match iter.size_hint() {
77
(_lower, Some(upper)) => upper,
78
(0, None) => 1024,
79
(lower, None) => lower,
80
}
81
}
82
83
// prefer this one over split_ca, as this can push the null_count into the thread pool
84
// returns an `(offset, length)` tuple
85
#[doc(hidden)]
86
pub fn _split_offsets(len: usize, n: usize) -> Vec<(usize, usize)> {
87
if n == 1 {
88
vec![(0, len)]
89
} else {
90
let chunk_size = len / n;
91
92
(0..n)
93
.map(|partition| {
94
let offset = partition * chunk_size;
95
let len = if partition == (n - 1) {
96
len - offset
97
} else {
98
chunk_size
99
};
100
(partition * chunk_size, len)
101
})
102
.collect_trusted()
103
}
104
}
105
106
#[allow(clippy::len_without_is_empty)]
107
pub trait Container: Clone {
108
fn slice(&self, offset: i64, len: usize) -> Self;
109
110
fn split_at(&self, offset: i64) -> (Self, Self);
111
112
fn len(&self) -> usize;
113
114
fn iter_chunks(&self) -> impl Iterator<Item = Self>;
115
116
fn should_rechunk(&self) -> bool;
117
118
fn n_chunks(&self) -> usize;
119
120
fn chunk_lengths(&self) -> impl Iterator<Item = usize>;
121
}
122
123
impl Container for DataFrame {
124
fn slice(&self, offset: i64, len: usize) -> Self {
125
DataFrame::slice(self, offset, len)
126
}
127
128
fn split_at(&self, offset: i64) -> (Self, Self) {
129
DataFrame::split_at(self, offset)
130
}
131
132
fn len(&self) -> usize {
133
self.height()
134
}
135
136
fn iter_chunks(&self) -> impl Iterator<Item = Self> {
137
flatten_df_iter(self)
138
}
139
140
fn should_rechunk(&self) -> bool {
141
self.should_rechunk()
142
}
143
144
fn n_chunks(&self) -> usize {
145
DataFrame::first_col_n_chunks(self)
146
}
147
148
fn chunk_lengths(&self) -> impl Iterator<Item = usize> {
149
// @scalar-correctness?
150
self.columns()[0].as_materialized_series().chunk_lengths()
151
}
152
}
153
154
impl<T: PolarsDataType> Container for ChunkedArray<T> {
155
fn slice(&self, offset: i64, len: usize) -> Self {
156
ChunkedArray::slice(self, offset, len)
157
}
158
159
fn split_at(&self, offset: i64) -> (Self, Self) {
160
ChunkedArray::split_at(self, offset)
161
}
162
163
fn len(&self) -> usize {
164
ChunkedArray::len(self)
165
}
166
167
fn iter_chunks(&self) -> impl Iterator<Item = Self> {
168
self.downcast_iter()
169
.map(|arr| Self::with_chunk(self.name().clone(), arr.clone()))
170
}
171
172
fn should_rechunk(&self) -> bool {
173
false
174
}
175
176
fn n_chunks(&self) -> usize {
177
self.chunks().len()
178
}
179
180
fn chunk_lengths(&self) -> impl Iterator<Item = usize> {
181
ChunkedArray::chunk_lengths(self)
182
}
183
}
184
185
impl Container for Series {
186
fn slice(&self, offset: i64, len: usize) -> Self {
187
self.0.slice(offset, len)
188
}
189
190
fn split_at(&self, offset: i64) -> (Self, Self) {
191
self.0.split_at(offset)
192
}
193
194
fn len(&self) -> usize {
195
self.0.len()
196
}
197
198
fn iter_chunks(&self) -> impl Iterator<Item = Self> {
199
(0..self.0.n_chunks()).map(|i| self.select_chunk(i))
200
}
201
202
fn should_rechunk(&self) -> bool {
203
false
204
}
205
206
fn n_chunks(&self) -> usize {
207
self.chunks().len()
208
}
209
210
fn chunk_lengths(&self) -> impl Iterator<Item = usize> {
211
self.0.chunk_lengths()
212
}
213
}
214
215
fn split_impl<C: Container>(container: &C, target: usize, chunk_size: usize) -> Vec<C> {
216
if target == 1 {
217
return vec![container.clone()];
218
}
219
let mut out = Vec::with_capacity(target);
220
let chunk_size = chunk_size as i64;
221
222
// First split
223
let (chunk, mut remainder) = container.split_at(chunk_size);
224
out.push(chunk);
225
226
// Take the rest of the splits of exactly chunk size, but skip the last remainder as we won't split that.
227
for _ in 1..target - 1 {
228
let (a, b) = remainder.split_at(chunk_size);
229
out.push(a);
230
remainder = b
231
}
232
// This can be slightly larger than `chunk_size`, but is smaller than `2 * chunk_size`.
233
out.push(remainder);
234
out
235
}
236
237
/// Splits, but doesn't flatten chunks. E.g. a container can still have multiple chunks.
238
pub fn split<C: Container>(container: &C, target: usize) -> Vec<C> {
239
let total_len = container.len();
240
if total_len == 0 {
241
return vec![container.clone()];
242
}
243
244
let chunk_size = std::cmp::max(total_len / target, 1);
245
246
if container.n_chunks() == target
247
&& container
248
.chunk_lengths()
249
.all(|len| len.abs_diff(chunk_size) < 100)
250
// We cannot get chunks if they are misaligned
251
&& !container.should_rechunk()
252
{
253
return container.iter_chunks().collect();
254
}
255
split_impl(container, target, chunk_size)
256
}
257
258
/// Split a [`Container`] in `target` elements. The target doesn't have to be respected if not
259
/// Deviation of the target might be done to create more equal size chunks.
260
pub fn split_and_flatten<C: Container>(container: &C, target: usize) -> Vec<C> {
261
let total_len = container.len();
262
if total_len == 0 {
263
return vec![container.clone()];
264
}
265
266
let chunk_size = std::cmp::max(total_len / target, 1);
267
268
if container.n_chunks() == target
269
&& container
270
.chunk_lengths()
271
.all(|len| len.abs_diff(chunk_size) < 100)
272
// We cannot get chunks if they are misaligned
273
&& !container.should_rechunk()
274
{
275
return container.iter_chunks().collect();
276
}
277
278
if container.n_chunks() == 1 {
279
split_impl(container, target, chunk_size)
280
} else {
281
let mut out = Vec::with_capacity(target);
282
let chunks = container.iter_chunks();
283
284
'new_chunk: for mut chunk in chunks {
285
loop {
286
let h = chunk.len();
287
if h < chunk_size {
288
// TODO if the chunk is much smaller than chunk size, we should try to merge it with the next one.
289
out.push(chunk);
290
continue 'new_chunk;
291
}
292
293
// If a split leads to the next chunk being smaller than 30% take the whole chunk
294
if ((h - chunk_size) as f64 / chunk_size as f64) < 0.3 {
295
out.push(chunk);
296
continue 'new_chunk;
297
}
298
299
let (a, b) = chunk.split_at(chunk_size as i64);
300
out.push(a);
301
chunk = b;
302
}
303
}
304
out
305
}
306
}
307
308
/// Split a [`DataFrame`] in `target` elements. The target doesn't have to be respected if not
309
/// strict. Deviation of the target might be done to create more equal size chunks.
310
///
311
/// # Panics
312
/// if chunks are not aligned
313
pub fn split_df_as_ref(df: &DataFrame, target: usize, strict: bool) -> Vec<DataFrame> {
314
if strict {
315
split(df, target)
316
} else {
317
split_and_flatten(df, target)
318
}
319
}
320
321
#[doc(hidden)]
322
/// Split a [`DataFrame`] into `n` parts. We take a `&mut` to be able to repartition/align chunks.
323
/// `strict` in that it respects `n` even if the chunks are suboptimal.
324
pub fn split_df(df: &mut DataFrame, target: usize, strict: bool) -> Vec<DataFrame> {
325
if target == 0 || df.height() == 0 {
326
return vec![df.clone()];
327
}
328
// make sure that chunks are aligned.
329
df.align_chunks_par();
330
split_df_as_ref(df, target, strict)
331
}
332
333
pub fn slice_slice<T>(vals: &[T], offset: i64, len: usize) -> &[T] {
334
let (raw_offset, slice_len) = slice_offsets(offset, len, vals.len());
335
&vals[raw_offset..raw_offset + slice_len]
336
}
337
338
#[inline]
339
pub fn slice_offsets(offset: i64, length: usize, array_len: usize) -> (usize, usize) {
340
let signed_start_offset = if offset < 0 {
341
offset.saturating_add_unsigned(array_len as u64)
342
} else {
343
offset
344
};
345
let signed_stop_offset = signed_start_offset.saturating_add_unsigned(length as u64);
346
347
let signed_array_len: i64 = array_len
348
.try_into()
349
.expect("array length larger than i64::MAX");
350
let clamped_start_offset = signed_start_offset.clamp(0, signed_array_len);
351
let clamped_stop_offset = signed_stop_offset.clamp(0, signed_array_len);
352
353
let slice_start_idx = clamped_start_offset as usize;
354
let slice_len = (clamped_stop_offset - clamped_start_offset) as usize;
355
(slice_start_idx, slice_len)
356
}
357
358
/// Apply a macro on the Series
359
#[macro_export]
360
macro_rules! match_dtype_to_physical_apply_macro {
361
($obj:expr, $macro:ident, $macro_string:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{
362
match $obj {
363
DataType::String => $macro_string!($($opt_args)*),
364
DataType::Boolean => $macro_bool!($($opt_args)*),
365
#[cfg(feature = "dtype-u8")]
366
DataType::UInt8 => $macro!(u8 $(, $opt_args)*),
367
#[cfg(feature = "dtype-u16")]
368
DataType::UInt16 => $macro!(u16 $(, $opt_args)*),
369
DataType::UInt32 => $macro!(u32 $(, $opt_args)*),
370
DataType::UInt64 => $macro!(u64 $(, $opt_args)*),
371
#[cfg(feature = "dtype-i8")]
372
DataType::Int8 => $macro!(i8 $(, $opt_args)*),
373
#[cfg(feature = "dtype-i16")]
374
DataType::Int16 => $macro!(i16 $(, $opt_args)*),
375
DataType::Int32 => $macro!(i32 $(, $opt_args)*),
376
DataType::Int64 => $macro!(i64 $(, $opt_args)*),
377
#[cfg(feature = "dtype-i128")]
378
DataType::Int128 => $macro!(i128 $(, $opt_args)*),
379
#[cfg(feature = "dtype-f16")]
380
DataType::Float16 => $macro!(pf16 $(, $opt_args)*),
381
DataType::Float32 => $macro!(f32 $(, $opt_args)*),
382
DataType::Float64 => $macro!(f64 $(, $opt_args)*),
383
dt => panic!("not implemented for dtype {:?}", dt),
384
}
385
}};
386
}
387
388
/// Apply a macro on the Series
389
#[macro_export]
390
macro_rules! match_dtype_to_logical_apply_macro {
391
($obj:expr, $macro:ident, $macro_string:ident, $macro_binary:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{
392
match $obj {
393
DataType::String => $macro_string!($($opt_args)*),
394
DataType::Binary => $macro_binary!($($opt_args)*),
395
DataType::Boolean => $macro_bool!($($opt_args)*),
396
#[cfg(feature = "dtype-u8")]
397
DataType::UInt8 => $macro!(UInt8Type $(, $opt_args)*),
398
#[cfg(feature = "dtype-u16")]
399
DataType::UInt16 => $macro!(UInt16Type $(, $opt_args)*),
400
DataType::UInt32 => $macro!(UInt32Type $(, $opt_args)*),
401
DataType::UInt64 => $macro!(UInt64Type $(, $opt_args)*),
402
#[cfg(feature = "dtype-u128")]
403
DataType::UInt128 => $macro!(UInt128Type $(, $opt_args)*),
404
#[cfg(feature = "dtype-i8")]
405
DataType::Int8 => $macro!(Int8Type $(, $opt_args)*),
406
#[cfg(feature = "dtype-i16")]
407
DataType::Int16 => $macro!(Int16Type $(, $opt_args)*),
408
DataType::Int32 => $macro!(Int32Type $(, $opt_args)*),
409
DataType::Int64 => $macro!(Int64Type $(, $opt_args)*),
410
#[cfg(feature = "dtype-i128")]
411
DataType::Int128 => $macro!(Int128Type $(, $opt_args)*),
412
#[cfg(feature = "dtype-f16")]
413
DataType::Float16 => $macro!(Float16Type $(, $opt_args)*),
414
DataType::Float32 => $macro!(Float32Type $(, $opt_args)*),
415
DataType::Float64 => $macro!(Float64Type $(, $opt_args)*),
416
dt => panic!("not implemented for dtype {:?}", dt),
417
}
418
}};
419
}
420
421
/// Apply a macro on the Downcasted ChunkedArrays
422
#[macro_export]
423
macro_rules! match_arrow_dtype_apply_macro_ca {
424
($self:expr, $macro:ident, $macro_string:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{
425
match $self.dtype() {
426
DataType::String => $macro_string!($self.str().unwrap() $(, $opt_args)*),
427
DataType::Boolean => $macro_bool!($self.bool().unwrap() $(, $opt_args)*),
428
#[cfg(feature = "dtype-u8")]
429
DataType::UInt8 => $macro!($self.u8().unwrap() $(, $opt_args)*),
430
#[cfg(feature = "dtype-u16")]
431
DataType::UInt16 => $macro!($self.u16().unwrap() $(, $opt_args)*),
432
DataType::UInt32 => $macro!($self.u32().unwrap() $(, $opt_args)*),
433
DataType::UInt64 => $macro!($self.u64().unwrap() $(, $opt_args)*),
434
#[cfg(feature = "dtype-u128")]
435
DataType::UInt128 => $macro!($self.u128().unwrap() $(, $opt_args)*),
436
#[cfg(feature = "dtype-i8")]
437
DataType::Int8 => $macro!($self.i8().unwrap() $(, $opt_args)*),
438
#[cfg(feature = "dtype-i16")]
439
DataType::Int16 => $macro!($self.i16().unwrap() $(, $opt_args)*),
440
DataType::Int32 => $macro!($self.i32().unwrap() $(, $opt_args)*),
441
DataType::Int64 => $macro!($self.i64().unwrap() $(, $opt_args)*),
442
#[cfg(feature = "dtype-i128")]
443
DataType::Int128 => $macro!($self.i128().unwrap() $(, $opt_args)*),
444
#[cfg(feature = "dtype-f16")]
445
DataType::Float16 => $macro!($self.f16().unwrap() $(, $opt_args)*),
446
DataType::Float32 => $macro!($self.f32().unwrap() $(, $opt_args)*),
447
DataType::Float64 => $macro!($self.f64().unwrap() $(, $opt_args)*),
448
dt => panic!("not implemented for dtype {:?}", dt),
449
}
450
}};
451
}
452
453
#[macro_export]
454
macro_rules! with_match_physical_numeric_type {(
455
$dtype:expr, | $_:tt $T:ident | $($body:tt)*
456
) => ({
457
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
458
#[cfg(feature = "dtype-f16")]
459
use polars_utils::float16::pf16;
460
use $crate::datatypes::DataType::*;
461
match $dtype {
462
#[cfg(feature = "dtype-i8")]
463
Int8 => __with_ty__! { i8 },
464
#[cfg(feature = "dtype-i16")]
465
Int16 => __with_ty__! { i16 },
466
Int32 => __with_ty__! { i32 },
467
Int64 => __with_ty__! { i64 },
468
#[cfg(feature = "dtype-i128")]
469
Int128 => __with_ty__! { i128 },
470
#[cfg(feature = "dtype-u8")]
471
UInt8 => __with_ty__! { u8 },
472
#[cfg(feature = "dtype-u16")]
473
UInt16 => __with_ty__! { u16 },
474
UInt32 => __with_ty__! { u32 },
475
UInt64 => __with_ty__! { u64 },
476
#[cfg(feature = "dtype-u128")]
477
UInt128 => __with_ty__! { u128 },
478
#[cfg(feature = "dtype-f16")]
479
Float16 => __with_ty__! { pf16 },
480
Float32 => __with_ty__! { f32 },
481
Float64 => __with_ty__! { f64 },
482
dt => panic!("not implemented for dtype {:?}", dt),
483
}
484
})}
485
486
#[macro_export]
487
macro_rules! with_match_physical_integer_type {(
488
$dtype:expr, | $_:tt $T:ident | $($body:tt)*
489
) => ({
490
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
491
#[cfg(feature = "dtype-f16")]
492
use polars_utils::float16::pf16;
493
use $crate::datatypes::DataType::*;
494
match $dtype {
495
#[cfg(feature = "dtype-i8")]
496
Int8 => __with_ty__! { i8 },
497
#[cfg(feature = "dtype-i16")]
498
Int16 => __with_ty__! { i16 },
499
Int32 => __with_ty__! { i32 },
500
Int64 => __with_ty__! { i64 },
501
#[cfg(feature = "dtype-i128")]
502
Int128 => __with_ty__! { i128 },
503
#[cfg(feature = "dtype-u8")]
504
UInt8 => __with_ty__! { u8 },
505
#[cfg(feature = "dtype-u16")]
506
UInt16 => __with_ty__! { u16 },
507
UInt32 => __with_ty__! { u32 },
508
UInt64 => __with_ty__! { u64 },
509
#[cfg(feature = "dtype-u128")]
510
UInt128 => __with_ty__! { u128 },
511
dt => panic!("not implemented for dtype {:?}", dt),
512
}
513
})}
514
515
#[macro_export]
516
macro_rules! with_match_physical_float_type {(
517
$dtype:expr, | $_:tt $T:ident | $($body:tt)*
518
) => ({
519
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
520
use polars_utils::float16::pf16;
521
use $crate::datatypes::DataType::*;
522
match $dtype {
523
#[cfg(feature = "dtype-f16")]
524
Float16 => __with_ty__! { pf16 },
525
Float32 => __with_ty__! { f32 },
526
Float64 => __with_ty__! { f64 },
527
dt => panic!("not implemented for dtype {:?}", dt),
528
}
529
})}
530
531
#[macro_export]
532
macro_rules! with_match_physical_float_polars_type {(
533
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
534
) => ({
535
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
536
use $crate::datatypes::DataType::*;
537
match $key_type {
538
#[cfg(feature = "dtype-f16")]
539
Float16 => __with_ty__! { Float16Type },
540
Float32 => __with_ty__! { Float32Type },
541
Float64 => __with_ty__! { Float64Type },
542
dt => panic!("not implemented for dtype {:?}", dt),
543
}
544
})}
545
546
#[macro_export]
547
macro_rules! with_match_physical_numeric_polars_type {(
548
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
549
) => ({
550
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
551
use $crate::datatypes::DataType::*;
552
match $key_type {
553
#[cfg(feature = "dtype-i8")]
554
Int8 => __with_ty__! { Int8Type },
555
#[cfg(feature = "dtype-i16")]
556
Int16 => __with_ty__! { Int16Type },
557
Int32 => __with_ty__! { Int32Type },
558
Int64 => __with_ty__! { Int64Type },
559
#[cfg(feature = "dtype-i128")]
560
Int128 => __with_ty__! { Int128Type },
561
#[cfg(feature = "dtype-u8")]
562
UInt8 => __with_ty__! { UInt8Type },
563
#[cfg(feature = "dtype-u16")]
564
UInt16 => __with_ty__! { UInt16Type },
565
UInt32 => __with_ty__! { UInt32Type },
566
UInt64 => __with_ty__! { UInt64Type },
567
#[cfg(feature = "dtype-u128")]
568
UInt128 => __with_ty__! { UInt128Type },
569
#[cfg(feature = "dtype-f16")]
570
Float16 => __with_ty__! { Float16Type },
571
Float32 => __with_ty__! { Float32Type },
572
Float64 => __with_ty__! { Float64Type },
573
dt => panic!("not implemented for dtype {:?}", dt),
574
}
575
})}
576
577
#[macro_export]
578
macro_rules! with_match_physical_integer_polars_type {(
579
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
580
) => ({
581
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
582
use $crate::datatypes::DataType::*;
583
use $crate::datatypes::*;
584
match $key_type {
585
#[cfg(feature = "dtype-i8")]
586
Int8 => __with_ty__! { Int8Type },
587
#[cfg(feature = "dtype-i16")]
588
Int16 => __with_ty__! { Int16Type },
589
Int32 => __with_ty__! { Int32Type },
590
Int64 => __with_ty__! { Int64Type },
591
#[cfg(feature = "dtype-i128")]
592
Int128 => __with_ty__! { Int128Type },
593
#[cfg(feature = "dtype-u8")]
594
UInt8 => __with_ty__! { UInt8Type },
595
#[cfg(feature = "dtype-u16")]
596
UInt16 => __with_ty__! { UInt16Type },
597
UInt32 => __with_ty__! { UInt32Type },
598
UInt64 => __with_ty__! { UInt64Type },
599
#[cfg(feature = "dtype-u128")]
600
UInt128 => __with_ty__! { UInt128Type },
601
dt => panic!("not implemented for dtype {:?}", dt),
602
}
603
})}
604
605
#[macro_export]
606
macro_rules! with_match_categorical_physical_type {(
607
$dtype:expr, | $_:tt $T:ident | $($body:tt)*
608
) => ({
609
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
610
match $dtype {
611
CategoricalPhysical::U8 => __with_ty__! { Categorical8Type },
612
CategoricalPhysical::U16 => __with_ty__! { Categorical16Type },
613
CategoricalPhysical::U32 => __with_ty__! { Categorical32Type },
614
}
615
})}
616
617
/// Apply a macro on the Downcasted ChunkedArrays of DataTypes that are logical numerics.
618
/// So no logical.
619
#[macro_export]
620
macro_rules! downcast_as_macro_arg_physical {
621
($self:expr, $macro:ident $(, $opt_args:expr)*) => {{
622
match $self.dtype() {
623
#[cfg(feature = "dtype-u8")]
624
DataType::UInt8 => $macro!($self.u8().unwrap() $(, $opt_args)*),
625
#[cfg(feature = "dtype-u16")]
626
DataType::UInt16 => $macro!($self.u16().unwrap() $(, $opt_args)*),
627
DataType::UInt32 => $macro!($self.u32().unwrap() $(, $opt_args)*),
628
DataType::UInt64 => $macro!($self.u64().unwrap() $(, $opt_args)*),
629
#[cfg(feature = "dtype-u128")]
630
DataType::UInt128 => $macro!($self.u128().unwrap() $(, $opt_args)*),
631
#[cfg(feature = "dtype-i8")]
632
DataType::Int8 => $macro!($self.i8().unwrap() $(, $opt_args)*),
633
#[cfg(feature = "dtype-i16")]
634
DataType::Int16 => $macro!($self.i16().unwrap() $(, $opt_args)*),
635
DataType::Int32 => $macro!($self.i32().unwrap() $(, $opt_args)*),
636
DataType::Int64 => $macro!($self.i64().unwrap() $(, $opt_args)*),
637
#[cfg(feature = "dtype-i128")]
638
DataType::Int128 => $macro!($self.i128().unwrap() $(, $opt_args)*),
639
#[cfg(feature = "dtype-f16")]
640
DataType::Float16 => $macro!($self.f16().unwrap() $(, $opt_args)*),
641
DataType::Float32 => $macro!($self.f32().unwrap() $(, $opt_args)*),
642
DataType::Float64 => $macro!($self.f64().unwrap() $(, $opt_args)*),
643
dt => panic!("not implemented for {:?}", dt),
644
}
645
}};
646
}
647
648
/// Apply a macro on the Downcasted ChunkedArrays of DataTypes that are logical numerics.
649
/// So no logical.
650
#[macro_export]
651
macro_rules! downcast_as_macro_arg_physical_mut {
652
($self:expr, $macro:ident $(, $opt_args:expr)*) => {{
653
// clone so that we do not borrow
654
match $self.dtype().clone() {
655
#[cfg(feature = "dtype-u8")]
656
DataType::UInt8 => {
657
let ca: &mut UInt8Chunked = $self.as_mut();
658
$macro!(UInt8Type, ca $(, $opt_args)*)
659
},
660
#[cfg(feature = "dtype-u16")]
661
DataType::UInt16 => {
662
let ca: &mut UInt16Chunked = $self.as_mut();
663
$macro!(UInt16Type, ca $(, $opt_args)*)
664
},
665
DataType::UInt32 => {
666
let ca: &mut UInt32Chunked = $self.as_mut();
667
$macro!(UInt32Type, ca $(, $opt_args)*)
668
},
669
DataType::UInt64 => {
670
let ca: &mut UInt64Chunked = $self.as_mut();
671
$macro!(UInt64Type, ca $(, $opt_args)*)
672
},
673
#[cfg(feature = "dtype-u128")]
674
DataType::UInt128 => {
675
let ca: &mut UInt128Chunked = $self.as_mut();
676
$macro!(UInt128Type, ca $(, $opt_args)*)
677
},
678
#[cfg(feature = "dtype-i8")]
679
DataType::Int8 => {
680
let ca: &mut Int8Chunked = $self.as_mut();
681
$macro!(Int8Type, ca $(, $opt_args)*)
682
},
683
#[cfg(feature = "dtype-i16")]
684
DataType::Int16 => {
685
let ca: &mut Int16Chunked = $self.as_mut();
686
$macro!(Int16Type, ca $(, $opt_args)*)
687
},
688
DataType::Int32 => {
689
let ca: &mut Int32Chunked = $self.as_mut();
690
$macro!(Int32Type, ca $(, $opt_args)*)
691
},
692
DataType::Int64 => {
693
let ca: &mut Int64Chunked = $self.as_mut();
694
$macro!(Int64Type, ca $(, $opt_args)*)
695
},
696
#[cfg(feature = "dtype-i128")]
697
DataType::Int128 => {
698
let ca: &mut Int128Chunked = $self.as_mut();
699
$macro!(Int128Type, ca $(, $opt_args)*)
700
},
701
#[cfg(feature = "dtype-f16")]
702
DataType::Float16 => {
703
let ca: &mut Float16Chunked = $self.as_mut();
704
$macro!(Float16Type, ca $(, $opt_args)*)
705
},
706
DataType::Float32 => {
707
let ca: &mut Float32Chunked = $self.as_mut();
708
$macro!(Float32Type, ca $(, $opt_args)*)
709
},
710
DataType::Float64 => {
711
let ca: &mut Float64Chunked = $self.as_mut();
712
$macro!(Float64Type, ca $(, $opt_args)*)
713
},
714
dt => panic!("not implemented for {:?}", dt),
715
}
716
}};
717
}
718
719
#[macro_export]
720
macro_rules! apply_method_all_arrow_series {
721
($self:expr, $method:ident, $($args:expr),*) => {
722
match $self.dtype() {
723
DataType::Boolean => $self.bool().unwrap().$method($($args),*),
724
DataType::String => $self.str().unwrap().$method($($args),*),
725
#[cfg(feature = "dtype-u8")]
726
DataType::UInt8 => $self.u8().unwrap().$method($($args),*),
727
#[cfg(feature = "dtype-u16")]
728
DataType::UInt16 => $self.u16().unwrap().$method($($args),*),
729
DataType::UInt32 => $self.u32().unwrap().$method($($args),*),
730
DataType::UInt64 => $self.u64().unwrap().$method($($args),*),
731
#[cfg(feature = "dtype-u128")]
732
DataType::UInt128 => $self.u128().unwrap().$medthod($($args),*),
733
#[cfg(feature = "dtype-i8")]
734
DataType::Int8 => $self.i8().unwrap().$method($($args),*),
735
#[cfg(feature = "dtype-i16")]
736
DataType::Int16 => $self.i16().unwrap().$method($($args),*),
737
DataType::Int32 => $self.i32().unwrap().$method($($args),*),
738
DataType::Int64 => $self.i64().unwrap().$method($($args),*),
739
#[cfg(feature = "dtype-i128")]
740
DataType::Int128 => $self.i128().unwrap().$method($($args),*),
741
#[cfg(feature = "dtype-f16")]
742
DataType::Float16 => $self.f16().unwrap().$method($($args),*),
743
DataType::Float32 => $self.f32().unwrap().$method($($args),*),
744
DataType::Float64 => $self.f64().unwrap().$method($($args),*),
745
DataType::Time => $self.time().unwrap().$method($($args),*),
746
DataType::Date => $self.date().unwrap().$method($($args),*),
747
DataType::Datetime(_, _) => $self.datetime().unwrap().$method($($args),*),
748
DataType::List(_) => $self.list().unwrap().$method($($args),*),
749
DataType::Struct(_) => $self.struct_().unwrap().$method($($args),*),
750
dt => panic!("dtype {:?} not supported", dt)
751
}
752
}
753
}
754
755
#[macro_export]
756
macro_rules! apply_method_physical_integer {
757
($self:expr, $method:ident, $($args:expr),*) => {
758
match $self.dtype() {
759
#[cfg(feature = "dtype-u8")]
760
DataType::UInt8 => $self.u8().unwrap().$method($($args),*),
761
#[cfg(feature = "dtype-u16")]
762
DataType::UInt16 => $self.u16().unwrap().$method($($args),*),
763
DataType::UInt32 => $self.u32().unwrap().$method($($args),*),
764
DataType::UInt64 => $self.u64().unwrap().$method($($args),*),
765
#[cfg(feature = "dtype-u128")]
766
DataType::UInt128 => $self.u128().unwrap().$method($($args),*),
767
#[cfg(feature = "dtype-i8")]
768
DataType::Int8 => $self.i8().unwrap().$method($($args),*),
769
#[cfg(feature = "dtype-i16")]
770
DataType::Int16 => $self.i16().unwrap().$method($($args),*),
771
DataType::Int32 => $self.i32().unwrap().$method($($args),*),
772
DataType::Int64 => $self.i64().unwrap().$method($($args),*),
773
#[cfg(feature = "dtype-i128")]
774
DataType::Int128 => $self.i128().unwrap().$method($($args),*),
775
dt => panic!("not implemented for dtype {:?}", dt),
776
}
777
}
778
}
779
780
// doesn't include Bool and String
781
#[macro_export]
782
macro_rules! apply_method_physical_numeric {
783
($self:expr, $method:ident, $($args:expr),*) => {
784
match $self.dtype() {
785
#[cfg(feature = "dtype-f16")]
786
DataType::Float16 => $self.f16().unwrap().$method($($args),*),
787
DataType::Float32 => $self.f32().unwrap().$method($($args),*),
788
DataType::Float64 => $self.f64().unwrap().$method($($args),*),
789
_ => apply_method_physical_integer!($self, $method, $($args),*),
790
}
791
}
792
}
793
794
#[macro_export]
795
macro_rules! df {
796
($($col_name:expr => $slice:expr), + $(,)?) => {
797
$crate::prelude::DataFrame::new_infer_height(vec![
798
$($crate::prelude::Column::from(<$crate::prelude::Series as $crate::prelude::NamedFrom::<_, _>>::new($col_name.into(), $slice)),)+
799
])
800
}
801
}
802
803
pub fn get_time_units(tu_l: &TimeUnit, tu_r: &TimeUnit) -> TimeUnit {
804
use crate::datatypes::time_unit::TimeUnit::*;
805
match (tu_l, tu_r) {
806
(Nanoseconds, Microseconds) => Microseconds,
807
(_, Milliseconds) => Milliseconds,
808
_ => *tu_l,
809
}
810
}
811
812
#[cold]
813
#[inline(never)]
814
fn width_mismatch(df1: &DataFrame, df2: &DataFrame) -> PolarsError {
815
let mut df1_extra = Vec::new();
816
let mut df2_extra = Vec::new();
817
818
let s1 = df1.schema();
819
let s2 = df2.schema();
820
821
s1.field_compare(s2, &mut df1_extra, &mut df2_extra);
822
823
let df1_extra = df1_extra
824
.into_iter()
825
.map(|(_, (n, _))| n.as_str())
826
.collect::<Vec<_>>()
827
.join(", ");
828
let df2_extra = df2_extra
829
.into_iter()
830
.map(|(_, (n, _))| n.as_str())
831
.collect::<Vec<_>>()
832
.join(", ");
833
834
polars_err!(
835
SchemaMismatch: r#"unable to vstack, dataframes have different widths ({} != {}).
836
One dataframe has additional columns: [{df1_extra}].
837
Other dataframe has additional columns: [{df2_extra}]."#,
838
df1.width(),
839
df2.width(),
840
)
841
}
842
843
pub fn accumulate_dataframes_vertical_unchecked_optional<I>(dfs: I) -> Option<DataFrame>
844
where
845
I: IntoIterator<Item = DataFrame>,
846
{
847
let mut iter = dfs.into_iter();
848
let additional = iter.size_hint().0;
849
let mut acc_df = iter.next()?;
850
acc_df.reserve_chunks(additional);
851
852
for df in iter {
853
if acc_df.width() != df.width() {
854
panic!("{}", width_mismatch(&acc_df, &df));
855
}
856
857
acc_df.vstack_mut_owned_unchecked(df);
858
}
859
Some(acc_df)
860
}
861
862
/// This takes ownership of the DataFrame so that drop is called earlier.
863
/// Does not check if schema is correct
864
pub fn accumulate_dataframes_vertical_unchecked<I>(dfs: I) -> DataFrame
865
where
866
I: IntoIterator<Item = DataFrame>,
867
{
868
let mut iter = dfs.into_iter();
869
let additional = iter.size_hint().0;
870
let mut acc_df = iter.next().unwrap();
871
acc_df.reserve_chunks(additional);
872
873
for df in iter {
874
if acc_df.width() != df.width() {
875
panic!("{}", width_mismatch(&acc_df, &df));
876
}
877
878
acc_df.vstack_mut_owned_unchecked(df);
879
}
880
acc_df
881
}
882
883
/// This takes ownership of the DataFrame so that drop is called earlier.
884
/// # Panics
885
/// Panics if `dfs` is empty.
886
pub fn accumulate_dataframes_vertical<I>(dfs: I) -> PolarsResult<DataFrame>
887
where
888
I: IntoIterator<Item = DataFrame>,
889
{
890
let mut iter = dfs.into_iter();
891
let additional = iter.size_hint().0;
892
let mut acc_df = iter.next().unwrap();
893
acc_df.reserve_chunks(additional);
894
for df in iter {
895
if acc_df.width() != df.width() {
896
return Err(width_mismatch(&acc_df, &df));
897
}
898
899
acc_df.vstack_mut_owned(df)?;
900
}
901
902
Ok(acc_df)
903
}
904
905
/// Concat the DataFrames to a single DataFrame.
906
pub fn concat_df<'a, I>(dfs: I) -> PolarsResult<DataFrame>
907
where
908
I: IntoIterator<Item = &'a DataFrame>,
909
{
910
let mut iter = dfs.into_iter();
911
let additional = iter.size_hint().0;
912
let mut acc_df = iter.next().unwrap().clone();
913
acc_df.reserve_chunks(additional);
914
for df in iter {
915
acc_df.vstack_mut(df)?;
916
}
917
Ok(acc_df)
918
}
919
920
/// Concat the DataFrames to a single DataFrame.
921
pub fn concat_df_unchecked<'a, I>(dfs: I) -> DataFrame
922
where
923
I: IntoIterator<Item = &'a DataFrame>,
924
{
925
let mut iter = dfs.into_iter();
926
let additional = iter.size_hint().0;
927
let mut acc_df = iter.next().unwrap().clone();
928
acc_df.reserve_chunks(additional);
929
for df in iter {
930
acc_df.vstack_mut_unchecked(df);
931
}
932
acc_df
933
}
934
935
pub fn accumulate_dataframes_horizontal(dfs: Vec<DataFrame>) -> PolarsResult<DataFrame> {
936
let mut iter = dfs.into_iter();
937
let mut acc_df = iter.next().unwrap();
938
for df in iter {
939
acc_df.hstack_mut(df.columns())?;
940
}
941
Ok(acc_df)
942
}
943
944
/// Ensure the chunks in both ChunkedArrays have the same length.
945
/// # Panics
946
/// This will panic if `left.len() != right.len()` and array is chunked.
947
pub fn align_chunks_binary<'a, T, B>(
948
left: &'a ChunkedArray<T>,
949
right: &'a ChunkedArray<B>,
950
) -> (Cow<'a, ChunkedArray<T>>, Cow<'a, ChunkedArray<B>>)
951
where
952
B: PolarsDataType,
953
T: PolarsDataType,
954
{
955
let assert = || {
956
assert_eq!(
957
left.len(),
958
right.len(),
959
"expected arrays of the same length"
960
)
961
};
962
match (left.chunks.len(), right.chunks.len()) {
963
// All chunks are equal length
964
(1, 1) => (Cow::Borrowed(left), Cow::Borrowed(right)),
965
// All chunks are equal length
966
(a, b)
967
if a == b
968
&& left
969
.chunk_lengths()
970
.zip(right.chunk_lengths())
971
.all(|(l, r)| l == r) =>
972
{
973
(Cow::Borrowed(left), Cow::Borrowed(right))
974
},
975
(_, 1) => {
976
assert();
977
(
978
Cow::Borrowed(left),
979
Cow::Owned(right.match_chunks(left.chunk_lengths())),
980
)
981
},
982
(1, _) => {
983
assert();
984
(
985
Cow::Owned(left.match_chunks(right.chunk_lengths())),
986
Cow::Borrowed(right),
987
)
988
},
989
(_, _) => {
990
assert();
991
// could optimize to choose to rechunk a primitive and not a string or list type
992
let left = left.rechunk();
993
(
994
Cow::Owned(left.match_chunks(right.chunk_lengths())),
995
Cow::Borrowed(right),
996
)
997
},
998
}
999
}
1000
1001
/// Ensure the chunks in ChunkedArray and Series have the same length.
1002
/// # Panics
1003
/// This will panic if `left.len() != right.len()` and array is chunked.
1004
pub fn align_chunks_binary_ca_series<'a, T>(
1005
left: &'a ChunkedArray<T>,
1006
right: &'a Series,
1007
) -> (Cow<'a, ChunkedArray<T>>, Cow<'a, Series>)
1008
where
1009
T: PolarsDataType,
1010
{
1011
let assert = || {
1012
assert_eq!(
1013
left.len(),
1014
right.len(),
1015
"expected arrays of the same length"
1016
)
1017
};
1018
match (left.chunks.len(), right.chunks().len()) {
1019
// All chunks are equal length
1020
(1, 1) => (Cow::Borrowed(left), Cow::Borrowed(right)),
1021
// All chunks are equal length
1022
(a, b)
1023
if a == b
1024
&& left
1025
.chunk_lengths()
1026
.zip(right.chunk_lengths())
1027
.all(|(l, r)| l == r) =>
1028
{
1029
assert();
1030
(Cow::Borrowed(left), Cow::Borrowed(right))
1031
},
1032
(_, 1) => (left.rechunk(), Cow::Borrowed(right)),
1033
(1, _) => (Cow::Borrowed(left), Cow::Owned(right.rechunk())),
1034
(_, _) => {
1035
assert();
1036
(left.rechunk(), Cow::Owned(right.rechunk()))
1037
},
1038
}
1039
}
1040
1041
#[cfg(feature = "performant")]
1042
pub(crate) fn align_chunks_binary_owned_series(left: Series, right: Series) -> (Series, Series) {
1043
match (left.chunks().len(), right.chunks().len()) {
1044
(1, 1) => (left, right),
1045
// All chunks are equal length
1046
(a, b)
1047
if a == b
1048
&& left
1049
.chunk_lengths()
1050
.zip(right.chunk_lengths())
1051
.all(|(l, r)| l == r) =>
1052
{
1053
(left, right)
1054
},
1055
(_, 1) => (left.rechunk(), right),
1056
(1, _) => (left, right.rechunk()),
1057
(_, _) => (left.rechunk(), right.rechunk()),
1058
}
1059
}
1060
1061
pub(crate) fn align_chunks_binary_owned<T, B>(
1062
left: ChunkedArray<T>,
1063
right: ChunkedArray<B>,
1064
) -> (ChunkedArray<T>, ChunkedArray<B>)
1065
where
1066
B: PolarsDataType,
1067
T: PolarsDataType,
1068
{
1069
match (left.chunks.len(), right.chunks.len()) {
1070
(1, 1) => (left, right),
1071
// All chunks are equal length
1072
(a, b)
1073
if a == b
1074
&& left
1075
.chunk_lengths()
1076
.zip(right.chunk_lengths())
1077
.all(|(l, r)| l == r) =>
1078
{
1079
(left, right)
1080
},
1081
(_, 1) => (left.rechunk().into_owned(), right),
1082
(1, _) => (left, right.rechunk().into_owned()),
1083
(_, _) => (left.rechunk().into_owned(), right.rechunk().into_owned()),
1084
}
1085
}
1086
1087
/// # Panics
1088
/// This will panic if `a.len() != b.len() || b.len() != c.len()` and array is chunked.
1089
#[allow(clippy::type_complexity)]
1090
pub fn align_chunks_ternary<'a, A, B, C>(
1091
a: &'a ChunkedArray<A>,
1092
b: &'a ChunkedArray<B>,
1093
c: &'a ChunkedArray<C>,
1094
) -> (
1095
Cow<'a, ChunkedArray<A>>,
1096
Cow<'a, ChunkedArray<B>>,
1097
Cow<'a, ChunkedArray<C>>,
1098
)
1099
where
1100
A: PolarsDataType,
1101
B: PolarsDataType,
1102
C: PolarsDataType,
1103
{
1104
if a.chunks.len() == 1 && b.chunks.len() == 1 && c.chunks.len() == 1 {
1105
return (Cow::Borrowed(a), Cow::Borrowed(b), Cow::Borrowed(c));
1106
}
1107
1108
assert!(
1109
a.len() == b.len() && b.len() == c.len(),
1110
"expected arrays of the same length"
1111
);
1112
1113
match (a.chunks.len(), b.chunks.len(), c.chunks.len()) {
1114
(_, 1, 1) => (
1115
Cow::Borrowed(a),
1116
Cow::Owned(b.match_chunks(a.chunk_lengths())),
1117
Cow::Owned(c.match_chunks(a.chunk_lengths())),
1118
),
1119
(1, 1, _) => (
1120
Cow::Owned(a.match_chunks(c.chunk_lengths())),
1121
Cow::Owned(b.match_chunks(c.chunk_lengths())),
1122
Cow::Borrowed(c),
1123
),
1124
(1, _, 1) => (
1125
Cow::Owned(a.match_chunks(b.chunk_lengths())),
1126
Cow::Borrowed(b),
1127
Cow::Owned(c.match_chunks(b.chunk_lengths())),
1128
),
1129
(1, _, _) => {
1130
let b = b.rechunk();
1131
(
1132
Cow::Owned(a.match_chunks(c.chunk_lengths())),
1133
Cow::Owned(b.match_chunks(c.chunk_lengths())),
1134
Cow::Borrowed(c),
1135
)
1136
},
1137
(_, 1, _) => {
1138
let a = a.rechunk();
1139
(
1140
Cow::Owned(a.match_chunks(c.chunk_lengths())),
1141
Cow::Owned(b.match_chunks(c.chunk_lengths())),
1142
Cow::Borrowed(c),
1143
)
1144
},
1145
(_, _, 1) => {
1146
let b = b.rechunk();
1147
(
1148
Cow::Borrowed(a),
1149
Cow::Owned(b.match_chunks(a.chunk_lengths())),
1150
Cow::Owned(c.match_chunks(a.chunk_lengths())),
1151
)
1152
},
1153
(len_a, len_b, len_c)
1154
if len_a == len_b
1155
&& len_b == len_c
1156
&& a.chunk_lengths()
1157
.zip(b.chunk_lengths())
1158
.zip(c.chunk_lengths())
1159
.all(|((a, b), c)| a == b && b == c) =>
1160
{
1161
(Cow::Borrowed(a), Cow::Borrowed(b), Cow::Borrowed(c))
1162
},
1163
_ => {
1164
// could optimize to choose to rechunk a primitive and not a string or list type
1165
let a = a.rechunk();
1166
let b = b.rechunk();
1167
(
1168
Cow::Owned(a.match_chunks(c.chunk_lengths())),
1169
Cow::Owned(b.match_chunks(c.chunk_lengths())),
1170
Cow::Borrowed(c),
1171
)
1172
},
1173
}
1174
}
1175
1176
pub fn binary_concatenate_validities<'a, T, B>(
1177
left: &'a ChunkedArray<T>,
1178
right: &'a ChunkedArray<B>,
1179
) -> Option<Bitmap>
1180
where
1181
B: PolarsDataType,
1182
T: PolarsDataType,
1183
{
1184
let (left, right) = align_chunks_binary(left, right);
1185
let left_validity = concatenate_validities(left.chunks());
1186
let right_validity = concatenate_validities(right.chunks());
1187
combine_validities_and(left_validity.as_ref(), right_validity.as_ref())
1188
}
1189
1190
/// Convenience for `x.into_iter().map(Into::into).collect()` using an `into_vec()` function.
1191
pub trait IntoVec<T> {
1192
fn into_vec(self) -> Vec<T>;
1193
}
1194
1195
impl<I, S> IntoVec<PlSmallStr> for I
1196
where
1197
I: IntoIterator<Item = S>,
1198
S: Into<PlSmallStr>,
1199
{
1200
fn into_vec(self) -> Vec<PlSmallStr> {
1201
self.into_iter().map(|s| s.into()).collect()
1202
}
1203
}
1204
1205
/// This logic is same as the impl on ChunkedArray
1206
/// The difference is that there is less indirection because the caller should preallocate
1207
/// `chunk_lens` once. On the `ChunkedArray` we indirect through an `ArrayRef` which is an indirection
1208
/// and a vtable.
1209
#[inline]
1210
pub(crate) fn index_to_chunked_index<
1211
I: Iterator<Item = Idx>,
1212
Idx: PartialOrd + std::ops::AddAssign + std::ops::SubAssign + Zero + One,
1213
>(
1214
chunk_lens: I,
1215
index: Idx,
1216
) -> (Idx, Idx) {
1217
let mut index_remainder = index;
1218
let mut current_chunk_idx = Zero::zero();
1219
1220
for chunk_len in chunk_lens {
1221
if chunk_len > index_remainder {
1222
break;
1223
} else {
1224
index_remainder -= chunk_len;
1225
current_chunk_idx += One::one();
1226
}
1227
}
1228
(current_chunk_idx, index_remainder)
1229
}
1230
1231
pub(crate) fn index_to_chunked_index_rev<
1232
I: Iterator<Item = Idx>,
1233
Idx: PartialOrd
1234
+ std::ops::AddAssign
1235
+ std::ops::SubAssign
1236
+ std::ops::Sub<Output = Idx>
1237
+ Zero
1238
+ One
1239
+ Copy
1240
+ std::fmt::Debug,
1241
>(
1242
chunk_lens_rev: I,
1243
index_from_back: Idx,
1244
total_chunks: Idx,
1245
) -> (Idx, Idx) {
1246
debug_assert!(index_from_back > Zero::zero(), "at least -1");
1247
let mut index_remainder = index_from_back;
1248
let mut current_chunk_idx = One::one();
1249
let mut current_chunk_len = Zero::zero();
1250
1251
for chunk_len in chunk_lens_rev {
1252
current_chunk_len = chunk_len;
1253
if chunk_len >= index_remainder {
1254
break;
1255
} else {
1256
index_remainder -= chunk_len;
1257
current_chunk_idx += One::one();
1258
}
1259
}
1260
(
1261
total_chunks - current_chunk_idx,
1262
current_chunk_len - index_remainder,
1263
)
1264
}
1265
1266
pub fn first_null<'a, I>(iter: I) -> Option<usize>
1267
where
1268
I: Iterator<Item = &'a dyn Array>,
1269
{
1270
let mut offset = 0;
1271
for arr in iter {
1272
if let Some(mask) = arr.validity() {
1273
let len_mask = mask.len();
1274
let n = mask.leading_ones();
1275
if n < len_mask {
1276
return Some(offset + n);
1277
}
1278
offset += len_mask
1279
} else {
1280
offset += arr.len();
1281
}
1282
}
1283
None
1284
}
1285
1286
pub fn first_non_null<'a, I>(iter: I) -> Option<usize>
1287
where
1288
I: Iterator<Item = &'a dyn Array>,
1289
{
1290
let mut offset = 0;
1291
for arr in iter {
1292
if let Some(mask) = arr.validity() {
1293
let len_mask = mask.len();
1294
let n = mask.leading_zeros();
1295
if n < len_mask {
1296
return Some(offset + n);
1297
}
1298
offset += len_mask
1299
} else if !arr.is_empty() {
1300
return Some(offset);
1301
}
1302
}
1303
None
1304
}
1305
1306
pub fn last_non_null<'a, I>(iter: I, len: usize) -> Option<usize>
1307
where
1308
I: DoubleEndedIterator<Item = &'a dyn Array>,
1309
{
1310
if len == 0 {
1311
return None;
1312
}
1313
let mut offset = 0;
1314
for arr in iter.rev() {
1315
if let Some(mask) = arr.validity() {
1316
let len_mask = mask.len();
1317
let n = mask.trailing_zeros();
1318
if n < len_mask {
1319
return Some(len - offset - n - 1);
1320
}
1321
offset += len_mask;
1322
} else if !arr.is_empty() {
1323
return Some(len - offset - 1);
1324
}
1325
}
1326
None
1327
}
1328
1329
/// ensure that nulls are propagated to both arrays
1330
pub fn coalesce_nulls<'a, T: PolarsDataType>(
1331
a: &'a ChunkedArray<T>,
1332
b: &'a ChunkedArray<T>,
1333
) -> (Cow<'a, ChunkedArray<T>>, Cow<'a, ChunkedArray<T>>) {
1334
if a.null_count() > 0 || b.null_count() > 0 {
1335
let (a, b) = align_chunks_binary(a, b);
1336
let mut b = b.into_owned();
1337
let a = a.coalesce_nulls(b.chunks());
1338
1339
for arr in a.chunks().iter() {
1340
for arr_b in unsafe { b.chunks_mut() } {
1341
*arr_b = arr_b.with_validity(arr.validity().cloned())
1342
}
1343
}
1344
b.compute_len();
1345
(Cow::Owned(a), Cow::Owned(b))
1346
} else {
1347
(Cow::Borrowed(a), Cow::Borrowed(b))
1348
}
1349
}
1350
1351
pub fn coalesce_nulls_columns(a: &Column, b: &Column) -> (Column, Column) {
1352
if a.null_count() > 0 || b.null_count() > 0 {
1353
let mut a = a.as_materialized_series().rechunk();
1354
let mut b = b.as_materialized_series().rechunk();
1355
for (arr_a, arr_b) in unsafe { a.chunks_mut().iter_mut().zip(b.chunks_mut()) } {
1356
let validity = match (arr_a.validity(), arr_b.validity()) {
1357
(None, Some(b)) => Some(b.clone()),
1358
(Some(a), Some(b)) => Some(a & b),
1359
(Some(a), None) => Some(a.clone()),
1360
(None, None) => None,
1361
};
1362
*arr_a = arr_a.with_validity(validity.clone());
1363
*arr_b = arr_b.with_validity(validity);
1364
}
1365
a.compute_len();
1366
b.compute_len();
1367
(a.into(), b.into())
1368
} else {
1369
(a.clone(), b.clone())
1370
}
1371
}
1372
1373
#[cfg(test)]
1374
mod test {
1375
use super::*;
1376
1377
#[test]
1378
fn test_split() {
1379
let ca: Int32Chunked = (0..10).collect_ca("a".into());
1380
1381
let out = split(&ca, 3);
1382
assert_eq!(out[0].len(), 3);
1383
assert_eq!(out[1].len(), 3);
1384
assert_eq!(out[2].len(), 4);
1385
}
1386
1387
#[test]
1388
fn test_align_chunks() -> PolarsResult<()> {
1389
let a = Int32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3, 4]);
1390
let mut b = Int32Chunked::new(PlSmallStr::EMPTY, &[1]);
1391
let b2 = Int32Chunked::new(PlSmallStr::EMPTY, &[2, 3, 4]);
1392
1393
b.append(&b2)?;
1394
let (a, b) = align_chunks_binary(&a, &b);
1395
assert_eq!(
1396
a.chunk_lengths().collect::<Vec<_>>(),
1397
b.chunk_lengths().collect::<Vec<_>>()
1398
);
1399
1400
let a = Int32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3, 4]);
1401
let mut b = Int32Chunked::new(PlSmallStr::EMPTY, &[1]);
1402
let b1 = b.clone();
1403
b.append(&b1)?;
1404
b.append(&b1)?;
1405
b.append(&b1)?;
1406
let (a, b) = align_chunks_binary(&a, &b);
1407
assert_eq!(
1408
a.chunk_lengths().collect::<Vec<_>>(),
1409
b.chunk_lengths().collect::<Vec<_>>()
1410
);
1411
1412
Ok(())
1413
}
1414
}
1415
1416