CoCalc -- mod.rs

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/utils/mod.rs
⁶⁹⁴⁰ views
1
mod any_value;
2
use arrow::compute::concatenate::concatenate_validities;
3
use arrow::compute::utils::combine_validities_and;
4
pub mod flatten;
5
pub(crate) mod series;
6
mod supertype;
7
use std::borrow::Cow;
8
use std::ops::{Deref, DerefMut};
9
mod schema;
10

11
pub use any_value::*;
12
use arrow::bitmap::Bitmap;
13
pub use arrow::legacy::utils::*;
14
pub use arrow::trusted_len::TrustMyLength;
15
use flatten::*;
16
use num_traits::{One, Zero};
17
use rayon::prelude::*;
18
pub use schema::*;
19
pub use series::*;
20
pub use supertype::*;
21
pub use {arrow, rayon};
22

23
use crate::POOL;
24
use crate::prelude::*;
25

26
#[repr(transparent)]
27
pub struct Wrap<T>(pub T);
28

29
impl<T> Deref for Wrap<T> {
30
    type Target = T;
31
    fn deref(&self) -> &Self::Target {
32
        &self.0
33
    }
34
}
35

36
#[inline(always)]
37
pub fn _set_partition_size() -> usize {
38
    POOL.current_num_threads()
39
}
40

41
/// Just a wrapper structure which is useful for certain impl specializations.
42
///
43
/// This is for instance use to implement
44
/// `impl<T> FromIterator<T::Native> for NoNull<ChunkedArray<T>>`
45
/// as `Option<T::Native>` was already implemented:
46
/// `impl<T> FromIterator<Option<T::Native>> for ChunkedArray<T>`
47
pub struct NoNull<T> {
48
    inner: T,
49
}
50

51
impl<T> NoNull<T> {
52
    pub fn new(inner: T) -> Self {
53
        NoNull { inner }
54
    }
55

56
    pub fn into_inner(self) -> T {
57
        self.inner
58
    }
59
}
60

61
impl<T> Deref for NoNull<T> {
62
    type Target = T;
63

64
    fn deref(&self) -> &Self::Target {
65
        &self.inner
66
    }
67
}
68

69
impl<T> DerefMut for NoNull<T> {
70
    fn deref_mut(&mut self) -> &mut Self::Target {
71
        &mut self.inner
72
    }
73
}
74

75
pub(crate) fn get_iter_capacity<T, I: Iterator<Item = T>>(iter: &I) -> usize {
76
    match iter.size_hint() {
77
        (_lower, Some(upper)) => upper,
78
        (0, None) => 1024,
79
        (lower, None) => lower,
80
    }
81
}
82

83
// prefer this one over split_ca, as this can push the null_count into the thread pool
84
// returns an `(offset, length)` tuple
85
#[doc(hidden)]
86
pub fn _split_offsets(len: usize, n: usize) -> Vec<(usize, usize)> {
87
    if n == 1 {
88
        vec![(0, len)]
89
    } else {
90
        let chunk_size = len / n;
91

92
        (0..n)
93
            .map(|partition| {
94
                let offset = partition * chunk_size;
95
                let len = if partition == (n - 1) {
96
                    len - offset
97
                } else {
98
                    chunk_size
99
                };
100
                (partition * chunk_size, len)
101
            })
102
            .collect_trusted()
103
    }
104
}
105

106
#[allow(clippy::len_without_is_empty)]
107
pub trait Container: Clone {
108
    fn slice(&self, offset: i64, len: usize) -> Self;
109

110
    fn split_at(&self, offset: i64) -> (Self, Self);
111

112
    fn len(&self) -> usize;
113

114
    fn iter_chunks(&self) -> impl Iterator<Item = Self>;
115

116
    fn should_rechunk(&self) -> bool;
117

118
    fn n_chunks(&self) -> usize;
119

120
    fn chunk_lengths(&self) -> impl Iterator<Item = usize>;
121
}
122

123
impl Container for DataFrame {
124
    fn slice(&self, offset: i64, len: usize) -> Self {
125
        DataFrame::slice(self, offset, len)
126
    }
127

128
    fn split_at(&self, offset: i64) -> (Self, Self) {
129
        DataFrame::split_at(self, offset)
130
    }
131

132
    fn len(&self) -> usize {
133
        self.height()
134
    }
135

136
    fn iter_chunks(&self) -> impl Iterator<Item = Self> {
137
        flatten_df_iter(self)
138
    }
139

140
    fn should_rechunk(&self) -> bool {
141
        self.should_rechunk()
142
    }
143

144
    fn n_chunks(&self) -> usize {
145
        DataFrame::first_col_n_chunks(self)
146
    }
147

148
    fn chunk_lengths(&self) -> impl Iterator<Item = usize> {
149
        // @scalar-correctness?
150
        self.columns[0].as_materialized_series().chunk_lengths()
151
    }
152
}
153

154
impl<T: PolarsDataType> Container for ChunkedArray<T> {
155
    fn slice(&self, offset: i64, len: usize) -> Self {
156
        ChunkedArray::slice(self, offset, len)
157
    }
158

159
    fn split_at(&self, offset: i64) -> (Self, Self) {
160
        ChunkedArray::split_at(self, offset)
161
    }
162

163
    fn len(&self) -> usize {
164
        ChunkedArray::len(self)
165
    }
166

167
    fn iter_chunks(&self) -> impl Iterator<Item = Self> {
168
        self.downcast_iter()
169
            .map(|arr| Self::with_chunk(self.name().clone(), arr.clone()))
170
    }
171

172
    fn should_rechunk(&self) -> bool {
173
        false
174
    }
175

176
    fn n_chunks(&self) -> usize {
177
        self.chunks().len()
178
    }
179

180
    fn chunk_lengths(&self) -> impl Iterator<Item = usize> {
181
        ChunkedArray::chunk_lengths(self)
182
    }
183
}
184

185
impl Container for Series {
186
    fn slice(&self, offset: i64, len: usize) -> Self {
187
        self.0.slice(offset, len)
188
    }
189

190
    fn split_at(&self, offset: i64) -> (Self, Self) {
191
        self.0.split_at(offset)
192
    }
193

194
    fn len(&self) -> usize {
195
        self.0.len()
196
    }
197

198
    fn iter_chunks(&self) -> impl Iterator<Item = Self> {
199
        (0..self.0.n_chunks()).map(|i| self.select_chunk(i))
200
    }
201

202
    fn should_rechunk(&self) -> bool {
203
        false
204
    }
205

206
    fn n_chunks(&self) -> usize {
207
        self.chunks().len()
208
    }
209

210
    fn chunk_lengths(&self) -> impl Iterator<Item = usize> {
211
        self.0.chunk_lengths()
212
    }
213
}
214

215
fn split_impl<C: Container>(container: &C, target: usize, chunk_size: usize) -> Vec<C> {
216
    if target == 1 {
217
        return vec![container.clone()];
218
    }
219
    let mut out = Vec::with_capacity(target);
220
    let chunk_size = chunk_size as i64;
221

222
    // First split
223
    let (chunk, mut remainder) = container.split_at(chunk_size);
224
    out.push(chunk);
225

226
    // Take the rest of the splits of exactly chunk size, but skip the last remainder as we won't split that.
227
    for _ in 1..target - 1 {
228
        let (a, b) = remainder.split_at(chunk_size);
229
        out.push(a);
230
        remainder = b
231
    }
232
    // This can be slightly larger than `chunk_size`, but is smaller than `2 * chunk_size`.
233
    out.push(remainder);
234
    out
235
}
236

237
/// Splits, but doesn't flatten chunks. E.g. a container can still have multiple chunks.
238
pub fn split<C: Container>(container: &C, target: usize) -> Vec<C> {
239
    let total_len = container.len();
240
    if total_len == 0 {
241
        return vec![container.clone()];
242
    }
243

244
    let chunk_size = std::cmp::max(total_len / target, 1);
245

246
    if container.n_chunks() == target
247
        && container
248
            .chunk_lengths()
249
            .all(|len| len.abs_diff(chunk_size) < 100)
250
        // We cannot get chunks if they are misaligned
251
        && !container.should_rechunk()
252
    {
253
        return container.iter_chunks().collect();
254
    }
255
    split_impl(container, target, chunk_size)
256
}
257

258
/// Split a [`Container`] in `target` elements. The target doesn't have to be respected if not
259
/// Deviation of the target might be done to create more equal size chunks.
260
pub fn split_and_flatten<C: Container>(container: &C, target: usize) -> Vec<C> {
261
    let total_len = container.len();
262
    if total_len == 0 {
263
        return vec![container.clone()];
264
    }
265

266
    let chunk_size = std::cmp::max(total_len / target, 1);
267

268
    if container.n_chunks() == target
269
        && container
270
            .chunk_lengths()
271
            .all(|len| len.abs_diff(chunk_size) < 100)
272
        // We cannot get chunks if they are misaligned
273
        && !container.should_rechunk()
274
    {
275
        return container.iter_chunks().collect();
276
    }
277

278
    if container.n_chunks() == 1 {
279
        split_impl(container, target, chunk_size)
280
    } else {
281
        let mut out = Vec::with_capacity(target);
282
        let chunks = container.iter_chunks();
283

284
        'new_chunk: for mut chunk in chunks {
285
            loop {
286
                let h = chunk.len();
287
                if h < chunk_size {
288
                    // TODO if the chunk is much smaller than chunk size, we should try to merge it with the next one.
289
                    out.push(chunk);
290
                    continue 'new_chunk;
291
                }
292

293
                // If a split leads to the next chunk being smaller than 30% take the whole chunk
294
                if ((h - chunk_size) as f64 / chunk_size as f64) < 0.3 {
295
                    out.push(chunk);
296
                    continue 'new_chunk;
297
                }
298

299
                let (a, b) = chunk.split_at(chunk_size as i64);
300
                out.push(a);
301
                chunk = b;
302
            }
303
        }
304
        out
305
    }
306
}
307

308
/// Split a [`DataFrame`] in `target` elements. The target doesn't have to be respected if not
309
/// strict. Deviation of the target might be done to create more equal size chunks.
310
///
311
/// # Panics
312
/// if chunks are not aligned
313
pub fn split_df_as_ref(df: &DataFrame, target: usize, strict: bool) -> Vec<DataFrame> {
314
    if strict {
315
        split(df, target)
316
    } else {
317
        split_and_flatten(df, target)
318
    }
319
}
320

321
#[doc(hidden)]
322
/// Split a [`DataFrame`] into `n` parts. We take a `&mut` to be able to repartition/align chunks.
323
/// `strict` in that it respects `n` even if the chunks are suboptimal.
324
pub fn split_df(df: &mut DataFrame, target: usize, strict: bool) -> Vec<DataFrame> {
325
    if target == 0 || df.is_empty() {
326
        return vec![df.clone()];
327
    }
328
    // make sure that chunks are aligned.
329
    df.align_chunks_par();
330
    split_df_as_ref(df, target, strict)
331
}
332

333
pub fn slice_slice<T>(vals: &[T], offset: i64, len: usize) -> &[T] {
334
    let (raw_offset, slice_len) = slice_offsets(offset, len, vals.len());
335
    &vals[raw_offset..raw_offset + slice_len]
336
}
337

338
#[inline]
339
pub fn slice_offsets(offset: i64, length: usize, array_len: usize) -> (usize, usize) {
340
    let signed_start_offset = if offset < 0 {
341
        offset.saturating_add_unsigned(array_len as u64)
342
    } else {
343
        offset
344
    };
345
    let signed_stop_offset = signed_start_offset.saturating_add_unsigned(length as u64);
346

347
    let signed_array_len: i64 = array_len
348
        .try_into()
349
        .expect("array length larger than i64::MAX");
350
    let clamped_start_offset = signed_start_offset.clamp(0, signed_array_len);
351
    let clamped_stop_offset = signed_stop_offset.clamp(0, signed_array_len);
352

353
    let slice_start_idx = clamped_start_offset as usize;
354
    let slice_len = (clamped_stop_offset - clamped_start_offset) as usize;
355
    (slice_start_idx, slice_len)
356
}
357

358
/// Apply a macro on the Series
359
#[macro_export]
360
macro_rules! match_dtype_to_physical_apply_macro {
361
    ($obj:expr, $macro:ident, $macro_string:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{
362
        match $obj {
363
            DataType::String => $macro_string!($($opt_args)*),
364
            DataType::Boolean => $macro_bool!($($opt_args)*),
365
            #[cfg(feature = "dtype-u8")]
366
            DataType::UInt8 => $macro!(u8 $(, $opt_args)*),
367
            #[cfg(feature = "dtype-u16")]
368
            DataType::UInt16 => $macro!(u16 $(, $opt_args)*),
369
            DataType::UInt32 => $macro!(u32 $(, $opt_args)*),
370
            DataType::UInt64 => $macro!(u64 $(, $opt_args)*),
371
            #[cfg(feature = "dtype-i8")]
372
            DataType::Int8 => $macro!(i8 $(, $opt_args)*),
373
            #[cfg(feature = "dtype-i16")]
374
            DataType::Int16 => $macro!(i16 $(, $opt_args)*),
375
            DataType::Int32 => $macro!(i32 $(, $opt_args)*),
376
            DataType::Int64 => $macro!(i64 $(, $opt_args)*),
377
            #[cfg(feature = "dtype-i128")]
378
            DataType::Int128 => $macro!(i128 $(, $opt_args)*),
379
            DataType::Float32 => $macro!(f32 $(, $opt_args)*),
380
            DataType::Float64 => $macro!(f64 $(, $opt_args)*),
381
            dt => panic!("not implemented for dtype {:?}", dt),
382
        }
383
    }};
384
}
385

386
/// Apply a macro on the Series
387
#[macro_export]
388
macro_rules! match_dtype_to_logical_apply_macro {
389
    ($obj:expr, $macro:ident, $macro_string:ident, $macro_binary:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{
390
        match $obj {
391
            DataType::String => $macro_string!($($opt_args)*),
392
            DataType::Binary => $macro_binary!($($opt_args)*),
393
            DataType::Boolean => $macro_bool!($($opt_args)*),
394
            #[cfg(feature = "dtype-u8")]
395
            DataType::UInt8 => $macro!(UInt8Type $(, $opt_args)*),
396
            #[cfg(feature = "dtype-u16")]
397
            DataType::UInt16 => $macro!(UInt16Type $(, $opt_args)*),
398
            DataType::UInt32 => $macro!(UInt32Type $(, $opt_args)*),
399
            DataType::UInt64 => $macro!(UInt64Type $(, $opt_args)*),
400
            #[cfg(feature = "dtype-i8")]
401
            DataType::Int8 => $macro!(Int8Type $(, $opt_args)*),
402
            #[cfg(feature = "dtype-i16")]
403
            DataType::Int16 => $macro!(Int16Type $(, $opt_args)*),
404
            DataType::Int32 => $macro!(Int32Type $(, $opt_args)*),
405
            DataType::Int64 => $macro!(Int64Type $(, $opt_args)*),
406
            #[cfg(feature = "dtype-i128")]
407
            DataType::Int128 => $macro!(Int128Type $(, $opt_args)*),
408
            DataType::Float32 => $macro!(Float32Type $(, $opt_args)*),
409
            DataType::Float64 => $macro!(Float64Type $(, $opt_args)*),
410
            dt => panic!("not implemented for dtype {:?}", dt),
411
        }
412
    }};
413
}
414

415
/// Apply a macro on the Downcasted ChunkedArrays
416
#[macro_export]
417
macro_rules! match_arrow_dtype_apply_macro_ca {
418
    ($self:expr, $macro:ident, $macro_string:ident, $macro_bool:ident $(, $opt_args:expr)*) => {{
419
        match $self.dtype() {
420
            DataType::String => $macro_string!($self.str().unwrap() $(, $opt_args)*),
421
            DataType::Boolean => $macro_bool!($self.bool().unwrap() $(, $opt_args)*),
422
            #[cfg(feature = "dtype-u8")]
423
            DataType::UInt8 => $macro!($self.u8().unwrap() $(, $opt_args)*),
424
            #[cfg(feature = "dtype-u16")]
425
            DataType::UInt16 => $macro!($self.u16().unwrap() $(, $opt_args)*),
426
            DataType::UInt32 => $macro!($self.u32().unwrap() $(, $opt_args)*),
427
            DataType::UInt64 => $macro!($self.u64().unwrap() $(, $opt_args)*),
428
            #[cfg(feature = "dtype-i8")]
429
            DataType::Int8 => $macro!($self.i8().unwrap() $(, $opt_args)*),
430
            #[cfg(feature = "dtype-i16")]
431
            DataType::Int16 => $macro!($self.i16().unwrap() $(, $opt_args)*),
432
            DataType::Int32 => $macro!($self.i32().unwrap() $(, $opt_args)*),
433
            DataType::Int64 => $macro!($self.i64().unwrap() $(, $opt_args)*),
434
            #[cfg(feature = "dtype-i128")]
435
            DataType::Int128 => $macro!($self.i128().unwrap() $(, $opt_args)*),
436
            DataType::Float32 => $macro!($self.f32().unwrap() $(, $opt_args)*),
437
            DataType::Float64 => $macro!($self.f64().unwrap() $(, $opt_args)*),
438
            dt => panic!("not implemented for dtype {:?}", dt),
439
        }
440
    }};
441
}
442

443
#[macro_export]
444
macro_rules! with_match_physical_numeric_type {(
445
    $dtype:expr, | $_:tt $T:ident | $($body:tt)*
446
) => ({
447
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
448
    use $crate::datatypes::DataType::*;
449
    match $dtype {
450
        #[cfg(feature = "dtype-i8")]
451
        Int8 => __with_ty__! { i8 },
452
        #[cfg(feature = "dtype-i16")]
453
        Int16 => __with_ty__! { i16 },
454
        Int32 => __with_ty__! { i32 },
455
        Int64 => __with_ty__! { i64 },
456
        #[cfg(feature = "dtype-i128")]
457
        Int128 => __with_ty__! { i128 },
458
        #[cfg(feature = "dtype-u8")]
459
        UInt8 => __with_ty__! { u8 },
460
        #[cfg(feature = "dtype-u16")]
461
        UInt16 => __with_ty__! { u16 },
462
        UInt32 => __with_ty__! { u32 },
463
        UInt64 => __with_ty__! { u64 },
464
        Float32 => __with_ty__! { f32 },
465
        Float64 => __with_ty__! { f64 },
466
        dt => panic!("not implemented for dtype {:?}", dt),
467
    }
468
})}
469

470
#[macro_export]
471
macro_rules! with_match_physical_integer_type {(
472
    $dtype:expr, | $_:tt $T:ident | $($body:tt)*
473
) => ({
474
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
475
    use $crate::datatypes::DataType::*;
476
    match $dtype {
477
        #[cfg(feature = "dtype-i8")]
478
        Int8 => __with_ty__! { i8 },
479
        #[cfg(feature = "dtype-i16")]
480
        Int16 => __with_ty__! { i16 },
481
        Int32 => __with_ty__! { i32 },
482
        Int64 => __with_ty__! { i64 },
483
        #[cfg(feature = "dtype-i128")]
484
        Int128 => __with_ty__! { i128 },
485
        #[cfg(feature = "dtype-u8")]
486
        UInt8 => __with_ty__! { u8 },
487
        #[cfg(feature = "dtype-u16")]
488
        UInt16 => __with_ty__! { u16 },
489
        UInt32 => __with_ty__! { u32 },
490
        UInt64 => __with_ty__! { u64 },
491
        dt => panic!("not implemented for dtype {:?}", dt),
492
    }
493
})}
494

495
#[macro_export]
496
macro_rules! with_match_physical_float_type {(
497
    $dtype:expr, | $_:tt $T:ident | $($body:tt)*
498
) => ({
499
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
500
    use $crate::datatypes::DataType::*;
501
    match $dtype {
502
        Float32 => __with_ty__! { f32 },
503
        Float64 => __with_ty__! { f64 },
504
        dt => panic!("not implemented for dtype {:?}", dt),
505
    }
506
})}
507

508
#[macro_export]
509
macro_rules! with_match_physical_float_polars_type {(
510
    $key_type:expr, | $_:tt $T:ident | $($body:tt)*
511
) => ({
512
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
513
    use $crate::datatypes::DataType::*;
514
    match $key_type {
515
        Float32 => __with_ty__! { Float32Type },
516
        Float64 => __with_ty__! { Float64Type },
517
        dt => panic!("not implemented for dtype {:?}", dt),
518
    }
519
})}
520

521
#[macro_export]
522
macro_rules! with_match_physical_numeric_polars_type {(
523
    $key_type:expr, | $_:tt $T:ident | $($body:tt)*
524
) => ({
525
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
526
    use $crate::datatypes::DataType::*;
527
    match $key_type {
528
            #[cfg(feature = "dtype-i8")]
529
        Int8 => __with_ty__! { Int8Type },
530
            #[cfg(feature = "dtype-i16")]
531
        Int16 => __with_ty__! { Int16Type },
532
        Int32 => __with_ty__! { Int32Type },
533
        Int64 => __with_ty__! { Int64Type },
534
            #[cfg(feature = "dtype-i128")]
535
        Int128 => __with_ty__! { Int128Type },
536
            #[cfg(feature = "dtype-u8")]
537
        UInt8 => __with_ty__! { UInt8Type },
538
            #[cfg(feature = "dtype-u16")]
539
        UInt16 => __with_ty__! { UInt16Type },
540
        UInt32 => __with_ty__! { UInt32Type },
541
        UInt64 => __with_ty__! { UInt64Type },
542
        Float32 => __with_ty__! { Float32Type },
543
        Float64 => __with_ty__! { Float64Type },
544
        dt => panic!("not implemented for dtype {:?}", dt),
545
    }
546
})}
547

548
#[macro_export]
549
macro_rules! with_match_physical_integer_polars_type {(
550
    $key_type:expr, | $_:tt $T:ident | $($body:tt)*
551
) => ({
552
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
553
    use $crate::datatypes::DataType::*;
554
    use $crate::datatypes::*;
555
    match $key_type {
556
        #[cfg(feature = "dtype-i8")]
557
        Int8 => __with_ty__! { Int8Type },
558
        #[cfg(feature = "dtype-i16")]
559
        Int16 => __with_ty__! { Int16Type },
560
        Int32 => __with_ty__! { Int32Type },
561
        Int64 => __with_ty__! { Int64Type },
562
        #[cfg(feature = "dtype-i128")]
563
        Int128 => __with_ty__! { Int128Type },
564
        #[cfg(feature = "dtype-u8")]
565
        UInt8 => __with_ty__! { UInt8Type },
566
        #[cfg(feature = "dtype-u16")]
567
        UInt16 => __with_ty__! { UInt16Type },
568
        UInt32 => __with_ty__! { UInt32Type },
569
        UInt64 => __with_ty__! { UInt64Type },
570
        dt => panic!("not implemented for dtype {:?}", dt),
571
    }
572
})}
573

574
#[macro_export]
575
macro_rules! with_match_categorical_physical_type {(
576
    $dtype:expr, | $_:tt $T:ident | $($body:tt)*
577
) => ({
578
    macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
579
    match $dtype {
580
        CategoricalPhysical::U8 => __with_ty__! { Categorical8Type },
581
        CategoricalPhysical::U16 => __with_ty__! { Categorical16Type },
582
        CategoricalPhysical::U32 => __with_ty__! { Categorical32Type },
583
    }
584
})}
585

586
/// Apply a macro on the Downcasted ChunkedArrays of DataTypes that are logical numerics.
587
/// So no logical.
588
#[macro_export]
589
macro_rules! downcast_as_macro_arg_physical {
590
    ($self:expr, $macro:ident $(, $opt_args:expr)*) => {{
591
        match $self.dtype() {
592
            #[cfg(feature = "dtype-u8")]
593
            DataType::UInt8 => $macro!($self.u8().unwrap() $(, $opt_args)*),
594
            #[cfg(feature = "dtype-u16")]
595
            DataType::UInt16 => $macro!($self.u16().unwrap() $(, $opt_args)*),
596
            DataType::UInt32 => $macro!($self.u32().unwrap() $(, $opt_args)*),
597
            DataType::UInt64 => $macro!($self.u64().unwrap() $(, $opt_args)*),
598
            #[cfg(feature = "dtype-i8")]
599
            DataType::Int8 => $macro!($self.i8().unwrap() $(, $opt_args)*),
600
            #[cfg(feature = "dtype-i16")]
601
            DataType::Int16 => $macro!($self.i16().unwrap() $(, $opt_args)*),
602
            DataType::Int32 => $macro!($self.i32().unwrap() $(, $opt_args)*),
603
            DataType::Int64 => $macro!($self.i64().unwrap() $(, $opt_args)*),
604
            #[cfg(feature = "dtype-i128")]
605
            DataType::Int128 => $macro!($self.i128().unwrap() $(, $opt_args)*),
606
            DataType::Float32 => $macro!($self.f32().unwrap() $(, $opt_args)*),
607
            DataType::Float64 => $macro!($self.f64().unwrap() $(, $opt_args)*),
608
            dt => panic!("not implemented for {:?}", dt),
609
        }
610
    }};
611
}
612

613
/// Apply a macro on the Downcasted ChunkedArrays of DataTypes that are logical numerics.
614
/// So no logical.
615
#[macro_export]
616
macro_rules! downcast_as_macro_arg_physical_mut {
617
    ($self:expr, $macro:ident $(, $opt_args:expr)*) => {{
618
        // clone so that we do not borrow
619
        match $self.dtype().clone() {
620
            #[cfg(feature = "dtype-u8")]
621
            DataType::UInt8 => {
622
                let ca: &mut UInt8Chunked = $self.as_mut();
623
                $macro!(UInt8Type, ca $(, $opt_args)*)
624
            },
625
            #[cfg(feature = "dtype-u16")]
626
            DataType::UInt16 => {
627
                let ca: &mut UInt16Chunked = $self.as_mut();
628
                $macro!(UInt16Type, ca $(, $opt_args)*)
629
            },
630
            DataType::UInt32 => {
631
                let ca: &mut UInt32Chunked = $self.as_mut();
632
                $macro!(UInt32Type, ca $(, $opt_args)*)
633
            },
634
            DataType::UInt64 => {
635
                let ca: &mut UInt64Chunked = $self.as_mut();
636
                $macro!(UInt64Type, ca $(, $opt_args)*)
637
            },
638
            #[cfg(feature = "dtype-i8")]
639
            DataType::Int8 => {
640
                let ca: &mut Int8Chunked = $self.as_mut();
641
                $macro!(Int8Type, ca $(, $opt_args)*)
642
            },
643
            #[cfg(feature = "dtype-i16")]
644
            DataType::Int16 => {
645
                let ca: &mut Int16Chunked = $self.as_mut();
646
                $macro!(Int16Type, ca $(, $opt_args)*)
647
            },
648
            DataType::Int32 => {
649
                let ca: &mut Int32Chunked = $self.as_mut();
650
                $macro!(Int32Type, ca $(, $opt_args)*)
651
            },
652
            DataType::Int64 => {
653
                let ca: &mut Int64Chunked = $self.as_mut();
654
                $macro!(Int64Type, ca $(, $opt_args)*)
655
            },
656
            #[cfg(feature = "dtype-i128")]
657
            DataType::Int128 => {
658
                let ca: &mut Int128Chunked = $self.as_mut();
659
                $macro!(Int128Type, ca $(, $opt_args)*)
660
            },
661
            DataType::Float32 => {
662
                let ca: &mut Float32Chunked = $self.as_mut();
663
                $macro!(Float32Type, ca $(, $opt_args)*)
664
            },
665
            DataType::Float64 => {
666
                let ca: &mut Float64Chunked = $self.as_mut();
667
                $macro!(Float64Type, ca $(, $opt_args)*)
668
            },
669
            dt => panic!("not implemented for {:?}", dt),
670
        }
671
    }};
672
}
673

674
#[macro_export]
675
macro_rules! apply_method_all_arrow_series {
676
    ($self:expr, $method:ident, $($args:expr),*) => {
677
        match $self.dtype() {
678
            DataType::Boolean => $self.bool().unwrap().$method($($args),*),
679
            DataType::String => $self.str().unwrap().$method($($args),*),
680
            #[cfg(feature = "dtype-u8")]
681
            DataType::UInt8 => $self.u8().unwrap().$method($($args),*),
682
            #[cfg(feature = "dtype-u16")]
683
            DataType::UInt16 => $self.u16().unwrap().$method($($args),*),
684
            DataType::UInt32 => $self.u32().unwrap().$method($($args),*),
685
            DataType::UInt64 => $self.u64().unwrap().$method($($args),*),
686
            #[cfg(feature = "dtype-i8")]
687
            DataType::Int8 => $self.i8().unwrap().$method($($args),*),
688
            #[cfg(feature = "dtype-i16")]
689
            DataType::Int16 => $self.i16().unwrap().$method($($args),*),
690
            DataType::Int32 => $self.i32().unwrap().$method($($args),*),
691
            DataType::Int64 => $self.i64().unwrap().$method($($args),*),
692
            #[cfg(feature = "dtype-i128")]
693
            DataType::Int128 => $self.i128().unwrap().$method($($args),*),
694
            DataType::Float32 => $self.f32().unwrap().$method($($args),*),
695
            DataType::Float64 => $self.f64().unwrap().$method($($args),*),
696
            DataType::Time => $self.time().unwrap().$method($($args),*),
697
            DataType::Date => $self.date().unwrap().$method($($args),*),
698
            DataType::Datetime(_, _) => $self.datetime().unwrap().$method($($args),*),
699
            DataType::List(_) => $self.list().unwrap().$method($($args),*),
700
            DataType::Struct(_) => $self.struct_().unwrap().$method($($args),*),
701
            dt => panic!("dtype {:?} not supported", dt)
702
        }
703
    }
704
}
705

706
#[macro_export]
707
macro_rules! apply_method_physical_integer {
708
    ($self:expr, $method:ident, $($args:expr),*) => {
709
        match $self.dtype() {
710
            #[cfg(feature = "dtype-u8")]
711
            DataType::UInt8 => $self.u8().unwrap().$method($($args),*),
712
            #[cfg(feature = "dtype-u16")]
713
            DataType::UInt16 => $self.u16().unwrap().$method($($args),*),
714
            DataType::UInt32 => $self.u32().unwrap().$method($($args),*),
715
            DataType::UInt64 => $self.u64().unwrap().$method($($args),*),
716
            #[cfg(feature = "dtype-i8")]
717
            DataType::Int8 => $self.i8().unwrap().$method($($args),*),
718
            #[cfg(feature = "dtype-i16")]
719
            DataType::Int16 => $self.i16().unwrap().$method($($args),*),
720
            DataType::Int32 => $self.i32().unwrap().$method($($args),*),
721
            DataType::Int64 => $self.i64().unwrap().$method($($args),*),
722
            #[cfg(feature = "dtype-i128")]
723
            DataType::Int128 => $self.i128().unwrap().$method($($args),*),
724
            dt => panic!("not implemented for dtype {:?}", dt),
725
        }
726
    }
727
}
728

729
// doesn't include Bool and String
730
#[macro_export]
731
macro_rules! apply_method_physical_numeric {
732
    ($self:expr, $method:ident, $($args:expr),*) => {
733
        match $self.dtype() {
734
            DataType::Float32 => $self.f32().unwrap().$method($($args),*),
735
            DataType::Float64 => $self.f64().unwrap().$method($($args),*),
736
            _ => apply_method_physical_integer!($self, $method, $($args),*),
737
        }
738
    }
739
}
740

741
#[macro_export]
742
macro_rules! df {
743
    ($($col_name:expr => $slice:expr), + $(,)?) => {
744
        $crate::prelude::DataFrame::new(vec![
745
            $($crate::prelude::Column::from(<$crate::prelude::Series as $crate::prelude::NamedFrom::<_, _>>::new($col_name.into(), $slice)),)+
746
        ])
747
    }
748
}
749

750
pub fn get_time_units(tu_l: &TimeUnit, tu_r: &TimeUnit) -> TimeUnit {
751
    use crate::datatypes::time_unit::TimeUnit::*;
752
    match (tu_l, tu_r) {
753
        (Nanoseconds, Microseconds) => Microseconds,
754
        (_, Milliseconds) => Milliseconds,
755
        _ => *tu_l,
756
    }
757
}
758

759
#[cold]
760
#[inline(never)]
761
fn width_mismatch(df1: &DataFrame, df2: &DataFrame) -> PolarsError {
762
    let mut df1_extra = Vec::new();
763
    let mut df2_extra = Vec::new();
764

765
    let s1 = df1.schema();
766
    let s2 = df2.schema();
767

768
    s1.field_compare(s2, &mut df1_extra, &mut df2_extra);
769

770
    let df1_extra = df1_extra
771
        .into_iter()
772
        .map(|(_, (n, _))| n.as_str())
773
        .collect::<Vec<_>>()
774
        .join(", ");
775
    let df2_extra = df2_extra
776
        .into_iter()
777
        .map(|(_, (n, _))| n.as_str())
778
        .collect::<Vec<_>>()
779
        .join(", ");
780

781
    polars_err!(
782
        SchemaMismatch: r#"unable to vstack, dataframes have different widths ({} != {}).
783
One dataframe has additional columns: [{df1_extra}].
784
Other dataframe has additional columns: [{df2_extra}]."#,
785
        df1.width(),
786
        df2.width(),
787
    )
788
}
789

790
pub fn accumulate_dataframes_vertical_unchecked_optional<I>(dfs: I) -> Option<DataFrame>
791
where
792
    I: IntoIterator<Item = DataFrame>,
793
{
794
    let mut iter = dfs.into_iter();
795
    let additional = iter.size_hint().0;
796
    let mut acc_df = iter.next()?;
797
    acc_df.reserve_chunks(additional);
798

799
    for df in iter {
800
        if acc_df.width() != df.width() {
801
            panic!("{}", width_mismatch(&acc_df, &df));
802
        }
803

804
        acc_df.vstack_mut_owned_unchecked(df);
805
    }
806
    Some(acc_df)
807
}
808

809
/// This takes ownership of the DataFrame so that drop is called earlier.
810
/// Does not check if schema is correct
811
pub fn accumulate_dataframes_vertical_unchecked<I>(dfs: I) -> DataFrame
812
where
813
    I: IntoIterator<Item = DataFrame>,
814
{
815
    let mut iter = dfs.into_iter();
816
    let additional = iter.size_hint().0;
817
    let mut acc_df = iter.next().unwrap();
818
    acc_df.reserve_chunks(additional);
819

820
    for df in iter {
821
        if acc_df.width() != df.width() {
822
            panic!("{}", width_mismatch(&acc_df, &df));
823
        }
824

825
        acc_df.vstack_mut_owned_unchecked(df);
826
    }
827
    acc_df
828
}
829

830
/// This takes ownership of the DataFrame so that drop is called earlier.
831
/// # Panics
832
/// Panics if `dfs` is empty.
833
pub fn accumulate_dataframes_vertical<I>(dfs: I) -> PolarsResult<DataFrame>
834
where
835
    I: IntoIterator<Item = DataFrame>,
836
{
837
    let mut iter = dfs.into_iter();
838
    let additional = iter.size_hint().0;
839
    let mut acc_df = iter.next().unwrap();
840
    acc_df.reserve_chunks(additional);
841
    for df in iter {
842
        if acc_df.width() != df.width() {
843
            return Err(width_mismatch(&acc_df, &df));
844
        }
845

846
        acc_df.vstack_mut_owned(df)?;
847
    }
848

849
    Ok(acc_df)
850
}
851

852
/// Concat the DataFrames to a single DataFrame.
853
pub fn concat_df<'a, I>(dfs: I) -> PolarsResult<DataFrame>
854
where
855
    I: IntoIterator<Item = &'a DataFrame>,
856
{
857
    let mut iter = dfs.into_iter();
858
    let additional = iter.size_hint().0;
859
    let mut acc_df = iter.next().unwrap().clone();
860
    acc_df.reserve_chunks(additional);
861
    for df in iter {
862
        acc_df.vstack_mut(df)?;
863
    }
864
    Ok(acc_df)
865
}
866

867
/// Concat the DataFrames to a single DataFrame.
868
pub fn concat_df_unchecked<'a, I>(dfs: I) -> DataFrame
869
where
870
    I: IntoIterator<Item = &'a DataFrame>,
871
{
872
    let mut iter = dfs.into_iter();
873
    let additional = iter.size_hint().0;
874
    let mut acc_df = iter.next().unwrap().clone();
875
    acc_df.reserve_chunks(additional);
876
    for df in iter {
877
        acc_df.vstack_mut_unchecked(df);
878
    }
879
    acc_df
880
}
881

882
pub fn accumulate_dataframes_horizontal(dfs: Vec<DataFrame>) -> PolarsResult<DataFrame> {
883
    let mut iter = dfs.into_iter();
884
    let mut acc_df = iter.next().unwrap();
885
    for df in iter {
886
        acc_df.hstack_mut(df.get_columns())?;
887
    }
888
    Ok(acc_df)
889
}
890

891
/// Ensure the chunks in both ChunkedArrays have the same length.
892
/// # Panics
893
/// This will panic if `left.len() != right.len()` and array is chunked.
894
pub fn align_chunks_binary<'a, T, B>(
895
    left: &'a ChunkedArray<T>,
896
    right: &'a ChunkedArray<B>,
897
) -> (Cow<'a, ChunkedArray<T>>, Cow<'a, ChunkedArray<B>>)
898
where
899
    B: PolarsDataType,
900
    T: PolarsDataType,
901
{
902
    let assert = || {
903
        assert_eq!(
904
            left.len(),
905
            right.len(),
906
            "expected arrays of the same length"
907
        )
908
    };
909
    match (left.chunks.len(), right.chunks.len()) {
910
        // All chunks are equal length
911
        (1, 1) => (Cow::Borrowed(left), Cow::Borrowed(right)),
912
        // All chunks are equal length
913
        (a, b)
914
            if a == b
915
                && left
916
                    .chunk_lengths()
917
                    .zip(right.chunk_lengths())
918
                    .all(|(l, r)| l == r) =>
919
        {
920
            (Cow::Borrowed(left), Cow::Borrowed(right))
921
        },
922
        (_, 1) => {
923
            assert();
924
            (
925
                Cow::Borrowed(left),
926
                Cow::Owned(right.match_chunks(left.chunk_lengths())),
927
            )
928
        },
929
        (1, _) => {
930
            assert();
931
            (
932
                Cow::Owned(left.match_chunks(right.chunk_lengths())),
933
                Cow::Borrowed(right),
934
            )
935
        },
936
        (_, _) => {
937
            assert();
938
            // could optimize to choose to rechunk a primitive and not a string or list type
939
            let left = left.rechunk();
940
            (
941
                Cow::Owned(left.match_chunks(right.chunk_lengths())),
942
                Cow::Borrowed(right),
943
            )
944
        },
945
    }
946
}
947

948
/// Ensure the chunks in ChunkedArray and Series have the same length.
949
/// # Panics
950
/// This will panic if `left.len() != right.len()` and array is chunked.
951
pub fn align_chunks_binary_ca_series<'a, T>(
952
    left: &'a ChunkedArray<T>,
953
    right: &'a Series,
954
) -> (Cow<'a, ChunkedArray<T>>, Cow<'a, Series>)
955
where
956
    T: PolarsDataType,
957
{
958
    let assert = || {
959
        assert_eq!(
960
            left.len(),
961
            right.len(),
962
            "expected arrays of the same length"
963
        )
964
    };
965
    match (left.chunks.len(), right.chunks().len()) {
966
        // All chunks are equal length
967
        (1, 1) => (Cow::Borrowed(left), Cow::Borrowed(right)),
968
        // All chunks are equal length
969
        (a, b)
970
            if a == b
971
                && left
972
                    .chunk_lengths()
973
                    .zip(right.chunk_lengths())
974
                    .all(|(l, r)| l == r) =>
975
        {
976
            assert();
977
            (Cow::Borrowed(left), Cow::Borrowed(right))
978
        },
979
        (_, 1) => (left.rechunk(), Cow::Borrowed(right)),
980
        (1, _) => (Cow::Borrowed(left), Cow::Owned(right.rechunk())),
981
        (_, _) => {
982
            assert();
983
            (left.rechunk(), Cow::Owned(right.rechunk()))
984
        },
985
    }
986
}
987

988
#[cfg(feature = "performant")]
989
pub(crate) fn align_chunks_binary_owned_series(left: Series, right: Series) -> (Series, Series) {
990
    match (left.chunks().len(), right.chunks().len()) {
991
        (1, 1) => (left, right),
992
        // All chunks are equal length
993
        (a, b)
994
            if a == b
995
                && left
996
                    .chunk_lengths()
997
                    .zip(right.chunk_lengths())
998
                    .all(|(l, r)| l == r) =>
999
        {
1000
            (left, right)
1001
        },
1002
        (_, 1) => (left.rechunk(), right),
1003
        (1, _) => (left, right.rechunk()),
1004
        (_, _) => (left.rechunk(), right.rechunk()),
1005
    }
1006
}
1007

1008
pub(crate) fn align_chunks_binary_owned<T, B>(
1009
    left: ChunkedArray<T>,
1010
    right: ChunkedArray<B>,
1011
) -> (ChunkedArray<T>, ChunkedArray<B>)
1012
where
1013
    B: PolarsDataType,
1014
    T: PolarsDataType,
1015
{
1016
    match (left.chunks.len(), right.chunks.len()) {
1017
        (1, 1) => (left, right),
1018
        // All chunks are equal length
1019
        (a, b)
1020
            if a == b
1021
                && left
1022
                    .chunk_lengths()
1023
                    .zip(right.chunk_lengths())
1024
                    .all(|(l, r)| l == r) =>
1025
        {
1026
            (left, right)
1027
        },
1028
        (_, 1) => (left.rechunk().into_owned(), right),
1029
        (1, _) => (left, right.rechunk().into_owned()),
1030
        (_, _) => (left.rechunk().into_owned(), right.rechunk().into_owned()),
1031
    }
1032
}
1033

1034
/// # Panics
1035
/// This will panic if `a.len() != b.len() || b.len() != c.len()` and array is chunked.
1036
#[allow(clippy::type_complexity)]
1037
pub fn align_chunks_ternary<'a, A, B, C>(
1038
    a: &'a ChunkedArray<A>,
1039
    b: &'a ChunkedArray<B>,
1040
    c: &'a ChunkedArray<C>,
1041
) -> (
1042
    Cow<'a, ChunkedArray<A>>,
1043
    Cow<'a, ChunkedArray<B>>,
1044
    Cow<'a, ChunkedArray<C>>,
1045
)
1046
where
1047
    A: PolarsDataType,
1048
    B: PolarsDataType,
1049
    C: PolarsDataType,
1050
{
1051
    if a.chunks.len() == 1 && b.chunks.len() == 1 && c.chunks.len() == 1 {
1052
        return (Cow::Borrowed(a), Cow::Borrowed(b), Cow::Borrowed(c));
1053
    }
1054

1055
    assert!(
1056
        a.len() == b.len() && b.len() == c.len(),
1057
        "expected arrays of the same length"
1058
    );
1059

1060
    match (a.chunks.len(), b.chunks.len(), c.chunks.len()) {
1061
        (_, 1, 1) => (
1062
            Cow::Borrowed(a),
1063
            Cow::Owned(b.match_chunks(a.chunk_lengths())),
1064
            Cow::Owned(c.match_chunks(a.chunk_lengths())),
1065
        ),
1066
        (1, 1, _) => (
1067
            Cow::Owned(a.match_chunks(c.chunk_lengths())),
1068
            Cow::Owned(b.match_chunks(c.chunk_lengths())),
1069
            Cow::Borrowed(c),
1070
        ),
1071
        (1, _, 1) => (
1072
            Cow::Owned(a.match_chunks(b.chunk_lengths())),
1073
            Cow::Borrowed(b),
1074
            Cow::Owned(c.match_chunks(b.chunk_lengths())),
1075
        ),
1076
        (1, _, _) => {
1077
            let b = b.rechunk();
1078
            (
1079
                Cow::Owned(a.match_chunks(c.chunk_lengths())),
1080
                Cow::Owned(b.match_chunks(c.chunk_lengths())),
1081
                Cow::Borrowed(c),
1082
            )
1083
        },
1084
        (_, 1, _) => {
1085
            let a = a.rechunk();
1086
            (
1087
                Cow::Owned(a.match_chunks(c.chunk_lengths())),
1088
                Cow::Owned(b.match_chunks(c.chunk_lengths())),
1089
                Cow::Borrowed(c),
1090
            )
1091
        },
1092
        (_, _, 1) => {
1093
            let b = b.rechunk();
1094
            (
1095
                Cow::Borrowed(a),
1096
                Cow::Owned(b.match_chunks(a.chunk_lengths())),
1097
                Cow::Owned(c.match_chunks(a.chunk_lengths())),
1098
            )
1099
        },
1100
        (len_a, len_b, len_c)
1101
            if len_a == len_b
1102
                && len_b == len_c
1103
                && a.chunk_lengths()
1104
                    .zip(b.chunk_lengths())
1105
                    .zip(c.chunk_lengths())
1106
                    .all(|((a, b), c)| a == b && b == c) =>
1107
        {
1108
            (Cow::Borrowed(a), Cow::Borrowed(b), Cow::Borrowed(c))
1109
        },
1110
        _ => {
1111
            // could optimize to choose to rechunk a primitive and not a string or list type
1112
            let a = a.rechunk();
1113
            let b = b.rechunk();
1114
            (
1115
                Cow::Owned(a.match_chunks(c.chunk_lengths())),
1116
                Cow::Owned(b.match_chunks(c.chunk_lengths())),
1117
                Cow::Borrowed(c),
1118
            )
1119
        },
1120
    }
1121
}
1122

1123
pub fn binary_concatenate_validities<'a, T, B>(
1124
    left: &'a ChunkedArray<T>,
1125
    right: &'a ChunkedArray<B>,
1126
) -> Option<Bitmap>
1127
where
1128
    B: PolarsDataType,
1129
    T: PolarsDataType,
1130
{
1131
    let (left, right) = align_chunks_binary(left, right);
1132
    let left_validity = concatenate_validities(left.chunks());
1133
    let right_validity = concatenate_validities(right.chunks());
1134
    combine_validities_and(left_validity.as_ref(), right_validity.as_ref())
1135
}
1136

1137
/// Convenience for `x.into_iter().map(Into::into).collect()` using an `into_vec()` function.
1138
pub trait IntoVec<T> {
1139
    fn into_vec(self) -> Vec<T>;
1140
}
1141

1142
impl<I, S> IntoVec<PlSmallStr> for I
1143
where
1144
    I: IntoIterator<Item = S>,
1145
    S: Into<PlSmallStr>,
1146
{
1147
    fn into_vec(self) -> Vec<PlSmallStr> {
1148
        self.into_iter().map(|s| s.into()).collect()
1149
    }
1150
}
1151

1152
/// This logic is same as the impl on ChunkedArray
1153
/// The difference is that there is less indirection because the caller should preallocate
1154
/// `chunk_lens` once. On the `ChunkedArray` we indirect through an `ArrayRef` which is an indirection
1155
/// and a vtable.
1156
#[inline]
1157
pub(crate) fn index_to_chunked_index<
1158
    I: Iterator<Item = Idx>,
1159
    Idx: PartialOrd + std::ops::AddAssign + std::ops::SubAssign + Zero + One,
1160
>(
1161
    chunk_lens: I,
1162
    index: Idx,
1163
) -> (Idx, Idx) {
1164
    let mut index_remainder = index;
1165
    let mut current_chunk_idx = Zero::zero();
1166

1167
    for chunk_len in chunk_lens {
1168
        if chunk_len > index_remainder {
1169
            break;
1170
        } else {
1171
            index_remainder -= chunk_len;
1172
            current_chunk_idx += One::one();
1173
        }
1174
    }
1175
    (current_chunk_idx, index_remainder)
1176
}
1177

1178
pub(crate) fn index_to_chunked_index_rev<
1179
    I: Iterator<Item = Idx>,
1180
    Idx: PartialOrd
1181
        + std::ops::AddAssign
1182
        + std::ops::SubAssign
1183
        + std::ops::Sub<Output = Idx>
1184
        + Zero
1185
        + One
1186
        + Copy
1187
        + std::fmt::Debug,
1188
>(
1189
    chunk_lens_rev: I,
1190
    index_from_back: Idx,
1191
    total_chunks: Idx,
1192
) -> (Idx, Idx) {
1193
    debug_assert!(index_from_back > Zero::zero(), "at least -1");
1194
    let mut index_remainder = index_from_back;
1195
    let mut current_chunk_idx = One::one();
1196
    let mut current_chunk_len = Zero::zero();
1197

1198
    for chunk_len in chunk_lens_rev {
1199
        current_chunk_len = chunk_len;
1200
        if chunk_len >= index_remainder {
1201
            break;
1202
        } else {
1203
            index_remainder -= chunk_len;
1204
            current_chunk_idx += One::one();
1205
        }
1206
    }
1207
    (
1208
        total_chunks - current_chunk_idx,
1209
        current_chunk_len - index_remainder,
1210
    )
1211
}
1212

1213
pub fn first_non_null<'a, I>(iter: I) -> Option<usize>
1214
where
1215
    I: Iterator<Item = Option<&'a Bitmap>>,
1216
{
1217
    let mut offset = 0;
1218
    for validity in iter {
1219
        if let Some(mask) = validity {
1220
            let len_mask = mask.len();
1221
            let n = mask.leading_zeros();
1222
            if n < len_mask {
1223
                return Some(offset + n);
1224
            }
1225
            offset += len_mask
1226
        } else {
1227
            return Some(offset);
1228
        }
1229
    }
1230
    None
1231
}
1232

1233
pub fn last_non_null<'a, I>(iter: I, len: usize) -> Option<usize>
1234
where
1235
    I: DoubleEndedIterator<Item = Option<&'a Bitmap>>,
1236
{
1237
    if len == 0 {
1238
        return None;
1239
    }
1240
    let mut offset = 0;
1241
    for validity in iter.rev() {
1242
        if let Some(mask) = validity {
1243
            let len_mask = mask.len();
1244
            let n = mask.trailing_zeros();
1245
            if n < len_mask {
1246
                return Some(len - offset - n - 1);
1247
            }
1248
            offset += len_mask;
1249
        } else {
1250
            return Some(len - offset - 1);
1251
        }
1252
    }
1253
    None
1254
}
1255

1256
/// ensure that nulls are propagated to both arrays
1257
pub fn coalesce_nulls<'a, T: PolarsDataType>(
1258
    a: &'a ChunkedArray<T>,
1259
    b: &'a ChunkedArray<T>,
1260
) -> (Cow<'a, ChunkedArray<T>>, Cow<'a, ChunkedArray<T>>) {
1261
    if a.null_count() > 0 || b.null_count() > 0 {
1262
        let (a, b) = align_chunks_binary(a, b);
1263
        let mut b = b.into_owned();
1264
        let a = a.coalesce_nulls(b.chunks());
1265

1266
        for arr in a.chunks().iter() {
1267
            for arr_b in unsafe { b.chunks_mut() } {
1268
                *arr_b = arr_b.with_validity(arr.validity().cloned())
1269
            }
1270
        }
1271
        b.compute_len();
1272
        (Cow::Owned(a), Cow::Owned(b))
1273
    } else {
1274
        (Cow::Borrowed(a), Cow::Borrowed(b))
1275
    }
1276
}
1277

1278
pub fn coalesce_nulls_columns(a: &Column, b: &Column) -> (Column, Column) {
1279
    if a.null_count() > 0 || b.null_count() > 0 {
1280
        let mut a = a.as_materialized_series().rechunk();
1281
        let mut b = b.as_materialized_series().rechunk();
1282
        for (arr_a, arr_b) in unsafe { a.chunks_mut().iter_mut().zip(b.chunks_mut()) } {
1283
            let validity = match (arr_a.validity(), arr_b.validity()) {
1284
                (None, Some(b)) => Some(b.clone()),
1285
                (Some(a), Some(b)) => Some(a & b),
1286
                (Some(a), None) => Some(a.clone()),
1287
                (None, None) => None,
1288
            };
1289
            *arr_a = arr_a.with_validity(validity.clone());
1290
            *arr_b = arr_b.with_validity(validity);
1291
        }
1292
        a.compute_len();
1293
        b.compute_len();
1294
        (a.into(), b.into())
1295
    } else {
1296
        (a.clone(), b.clone())
1297
    }
1298
}
1299

1300
pub fn operation_exceeded_idxsize_msg(operation: &str) -> String {
1301
    if size_of::<IdxSize>() == size_of::<u32>() {
1302
        format!(
1303
            "{} exceeded the maximum supported limit of {} rows. Consider installing 'polars-u64-idx'.",
1304
            operation,
1305
            IdxSize::MAX,
1306
        )
1307
    } else {
1308
        format!(
1309
            "{} exceeded the maximum supported limit of {} rows.",
1310
            operation,
1311
            IdxSize::MAX,
1312
        )
1313
    }
1314
}
1315

1316
#[cfg(test)]
1317
mod test {
1318
    use super::*;
1319

1320
    #[test]
1321
    fn test_split() {
1322
        let ca: Int32Chunked = (0..10).collect_ca("a".into());
1323

1324
        let out = split(&ca, 3);
1325
        assert_eq!(out[0].len(), 3);
1326
        assert_eq!(out[1].len(), 3);
1327
        assert_eq!(out[2].len(), 4);
1328
    }
1329

1330
    #[test]
1331
    fn test_align_chunks() -> PolarsResult<()> {
1332
        let a = Int32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3, 4]);
1333
        let mut b = Int32Chunked::new(PlSmallStr::EMPTY, &[1]);
1334
        let b2 = Int32Chunked::new(PlSmallStr::EMPTY, &[2, 3, 4]);
1335

1336
        b.append(&b2)?;
1337
        let (a, b) = align_chunks_binary(&a, &b);
1338
        assert_eq!(
1339
            a.chunk_lengths().collect::<Vec<_>>(),
1340
            b.chunk_lengths().collect::<Vec<_>>()
1341
        );
1342

1343
        let a = Int32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3, 4]);
1344
        let mut b = Int32Chunked::new(PlSmallStr::EMPTY, &[1]);
1345
        let b1 = b.clone();
1346
        b.append(&b1)?;
1347
        b.append(&b1)?;
1348
        b.append(&b1)?;
1349
        let (a, b) = align_chunks_binary(&a, &b);
1350
        assert_eq!(
1351
            a.chunk_lengths().collect::<Vec<_>>(),
1352
            b.chunk_lengths().collect::<Vec<_>>()
1353
        );
1354

1355
        Ok(())
1356
    }
1357
}
1358

1359
Product

Resources

Company