CoCalc -- mod.rs

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/frame/mod.rs
⁶⁹⁴⁰ views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! DataFrame module.
3
use std::sync::OnceLock;
4
use std::{mem, ops};
5

6
use arrow::datatypes::ArrowSchemaRef;
7
use polars_row::ArrayRef;
8
use polars_schema::schema::ensure_matching_schema_names;
9
use polars_utils::itertools::Itertools;
10
use rayon::prelude::*;
11

12
use crate::chunked_array::flags::StatisticsFlags;
13
#[cfg(feature = "algorithm_group_by")]
14
use crate::chunked_array::ops::unique::is_unique_helper;
15
use crate::prelude::*;
16
#[cfg(feature = "row_hash")]
17
use crate::utils::split_df;
18
use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19
use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20

21
#[cfg(feature = "dataframe_arithmetic")]
22
mod arithmetic;
23
pub mod builder;
24
mod chunks;
25
pub use chunks::chunk_df_for_writing;
26
pub mod column;
27
pub mod explode;
28
mod from;
29
#[cfg(feature = "algorithm_group_by")]
30
pub mod group_by;
31
pub(crate) mod horizontal;
32
#[cfg(any(feature = "rows", feature = "object"))]
33
pub mod row;
34
mod top_k;
35
mod upstream_traits;
36
mod validation;
37

38
use arrow::record_batch::{RecordBatch, RecordBatchT};
39
use polars_utils::pl_str::PlSmallStr;
40
#[cfg(feature = "serde")]
41
use serde::{Deserialize, Serialize};
42
use strum_macros::IntoStaticStr;
43

44
use crate::POOL;
45
#[cfg(feature = "row_hash")]
46
use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47
use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48
use crate::series::IsSorted;
49

50
#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
53
#[strum(serialize_all = "snake_case")]
54
pub enum UniqueKeepStrategy {
55
    /// Keep the first unique row.
56
    First,
57
    /// Keep the last unique row.
58
    Last,
59
    /// Keep None of the unique rows.
60
    None,
61
    /// Keep any of the unique rows
62
    /// This allows more optimizations
63
    #[default]
64
    Any,
65
}
66

67
fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
68
where
69
    F: for<'a> FnMut(&'a T) -> &'a str,
70
{
71
    // Always unique.
72
    if items.len() <= 1 {
73
        return Ok(());
74
    }
75

76
    if items.len() <= 4 {
77
        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
78
        for i in 0..items.len() - 1 {
79
            let name = get_name(&items[i]);
80
            for other in items.iter().skip(i + 1) {
81
                if name == get_name(other) {
82
                    polars_bail!(duplicate = name);
83
                }
84
            }
85
        }
86
    } else {
87
        let mut names = PlHashSet::with_capacity(items.len());
88
        for item in items {
89
            let name = get_name(item);
90
            if !names.insert(name) {
91
                polars_bail!(duplicate = name);
92
            }
93
        }
94
    }
95
    Ok(())
96
}
97

98
/// A contiguous growable collection of `Series` that have the same length.
99
///
100
/// ## Use declarations
101
///
102
/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
103
///
104
/// ```rust
105
/// use polars_core::prelude::*; // if the crate polars-core is used directly
106
/// // use polars::prelude::*;      if the crate polars is used
107
/// ```
108
///
109
/// # Initialization
110
/// ## Default
111
///
112
/// A `DataFrame` can be initialized empty:
113
///
114
/// ```rust
115
/// # use polars_core::prelude::*;
116
/// let df = DataFrame::default();
117
/// assert!(df.is_empty());
118
/// ```
119
///
120
/// ## Wrapping a `Vec<Series>`
121
///
122
/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
123
///
124
/// ```rust
125
/// # use polars_core::prelude::*;
126
/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
127
/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
128
///
129
/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
130
/// ```
131
///
132
/// ## Using a macro
133
///
134
/// The [`df!`] macro is a convenient method:
135
///
136
/// ```rust
137
/// # use polars_core::prelude::*;
138
/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
139
///                                       "Color" => ["Red", "Yellow", "Green"]);
140
/// ```
141
///
142
/// ## Using a CSV file
143
///
144
/// See the `polars_io::csv::CsvReader`.
145
///
146
/// # Indexing
147
/// ## By a number
148
///
149
/// The `Index<usize>` is implemented for the `DataFrame`.
150
///
151
/// ```rust
152
/// # use polars_core::prelude::*;
153
/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
154
///              "Color" => ["Red", "Yellow", "Green"])?;
155
///
156
/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
157
/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
158
/// # Ok::<(), PolarsError>(())
159
/// ```
160
///
161
/// ## By a `Series` name
162
///
163
/// ```rust
164
/// # use polars_core::prelude::*;
165
/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
166
///              "Color" => ["Red", "Yellow", "Green"])?;
167
///
168
/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
169
/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
170
/// # Ok::<(), PolarsError>(())
171
/// ```
172
#[derive(Clone)]
173
pub struct DataFrame {
174
    height: usize,
175
    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
176
    pub(crate) columns: Vec<Column>,
177

178
    /// A cached schema. This might not give correct results if the DataFrame was modified in place
179
    /// between schema and reading.
180
    cached_schema: OnceLock<SchemaRef>,
181
}
182

183
impl DataFrame {
184
    pub fn clear_schema(&mut self) {
185
        self.cached_schema = OnceLock::new();
186
    }
187

188
    #[inline]
189
    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
190
        self.columns.iter()
191
    }
192

193
    #[inline]
194
    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
195
        self.columns.iter().map(Column::as_materialized_series)
196
    }
197

198
    #[inline]
199
    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
200
        self.columns.par_iter().map(Column::as_materialized_series)
201
    }
202

203
    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
204
    ///
205
    /// # Implementation
206
    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
207
    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
208
    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
209
    ///
210
    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
211
    /// However, this function will yield a smaller number. This is because this function returns
212
    /// the visible size of the buffer, not its total capacity.
213
    ///
214
    /// FFI buffers are included in this estimation.
215
    pub fn estimated_size(&self) -> usize {
216
        self.columns.iter().map(Column::estimated_size).sum()
217
    }
218

219
    // Reduce monomorphization.
220
    fn try_apply_columns(
221
        &self,
222
        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
223
    ) -> PolarsResult<Vec<Column>> {
224
        self.columns.iter().map(func).collect()
225
    }
226
    // Reduce monomorphization.
227
    pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
228
        self.columns.iter().map(func).collect()
229
    }
230
    // Reduce monomorphization.
231
    fn try_apply_columns_par(
232
        &self,
233
        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
234
    ) -> PolarsResult<Vec<Column>> {
235
        POOL.install(|| self.columns.par_iter().map(func).collect())
236
    }
237
    // Reduce monomorphization.
238
    pub fn _apply_columns_par(
239
        &self,
240
        func: &(dyn Fn(&Column) -> Column + Send + Sync),
241
    ) -> Vec<Column> {
242
        POOL.install(|| self.columns.par_iter().map(func).collect())
243
    }
244

245
    /// Get the index of the column.
246
    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
247
        self.get_column_index(name)
248
            .ok_or_else(|| polars_err!(col_not_found = name))
249
    }
250

251
    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
252
        polars_ensure!(
253
            self.columns.iter().all(|s| s.name().as_str() != name),
254
            Duplicate: "column with name {:?} is already present in the DataFrame", name
255
        );
256
        Ok(())
257
    }
258

259
    /// Reserve additional slots into the chunks of the series.
260
    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
261
        for s in &mut self.columns {
262
            if let Column::Series(s) = s {
263
                // SAFETY:
264
                // do not modify the data, simply resize.
265
                unsafe { s.chunks_mut().reserve(additional) }
266
            }
267
        }
268
    }
269

270
    /// Create a DataFrame from a Vector of Series.
271
    ///
272
    /// Errors if a column names are not unique, or if heights are not all equal.
273
    ///
274
    /// # Example
275
    ///
276
    /// ```
277
    /// # use polars_core::prelude::*;
278
    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
279
    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
280
    ///
281
    /// let df = DataFrame::new(vec![s0, s1])?;
282
    /// # Ok::<(), PolarsError>(())
283
    /// ```
284
    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
285
        DataFrame::validate_columns_slice(&columns)
286
            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
287
        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
288
    }
289

290
    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
291
        for col in &columns {
292
            polars_ensure!(
293
                col.len() == height,
294
                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
295
                columns[0].name(), height, col.name(), col.len()
296
            );
297
        }
298

299
        Ok(DataFrame {
300
            height,
301
            columns,
302
            cached_schema: OnceLock::new(),
303
        })
304
    }
305

306
    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
307
    /// columns to match the other columns.
308
    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
309
        // The length of the longest non-unit length column determines the
310
        // broadcast length. If all columns are unit-length the broadcast length
311
        // is one.
312
        let broadcast_len = columns
313
            .iter()
314
            .map(|s| s.len())
315
            .filter(|l| *l != 1)
316
            .max()
317
            .unwrap_or(1);
318
        Self::new_with_broadcast_len(columns, broadcast_len)
319
    }
320

321
    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
322
    /// columns to broadcast_len.
323
    pub fn new_with_broadcast_len(
324
        columns: Vec<Column>,
325
        broadcast_len: usize,
326
    ) -> PolarsResult<Self> {
327
        ensure_names_unique(&columns, |s| s.name().as_str())?;
328
        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
329
    }
330

331
    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
332
    /// columns to match the other columns.
333
    ///  
334
    /// # Safety
335
    /// Does not check that the column names are unique (which they must be).
336
    pub unsafe fn new_with_broadcast_no_namecheck(
337
        mut columns: Vec<Column>,
338
        broadcast_len: usize,
339
    ) -> PolarsResult<Self> {
340
        for col in &mut columns {
341
            // Length not equal to the broadcast len, needs broadcast or is an error.
342
            let len = col.len();
343
            if len != broadcast_len {
344
                if len != 1 {
345
                    let name = col.name().to_owned();
346
                    let extra_info =
347
                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
348
                            format!(" (matching column '{}')", c.name())
349
                        } else {
350
                            String::new()
351
                        };
352
                    polars_bail!(
353
                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
354
                    );
355
                }
356
                *col = col.new_from_index(0, broadcast_len);
357
            }
358
        }
359

360
        let length = if columns.is_empty() { 0 } else { broadcast_len };
361

362
        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
363
    }
364

365
    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
366
        let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
367
        unsafe { Self::new_no_checks(height, cols.collect()) }
368
    }
369

370
    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
371
    ///
372
    /// # Example
373
    ///
374
    /// ```rust
375
    /// use polars_core::prelude::DataFrame;
376
    /// static EMPTY: DataFrame = DataFrame::empty();
377
    /// ```
378
    pub const fn empty() -> Self {
379
        Self::empty_with_height(0)
380
    }
381

382
    /// Creates an empty `DataFrame` with a specific `height`.
383
    pub const fn empty_with_height(height: usize) -> Self {
384
        DataFrame {
385
            height,
386
            columns: vec![],
387
            cached_schema: OnceLock::new(),
388
        }
389
    }
390

391
    /// Create an empty `DataFrame` with empty columns as per the `schema`.
392
    pub fn empty_with_schema(schema: &Schema) -> Self {
393
        let cols = schema
394
            .iter()
395
            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
396
            .collect();
397
        unsafe { DataFrame::new_no_checks(0, cols) }
398
    }
399

400
    /// Create an empty `DataFrame` with empty columns as per the `schema`.
401
    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
402
        let cols = schema
403
            .iter_values()
404
            .map(|fld| {
405
                Column::from(Series::new_empty(
406
                    fld.name.clone(),
407
                    &(DataType::from_arrow_field(fld)),
408
                ))
409
            })
410
            .collect();
411
        unsafe { DataFrame::new_no_checks(0, cols) }
412
    }
413

414
    /// Create a new `DataFrame` with the given schema, only containing nulls.
415
    pub fn full_null(schema: &Schema, height: usize) -> Self {
416
        let columns = schema
417
            .iter_fields()
418
            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
419
            .collect();
420
        unsafe { DataFrame::new_no_checks(height, columns) }
421
    }
422

423
    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
424
    ///
425
    /// # Example
426
    ///
427
    /// ```rust
428
    /// # use polars_core::prelude::*;
429
    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
430
    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
431
    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
432
    ///
433
    /// assert_eq!(df.pop(), Some(s2));
434
    /// assert_eq!(df.pop(), Some(s1));
435
    /// assert_eq!(df.pop(), None);
436
    /// assert!(df.is_empty());
437
    /// # Ok::<(), PolarsError>(())
438
    /// ```
439
    pub fn pop(&mut self) -> Option<Column> {
440
        self.clear_schema();
441

442
        self.columns.pop()
443
    }
444

445
    /// Add a new column at index 0 that counts the rows.
446
    ///
447
    /// # Example
448
    ///
449
    /// ```
450
    /// # use polars_core::prelude::*;
451
    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
452
    /// assert_eq!(df1.shape(), (4, 1));
453
    ///
454
    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
455
    /// assert_eq!(df2.shape(), (4, 2));
456
    /// println!("{}", df2);
457
    ///
458
    /// # Ok::<(), PolarsError>(())
459
    /// ```
460
    ///
461
    /// Output:
462
    ///
463
    /// ```text
464
    ///  shape: (4, 2)
465
    ///  +-----+----------+
466
    ///  | Id  | Name     |
467
    ///  | --- | ---      |
468
    ///  | u32 | str      |
469
    ///  +=====+==========+
470
    ///  | 0   | James    |
471
    ///  +-----+----------+
472
    ///  | 1   | Mary     |
473
    ///  +-----+----------+
474
    ///  | 2   | John     |
475
    ///  +-----+----------+
476
    ///  | 3   | Patricia |
477
    ///  +-----+----------+
478
    /// ```
479
    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
480
        let mut columns = Vec::with_capacity(self.columns.len() + 1);
481
        let offset = offset.unwrap_or(0);
482

483
        let col = Column::new_row_index(name, offset, self.height())?;
484
        columns.push(col);
485
        columns.extend_from_slice(&self.columns);
486
        DataFrame::new(columns)
487
    }
488

489
    /// Add a row index column in place.
490
    ///
491
    /// # Safety
492
    /// The caller should ensure the DataFrame does not already contain a column with the given name.
493
    ///
494
    /// # Panics
495
    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
496
    pub unsafe fn with_row_index_mut(
497
        &mut self,
498
        name: PlSmallStr,
499
        offset: Option<IdxSize>,
500
    ) -> &mut Self {
501
        // TODO: Make this function unsafe
502
        debug_assert!(
503
            self.columns.iter().all(|c| c.name() != &name),
504
            "with_row_index_mut(): column with name {} already exists",
505
            &name
506
        );
507

508
        let offset = offset.unwrap_or(0);
509
        let col = Column::new_row_index(name, offset, self.height()).unwrap();
510

511
        self.clear_schema();
512
        self.columns.insert(0, col);
513
        self
514
    }
515

516
    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
517
    /// `Series`.
518
    ///
519
    /// Calculates the height from the first column or `0` if no columns are given.
520
    ///
521
    /// # Safety
522
    ///
523
    /// It is the callers responsibility to uphold the contract of all `Series`
524
    /// having an equal length and a unique name, if not this may panic down the line.
525
    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
526
        let height = columns.first().map_or(0, Column::len);
527
        unsafe { Self::new_no_checks(height, columns) }
528
    }
529

530
    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
531
    /// `Series`.
532
    ///
533
    /// It is advised to use [DataFrame::new] in favor of this method.
534
    ///
535
    /// # Safety
536
    ///
537
    /// It is the callers responsibility to uphold the contract of all `Series`
538
    /// having an equal length and a unique name, if not this may panic down the line.
539
    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
540
        if cfg!(debug_assertions) {
541
            DataFrame::validate_columns_slice(&columns).unwrap();
542
        }
543

544
        unsafe { Self::_new_no_checks_impl(height, columns) }
545
    }
546

547
    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
548
    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
549
    /// constructed with this method is generally highly unsafe and should not be long-lived.
550
    #[allow(clippy::missing_safety_doc)]
551
    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
552
        DataFrame {
553
            height,
554
            columns,
555
            cached_schema: OnceLock::new(),
556
        }
557
    }
558

559
    /// Shrink the capacity of this DataFrame to fit its length.
560
    pub fn shrink_to_fit(&mut self) {
561
        // Don't parallelize this. Memory overhead
562
        for s in &mut self.columns {
563
            s.shrink_to_fit();
564
        }
565
    }
566

567
    /// Aggregate all the chunks in the DataFrame to a single chunk.
568
    pub fn as_single_chunk(&mut self) -> &mut Self {
569
        // Don't parallelize this. Memory overhead
570
        for s in &mut self.columns {
571
            *s = s.rechunk();
572
        }
573
        self
574
    }
575

576
    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
577
    /// This may lead to more peak memory consumption.
578
    pub fn as_single_chunk_par(&mut self) -> &mut Self {
579
        if self.columns.iter().any(|c| c.n_chunks() > 1) {
580
            self.columns = self._apply_columns_par(&|s| s.rechunk());
581
        }
582
        self
583
    }
584

585
    /// Rechunks all columns to only have a single chunk.
586
    pub fn rechunk_mut(&mut self) {
587
        // SAFETY: We never adjust the length or names of the columns.
588
        let columns = unsafe { self.get_columns_mut() };
589

590
        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
591
            *col = col.rechunk();
592
        }
593
    }
594

595
    pub fn _deshare_views_mut(&mut self) {
596
        // SAFETY: We never adjust the length or names of the columns.
597
        unsafe {
598
            let columns = self.get_columns_mut();
599
            for col in columns {
600
                let Column::Series(s) = col else { continue };
601

602
                if let Ok(ca) = s.binary() {
603
                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
604
                    *col = Column::from(gc_ca.into_series());
605
                } else if let Ok(ca) = s.str() {
606
                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
607
                    *col = Column::from(gc_ca.into_series());
608
                }
609
            }
610
        }
611
    }
612

613
    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
614
    pub fn rechunk_to_record_batch(
615
        self,
616
        compat_level: CompatLevel,
617
    ) -> RecordBatchT<Box<dyn Array>> {
618
        let height = self.height();
619

620
        let (schema, arrays) = self
621
            .columns
622
            .into_iter()
623
            .map(|col| {
624
                let mut series = col.take_materialized_series();
625
                // Rechunk to one chunk if necessary
626
                if series.n_chunks() > 1 {
627
                    series = series.rechunk();
628
                }
629
                (
630
                    series.field().to_arrow(compat_level),
631
                    series.to_arrow(0, compat_level),
632
                )
633
            })
634
            .collect();
635

636
        RecordBatchT::new(height, Arc::new(schema), arrays)
637
    }
638

639
    /// Returns true if the chunks of the columns do not align and re-chunking should be done
640
    pub fn should_rechunk(&self) -> bool {
641
        // Fast check. It is also needed for correctness, as code below doesn't check if the number
642
        // of chunks is equal.
643
        if !self
644
            .get_columns()
645
            .iter()
646
            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
647
            .all_equal()
648
        {
649
            return true;
650
        }
651

652
        // From here we check chunk lengths.
653
        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
654
        match chunk_lengths.next() {
655
            None => false,
656
            Some(first_column_chunk_lengths) => {
657
                // Fast Path for single Chunk Series
658
                if first_column_chunk_lengths.size_hint().0 == 1 {
659
                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
660
                }
661
                // Always rechunk if we have more chunks than rows.
662
                // except when we have an empty df containing a single chunk
663
                let height = self.height();
664
                let n_chunks = first_column_chunk_lengths.size_hint().0;
665
                if n_chunks > height && !(height == 0 && n_chunks == 1) {
666
                    return true;
667
                }
668
                // Slow Path for multi Chunk series
669
                let v: Vec<_> = first_column_chunk_lengths.collect();
670
                for cl in chunk_lengths {
671
                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
672
                        return true;
673
                    }
674
                }
675
                false
676
            },
677
        }
678
    }
679

680
    /// Ensure all the chunks in the [`DataFrame`] are aligned.
681
    pub fn align_chunks_par(&mut self) -> &mut Self {
682
        if self.should_rechunk() {
683
            self.as_single_chunk_par()
684
        } else {
685
            self
686
        }
687
    }
688

689
    pub fn align_chunks(&mut self) -> &mut Self {
690
        if self.should_rechunk() {
691
            self.as_single_chunk()
692
        } else {
693
            self
694
        }
695
    }
696

697
    /// Get the [`DataFrame`] schema.
698
    ///
699
    /// # Example
700
    ///
701
    /// ```rust
702
    /// # use polars_core::prelude::*;
703
    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
704
    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
705
    ///
706
    /// let f1: Field = Field::new("Thing".into(), DataType::String);
707
    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
708
    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
709
    ///
710
    /// assert_eq!(&**df.schema(), &sc);
711
    /// # Ok::<(), PolarsError>(())
712
    /// ```
713
    pub fn schema(&self) -> &SchemaRef {
714
        let out = self.cached_schema.get_or_init(|| {
715
            Arc::new(
716
                self.columns
717
                    .iter()
718
                    .map(|x| (x.name().clone(), x.dtype().clone()))
719
                    .collect(),
720
            )
721
        });
722

723
        debug_assert_eq!(out.len(), self.width());
724

725
        out
726
    }
727

728
    /// Get a reference to the [`DataFrame`] columns.
729
    ///
730
    /// # Example
731
    ///
732
    /// ```rust
733
    /// # use polars_core::prelude::*;
734
    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
735
    ///                         "Symbol" => ["A", "C", "G", "T"])?;
736
    /// let columns: &[Column] = df.get_columns();
737
    ///
738
    /// assert_eq!(columns[0].name(), "Name");
739
    /// assert_eq!(columns[1].name(), "Symbol");
740
    /// # Ok::<(), PolarsError>(())
741
    /// ```
742
    #[inline]
743
    pub fn get_columns(&self) -> &[Column] {
744
        &self.columns
745
    }
746

747
    #[inline]
748
    /// Get mutable access to the underlying columns.
749
    ///
750
    /// # Safety
751
    ///
752
    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
753
    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
754
    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
755
    /// calling [`DataFrame::clear_schema`].
756
    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
757
        &mut self.columns
758
    }
759

760
    #[inline]
761
    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
762
    pub fn clear_columns(&mut self) {
763
        unsafe { self.get_columns_mut() }.clear();
764
        self.clear_schema();
765
    }
766

767
    #[inline]
768
    /// Extend the columns without checking for name collisions or height.
769
    ///
770
    /// # Safety
771
    ///
772
    /// The caller needs to ensure that:
773
    /// - Column names are unique within the resulting [`DataFrame`].
774
    /// - The length of each appended column matches the height of the [`DataFrame`]. For
775
    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
776
    ///   with [`DataFrame::set_height`].
777
    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
778
        unsafe { self.get_columns_mut() }.extend(iter);
779
        self.clear_schema();
780
    }
781

782
    /// Take ownership of the underlying columns vec.
783
    pub fn take_columns(self) -> Vec<Column> {
784
        self.columns
785
    }
786

787
    /// Iterator over the columns as [`Series`].
788
    ///
789
    /// # Example
790
    ///
791
    /// ```rust
792
    /// # use polars_core::prelude::*;
793
    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
794
    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
795
    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
796
    ///
797
    /// let mut iterator = df.iter();
798
    ///
799
    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
800
    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
801
    /// assert_eq!(iterator.next(), None);
802
    /// # Ok::<(), PolarsError>(())
803
    /// ```
804
    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
805
        self.materialized_column_iter()
806
    }
807

808
    /// # Example
809
    ///
810
    /// ```rust
811
    /// # use polars_core::prelude::*;
812
    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
813
    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
814
    ///
815
    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
816
    /// # Ok::<(), PolarsError>(())
817
    /// ```
818
    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
819
        self.columns.iter().map(|s| s.name()).collect()
820
    }
821

822
    /// Get the [`Vec<PlSmallStr>`] representing the column names.
823
    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
824
        self.columns.iter().map(|s| s.name().clone()).collect()
825
    }
826

827
    pub fn get_column_names_str(&self) -> Vec<&str> {
828
        self.columns.iter().map(|s| s.name().as_str()).collect()
829
    }
830

831
    /// Set the column names.
832
    /// # Example
833
    ///
834
    /// ```rust
835
    /// # use polars_core::prelude::*;
836
    /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
837
    /// df.set_column_names(["Set"])?;
838
    ///
839
    /// assert_eq!(df.get_column_names(), &["Set"]);
840
    /// # Ok::<(), PolarsError>(())
841
    /// ```
842
    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
843
    where
844
        I: IntoIterator<Item = S>,
845
        S: Into<PlSmallStr>,
846
    {
847
        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
848
        self._set_column_names_impl(names.as_slice())
849
    }
850

851
    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
852
        polars_ensure!(
853
            names.len() == self.width(),
854
            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
855
            names.len(), self.width()
856
        );
857
        ensure_names_unique(names, |s| s.as_str())?;
858

859
        let columns = mem::take(&mut self.columns);
860
        self.columns = columns
861
            .into_iter()
862
            .zip(names)
863
            .map(|(s, name)| {
864
                let mut s = s;
865
                s.rename(name.clone());
866
                s
867
            })
868
            .collect();
869
        self.clear_schema();
870
        Ok(())
871
    }
872

873
    /// Get the data types of the columns in the [`DataFrame`].
874
    ///
875
    /// # Example
876
    ///
877
    /// ```rust
878
    /// # use polars_core::prelude::*;
879
    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
880
    ///                                "Fraction" => [0.965, 0.035])?;
881
    ///
882
    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
883
    /// # Ok::<(), PolarsError>(())
884
    /// ```
885
    pub fn dtypes(&self) -> Vec<DataType> {
886
        self.columns.iter().map(|s| s.dtype().clone()).collect()
887
    }
888

889
    pub(crate) fn first_series_column(&self) -> Option<&Series> {
890
        self.columns.iter().find_map(|col| col.as_series())
891
    }
892

893
    /// The number of chunks for the first column.
894
    pub fn first_col_n_chunks(&self) -> usize {
895
        match self.first_series_column() {
896
            None if self.columns.is_empty() => 0,
897
            None => 1,
898
            Some(s) => s.n_chunks(),
899
        }
900
    }
901

902
    /// The highest number of chunks for any column.
903
    pub fn max_n_chunks(&self) -> usize {
904
        self.columns
905
            .iter()
906
            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
907
            .max()
908
            .unwrap_or(0)
909
    }
910

911
    /// Get a reference to the schema fields of the [`DataFrame`].
912
    ///
913
    /// # Example
914
    ///
915
    /// ```rust
916
    /// # use polars_core::prelude::*;
917
    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
918
    ///                            "Fraction" => [0.708, 0.292])?;
919
    ///
920
    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
921
    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
922
    ///
923
    /// assert_eq!(earth.fields(), &[f1, f2]);
924
    /// # Ok::<(), PolarsError>(())
925
    /// ```
926
    pub fn fields(&self) -> Vec<Field> {
927
        self.columns
928
            .iter()
929
            .map(|s| s.field().into_owned())
930
            .collect()
931
    }
932

933
    /// Get (height, width) of the [`DataFrame`].
934
    ///
935
    /// # Example
936
    ///
937
    /// ```rust
938
    /// # use polars_core::prelude::*;
939
    /// let df0: DataFrame = DataFrame::default();
940
    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
941
    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
942
    ///                          "2" => [1, 2, 3, 4, 5])?;
943
    ///
944
    /// assert_eq!(df0.shape(), (0 ,0));
945
    /// assert_eq!(df1.shape(), (5, 1));
946
    /// assert_eq!(df2.shape(), (5, 2));
947
    /// # Ok::<(), PolarsError>(())
948
    /// ```
949
    pub fn shape(&self) -> (usize, usize) {
950
        (self.height, self.columns.len())
951
    }
952

953
    /// Get the width of the [`DataFrame`] which is the number of columns.
954
    ///
955
    /// # Example
956
    ///
957
    /// ```rust
958
    /// # use polars_core::prelude::*;
959
    /// let df0: DataFrame = DataFrame::default();
960
    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
961
    /// let df2: DataFrame = df!("Series 1" => [0; 0],
962
    ///                          "Series 2" => [0; 0])?;
963
    ///
964
    /// assert_eq!(df0.width(), 0);
965
    /// assert_eq!(df1.width(), 1);
966
    /// assert_eq!(df2.width(), 2);
967
    /// # Ok::<(), PolarsError>(())
968
    /// ```
969
    pub fn width(&self) -> usize {
970
        self.columns.len()
971
    }
972

973
    /// Get the height of the [`DataFrame`] which is the number of rows.
974
    ///
975
    /// # Example
976
    ///
977
    /// ```rust
978
    /// # use polars_core::prelude::*;
979
    /// let df0: DataFrame = DataFrame::default();
980
    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
981
    /// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
982
    ///
983
    /// assert_eq!(df0.height(), 0);
984
    /// assert_eq!(df1.height(), 2);
985
    /// assert_eq!(df2.height(), 5);
986
    /// # Ok::<(), PolarsError>(())
987
    /// ```
988
    pub fn height(&self) -> usize {
989
        self.height
990
    }
991

992
    /// Returns the size as number of rows * number of columns
993
    pub fn size(&self) -> usize {
994
        let s = self.shape();
995
        s.0 * s.1
996
    }
997

998
    /// Returns `true` if the [`DataFrame`] contains no rows.
999
    ///
1000
    /// # Example
1001
    ///
1002
    /// ```rust
1003
    /// # use polars_core::prelude::*;
1004
    /// let df1: DataFrame = DataFrame::default();
1005
    /// assert!(df1.is_empty());
1006
    ///
1007
    /// let df2: DataFrame = df!("First name" => ["Forever"],
1008
    ///                          "Last name" => ["Alone"])?;
1009
    /// assert!(!df2.is_empty());
1010
    /// # Ok::<(), PolarsError>(())
1011
    /// ```
1012
    pub fn is_empty(&self) -> bool {
1013
        matches!(self.shape(), (0, _) | (_, 0))
1014
    }
1015

1016
    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1017
    ///
1018
    /// # Safety
1019
    ///
1020
    /// This needs to be equal to the length of all the columns.
1021
    pub unsafe fn set_height(&mut self, height: usize) {
1022
        self.height = height;
1023
    }
1024

1025
    /// Add multiple [`Series`] to a [`DataFrame`].
1026
    /// The added `Series` are required to have the same length.
1027
    ///
1028
    /// # Example
1029
    ///
1030
    /// ```rust
1031
    /// # use polars_core::prelude::*;
1032
    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1033
    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1034
    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1035
    ///
1036
    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1037
    /// assert_eq!(df2.shape(), (3, 3));
1038
    /// println!("{}", df2);
1039
    /// # Ok::<(), PolarsError>(())
1040
    /// ```
1041
    ///
1042
    /// Output:
1043
    ///
1044
    /// ```text
1045
    /// shape: (3, 3)
1046
    /// +---------+--------+----------+
1047
    /// | Element | Proton | Electron |
1048
    /// | ---     | ---    | ---      |
1049
    /// | str     | i32    | i32      |
1050
    /// +=========+========+==========+
1051
    /// | Copper  | 29     | 29       |
1052
    /// +---------+--------+----------+
1053
    /// | Silver  | 47     | 47       |
1054
    /// +---------+--------+----------+
1055
    /// | Gold    | 79     | 79       |
1056
    /// +---------+--------+----------+
1057
    /// ```
1058
    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1059
        let mut new_cols = self.columns.clone();
1060
        new_cols.extend_from_slice(columns);
1061
        DataFrame::new(new_cols)
1062
    }
1063

1064
    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1065
    ///
1066
    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1067
    ///
1068
    /// # Example
1069
    ///
1070
    /// ```rust
1071
    /// # use polars_core::prelude::*;
1072
    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1073
    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1074
    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1075
    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1076
    ///
1077
    /// let df3: DataFrame = df1.vstack(&df2)?;
1078
    ///
1079
    /// assert_eq!(df3.shape(), (5, 2));
1080
    /// println!("{}", df3);
1081
    /// # Ok::<(), PolarsError>(())
1082
    /// ```
1083
    ///
1084
    /// Output:
1085
    ///
1086
    /// ```text
1087
    /// shape: (5, 2)
1088
    /// +-----------+-------------------+
1089
    /// | Element   | Melting Point (K) |
1090
    /// | ---       | ---               |
1091
    /// | str       | f64               |
1092
    /// +===========+===================+
1093
    /// | Copper    | 1357.77           |
1094
    /// +-----------+-------------------+
1095
    /// | Silver    | 1234.93           |
1096
    /// +-----------+-------------------+
1097
    /// | Gold      | 1337.33           |
1098
    /// +-----------+-------------------+
1099
    /// | Platinum  | 2041.4            |
1100
    /// +-----------+-------------------+
1101
    /// | Palladium | 1828.05           |
1102
    /// +-----------+-------------------+
1103
    /// ```
1104
    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1105
        let mut df = self.clone();
1106
        df.vstack_mut(other)?;
1107
        Ok(df)
1108
    }
1109

1110
    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1111
    ///
1112
    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1113
    ///
1114
    /// # Example
1115
    ///
1116
    /// ```rust
1117
    /// # use polars_core::prelude::*;
1118
    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1119
    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1120
    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1121
    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1122
    ///
1123
    /// df1.vstack_mut(&df2)?;
1124
    ///
1125
    /// assert_eq!(df1.shape(), (5, 2));
1126
    /// println!("{}", df1);
1127
    /// # Ok::<(), PolarsError>(())
1128
    /// ```
1129
    ///
1130
    /// Output:
1131
    ///
1132
    /// ```text
1133
    /// shape: (5, 2)
1134
    /// +-----------+-------------------+
1135
    /// | Element   | Melting Point (K) |
1136
    /// | ---       | ---               |
1137
    /// | str       | f64               |
1138
    /// +===========+===================+
1139
    /// | Copper    | 1357.77           |
1140
    /// +-----------+-------------------+
1141
    /// | Silver    | 1234.93           |
1142
    /// +-----------+-------------------+
1143
    /// | Gold      | 1337.33           |
1144
    /// +-----------+-------------------+
1145
    /// | Platinum  | 2041.4            |
1146
    /// +-----------+-------------------+
1147
    /// | Palladium | 1828.05           |
1148
    /// +-----------+-------------------+
1149
    /// ```
1150
    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1151
        if self.width() != other.width() {
1152
            polars_ensure!(
1153
                self.width() == 0,
1154
                ShapeMismatch:
1155
                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1156
                self.width(), other.width(),
1157
            );
1158
            self.columns.clone_from(&other.columns);
1159
            self.height = other.height;
1160
            return Ok(self);
1161
        }
1162

1163
        self.columns
1164
            .iter_mut()
1165
            .zip(other.columns.iter())
1166
            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1167
                ensure_can_extend(&*left, right)?;
1168
                left.append(right).map_err(|e| {
1169
                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1170
                })?;
1171
                Ok(())
1172
            })?;
1173
        self.height += other.height;
1174
        Ok(self)
1175
    }
1176

1177
    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1178
        if self.width() != other.width() {
1179
            polars_ensure!(
1180
                self.width() == 0,
1181
                ShapeMismatch:
1182
                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1183
                self.width(), other.width(),
1184
            );
1185
            self.columns = other.columns;
1186
            self.height = other.height;
1187
            return Ok(self);
1188
        }
1189

1190
        self.columns
1191
            .iter_mut()
1192
            .zip(other.columns.into_iter())
1193
            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1194
                ensure_can_extend(&*left, &right)?;
1195
                let right_name = right.name().clone();
1196
                left.append_owned(right).map_err(|e| {
1197
                    e.context(format!("failed to vstack column '{right_name}'").into())
1198
                })?;
1199
                Ok(())
1200
            })?;
1201
        self.height += other.height;
1202
        Ok(self)
1203
    }
1204

1205
    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1206
    ///
1207
    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1208
    ///
1209
    /// # Panics
1210
    /// Panics if the schema's don't match.
1211
    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1212
        self.columns
1213
            .iter_mut()
1214
            .zip(other.columns.iter())
1215
            .for_each(|(left, right)| {
1216
                left.append(right)
1217
                    .map_err(|e| {
1218
                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1219
                    })
1220
                    .expect("should not fail");
1221
            });
1222
        self.height += other.height;
1223
    }
1224

1225
    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1226
    ///
1227
    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1228
    ///
1229
    /// # Panics
1230
    /// Panics if the schema's don't match.
1231
    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1232
        self.columns
1233
            .iter_mut()
1234
            .zip(other.columns)
1235
            .for_each(|(left, right)| {
1236
                left.append_owned(right).expect("should not fail");
1237
            });
1238
        self.height += other.height;
1239
    }
1240

1241
    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1242
    ///
1243
    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1244
    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1245
    ///
1246
    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1247
    /// and thus will yield faster queries.
1248
    ///
1249
    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1250
    /// online operations where you add `n` rows and rerun a query.
1251
    ///
1252
    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1253
    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1254
    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1255
    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1256
        polars_ensure!(
1257
            self.width() == other.width(),
1258
            ShapeMismatch:
1259
            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1260
            self.width(), other.width(),
1261
        );
1262

1263
        self.columns
1264
            .iter_mut()
1265
            .zip(other.columns.iter())
1266
            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1267
                ensure_can_extend(&*left, right)?;
1268
                left.extend(right).map_err(|e| {
1269
                    e.context(format!("failed to extend column '{}'", right.name()).into())
1270
                })?;
1271
                Ok(())
1272
            })?;
1273
        self.height += other.height;
1274
        self.clear_schema();
1275
        Ok(())
1276
    }
1277

1278
    /// Remove a column by name and return the column removed.
1279
    ///
1280
    /// # Example
1281
    ///
1282
    /// ```rust
1283
    /// # use polars_core::prelude::*;
1284
    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1285
    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1286
    ///
1287
    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1288
    /// assert!(s1.is_err());
1289
    ///
1290
    /// let s2: Column = df.drop_in_place("Animal")?;
1291
    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1292
    /// # Ok::<(), PolarsError>(())
1293
    /// ```
1294
    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1295
        let idx = self.check_name_to_idx(name)?;
1296
        self.clear_schema();
1297
        Ok(self.columns.remove(idx))
1298
    }
1299

1300
    /// Return a new [`DataFrame`] where all null values are dropped.
1301
    ///
1302
    /// # Example
1303
    ///
1304
    /// ```no_run
1305
    /// # use polars_core::prelude::*;
1306
    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1307
    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1308
    /// assert_eq!(df1.shape(), (3, 2));
1309
    ///
1310
    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1311
    /// assert_eq!(df2.shape(), (1, 2));
1312
    /// println!("{}", df2);
1313
    /// # Ok::<(), PolarsError>(())
1314
    /// ```
1315
    ///
1316
    /// Output:
1317
    ///
1318
    /// ```text
1319
    /// shape: (1, 2)
1320
    /// +---------+---------------------+
1321
    /// | Country | Tax revenue (% GDP) |
1322
    /// | ---     | ---                 |
1323
    /// | str     | f64                 |
1324
    /// +=========+=====================+
1325
    /// | Malta   | 32.7                |
1326
    /// +---------+---------------------+
1327
    /// ```
1328
    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1329
    where
1330
        for<'a> &'a S: Into<PlSmallStr>,
1331
    {
1332
        if let Some(v) = subset {
1333
            let v = self.select_columns(v)?;
1334
            self._drop_nulls_impl(v.as_slice())
1335
        } else {
1336
            self._drop_nulls_impl(self.columns.as_slice())
1337
        }
1338
    }
1339

1340
    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1341
        // fast path for no nulls in df
1342
        if subset.iter().all(|s| !s.has_nulls()) {
1343
            return Ok(self.clone());
1344
        }
1345

1346
        let mut iter = subset.iter();
1347

1348
        let mask = iter
1349
            .next()
1350
            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1351
        let mut mask = mask.is_not_null();
1352

1353
        for c in iter {
1354
            mask = mask & c.is_not_null();
1355
        }
1356
        self.filter(&mask)
1357
    }
1358

1359
    /// Drop a column by name.
1360
    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1361
    /// the current one in place.
1362
    ///
1363
    /// # Example
1364
    ///
1365
    /// ```rust
1366
    /// # use polars_core::prelude::*;
1367
    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1368
    /// let df2: DataFrame = df1.drop("Ray type")?;
1369
    ///
1370
    /// assert!(df2.is_empty());
1371
    /// # Ok::<(), PolarsError>(())
1372
    /// ```
1373
    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1374
        let idx = self.check_name_to_idx(name)?;
1375
        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1376

1377
        self.columns.iter().enumerate().for_each(|(i, s)| {
1378
            if i != idx {
1379
                new_cols.push(s.clone())
1380
            }
1381
        });
1382

1383
        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1384
    }
1385

1386
    /// Drop columns that are in `names`.
1387
    pub fn drop_many<I, S>(&self, names: I) -> Self
1388
    where
1389
        I: IntoIterator<Item = S>,
1390
        S: Into<PlSmallStr>,
1391
    {
1392
        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1393
        self.drop_many_amortized(&names)
1394
    }
1395

1396
    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1397
    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1398
        if names.is_empty() {
1399
            return self.clone();
1400
        }
1401
        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1402
        self.columns.iter().for_each(|s| {
1403
            if !names.contains(s.name()) {
1404
                new_cols.push(s.clone())
1405
            }
1406
        });
1407

1408
        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1409
    }
1410

1411
    /// Insert a new column at a given index without checking for duplicates.
1412
    /// This can leave the [`DataFrame`] at an invalid state
1413
    fn insert_column_no_name_check(
1414
        &mut self,
1415
        index: usize,
1416
        column: Column,
1417
    ) -> PolarsResult<&mut Self> {
1418
        polars_ensure!(
1419
            self.width() == 0 || column.len() == self.height(),
1420
            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1421
            column.len(), self.height(),
1422
        );
1423

1424
        if self.width() == 0 {
1425
            self.height = column.len();
1426
        }
1427

1428
        self.columns.insert(index, column);
1429
        self.clear_schema();
1430
        Ok(self)
1431
    }
1432

1433
    /// Insert a new column at a given index.
1434
    pub fn insert_column<S: IntoColumn>(
1435
        &mut self,
1436
        index: usize,
1437
        column: S,
1438
    ) -> PolarsResult<&mut Self> {
1439
        let column = column.into_column();
1440
        self.check_already_present(column.name().as_str())?;
1441
        self.insert_column_no_name_check(index, column)
1442
    }
1443

1444
    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1445
        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1446
            self.replace_column(idx, column)?;
1447
        } else {
1448
            if self.width() == 0 {
1449
                self.height = column.len();
1450
            }
1451

1452
            self.columns.push(column);
1453
            self.clear_schema();
1454
        }
1455
        Ok(())
1456
    }
1457

1458
    /// Add a new column to this [`DataFrame`] or replace an existing one.
1459
    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1460
        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1461
            let height = df.height();
1462
            if column.len() == 1 && height > 1 {
1463
                column = column.new_from_index(0, height);
1464
            }
1465

1466
            if column.len() == height || df.get_columns().is_empty() {
1467
                df.add_column_by_search(column)?;
1468
                Ok(df)
1469
            }
1470
            // special case for literals
1471
            else if height == 0 && column.len() == 1 {
1472
                let s = column.clear();
1473
                df.add_column_by_search(s)?;
1474
                Ok(df)
1475
            } else {
1476
                polars_bail!(
1477
                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1478
                    column.len(), height,
1479
                );
1480
            }
1481
        }
1482
        let column = column.into_column();
1483
        inner(self, column)
1484
    }
1485

1486
    /// Adds a column to the [`DataFrame`] without doing any checks
1487
    /// on length or duplicates.
1488
    ///
1489
    /// # Safety
1490
    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1491
    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1492
        debug_assert!(self.width() == 0 || self.height() == column.len());
1493
        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1494

1495
        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1496
        // properly for `width` == 0.
1497
        if self.width() == 0 {
1498
            unsafe { self.set_height(column.len()) };
1499
        }
1500
        unsafe { self.get_columns_mut() }.push(column);
1501
        self.clear_schema();
1502

1503
        self
1504
    }
1505

1506
    // Note: Schema can be both input or output_schema
1507
    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1508
        let name = c.name();
1509
        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1510
            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1511
                // Given schema is output_schema and we can push.
1512
                if idx == self.columns.len() {
1513
                    if self.width() == 0 {
1514
                        self.height = c.len();
1515
                    }
1516

1517
                    self.columns.push(c);
1518
                    self.clear_schema();
1519
                }
1520
                // Schema is incorrect fallback to search
1521
                else {
1522
                    debug_assert!(false);
1523
                    self.add_column_by_search(c)?;
1524
                }
1525
            } else {
1526
                self.replace_column(idx, c)?;
1527
            }
1528
        } else {
1529
            if self.width() == 0 {
1530
                self.height = c.len();
1531
            }
1532

1533
            self.columns.push(c);
1534
            self.clear_schema();
1535
        }
1536

1537
        Ok(())
1538
    }
1539

1540
    // Note: Schema can be both input or output_schema
1541
    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1542
        for (i, s) in series.into_iter().enumerate() {
1543
            // we need to branch here
1544
            // because users can add multiple columns with the same name
1545
            if i == 0 || schema.get(s.name().as_str()).is_some() {
1546
                self.with_column_and_schema(s.into_column(), schema)?;
1547
            } else {
1548
                self.with_column(s.clone().into_column())?;
1549
            }
1550
        }
1551
        Ok(())
1552
    }
1553

1554
    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1555
        for (i, s) in columns.into_iter().enumerate() {
1556
            // we need to branch here
1557
            // because users can add multiple columns with the same name
1558
            if i == 0 || schema.get(s.name().as_str()).is_some() {
1559
                self.with_column_and_schema(s, schema)?;
1560
            } else {
1561
                self.with_column(s.clone())?;
1562
            }
1563
        }
1564

1565
        Ok(())
1566
    }
1567

1568
    /// Add a new column to this [`DataFrame`] or replace an existing one.
1569
    /// Uses an existing schema to amortize lookups.
1570
    /// If the schema is incorrect, we will fallback to linear search.
1571
    ///
1572
    /// Note: Schema can be both input or output_schema
1573
    pub fn with_column_and_schema<C: IntoColumn>(
1574
        &mut self,
1575
        column: C,
1576
        schema: &Schema,
1577
    ) -> PolarsResult<&mut Self> {
1578
        let mut column = column.into_column();
1579

1580
        let height = self.height();
1581
        if column.len() == 1 && height > 1 {
1582
            column = column.new_from_index(0, height);
1583
        }
1584

1585
        if column.len() == height || self.columns.is_empty() {
1586
            self.add_column_by_schema(column, schema)?;
1587
            Ok(self)
1588
        }
1589
        // special case for literals
1590
        else if height == 0 && column.len() == 1 {
1591
            let s = column.clear();
1592
            self.add_column_by_schema(s, schema)?;
1593
            Ok(self)
1594
        } else {
1595
            polars_bail!(
1596
                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1597
                column.len(), height,
1598
            );
1599
        }
1600
    }
1601

1602
    /// Get a row in the [`DataFrame`]. Beware this is slow.
1603
    ///
1604
    /// # Example
1605
    ///
1606
    /// ```
1607
    /// # use polars_core::prelude::*;
1608
    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1609
    ///     df.get(idx)
1610
    /// }
1611
    /// ```
1612
    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1613
        match self.columns.first() {
1614
            Some(s) => {
1615
                if s.len() <= idx {
1616
                    return None;
1617
                }
1618
            },
1619
            None => return None,
1620
        }
1621
        // SAFETY: we just checked bounds
1622
        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1623
    }
1624

1625
    /// Select a [`Series`] by index.
1626
    ///
1627
    /// # Example
1628
    ///
1629
    /// ```rust
1630
    /// # use polars_core::prelude::*;
1631
    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1632
    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1633
    ///
1634
    /// let s1: Option<&Column> = df.select_at_idx(0);
1635
    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1636
    ///
1637
    /// assert_eq!(s1, Some(&s2));
1638
    /// # Ok::<(), PolarsError>(())
1639
    /// ```
1640
    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1641
        self.columns.get(idx)
1642
    }
1643

1644
    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1645
    ///
1646
    /// # Examples
1647
    ///
1648
    /// ```rust
1649
    /// # use polars_core::prelude::*;
1650
    /// let df = df! {
1651
    ///     "0" => [0, 0, 0],
1652
    ///     "1" => [1, 1, 1],
1653
    ///     "2" => [2, 2, 2]
1654
    /// }?;
1655
    ///
1656
    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1657
    /// assert!(df.equals(&df.select_by_range(..)?));
1658
    /// # Ok::<(), PolarsError>(())
1659
    /// ```
1660
    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1661
    where
1662
        R: ops::RangeBounds<usize>,
1663
    {
1664
        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1665
        // because it is the nightly feature. We should change here if this function were stable.
1666
        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1667
        where
1668
            R: ops::RangeBounds<usize>,
1669
        {
1670
            let len = bounds.end;
1671

1672
            let start: ops::Bound<&usize> = range.start_bound();
1673
            let start = match start {
1674
                ops::Bound::Included(&start) => start,
1675
                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1676
                    panic!("attempted to index slice from after maximum usize");
1677
                }),
1678
                ops::Bound::Unbounded => 0,
1679
            };
1680

1681
            let end: ops::Bound<&usize> = range.end_bound();
1682
            let end = match end {
1683
                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1684
                    panic!("attempted to index slice up to maximum usize");
1685
                }),
1686
                ops::Bound::Excluded(&end) => end,
1687
                ops::Bound::Unbounded => len,
1688
            };
1689

1690
            if start > end {
1691
                panic!("slice index starts at {start} but ends at {end}");
1692
            }
1693
            if end > len {
1694
                panic!("range end index {end} out of range for slice of length {len}",);
1695
            }
1696

1697
            ops::Range { start, end }
1698
        }
1699

1700
        let colnames = self.get_column_names_owned();
1701
        let range = get_range(range, ..colnames.len());
1702

1703
        self._select_impl(&colnames[range])
1704
    }
1705

1706
    /// Get column index of a [`Series`] by name.
1707
    /// # Example
1708
    ///
1709
    /// ```rust
1710
    /// # use polars_core::prelude::*;
1711
    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1712
    ///                         "Health" => [100, 200, 500],
1713
    ///                         "Mana" => [250, 100, 0],
1714
    ///                         "Strength" => [30, 150, 300])?;
1715
    ///
1716
    /// assert_eq!(df.get_column_index("Name"), Some(0));
1717
    /// assert_eq!(df.get_column_index("Health"), Some(1));
1718
    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1719
    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1720
    /// assert_eq!(df.get_column_index("Haste"), None);
1721
    /// # Ok::<(), PolarsError>(())
1722
    /// ```
1723
    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1724
        let schema = self.schema();
1725
        if let Some(idx) = schema.index_of(name) {
1726
            if self
1727
                .get_columns()
1728
                .get(idx)
1729
                .is_some_and(|c| c.name() == name)
1730
            {
1731
                return Some(idx);
1732
            }
1733
        }
1734

1735
        self.columns.iter().position(|s| s.name().as_str() == name)
1736
    }
1737

1738
    /// Get column index of a [`Series`] by name.
1739
    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1740
        self.get_column_index(name)
1741
            .ok_or_else(|| polars_err!(col_not_found = name))
1742
    }
1743

1744
    /// Select a single column by name.
1745
    ///
1746
    /// # Example
1747
    ///
1748
    /// ```rust
1749
    /// # use polars_core::prelude::*;
1750
    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1751
    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1752
    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1753
    ///
1754
    /// assert_eq!(df.column("Password")?, &s1);
1755
    /// # Ok::<(), PolarsError>(())
1756
    /// ```
1757
    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1758
        let idx = self.try_get_column_index(name)?;
1759
        Ok(self.select_at_idx(idx).unwrap())
1760
    }
1761

1762
    /// Selected multiple columns by name.
1763
    ///
1764
    /// # Example
1765
    ///
1766
    /// ```rust
1767
    /// # use polars_core::prelude::*;
1768
    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1769
    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1770
    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1771
    ///
1772
    /// assert_eq!(&df[0], sv[0]);
1773
    /// assert_eq!(&df[1], sv[1]);
1774
    /// # Ok::<(), PolarsError>(())
1775
    /// ```
1776
    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1777
    where
1778
        I: IntoIterator<Item = S>,
1779
        S: AsRef<str>,
1780
    {
1781
        names
1782
            .into_iter()
1783
            .map(|name| self.column(name.as_ref()))
1784
            .collect()
1785
    }
1786

1787
    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1788
    ///
1789
    /// # Examples
1790
    ///
1791
    /// ```
1792
    /// # use polars_core::prelude::*;
1793
    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1794
    ///     df.select(["foo", "bar"])
1795
    /// }
1796
    /// ```
1797
    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1798
    where
1799
        I: IntoIterator<Item = S>,
1800
        S: Into<PlSmallStr>,
1801
    {
1802
        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1803
        self._select_impl(cols.as_slice())
1804
    }
1805

1806
    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1807
        ensure_names_unique(cols, |s| s.as_str())?;
1808
        self._select_impl_unchecked(cols)
1809
    }
1810

1811
    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1812
        let selected = self.select_columns_impl(cols)?;
1813
        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814
    }
1815

1816
    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1817
    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1818
    where
1819
        I: IntoIterator<Item = S>,
1820
        S: Into<PlSmallStr>,
1821
    {
1822
        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1823
        self._select_with_schema_impl(&cols, schema, true)
1824
    }
1825

1826
    /// Select with a known schema without checking for duplicates in `selection`.
1827
    /// The schema names must match the column names of this DataFrame.
1828
    pub fn select_with_schema_unchecked<I, S>(
1829
        &self,
1830
        selection: I,
1831
        schema: &Schema,
1832
    ) -> PolarsResult<Self>
1833
    where
1834
        I: IntoIterator<Item = S>,
1835
        S: Into<PlSmallStr>,
1836
    {
1837
        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1838
        self._select_with_schema_impl(&cols, schema, false)
1839
    }
1840

1841
    /// * The schema names must match the column names of this DataFrame.
1842
    pub fn _select_with_schema_impl(
1843
        &self,
1844
        cols: &[PlSmallStr],
1845
        schema: &Schema,
1846
        check_duplicates: bool,
1847
    ) -> PolarsResult<Self> {
1848
        if check_duplicates {
1849
            ensure_names_unique(cols, |s| s.as_str())?;
1850
        }
1851

1852
        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1853
        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1854
    }
1855

1856
    /// A non generic implementation to reduce compiler bloat.
1857
    fn select_columns_impl_with_schema(
1858
        &self,
1859
        cols: &[PlSmallStr],
1860
        schema: &Schema,
1861
    ) -> PolarsResult<Vec<Column>> {
1862
        if cfg!(debug_assertions) {
1863
            ensure_matching_schema_names(schema, self.schema())?;
1864
        }
1865

1866
        cols.iter()
1867
            .map(|name| {
1868
                let index = schema.try_get_full(name.as_str())?.0;
1869
                Ok(self.columns[index].clone())
1870
            })
1871
            .collect()
1872
    }
1873

1874
    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1875
    where
1876
        I: IntoIterator<Item = S>,
1877
        S: Into<PlSmallStr>,
1878
    {
1879
        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1880
        self.select_physical_impl(&cols)
1881
    }
1882

1883
    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1884
        ensure_names_unique(cols, |s| s.as_str())?;
1885
        let selected = self.select_columns_physical_impl(cols)?;
1886
        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1887
    }
1888

1889
    pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1890
        let from = self.schema();
1891
        let columns = to
1892
            .iter_names()
1893
            .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1894
            .collect::<PolarsResult<Vec<_>>>()?;
1895
        let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1896
        df.cached_schema = to.into();
1897
        Ok(df)
1898
    }
1899

1900
    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1901
    ///
1902
    /// # Example
1903
    ///
1904
    /// ```rust
1905
    /// # use polars_core::prelude::*;
1906
    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1907
    ///                         "Carbon" => [1, 2, 3],
1908
    ///                         "Hydrogen" => [4, 6, 8])?;
1909
    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1910
    ///
1911
    /// assert_eq!(df["Carbon"], sv[0]);
1912
    /// assert_eq!(df["Hydrogen"], sv[1]);
1913
    /// # Ok::<(), PolarsError>(())
1914
    /// ```
1915
    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1916
        let cols = selection.into_vec();
1917
        self.select_columns_impl(&cols)
1918
    }
1919

1920
    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1921
        self.columns
1922
            .iter()
1923
            .enumerate()
1924
            .map(|(i, s)| (s.name().as_str(), i))
1925
            .collect()
1926
    }
1927

1928
    /// A non generic implementation to reduce compiler bloat.
1929
    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1930
        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1931
            let name_to_idx = self._names_to_idx_map();
1932
            cols.iter()
1933
                .map(|name| {
1934
                    let idx = *name_to_idx
1935
                        .get(name.as_str())
1936
                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1937
                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1938
                })
1939
                .collect::<PolarsResult<Vec<_>>>()?
1940
        } else {
1941
            cols.iter()
1942
                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1943
                .collect::<PolarsResult<Vec<_>>>()?
1944
        };
1945

1946
        Ok(selected)
1947
    }
1948

1949
    /// A non generic implementation to reduce compiler bloat.
1950
    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1951
        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1952
            // we hash, because there are user that having millions of columns.
1953
            // # https://github.com/pola-rs/polars/issues/1023
1954
            let name_to_idx = self._names_to_idx_map();
1955

1956
            cols.iter()
1957
                .map(|name| {
1958
                    let idx = *name_to_idx
1959
                        .get(name.as_str())
1960
                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1961
                    Ok(self.select_at_idx(idx).unwrap().clone())
1962
                })
1963
                .collect::<PolarsResult<Vec<_>>>()?
1964
        } else {
1965
            cols.iter()
1966
                .map(|c| self.column(c.as_str()).cloned())
1967
                .collect::<PolarsResult<Vec<_>>>()?
1968
        };
1969

1970
        Ok(selected)
1971
    }
1972

1973
    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1974
        // If there is a filtered column just see how many columns there are left.
1975
        if let Some(fst) = filtered.first() {
1976
            return fst.len();
1977
        }
1978

1979
        // Otherwise, count the number of values that would be filtered and return that height.
1980
        let num_trues = mask.num_trues();
1981
        if mask.len() == self.height() {
1982
            num_trues
1983
        } else {
1984
            // This is for broadcasting masks
1985
            debug_assert!(num_trues == 0 || num_trues == 1);
1986
            self.height() * num_trues
1987
        }
1988
    }
1989

1990
    /// Take the [`DataFrame`] rows by a boolean mask.
1991
    ///
1992
    /// # Example
1993
    ///
1994
    /// ```
1995
    /// # use polars_core::prelude::*;
1996
    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1997
    ///     let mask = df.column("sepal_width")?.is_not_null();
1998
    ///     df.filter(&mask)
1999
    /// }
2000
    /// ```
2001
    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2002
        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2003
        let height = self.filter_height(&new_col, mask);
2004

2005
        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2006
    }
2007

2008
    /// Same as `filter` but does not parallelize.
2009
    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2010
        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2011
        let height = self.filter_height(&new_col, mask);
2012

2013
        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2014
    }
2015

2016
    /// Take [`DataFrame`] rows by index values.
2017
    ///
2018
    /// # Example
2019
    ///
2020
    /// ```
2021
    /// # use polars_core::prelude::*;
2022
    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2023
    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2024
    ///     df.take(&idx)
2025
    /// }
2026
    /// ```
2027
    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2028
        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2029

2030
        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2031
    }
2032

2033
    /// # Safety
2034
    /// The indices must be in-bounds.
2035
    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2036
        self.take_unchecked_impl(idx, true)
2037
    }
2038

2039
    /// # Safety
2040
    /// The indices must be in-bounds.
2041
    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2042
        let cols = if allow_threads {
2043
            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2044
        } else {
2045
            self._apply_columns(&|s| s.take_unchecked(idx))
2046
        };
2047
        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2048
    }
2049

2050
    /// # Safety
2051
    /// The indices must be in-bounds.
2052
    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2053
        self.take_slice_unchecked_impl(idx, true)
2054
    }
2055

2056
    /// # Safety
2057
    /// The indices must be in-bounds.
2058
    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2059
        let cols = if allow_threads {
2060
            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2061
        } else {
2062
            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2063
        };
2064
        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2065
    }
2066

2067
    /// Rename a column in the [`DataFrame`].
2068
    ///
2069
    /// # Example
2070
    ///
2071
    /// ```
2072
    /// # use polars_core::prelude::*;
2073
    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2074
    ///     let original_name = "foo";
2075
    ///     let new_name = "bar";
2076
    ///     df.rename(original_name, new_name.into())
2077
    /// }
2078
    /// ```
2079
    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2080
        if column == name.as_str() {
2081
            return Ok(self);
2082
        }
2083
        polars_ensure!(
2084
            !self.schema().contains(&name),
2085
            Duplicate: "column rename attempted with already existing name \"{name}\""
2086
        );
2087

2088
        self.get_column_index(column)
2089
            .and_then(|idx| self.columns.get_mut(idx))
2090
            .ok_or_else(|| polars_err!(col_not_found = column))
2091
            .map(|c| c.rename(name))?;
2092
        self.clear_schema();
2093

2094
        Ok(self)
2095
    }
2096

2097
    /// Sort [`DataFrame`] in place.
2098
    ///
2099
    /// See [`DataFrame::sort`] for more instruction.
2100
    pub fn sort_in_place(
2101
        &mut self,
2102
        by: impl IntoVec<PlSmallStr>,
2103
        sort_options: SortMultipleOptions,
2104
    ) -> PolarsResult<&mut Self> {
2105
        let by_column = self.select_columns(by)?;
2106
        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2107
        Ok(self)
2108
    }
2109

2110
    #[doc(hidden)]
2111
    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2112
    pub fn sort_impl(
2113
        &self,
2114
        by_column: Vec<Column>,
2115
        mut sort_options: SortMultipleOptions,
2116
        slice: Option<(i64, usize)>,
2117
    ) -> PolarsResult<Self> {
2118
        if by_column.is_empty() {
2119
            // If no columns selected, any order (including original order) is correct.
2120
            return if let Some((offset, len)) = slice {
2121
                Ok(self.slice(offset, len))
2122
            } else {
2123
                Ok(self.clone())
2124
            };
2125
        }
2126

2127
        // note that the by_column argument also contains evaluated expression from
2128
        // polars-lazy that may not even be present in this dataframe. therefore
2129
        // when we try to set the first columns as sorted, we ignore the error as
2130
        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2131
        let first_descending = sort_options.descending[0];
2132
        let first_by_column = by_column[0].name().to_string();
2133

2134
        let set_sorted = |df: &mut DataFrame| {
2135
            // Mark the first sort column as sorted; if the column does not exist it
2136
            // is ok, because we sorted by an expression not present in the dataframe
2137
            let _ = df.apply(&first_by_column, |s| {
2138
                let mut s = s.clone();
2139
                if first_descending {
2140
                    s.set_sorted_flag(IsSorted::Descending)
2141
                } else {
2142
                    s.set_sorted_flag(IsSorted::Ascending)
2143
                }
2144
                s
2145
            });
2146
        };
2147
        if self.is_empty() {
2148
            let mut out = self.clone();
2149
            set_sorted(&mut out);
2150
            return Ok(out);
2151
        }
2152

2153
        if let Some((0, k)) = slice {
2154
            if k < self.len() {
2155
                return self.bottom_k_impl(k, by_column, sort_options);
2156
            }
2157
        }
2158
        // Check if the required column is already sorted; if so we can exit early
2159
        // We can do so when there is only one column to sort by, for multiple columns
2160
        // it will be complicated to do so
2161
        #[cfg(feature = "dtype-categorical")]
2162
        let is_not_categorical_enum =
2163
            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2164
                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2165

2166
        #[cfg(not(feature = "dtype-categorical"))]
2167
        #[allow(non_upper_case_globals)]
2168
        const is_not_categorical_enum: bool = true;
2169

2170
        if by_column.len() == 1 && is_not_categorical_enum {
2171
            let required_sorting = if sort_options.descending[0] {
2172
                IsSorted::Descending
2173
            } else {
2174
                IsSorted::Ascending
2175
            };
2176
            // If null count is 0 then nulls_last doesnt matter
2177
            // Safe to get value at last position since the dataframe is not empty (taken care above)
2178
            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2179
                && ((by_column[0].null_count() == 0)
2180
                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2181
                        == sort_options.nulls_last[0]);
2182

2183
            if no_sorting_required {
2184
                return if let Some((offset, len)) = slice {
2185
                    Ok(self.slice(offset, len))
2186
                } else {
2187
                    Ok(self.clone())
2188
                };
2189
            }
2190
        }
2191

2192
        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2193

2194
        // a lot of indirection in both sorting and take
2195
        let mut df = self.clone();
2196
        let df = df.as_single_chunk_par();
2197
        let mut take = match (by_column.len(), has_nested) {
2198
            (1, false) => {
2199
                let s = &by_column[0];
2200
                let options = SortOptions {
2201
                    descending: sort_options.descending[0],
2202
                    nulls_last: sort_options.nulls_last[0],
2203
                    multithreaded: sort_options.multithreaded,
2204
                    maintain_order: sort_options.maintain_order,
2205
                    limit: sort_options.limit,
2206
                };
2207
                // fast path for a frame with a single series
2208
                // no need to compute the sort indices and then take by these indices
2209
                // simply sort and return as frame
2210
                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2211
                    let mut out = s.sort_with(options)?;
2212
                    if let Some((offset, len)) = slice {
2213
                        out = out.slice(offset, len);
2214
                    }
2215
                    return Ok(out.into_frame());
2216
                }
2217
                s.arg_sort(options)
2218
            },
2219
            _ => {
2220
                if sort_options.nulls_last.iter().all(|&x| x)
2221
                    || has_nested
2222
                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2223
                {
2224
                    argsort_multiple_row_fmt(
2225
                        &by_column,
2226
                        sort_options.descending,
2227
                        sort_options.nulls_last,
2228
                        sort_options.multithreaded,
2229
                    )?
2230
                } else {
2231
                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2232
                    first
2233
                        .as_materialized_series()
2234
                        .arg_sort_multiple(&other, &sort_options)?
2235
                }
2236
            },
2237
        };
2238

2239
        if let Some((offset, len)) = slice {
2240
            take = take.slice(offset, len);
2241
        }
2242

2243
        // SAFETY:
2244
        // the created indices are in bounds
2245
        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2246
        set_sorted(&mut df);
2247
        Ok(df)
2248
    }
2249

2250
    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2251
    ///
2252
    /// This dataframe does not necessarily have a specified schema and may be changed at any
2253
    /// point. It is primarily used for debugging.
2254
    pub fn _to_metadata(&self) -> DataFrame {
2255
        let num_columns = self.columns.len();
2256

2257
        let mut column_names =
2258
            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2259
        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2260
        let mut sorted_asc_ca =
2261
            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2262
        let mut sorted_dsc_ca =
2263
            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2264
        let mut fast_explode_list_ca =
2265
            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2266
        let mut materialized_at_ca =
2267
            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2268

2269
        for col in &self.columns {
2270
            let flags = col.get_flags();
2271

2272
            let (repr, materialized_at) = match col {
2273
                Column::Series(s) => ("series", s.materialized_at()),
2274
                Column::Partitioned(_) => ("partitioned", None),
2275
                Column::Scalar(_) => ("scalar", None),
2276
            };
2277
            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2278
            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2279
            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2280

2281
            column_names.append_value(col.name().clone());
2282
            repr_ca.append_value(repr);
2283
            sorted_asc_ca.append_value(sorted_asc);
2284
            sorted_dsc_ca.append_value(sorted_dsc);
2285
            fast_explode_list_ca.append_value(fast_explode_list);
2286
            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2287
        }
2288

2289
        unsafe {
2290
            DataFrame::new_no_checks(
2291
                self.width(),
2292
                vec![
2293
                    column_names.finish().into_column(),
2294
                    repr_ca.finish().into_column(),
2295
                    sorted_asc_ca.finish().into_column(),
2296
                    sorted_dsc_ca.finish().into_column(),
2297
                    fast_explode_list_ca.finish().into_column(),
2298
                    materialized_at_ca.finish().into_column(),
2299
                ],
2300
            )
2301
        }
2302
    }
2303

2304
    /// Return a sorted clone of this [`DataFrame`].
2305
    ///
2306
    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2307
    /// # Example
2308
    ///
2309
    /// Sort by a single column with default options:
2310
    /// ```
2311
    /// # use polars_core::prelude::*;
2312
    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2313
    ///     df.sort(["sepal_width"], Default::default())
2314
    /// }
2315
    /// ```
2316
    /// Sort by a single column with specific order:
2317
    /// ```
2318
    /// # use polars_core::prelude::*;
2319
    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2320
    ///     df.sort(
2321
    ///         ["sepal_width"],
2322
    ///         SortMultipleOptions::new()
2323
    ///             .with_order_descending(descending)
2324
    ///     )
2325
    /// }
2326
    /// ```
2327
    /// Sort by multiple columns with specifying order for each column:
2328
    /// ```
2329
    /// # use polars_core::prelude::*;
2330
    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2331
    ///     df.sort(
2332
    ///         ["sepal_width", "sepal_length"],
2333
    ///         SortMultipleOptions::new()
2334
    ///             .with_order_descending_multi([false, true])
2335
    ///     )
2336
    /// }
2337
    /// ```
2338
    /// See [`SortMultipleOptions`] for more options.
2339
    ///
2340
    /// Also see [`DataFrame::sort_in_place`].
2341
    pub fn sort(
2342
        &self,
2343
        by: impl IntoVec<PlSmallStr>,
2344
        sort_options: SortMultipleOptions,
2345
    ) -> PolarsResult<Self> {
2346
        let mut df = self.clone();
2347
        df.sort_in_place(by, sort_options)?;
2348
        Ok(df)
2349
    }
2350

2351
    /// Replace a column with a [`Series`].
2352
    ///
2353
    /// # Example
2354
    ///
2355
    /// ```rust
2356
    /// # use polars_core::prelude::*;
2357
    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2358
    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2359
    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2360
    ///
2361
    /// assert!(df.replace("Nation", s.clone()).is_err());
2362
    /// assert!(df.replace("Country", s).is_ok());
2363
    /// # Ok::<(), PolarsError>(())
2364
    /// ```
2365
    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2366
        self.apply(column, |_| new_col.into_series())
2367
    }
2368

2369
    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2370
    /// is that now the value of `column: &str` determines the name of the column and not the name
2371
    /// of the `Series` passed to this method.
2372
    pub fn replace_or_add<S: IntoSeries>(
2373
        &mut self,
2374
        column: PlSmallStr,
2375
        new_col: S,
2376
    ) -> PolarsResult<&mut Self> {
2377
        let mut new_col = new_col.into_series();
2378
        new_col.rename(column);
2379
        self.with_column(new_col)
2380
    }
2381

2382
    /// Replace column at index `idx` with a [`Series`].
2383
    ///
2384
    /// # Example
2385
    ///
2386
    /// ```ignored
2387
    /// # use polars_core::prelude::*;
2388
    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2389
    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2390
    /// let mut df = DataFrame::new(vec![s0, s1])?;
2391
    ///
2392
    /// // Add 32 to get lowercase ascii values
2393
    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2394
    /// # Ok::<(), PolarsError>(())
2395
    /// ```
2396
    pub fn replace_column<C: IntoColumn>(
2397
        &mut self,
2398
        index: usize,
2399
        new_column: C,
2400
    ) -> PolarsResult<&mut Self> {
2401
        polars_ensure!(
2402
            index < self.width(),
2403
            ShapeMismatch:
2404
            "unable to replace at index {}, the DataFrame has only {} columns",
2405
            index, self.width(),
2406
        );
2407
        let mut new_column = new_column.into_column();
2408
        polars_ensure!(
2409
            new_column.len() == self.height(),
2410
            ShapeMismatch:
2411
            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2412
            new_column.len(), self.height(),
2413
        );
2414
        let old_col = &mut self.columns[index];
2415
        mem::swap(old_col, &mut new_column);
2416
        self.clear_schema();
2417
        Ok(self)
2418
    }
2419

2420
    /// Apply a closure to a column. This is the recommended way to do in place modification.
2421
    ///
2422
    /// # Example
2423
    ///
2424
    /// ```rust
2425
    /// # use polars_core::prelude::*;
2426
    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2427
    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2428
    /// let mut df = DataFrame::new(vec![s0, s1])?;
2429
    ///
2430
    /// fn str_to_len(str_val: &Column) -> Column {
2431
    ///     str_val.str()
2432
    ///         .unwrap()
2433
    ///         .into_iter()
2434
    ///         .map(|opt_name: Option<&str>| {
2435
    ///             opt_name.map(|name: &str| name.len() as u32)
2436
    ///          })
2437
    ///         .collect::<UInt32Chunked>()
2438
    ///         .into_column()
2439
    /// }
2440
    ///
2441
    /// // Replace the names column by the length of the names.
2442
    /// df.apply("names", str_to_len);
2443
    /// # Ok::<(), PolarsError>(())
2444
    /// ```
2445
    /// Results in:
2446
    ///
2447
    /// ```text
2448
    /// +--------+-------+
2449
    /// | foo    |       |
2450
    /// | ---    | names |
2451
    /// | str    | u32   |
2452
    /// +========+=======+
2453
    /// | "ham"  | 4     |
2454
    /// +--------+-------+
2455
    /// | "spam" | 6     |
2456
    /// +--------+-------+
2457
    /// | "egg"  | 3     |
2458
    /// +--------+-------+
2459
    /// ```
2460
    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2461
    where
2462
        F: FnOnce(&Column) -> C,
2463
        C: IntoColumn,
2464
    {
2465
        let idx = self.check_name_to_idx(name)?;
2466
        self.apply_at_idx(idx, f)?;
2467
        Ok(self)
2468
    }
2469

2470
    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2471
    /// modification.
2472
    ///
2473
    /// # Example
2474
    ///
2475
    /// ```rust
2476
    /// # use polars_core::prelude::*;
2477
    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2478
    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2479
    /// let mut df = DataFrame::new(vec![s0, s1])?;
2480
    ///
2481
    /// // Add 32 to get lowercase ascii values
2482
    /// df.apply_at_idx(1, |s| s + 32);
2483
    /// # Ok::<(), PolarsError>(())
2484
    /// ```
2485
    /// Results in:
2486
    ///
2487
    /// ```text
2488
    /// +--------+-------+
2489
    /// | foo    | ascii |
2490
    /// | ---    | ---   |
2491
    /// | str    | i32   |
2492
    /// +========+=======+
2493
    /// | "ham"  | 102   |
2494
    /// +--------+-------+
2495
    /// | "spam" | 111   |
2496
    /// +--------+-------+
2497
    /// | "egg"  | 111   |
2498
    /// +--------+-------+
2499
    /// ```
2500
    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2501
    where
2502
        F: FnOnce(&Column) -> C,
2503
        C: IntoColumn,
2504
    {
2505
        let df_height = self.height();
2506
        let width = self.width();
2507
        let col = self.columns.get_mut(idx).ok_or_else(|| {
2508
            polars_err!(
2509
                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2510
                idx, width
2511
            )
2512
        })?;
2513
        let name = col.name().clone();
2514
        let dtype_before = col.dtype().clone();
2515
        let new_col = f(col).into_column();
2516
        match new_col.len() {
2517
            1 => {
2518
                let new_col = new_col.new_from_index(0, df_height);
2519
                let _ = mem::replace(col, new_col);
2520
            },
2521
            len if (len == df_height) => {
2522
                let _ = mem::replace(col, new_col);
2523
            },
2524
            len => polars_bail!(
2525
                ShapeMismatch:
2526
                "resulting Series has length {} while the DataFrame has height {}",
2527
                len, df_height
2528
            ),
2529
        }
2530

2531
        // make sure the name remains the same after applying the closure
2532
        unsafe {
2533
            let col = self.columns.get_unchecked_mut(idx);
2534
            col.rename(name);
2535

2536
            if col.dtype() != &dtype_before {
2537
                self.clear_schema();
2538
            }
2539
        }
2540
        Ok(self)
2541
    }
2542

2543
    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2544
    /// modification.
2545
    ///
2546
    /// # Example
2547
    ///
2548
    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2549
    ///
2550
    /// ```rust
2551
    /// # use polars_core::prelude::*;
2552
    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2553
    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2554
    /// let mut df = DataFrame::new(vec![s0, s1])?;
2555
    ///
2556
    /// let idx = vec![0, 1, 4];
2557
    ///
2558
    /// df.try_apply("foo", |c| {
2559
    ///     c.str()?
2560
    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2561
    /// });
2562
    /// # Ok::<(), PolarsError>(())
2563
    /// ```
2564
    /// Results in:
2565
    ///
2566
    /// ```text
2567
    /// +---------------------+--------+
2568
    /// | foo                 | values |
2569
    /// | ---                 | ---    |
2570
    /// | str                 | i32    |
2571
    /// +=====================+========+
2572
    /// | "ham-is-modified"   | 1      |
2573
    /// +---------------------+--------+
2574
    /// | "spam-is-modified"  | 2      |
2575
    /// +---------------------+--------+
2576
    /// | "egg"               | 3      |
2577
    /// +---------------------+--------+
2578
    /// | "bacon"             | 4      |
2579
    /// +---------------------+--------+
2580
    /// | "quack-is-modified" | 5      |
2581
    /// +---------------------+--------+
2582
    /// ```
2583
    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2584
    where
2585
        F: FnOnce(&Column) -> PolarsResult<C>,
2586
        C: IntoColumn,
2587
    {
2588
        let width = self.width();
2589
        let col = self.columns.get_mut(idx).ok_or_else(|| {
2590
            polars_err!(
2591
                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2592
                idx, width
2593
            )
2594
        })?;
2595
        let name = col.name().clone();
2596

2597
        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2598

2599
        // make sure the name remains the same after applying the closure
2600
        unsafe {
2601
            let col = self.columns.get_unchecked_mut(idx);
2602
            col.rename(name);
2603
        }
2604
        Ok(self)
2605
    }
2606

2607
    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2608
    /// modification.
2609
    ///
2610
    /// # Example
2611
    ///
2612
    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2613
    ///
2614
    /// ```rust
2615
    /// # use polars_core::prelude::*;
2616
    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2617
    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2618
    /// let mut df = DataFrame::new(vec![s0, s1])?;
2619
    ///
2620
    /// // create a mask
2621
    /// let values = df.column("values")?.as_materialized_series();
2622
    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2623
    ///
2624
    /// df.try_apply("foo", |c| {
2625
    ///     c.str()?
2626
    ///     .set(&mask, Some("not_within_bounds"))
2627
    /// });
2628
    /// # Ok::<(), PolarsError>(())
2629
    /// ```
2630
    /// Results in:
2631
    ///
2632
    /// ```text
2633
    /// +---------------------+--------+
2634
    /// | foo                 | values |
2635
    /// | ---                 | ---    |
2636
    /// | str                 | i32    |
2637
    /// +=====================+========+
2638
    /// | "not_within_bounds" | 1      |
2639
    /// +---------------------+--------+
2640
    /// | "spam"              | 2      |
2641
    /// +---------------------+--------+
2642
    /// | "egg"               | 3      |
2643
    /// +---------------------+--------+
2644
    /// | "bacon"             | 4      |
2645
    /// +---------------------+--------+
2646
    /// | "not_within_bounds" | 5      |
2647
    /// +---------------------+--------+
2648
    /// ```
2649
    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2650
    where
2651
        F: FnOnce(&Series) -> PolarsResult<C>,
2652
        C: IntoColumn,
2653
    {
2654
        let idx = self.try_get_column_index(column)?;
2655
        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2656
    }
2657

2658
    /// Slice the [`DataFrame`] along the rows.
2659
    ///
2660
    /// # Example
2661
    ///
2662
    /// ```rust
2663
    /// # use polars_core::prelude::*;
2664
    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2665
    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2666
    /// let sl: DataFrame = df.slice(2, 3);
2667
    ///
2668
    /// assert_eq!(sl.shape(), (3, 2));
2669
    /// println!("{}", sl);
2670
    /// # Ok::<(), PolarsError>(())
2671
    /// ```
2672
    /// Output:
2673
    /// ```text
2674
    /// shape: (3, 2)
2675
    /// +-------+-------+
2676
    /// | Fruit | Color |
2677
    /// | ---   | ---   |
2678
    /// | str   | str   |
2679
    /// +=======+=======+
2680
    /// | Grape | White |
2681
    /// +-------+-------+
2682
    /// | Fig   | White |
2683
    /// +-------+-------+
2684
    /// | Fig   | Red   |
2685
    /// +-------+-------+
2686
    /// ```
2687
    #[must_use]
2688
    pub fn slice(&self, offset: i64, length: usize) -> Self {
2689
        if offset == 0 && length == self.height() {
2690
            return self.clone();
2691
        }
2692
        if length == 0 {
2693
            return self.clear();
2694
        }
2695
        let col = self
2696
            .columns
2697
            .iter()
2698
            .map(|s| s.slice(offset, length))
2699
            .collect::<Vec<_>>();
2700

2701
        let height = if let Some(fst) = col.first() {
2702
            fst.len()
2703
        } else {
2704
            let (_, length) = slice_offsets(offset, length, self.height());
2705
            length
2706
        };
2707

2708
        unsafe { DataFrame::new_no_checks(height, col) }
2709
    }
2710

2711
    /// Split [`DataFrame`] at the given `offset`.
2712
    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2713
        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2714

2715
        let (idx, _) = slice_offsets(offset, 0, self.height());
2716

2717
        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2718
        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2719
        (a, b)
2720
    }
2721

2722
    pub fn clear(&self) -> Self {
2723
        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2724
        unsafe { DataFrame::new_no_checks(0, col) }
2725
    }
2726

2727
    #[must_use]
2728
    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2729
        if offset == 0 && length == self.height() {
2730
            return self.clone();
2731
        }
2732
        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2733
        unsafe { DataFrame::new_no_checks(length, columns) }
2734
    }
2735

2736
    #[must_use]
2737
    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2738
        if offset == 0 && length == self.height() {
2739
            return self.clone();
2740
        }
2741
        // @scalar-opt
2742
        let columns = self._apply_columns(&|s| {
2743
            let mut out = s.slice(offset, length);
2744
            out.shrink_to_fit();
2745
            out
2746
        });
2747
        unsafe { DataFrame::new_no_checks(length, columns) }
2748
    }
2749

2750
    /// Get the head of the [`DataFrame`].
2751
    ///
2752
    /// # Example
2753
    ///
2754
    /// ```rust
2755
    /// # use polars_core::prelude::*;
2756
    /// let countries: DataFrame =
2757
    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2758
    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2759
    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2760
    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2761
    /// assert_eq!(countries.shape(), (5, 4));
2762
    ///
2763
    /// println!("{}", countries.head(Some(3)));
2764
    /// # Ok::<(), PolarsError>(())
2765
    /// ```
2766
    ///
2767
    /// Output:
2768
    ///
2769
    /// ```text
2770
    /// shape: (3, 4)
2771
    /// +--------------------+---------------+---------------+------------+
2772
    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2773
    /// | ---                | ---           | ---           | ---        |
2774
    /// | i32                | str           | str           | str        |
2775
    /// +====================+===============+===============+============+
2776
    /// | 1                  | North America | United States | Washington |
2777
    /// +--------------------+---------------+---------------+------------+
2778
    /// | 2                  | Asia          | China         | Beijing    |
2779
    /// +--------------------+---------------+---------------+------------+
2780
    /// | 3                  | Asia          | Japan         | Tokyo      |
2781
    /// +--------------------+---------------+---------------+------------+
2782
    /// ```
2783
    #[must_use]
2784
    pub fn head(&self, length: Option<usize>) -> Self {
2785
        let col = self
2786
            .columns
2787
            .iter()
2788
            .map(|c| c.head(length))
2789
            .collect::<Vec<_>>();
2790

2791
        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2792
        let height = usize::min(height, self.height());
2793
        unsafe { DataFrame::new_no_checks(height, col) }
2794
    }
2795

2796
    /// Get the tail of the [`DataFrame`].
2797
    ///
2798
    /// # Example
2799
    ///
2800
    /// ```rust
2801
    /// # use polars_core::prelude::*;
2802
    /// let countries: DataFrame =
2803
    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2804
    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2805
    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2806
    /// assert_eq!(countries.shape(), (5, 3));
2807
    ///
2808
    /// println!("{}", countries.tail(Some(2)));
2809
    /// # Ok::<(), PolarsError>(())
2810
    /// ```
2811
    ///
2812
    /// Output:
2813
    ///
2814
    /// ```text
2815
    /// shape: (2, 3)
2816
    /// +-------------+--------------------+---------+
2817
    /// | Rank (2021) | Apple Price (€/kg) | Country |
2818
    /// | ---         | ---                | ---     |
2819
    /// | i32         | f64                | str     |
2820
    /// +=============+====================+=========+
2821
    /// | 108         | 0.63               | Syria   |
2822
    /// +-------------+--------------------+---------+
2823
    /// | 109         | 0.63               | Turkey  |
2824
    /// +-------------+--------------------+---------+
2825
    /// ```
2826
    #[must_use]
2827
    pub fn tail(&self, length: Option<usize>) -> Self {
2828
        let col = self
2829
            .columns
2830
            .iter()
2831
            .map(|c| c.tail(length))
2832
            .collect::<Vec<_>>();
2833

2834
        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2835
        let height = usize::min(height, self.height());
2836
        unsafe { DataFrame::new_no_checks(height, col) }
2837
    }
2838

2839
    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2840
    ///
2841
    /// # Panics
2842
    ///
2843
    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2844
    ///
2845
    /// This responsibility is left to the caller as we don't want to take mutable references here,
2846
    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2847
    /// as well.
2848
    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2849
        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2850
        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2851
        // as we must allocate arrow strings/binaries.
2852
        let must_convert = compat_level.0 == 0;
2853
        let parallel = parallel
2854
            && must_convert
2855
            && self.columns.len() > 1
2856
            && self
2857
                .columns
2858
                .iter()
2859
                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2860

2861
        RecordBatchIter {
2862
            columns: &self.columns,
2863
            schema: Arc::new(
2864
                self.columns
2865
                    .iter()
2866
                    .map(|c| c.field().to_arrow(compat_level))
2867
                    .collect(),
2868
            ),
2869
            idx: 0,
2870
            n_chunks: self.first_col_n_chunks(),
2871
            compat_level,
2872
            parallel,
2873
        }
2874
    }
2875

2876
    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2877
    ///
2878
    /// # Panics
2879
    ///
2880
    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2881
    ///
2882
    /// This responsibility is left to the caller as we don't want to take mutable references here,
2883
    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2884
    /// as well.
2885
    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2886
        debug_assert!(!self.should_rechunk());
2887
        PhysRecordBatchIter {
2888
            schema: Arc::new(
2889
                self.get_columns()
2890
                    .iter()
2891
                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2892
                    .collect(),
2893
            ),
2894
            arr_iters: self
2895
                .materialized_column_iter()
2896
                .map(|s| s.chunks().iter())
2897
                .collect(),
2898
        }
2899
    }
2900

2901
    /// Get a [`DataFrame`] with all the columns in reversed order.
2902
    #[must_use]
2903
    pub fn reverse(&self) -> Self {
2904
        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2905
        unsafe { DataFrame::new_no_checks(self.height(), col) }
2906
    }
2907

2908
    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2909
    /// with `Nones`.
2910
    ///
2911
    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2912
    #[must_use]
2913
    pub fn shift(&self, periods: i64) -> Self {
2914
        let col = self._apply_columns_par(&|s| s.shift(periods));
2915
        unsafe { DataFrame::new_no_checks(self.height(), col) }
2916
    }
2917

2918
    /// Replace None values with one of the following strategies:
2919
    /// * Forward fill (replace None with the previous value)
2920
    /// * Backward fill (replace None with the next value)
2921
    /// * Mean fill (replace None with the mean of the whole array)
2922
    /// * Min fill (replace None with the minimum of the whole array)
2923
    /// * Max fill (replace None with the maximum of the whole array)
2924
    ///
2925
    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2926
    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2927
        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2928

2929
        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2930
    }
2931

2932
    /// Pipe different functions/ closure operations that work on a DataFrame together.
2933
    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2934
    where
2935
        F: Fn(DataFrame) -> PolarsResult<B>,
2936
    {
2937
        f(self)
2938
    }
2939

2940
    /// Pipe different functions/ closure operations that work on a DataFrame together.
2941
    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2942
    where
2943
        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2944
    {
2945
        f(self)
2946
    }
2947

2948
    /// Pipe different functions/ closure operations that work on a DataFrame together.
2949
    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2950
    where
2951
        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2952
    {
2953
        f(self, args)
2954
    }
2955

2956
    /// Drop duplicate rows from a [`DataFrame`].
2957
    /// *This fails when there is a column of type List in DataFrame*
2958
    ///
2959
    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2960
    ///
2961
    /// # Example
2962
    ///
2963
    /// ```no_run
2964
    /// # use polars_core::prelude::*;
2965
    /// let df = df! {
2966
    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2967
    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2968
    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2969
    ///           }?;
2970
    ///
2971
    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2972
    /// # Ok::<(), PolarsError>(())
2973
    /// ```
2974
    /// Returns
2975
    ///
2976
    /// ```text
2977
    /// +-----+-----+-----+
2978
    /// | flt | int | str |
2979
    /// | --- | --- | --- |
2980
    /// | f64 | i32 | str |
2981
    /// +=====+=====+=====+
2982
    /// | 1   | 1   | "a" |
2983
    /// +-----+-----+-----+
2984
    /// | 2   | 2   | "b" |
2985
    /// +-----+-----+-----+
2986
    /// | 3   | 3   | "c" |
2987
    /// +-----+-----+-----+
2988
    /// ```
2989
    #[cfg(feature = "algorithm_group_by")]
2990
    pub fn unique_stable(
2991
        &self,
2992
        subset: Option<&[String]>,
2993
        keep: UniqueKeepStrategy,
2994
        slice: Option<(i64, usize)>,
2995
    ) -> PolarsResult<DataFrame> {
2996
        self.unique_impl(
2997
            true,
2998
            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2999
            keep,
3000
            slice,
3001
        )
3002
    }
3003

3004
    /// Unstable distinct. See [`DataFrame::unique_stable`].
3005
    #[cfg(feature = "algorithm_group_by")]
3006
    pub fn unique<I, S>(
3007
        &self,
3008
        subset: Option<&[String]>,
3009
        keep: UniqueKeepStrategy,
3010
        slice: Option<(i64, usize)>,
3011
    ) -> PolarsResult<DataFrame> {
3012
        self.unique_impl(
3013
            false,
3014
            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3015
            keep,
3016
            slice,
3017
        )
3018
    }
3019

3020
    #[cfg(feature = "algorithm_group_by")]
3021
    pub fn unique_impl(
3022
        &self,
3023
        maintain_order: bool,
3024
        subset: Option<Vec<PlSmallStr>>,
3025
        keep: UniqueKeepStrategy,
3026
        slice: Option<(i64, usize)>,
3027
    ) -> PolarsResult<Self> {
3028
        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3029
        let mut df = self.clone();
3030
        // take on multiple chunks is terrible
3031
        df.as_single_chunk_par();
3032

3033
        let columns = match (keep, maintain_order) {
3034
            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3035
                let gb = df.group_by_stable(names)?;
3036
                let groups = gb.get_groups();
3037
                let (offset, len) = slice.unwrap_or((0, groups.len()));
3038
                let groups = groups.slice(offset, len);
3039
                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3040
            },
3041
            (UniqueKeepStrategy::Last, true) => {
3042
                // maintain order by last values, so the sorted groups are not correct as they
3043
                // are sorted by the first value
3044
                let gb = df.group_by_stable(names)?;
3045
                let groups = gb.get_groups();
3046

3047
                let last_idx: NoNull<IdxCa> = groups
3048
                    .iter()
3049
                    .map(|g| match g {
3050
                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3051
                        GroupsIndicator::Slice([first, len]) => first + len - 1,
3052
                    })
3053
                    .collect();
3054

3055
                let mut last_idx = last_idx.into_inner().sort(false);
3056

3057
                if let Some((offset, len)) = slice {
3058
                    last_idx = last_idx.slice(offset, len);
3059
                }
3060

3061
                let last_idx = NoNull::new(last_idx);
3062
                let out = unsafe { df.take_unchecked(&last_idx) };
3063
                return Ok(out);
3064
            },
3065
            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3066
                let gb = df.group_by(names)?;
3067
                let groups = gb.get_groups();
3068
                let (offset, len) = slice.unwrap_or((0, groups.len()));
3069
                let groups = groups.slice(offset, len);
3070
                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3071
            },
3072
            (UniqueKeepStrategy::Last, false) => {
3073
                let gb = df.group_by(names)?;
3074
                let groups = gb.get_groups();
3075
                let (offset, len) = slice.unwrap_or((0, groups.len()));
3076
                let groups = groups.slice(offset, len);
3077
                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3078
            },
3079
            (UniqueKeepStrategy::None, _) => {
3080
                let df_part = df.select(names)?;
3081
                let mask = df_part.is_unique()?;
3082
                let mut filtered = df.filter(&mask)?;
3083

3084
                if let Some((offset, len)) = slice {
3085
                    filtered = filtered.slice(offset, len);
3086
                }
3087
                return Ok(filtered);
3088
            },
3089
        };
3090
        let height = Self::infer_height(&columns);
3091
        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3092
    }
3093

3094
    /// Get a mask of all the unique rows in the [`DataFrame`].
3095
    ///
3096
    /// # Example
3097
    ///
3098
    /// ```no_run
3099
    /// # use polars_core::prelude::*;
3100
    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3101
    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3102
    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3103
    ///
3104
    /// assert!(ca.all());
3105
    /// # Ok::<(), PolarsError>(())
3106
    /// ```
3107
    #[cfg(feature = "algorithm_group_by")]
3108
    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3109
        let gb = self.group_by(self.get_column_names_owned())?;
3110
        let groups = gb.get_groups();
3111
        Ok(is_unique_helper(
3112
            groups,
3113
            self.height() as IdxSize,
3114
            true,
3115
            false,
3116
        ))
3117
    }
3118

3119
    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3120
    ///
3121
    /// # Example
3122
    ///
3123
    /// ```no_run
3124
    /// # use polars_core::prelude::*;
3125
    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3126
    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3127
    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3128
    ///
3129
    /// assert!(!ca.all());
3130
    /// # Ok::<(), PolarsError>(())
3131
    /// ```
3132
    #[cfg(feature = "algorithm_group_by")]
3133
    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3134
        let gb = self.group_by(self.get_column_names_owned())?;
3135
        let groups = gb.get_groups();
3136
        Ok(is_unique_helper(
3137
            groups,
3138
            self.height() as IdxSize,
3139
            false,
3140
            true,
3141
        ))
3142
    }
3143

3144
    /// Create a new [`DataFrame`] that shows the null counts per column.
3145
    #[must_use]
3146
    pub fn null_count(&self) -> Self {
3147
        let cols = self
3148
            .columns
3149
            .iter()
3150
            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3151
            .collect();
3152
        unsafe { Self::new_no_checks(1, cols) }
3153
    }
3154

3155
    /// Hash and combine the row values
3156
    #[cfg(feature = "row_hash")]
3157
    pub fn hash_rows(
3158
        &mut self,
3159
        hasher_builder: Option<PlSeedableRandomStateQuality>,
3160
    ) -> PolarsResult<UInt64Chunked> {
3161
        let dfs = split_df(self, POOL.current_num_threads(), false);
3162
        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3163

3164
        let mut iter = cas.into_iter();
3165
        let mut acc_ca = iter.next().unwrap();
3166
        for ca in iter {
3167
            acc_ca.append(&ca)?;
3168
        }
3169
        Ok(acc_ca.rechunk().into_owned())
3170
    }
3171

3172
    /// Get the supertype of the columns in this DataFrame
3173
    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3174
        self.columns
3175
            .iter()
3176
            .map(|s| Ok(s.dtype().clone()))
3177
            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3178
    }
3179

3180
    /// Take by index values given by the slice `idx`.
3181
    /// # Warning
3182
    /// Be careful with allowing threads when calling this in a large hot loop
3183
    /// every thread split may be on rayon stack and lead to SO
3184
    #[doc(hidden)]
3185
    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3186
        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3187
    }
3188

3189
    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3190
    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3191
    ///
3192
    /// # Warning
3193
    /// Be careful with allowing threads when calling this in a large hot loop
3194
    /// every thread split may be on rayon stack and lead to SO
3195
    #[doc(hidden)]
3196
    pub unsafe fn _take_unchecked_slice_sorted(
3197
        &self,
3198
        idx: &[IdxSize],
3199
        allow_threads: bool,
3200
        sorted: IsSorted,
3201
    ) -> Self {
3202
        #[cfg(debug_assertions)]
3203
        {
3204
            if idx.len() > 2 {
3205
                match sorted {
3206
                    IsSorted::Ascending => {
3207
                        assert!(idx[0] <= idx[idx.len() - 1]);
3208
                    },
3209
                    IsSorted::Descending => {
3210
                        assert!(idx[0] >= idx[idx.len() - 1]);
3211
                    },
3212
                    _ => {},
3213
                }
3214
            }
3215
        }
3216
        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3217
        ca.set_sorted_flag(sorted);
3218
        self.take_unchecked_impl(&ca, allow_threads)
3219
    }
3220

3221
    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3222
    #[doc(hidden)]
3223
    pub fn _partition_by_impl(
3224
        &self,
3225
        cols: &[PlSmallStr],
3226
        stable: bool,
3227
        include_key: bool,
3228
        parallel: bool,
3229
    ) -> PolarsResult<Vec<DataFrame>> {
3230
        let selected_keys = self.select_columns(cols.iter().cloned())?;
3231
        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3232
        let groups = groups.take_groups();
3233

3234
        // drop key columns prior to calculation if requested
3235
        let df = if include_key {
3236
            self.clone()
3237
        } else {
3238
            self.drop_many(cols.iter().cloned())
3239
        };
3240

3241
        if parallel {
3242
            // don't parallelize this
3243
            // there is a lot of parallelization in take and this may easily SO
3244
            POOL.install(|| {
3245
                match groups.as_ref() {
3246
                    GroupsType::Idx(idx) => {
3247
                        // Rechunk as the gather may rechunk for every group #17562.
3248
                        let mut df = df.clone();
3249
                        df.as_single_chunk_par();
3250
                        Ok(idx
3251
                            .into_par_iter()
3252
                            .map(|(_, group)| {
3253
                                // groups are in bounds
3254
                                unsafe {
3255
                                    df._take_unchecked_slice_sorted(
3256
                                        group,
3257
                                        false,
3258
                                        IsSorted::Ascending,
3259
                                    )
3260
                                }
3261
                            })
3262
                            .collect())
3263
                    },
3264
                    GroupsType::Slice { groups, .. } => Ok(groups
3265
                        .into_par_iter()
3266
                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3267
                        .collect()),
3268
                }
3269
            })
3270
        } else {
3271
            match groups.as_ref() {
3272
                GroupsType::Idx(idx) => {
3273
                    // Rechunk as the gather may rechunk for every group #17562.
3274
                    let mut df = df;
3275
                    df.as_single_chunk();
3276
                    Ok(idx
3277
                        .into_iter()
3278
                        .map(|(_, group)| {
3279
                            // groups are in bounds
3280
                            unsafe {
3281
                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3282
                            }
3283
                        })
3284
                        .collect())
3285
                },
3286
                GroupsType::Slice { groups, .. } => Ok(groups
3287
                    .iter()
3288
                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3289
                    .collect()),
3290
            }
3291
        }
3292
    }
3293

3294
    /// Split into multiple DataFrames partitioned by groups
3295
    #[cfg(feature = "partition_by")]
3296
    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3297
    where
3298
        I: IntoIterator<Item = S>,
3299
        S: Into<PlSmallStr>,
3300
    {
3301
        let cols = cols
3302
            .into_iter()
3303
            .map(Into::into)
3304
            .collect::<Vec<PlSmallStr>>();
3305
        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3306
    }
3307

3308
    /// Split into multiple DataFrames partitioned by groups
3309
    /// Order of the groups are maintained.
3310
    #[cfg(feature = "partition_by")]
3311
    pub fn partition_by_stable<I, S>(
3312
        &self,
3313
        cols: I,
3314
        include_key: bool,
3315
    ) -> PolarsResult<Vec<DataFrame>>
3316
    where
3317
        I: IntoIterator<Item = S>,
3318
        S: Into<PlSmallStr>,
3319
    {
3320
        let cols = cols
3321
            .into_iter()
3322
            .map(Into::into)
3323
            .collect::<Vec<PlSmallStr>>();
3324
        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3325
    }
3326

3327
    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3328
    /// inserted as columns.
3329
    #[cfg(feature = "dtype-struct")]
3330
    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3331
        let cols = cols.into_vec();
3332
        self.unnest_impl(cols.into_iter().collect())
3333
    }
3334

3335
    #[cfg(feature = "dtype-struct")]
3336
    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3337
        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3338
        let mut count = 0;
3339
        for s in &self.columns {
3340
            if cols.contains(s.name()) {
3341
                let ca = s.struct_()?.clone();
3342
                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3343
                count += 1;
3344
            } else {
3345
                new_cols.push(s.clone())
3346
            }
3347
        }
3348
        if count != cols.len() {
3349
            // one or more columns not found
3350
            // the code below will return an error with the missing name
3351
            let schema = self.schema();
3352
            for col in cols {
3353
                let _ = schema
3354
                    .get(col.as_str())
3355
                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3356
            }
3357
        }
3358
        DataFrame::new(new_cols)
3359
    }
3360

3361
    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3362
        cols.first().map_or(0, Column::len)
3363
    }
3364

3365
    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3366
        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3367
        // append_chunk or something like this. It is just quite difficult to make that safe.
3368
        let df = DataFrame::from(rb);
3369
        polars_ensure!(
3370
            self.schema() == df.schema(),
3371
            SchemaMismatch: "cannot append record batch with different schema\n\n
3372
        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3373
        );
3374
        self.vstack_mut_owned_unchecked(df);
3375
        Ok(())
3376
    }
3377
}
3378

3379
pub struct RecordBatchIter<'a> {
3380
    columns: &'a Vec<Column>,
3381
    schema: ArrowSchemaRef,
3382
    idx: usize,
3383
    n_chunks: usize,
3384
    compat_level: CompatLevel,
3385
    parallel: bool,
3386
}
3387

3388
impl Iterator for RecordBatchIter<'_> {
3389
    type Item = RecordBatch;
3390

3391
    fn next(&mut self) -> Option<Self::Item> {
3392
        if self.idx >= self.n_chunks {
3393
            return None;
3394
        }
3395

3396
        // Create a batch of the columns with the same chunk no.
3397
        let batch_cols: Vec<ArrayRef> = if self.parallel {
3398
            let iter = self
3399
                .columns
3400
                .par_iter()
3401
                .map(Column::as_materialized_series)
3402
                .map(|s| s.to_arrow(self.idx, self.compat_level));
3403
            POOL.install(|| iter.collect())
3404
        } else {
3405
            self.columns
3406
                .iter()
3407
                .map(Column::as_materialized_series)
3408
                .map(|s| s.to_arrow(self.idx, self.compat_level))
3409
                .collect()
3410
        };
3411
        self.idx += 1;
3412

3413
        let length = batch_cols.first().map_or(0, |arr| arr.len());
3414
        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3415
    }
3416

3417
    fn size_hint(&self) -> (usize, Option<usize>) {
3418
        let n = self.n_chunks - self.idx;
3419
        (n, Some(n))
3420
    }
3421
}
3422

3423
pub struct PhysRecordBatchIter<'a> {
3424
    schema: ArrowSchemaRef,
3425
    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3426
}
3427

3428
impl Iterator for PhysRecordBatchIter<'_> {
3429
    type Item = RecordBatch;
3430

3431
    fn next(&mut self) -> Option<Self::Item> {
3432
        let arrs = self
3433
            .arr_iters
3434
            .iter_mut()
3435
            .map(|phys_iter| phys_iter.next().cloned())
3436
            .collect::<Option<Vec<_>>>()?;
3437

3438
        let length = arrs.first().map_or(0, |arr| arr.len());
3439
        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3440
    }
3441

3442
    fn size_hint(&self) -> (usize, Option<usize>) {
3443
        if let Some(iter) = self.arr_iters.first() {
3444
            iter.size_hint()
3445
        } else {
3446
            (0, None)
3447
        }
3448
    }
3449
}
3450

3451
impl Default for DataFrame {
3452
    fn default() -> Self {
3453
        DataFrame::empty()
3454
    }
3455
}
3456

3457
impl From<DataFrame> for Vec<Column> {
3458
    fn from(df: DataFrame) -> Self {
3459
        df.columns
3460
    }
3461
}
3462

3463
// utility to test if we can vstack/extend the columns
3464
fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3465
    polars_ensure!(
3466
        left.name() == right.name(),
3467
        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3468
        left.name(), right.name(),
3469
    );
3470
    Ok(())
3471
}
3472

3473
#[cfg(test)]
3474
mod test {
3475
    use super::*;
3476

3477
    fn create_frame() -> DataFrame {
3478
        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3479
        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3480
        DataFrame::new(vec![s0, s1]).unwrap()
3481
    }
3482

3483
    #[test]
3484
    #[cfg_attr(miri, ignore)]
3485
    fn test_recordbatch_iterator() {
3486
        let df = df!(
3487
            "foo" => [1, 2, 3, 4, 5]
3488
        )
3489
        .unwrap();
3490
        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3491
        assert_eq!(5, iter.next().unwrap().len());
3492
        assert!(iter.next().is_none());
3493
    }
3494

3495
    #[test]
3496
    #[cfg_attr(miri, ignore)]
3497
    fn test_select() {
3498
        let df = create_frame();
3499
        assert_eq!(
3500
            df.column("days")
3501
                .unwrap()
3502
                .as_series()
3503
                .unwrap()
3504
                .equal(1)
3505
                .unwrap()
3506
                .sum(),
3507
            Some(1)
3508
        );
3509
    }
3510

3511
    #[test]
3512
    #[cfg_attr(miri, ignore)]
3513
    fn test_filter_broadcast_on_string_col() {
3514
        let col_name = "some_col";
3515
        let v = vec!["test".to_string()];
3516
        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3517
        let mut df = DataFrame::new(vec![s0]).unwrap();
3518

3519
        df = df
3520
            .filter(
3521
                &df.column(col_name)
3522
                    .unwrap()
3523
                    .as_materialized_series()
3524
                    .equal("")
3525
                    .unwrap(),
3526
            )
3527
            .unwrap();
3528
        assert_eq!(
3529
            df.column(col_name)
3530
                .unwrap()
3531
                .as_materialized_series()
3532
                .n_chunks(),
3533
            1
3534
        );
3535
    }
3536

3537
    #[test]
3538
    #[cfg_attr(miri, ignore)]
3539
    fn test_filter_broadcast_on_list_col() {
3540
        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3541
        let ll: ListChunked = [&s1].iter().copied().collect();
3542

3543
        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3544
        let new = ll.filter(&mask).unwrap();
3545

3546
        assert_eq!(new.chunks.len(), 1);
3547
        assert_eq!(new.len(), 0);
3548
    }
3549

3550
    #[test]
3551
    fn slice() {
3552
        let df = create_frame();
3553
        let sliced_df = df.slice(0, 2);
3554
        assert_eq!(sliced_df.shape(), (2, 2));
3555
    }
3556

3557
    #[test]
3558
    fn rechunk_false() {
3559
        let df = create_frame();
3560
        assert!(!df.should_rechunk())
3561
    }
3562

3563
    #[test]
3564
    fn rechunk_true() -> PolarsResult<()> {
3565
        let mut base = df!(
3566
            "a" => [1, 2, 3],
3567
            "b" => [1, 2, 3]
3568
        )?;
3569

3570
        // Create a series with multiple chunks
3571
        let mut s = Series::new("foo".into(), 0..2);
3572
        let s2 = Series::new("bar".into(), 0..1);
3573
        s.append(&s2)?;
3574

3575
        // Append series to frame
3576
        let out = base.with_column(s)?;
3577

3578
        // Now we should rechunk
3579
        assert!(out.should_rechunk());
3580
        Ok(())
3581
    }
3582

3583
    #[test]
3584
    fn test_duplicate_column() {
3585
        let mut df = df! {
3586
            "foo" => [1, 2, 3]
3587
        }
3588
        .unwrap();
3589
        // check if column is replaced
3590
        assert!(
3591
            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3592
                .is_ok()
3593
        );
3594
        assert!(
3595
            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3596
                .is_ok()
3597
        );
3598
        assert!(df.column("bar").is_ok())
3599
    }
3600

3601
    #[test]
3602
    #[cfg_attr(miri, ignore)]
3603
    fn distinct() {
3604
        let df = df! {
3605
            "flt" => [1., 1., 2., 2., 3., 3.],
3606
            "int" => [1, 1, 2, 2, 3, 3, ],
3607
            "str" => ["a", "a", "b", "b", "c", "c"]
3608
        }
3609
        .unwrap();
3610
        let df = df
3611
            .unique_stable(None, UniqueKeepStrategy::First, None)
3612
            .unwrap()
3613
            .sort(["flt"], SortMultipleOptions::default())
3614
            .unwrap();
3615
        let valid = df! {
3616
            "flt" => [1., 2., 3.],
3617
            "int" => [1, 2, 3],
3618
            "str" => ["a", "b", "c"]
3619
        }
3620
        .unwrap();
3621
        assert!(df.equals(&valid));
3622
    }
3623

3624
    #[test]
3625
    fn test_vstack() {
3626
        // check that it does not accidentally rechunks
3627
        let mut df = df! {
3628
            "flt" => [1., 1., 2., 2., 3., 3.],
3629
            "int" => [1, 1, 2, 2, 3, 3, ],
3630
            "str" => ["a", "a", "b", "b", "c", "c"]
3631
        }
3632
        .unwrap();
3633

3634
        df.vstack_mut(&df.slice(0, 3)).unwrap();
3635
        assert_eq!(df.first_col_n_chunks(), 2)
3636
    }
3637

3638
    #[test]
3639
    fn test_vstack_on_empty_dataframe() {
3640
        let mut df = DataFrame::empty();
3641

3642
        let df_data = df! {
3643
            "flt" => [1., 1., 2., 2., 3., 3.],
3644
            "int" => [1, 1, 2, 2, 3, 3, ],
3645
            "str" => ["a", "a", "b", "b", "c", "c"]
3646
        }
3647
        .unwrap();
3648

3649
        df.vstack_mut(&df_data).unwrap();
3650
        assert_eq!(df.height, 6)
3651
    }
3652

3653
    #[test]
3654
    fn test_replace_or_add() -> PolarsResult<()> {
3655
        let mut df = df!(
3656
            "a" => [1, 2, 3],
3657
            "b" => [1, 2, 3]
3658
        )?;
3659

3660
        // check that the new column is "c" and not "bar".
3661
        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3662

3663
        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3664
        Ok(())
3665
    }
3666

3667
    #[test]
3668
    fn test_unique_keep_none_with_slice() {
3669
        let df = df! {
3670
            "x" => [1, 2, 3, 2, 1]
3671
        }
3672
        .unwrap();
3673
        let out = df
3674
            .unique_stable(
3675
                Some(&["x".to_string()][..]),
3676
                UniqueKeepStrategy::None,
3677
                Some((0, 2)),
3678
            )
3679
            .unwrap();
3680
        let expected = df! {
3681
            "x" => [3]
3682
        }
3683
        .unwrap();
3684
        assert!(out.equals(&expected));
3685
    }
3686

3687
    #[test]
3688
    #[cfg(feature = "dtype-i8")]
3689
    fn test_apply_result_schema() {
3690
        let mut df = df! {
3691
            "x" => [1, 2, 3, 2, 1]
3692
        }
3693
        .unwrap();
3694

3695
        let schema_before = df.schema().clone();
3696
        df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3697
        assert_ne!(&schema_before, df.schema());
3698
    }
3699
}
3700

3701
Product

Resources

Company