Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/frame/mod.rs
8431 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! DataFrame module.
3
use arrow::datatypes::ArrowSchemaRef;
4
use polars_row::ArrayRef;
5
use polars_utils::UnitVec;
6
use polars_utils::itertools::Itertools;
7
use rayon::prelude::*;
8
9
use crate::chunked_array::flags::StatisticsFlags;
10
#[cfg(feature = "algorithm_group_by")]
11
use crate::chunked_array::ops::unique::is_unique_helper;
12
use crate::prelude::gather::check_bounds_ca;
13
use crate::prelude::*;
14
#[cfg(feature = "row_hash")]
15
use crate::utils::split_df;
16
use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
17
use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
18
19
#[cfg(feature = "dataframe_arithmetic")]
20
mod arithmetic;
21
pub mod builder;
22
mod chunks;
23
pub use chunks::chunk_df_for_writing;
24
mod broadcast;
25
pub mod column;
26
mod dataframe;
27
mod filter;
28
mod projection;
29
pub use dataframe::DataFrame;
30
use filter::filter_zero_width;
31
use projection::{AmortizedColumnSelector, LINEAR_SEARCH_LIMIT};
32
33
pub mod explode;
34
mod from;
35
#[cfg(feature = "algorithm_group_by")]
36
pub mod group_by;
37
pub(crate) mod horizontal;
38
#[cfg(feature = "proptest")]
39
pub mod proptest;
40
#[cfg(any(feature = "rows", feature = "object"))]
41
pub mod row;
42
mod top_k;
43
mod upstream_traits;
44
mod validation;
45
46
use arrow::record_batch::{RecordBatch, RecordBatchT};
47
use polars_utils::pl_str::PlSmallStr;
48
#[cfg(feature = "serde")]
49
use serde::{Deserialize, Serialize};
50
use strum_macros::IntoStaticStr;
51
52
use crate::POOL;
53
#[cfg(feature = "row_hash")]
54
use crate::hashing::_df_rows_to_hashes_threaded_vertical;
55
use crate::prelude::sort::arg_sort;
56
use crate::series::IsSorted;
57
58
#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
59
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
60
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
61
#[strum(serialize_all = "snake_case")]
62
pub enum UniqueKeepStrategy {
63
/// Keep the first unique row.
64
First,
65
/// Keep the last unique row.
66
Last,
67
/// Keep None of the unique rows.
68
None,
69
/// Keep any of the unique rows
70
/// This allows more optimizations
71
#[default]
72
Any,
73
}
74
75
impl DataFrame {
76
pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
77
self.columns().iter().map(Column::as_materialized_series)
78
}
79
80
/// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
81
///
82
/// # Implementation
83
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
84
/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
85
/// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
86
///
87
/// When an array is sliced, its allocated size remains constant because the buffer unchanged.
88
/// However, this function will yield a smaller number. This is because this function returns
89
/// the visible size of the buffer, not its total capacity.
90
///
91
/// FFI buffers are included in this estimation.
92
pub fn estimated_size(&self) -> usize {
93
self.columns().iter().map(Column::estimated_size).sum()
94
}
95
96
pub fn try_apply_columns(
97
&self,
98
func: impl Fn(&Column) -> PolarsResult<Column> + Send + Sync,
99
) -> PolarsResult<Vec<Column>> {
100
return inner(self, &func);
101
102
fn inner(
103
slf: &DataFrame,
104
func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
105
) -> PolarsResult<Vec<Column>> {
106
slf.columns().iter().map(func).collect()
107
}
108
}
109
110
pub fn apply_columns(&self, func: impl Fn(&Column) -> Column + Send + Sync) -> Vec<Column> {
111
return inner(self, &func);
112
113
fn inner(slf: &DataFrame, func: &(dyn Fn(&Column) -> Column + Send + Sync)) -> Vec<Column> {
114
slf.columns().iter().map(func).collect()
115
}
116
}
117
118
pub fn try_apply_columns_par(
119
&self,
120
func: impl Fn(&Column) -> PolarsResult<Column> + Send + Sync,
121
) -> PolarsResult<Vec<Column>> {
122
return inner(self, &func);
123
124
fn inner(
125
slf: &DataFrame,
126
func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
127
) -> PolarsResult<Vec<Column>> {
128
POOL.install(|| slf.columns().par_iter().map(func).collect())
129
}
130
}
131
132
pub fn apply_columns_par(&self, func: impl Fn(&Column) -> Column + Send + Sync) -> Vec<Column> {
133
return inner(self, &func);
134
135
fn inner(slf: &DataFrame, func: &(dyn Fn(&Column) -> Column + Send + Sync)) -> Vec<Column> {
136
POOL.install(|| slf.columns().par_iter().map(func).collect())
137
}
138
}
139
140
/// Reserve additional slots into the chunks of the series.
141
pub(crate) fn reserve_chunks(&mut self, additional: usize) {
142
for s in unsafe { self.columns_mut_retain_schema() } {
143
if let Column::Series(s) = s {
144
// SAFETY:
145
// do not modify the data, simply resize.
146
unsafe { s.chunks_mut().reserve(additional) }
147
}
148
}
149
}
150
pub fn new_from_index(&self, index: usize, height: usize) -> Self {
151
let new_cols = self.apply_columns(|c| c.new_from_index(index, height));
152
153
unsafe { Self::_new_unchecked_impl(height, new_cols).with_schema_from(self) }
154
}
155
156
/// Create a new `DataFrame` with the given schema, only containing nulls.
157
pub fn full_null(schema: &Schema, height: usize) -> Self {
158
let columns = schema
159
.iter_fields()
160
.map(|f| Column::full_null(f.name().clone(), height, f.dtype()))
161
.collect();
162
163
unsafe { DataFrame::_new_unchecked_impl(height, columns) }
164
}
165
166
/// Ensure this DataFrame matches the given schema. Casts null columns to
167
/// the expected schema if necessary (but nothing else).
168
pub fn ensure_matches_schema(&mut self, schema: &Schema) -> PolarsResult<()> {
169
let mut did_cast = false;
170
let cached_schema = self.cached_schema().cloned();
171
172
for (col, (name, dt)) in unsafe { self.columns_mut() }.iter_mut().zip(schema.iter()) {
173
polars_ensure!(
174
col.name() == name,
175
SchemaMismatch: "column name mismatch: expected {:?}, found {:?}",
176
name,
177
col.name()
178
);
179
180
let needs_cast = col.dtype().matches_schema_type(dt)?;
181
182
if needs_cast {
183
*col = col.cast(dt)?;
184
did_cast = true;
185
}
186
}
187
188
if !did_cast {
189
unsafe { self.set_opt_schema(cached_schema) };
190
}
191
192
Ok(())
193
}
194
195
/// Add a new column at index 0 that counts the rows.
196
///
197
/// # Example
198
///
199
/// ```
200
/// # use polars_core::prelude::*;
201
/// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
202
/// assert_eq!(df1.shape(), (4, 1));
203
///
204
/// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
205
/// assert_eq!(df2.shape(), (4, 2));
206
/// println!("{}", df2);
207
///
208
/// # Ok::<(), PolarsError>(())
209
/// ```
210
///
211
/// Output:
212
///
213
/// ```text
214
/// shape: (4, 2)
215
/// +-----+----------+
216
/// | Id | Name |
217
/// | --- | --- |
218
/// | u32 | str |
219
/// +=====+==========+
220
/// | 0 | James |
221
/// +-----+----------+
222
/// | 1 | Mary |
223
/// +-----+----------+
224
/// | 2 | John |
225
/// +-----+----------+
226
/// | 3 | Patricia |
227
/// +-----+----------+
228
/// ```
229
pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
230
let mut new_columns = Vec::with_capacity(self.width() + 1);
231
let offset = offset.unwrap_or(0);
232
233
if self.get_column_index(&name).is_some() {
234
polars_bail!(duplicate = name)
235
}
236
237
let col = Column::new_row_index(name, offset, self.height())?;
238
new_columns.push(col);
239
new_columns.extend_from_slice(self.columns());
240
241
Ok(unsafe { DataFrame::new_unchecked(self.height(), new_columns) })
242
}
243
244
/// Add a row index column in place.
245
///
246
/// # Safety
247
/// The caller should ensure the DataFrame does not already contain a column with the given name.
248
///
249
/// # Panics
250
/// Panics if the resulting column would reach or overflow IdxSize::MAX.
251
pub unsafe fn with_row_index_mut(
252
&mut self,
253
name: PlSmallStr,
254
offset: Option<IdxSize>,
255
) -> &mut Self {
256
debug_assert!(
257
self.get_column_index(&name).is_none(),
258
"with_row_index_mut(): column with name {} already exists",
259
&name
260
);
261
262
let offset = offset.unwrap_or(0);
263
let col = Column::new_row_index(name, offset, self.height()).unwrap();
264
265
unsafe { self.columns_mut() }.insert(0, col);
266
self
267
}
268
269
/// Shrink the capacity of this DataFrame to fit its length.
270
pub fn shrink_to_fit(&mut self) {
271
// Don't parallelize this. Memory overhead
272
for s in unsafe { self.columns_mut_retain_schema() } {
273
s.shrink_to_fit();
274
}
275
}
276
277
/// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
278
/// This may lead to more peak memory consumption.
279
pub fn rechunk_mut_par(&mut self) -> &mut Self {
280
if self.columns().iter().any(|c| c.n_chunks() > 1) {
281
POOL.install(|| {
282
unsafe { self.columns_mut_retain_schema() }
283
.par_iter_mut()
284
.for_each(|c| *c = c.rechunk());
285
})
286
}
287
288
self
289
}
290
291
/// Rechunks all columns to only have a single chunk.
292
pub fn rechunk_mut(&mut self) -> &mut Self {
293
// SAFETY: We never adjust the length or names of the columns.
294
let columns = unsafe { self.columns_mut() };
295
296
for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
297
*col = col.rechunk();
298
}
299
300
self
301
}
302
303
/// Returns true if the chunks of the columns do not align and re-chunking should be done
304
pub fn should_rechunk(&self) -> bool {
305
// Fast check. It is also needed for correctness, as code below doesn't check if the number
306
// of chunks is equal.
307
if !self
308
.columns()
309
.iter()
310
.filter_map(|c| c.as_series().map(|s| s.n_chunks()))
311
.all_equal()
312
{
313
return true;
314
}
315
316
// From here we check chunk lengths.
317
let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
318
match chunk_lengths.next() {
319
None => false,
320
Some(first_column_chunk_lengths) => {
321
// Fast Path for single Chunk Series
322
if first_column_chunk_lengths.size_hint().0 == 1 {
323
return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
324
}
325
// Always rechunk if we have more chunks than rows.
326
// except when we have an empty df containing a single chunk
327
let height = self.height();
328
let n_chunks = first_column_chunk_lengths.size_hint().0;
329
if n_chunks > height && !(height == 0 && n_chunks == 1) {
330
return true;
331
}
332
// Slow Path for multi Chunk series
333
let v: Vec<_> = first_column_chunk_lengths.collect();
334
for cl in chunk_lengths {
335
if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
336
return true;
337
}
338
}
339
false
340
},
341
}
342
}
343
344
/// Ensure all the chunks in the [`DataFrame`] are aligned.
345
pub fn align_chunks_par(&mut self) -> &mut Self {
346
if self.should_rechunk() {
347
self.rechunk_mut_par()
348
} else {
349
self
350
}
351
}
352
353
/// Ensure all the chunks in the [`DataFrame`] are aligned.
354
pub fn align_chunks(&mut self) -> &mut Self {
355
if self.should_rechunk() {
356
self.rechunk_mut()
357
} else {
358
self
359
}
360
}
361
362
/// # Example
363
///
364
/// ```rust
365
/// # use polars_core::prelude::*;
366
/// let df: DataFrame = df!("Language" => ["Rust", "Python"],
367
/// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
368
///
369
/// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
370
/// # Ok::<(), PolarsError>(())
371
/// ```
372
pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
373
self.columns().iter().map(|s| s.name()).collect()
374
}
375
376
/// Get the [`Vec<PlSmallStr>`] representing the column names.
377
pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
378
self.columns().iter().map(|s| s.name().clone()).collect()
379
}
380
381
/// Set the column names.
382
/// # Example
383
///
384
/// ```rust
385
/// # use polars_core::prelude::*;
386
/// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
387
/// df.set_column_names(&["Set"])?;
388
///
389
/// assert_eq!(df.get_column_names(), &["Set"]);
390
/// # Ok::<(), PolarsError>(())
391
/// ```
392
pub fn set_column_names<T>(&mut self, new_names: &[T]) -> PolarsResult<()>
393
where
394
T: AsRef<str>,
395
{
396
polars_ensure!(
397
new_names.len() == self.width(),
398
ShapeMismatch: "{} column names provided for a DataFrame of width {}",
399
new_names.len(), self.width()
400
);
401
402
validation::ensure_names_unique(new_names)?;
403
404
*unsafe { self.columns_mut() } = std::mem::take(unsafe { self.columns_mut() })
405
.into_iter()
406
.zip(new_names)
407
.map(|(c, name)| c.with_name(PlSmallStr::from_str(name.as_ref())))
408
.collect();
409
410
Ok(())
411
}
412
413
/// Get the data types of the columns in the [`DataFrame`].
414
///
415
/// # Example
416
///
417
/// ```rust
418
/// # use polars_core::prelude::*;
419
/// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
420
/// "Fraction" => [0.965, 0.035])?;
421
///
422
/// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
423
/// # Ok::<(), PolarsError>(())
424
/// ```
425
pub fn dtypes(&self) -> Vec<DataType> {
426
self.columns().iter().map(|s| s.dtype().clone()).collect()
427
}
428
429
/// The number of chunks for the first column.
430
pub fn first_col_n_chunks(&self) -> usize {
431
match self.columns().iter().find_map(|col| col.as_series()) {
432
None if self.width() == 0 => 0,
433
None => 1,
434
Some(s) => s.n_chunks(),
435
}
436
}
437
438
/// The highest number of chunks for any column.
439
pub fn max_n_chunks(&self) -> usize {
440
self.columns()
441
.iter()
442
.map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
443
.max()
444
.unwrap_or(0)
445
}
446
447
/// Generate the schema fields of the [`DataFrame`].
448
///
449
/// # Example
450
///
451
/// ```rust
452
/// # use polars_core::prelude::*;
453
/// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
454
/// "Fraction" => [0.708, 0.292])?;
455
///
456
/// let f1: Field = Field::new("Surface type".into(), DataType::String);
457
/// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
458
///
459
/// assert_eq!(earth.fields(), &[f1, f2]);
460
/// # Ok::<(), PolarsError>(())
461
/// ```
462
pub fn fields(&self) -> Vec<Field> {
463
self.columns()
464
.iter()
465
.map(|s| s.field().into_owned())
466
.collect()
467
}
468
469
/// Add multiple [`Series`] to a [`DataFrame`].
470
/// The added `Series` are required to have the same length.
471
///
472
/// # Example
473
///
474
/// ```rust
475
/// # use polars_core::prelude::*;
476
/// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
477
/// let s1 = Column::new("Proton".into(), [29, 47, 79]);
478
/// let s2 = Column::new("Electron".into(), [29, 47, 79]);
479
///
480
/// let df2: DataFrame = df1.hstack(&[s1, s2])?;
481
/// assert_eq!(df2.shape(), (3, 3));
482
/// println!("{}", df2);
483
/// # Ok::<(), PolarsError>(())
484
/// ```
485
///
486
/// Output:
487
///
488
/// ```text
489
/// shape: (3, 3)
490
/// +---------+--------+----------+
491
/// | Element | Proton | Electron |
492
/// | --- | --- | --- |
493
/// | str | i32 | i32 |
494
/// +=========+========+==========+
495
/// | Copper | 29 | 29 |
496
/// +---------+--------+----------+
497
/// | Silver | 47 | 47 |
498
/// +---------+--------+----------+
499
/// | Gold | 79 | 79 |
500
/// +---------+--------+----------+
501
/// ```
502
pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
503
let mut new_cols = Vec::with_capacity(self.width() + columns.len());
504
505
new_cols.extend(self.columns().iter().cloned());
506
new_cols.extend_from_slice(columns);
507
508
DataFrame::new(self.height(), new_cols)
509
}
510
/// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
511
///
512
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
513
///
514
/// # Example
515
///
516
/// ```rust
517
/// # use polars_core::prelude::*;
518
/// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
519
/// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
520
/// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
521
/// "Melting Point (K)" => [2041.4, 1828.05])?;
522
///
523
/// let df3: DataFrame = df1.vstack(&df2)?;
524
///
525
/// assert_eq!(df3.shape(), (5, 2));
526
/// println!("{}", df3);
527
/// # Ok::<(), PolarsError>(())
528
/// ```
529
///
530
/// Output:
531
///
532
/// ```text
533
/// shape: (5, 2)
534
/// +-----------+-------------------+
535
/// | Element | Melting Point (K) |
536
/// | --- | --- |
537
/// | str | f64 |
538
/// +===========+===================+
539
/// | Copper | 1357.77 |
540
/// +-----------+-------------------+
541
/// | Silver | 1234.93 |
542
/// +-----------+-------------------+
543
/// | Gold | 1337.33 |
544
/// +-----------+-------------------+
545
/// | Platinum | 2041.4 |
546
/// +-----------+-------------------+
547
/// | Palladium | 1828.05 |
548
/// +-----------+-------------------+
549
/// ```
550
pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
551
let mut df = self.clone();
552
df.vstack_mut(other)?;
553
Ok(df)
554
}
555
556
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
557
///
558
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
559
///
560
/// # Example
561
///
562
/// ```rust
563
/// # use polars_core::prelude::*;
564
/// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
565
/// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
566
/// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
567
/// "Melting Point (K)" => [2041.4, 1828.05])?;
568
///
569
/// df1.vstack_mut(&df2)?;
570
///
571
/// assert_eq!(df1.shape(), (5, 2));
572
/// println!("{}", df1);
573
/// # Ok::<(), PolarsError>(())
574
/// ```
575
///
576
/// Output:
577
///
578
/// ```text
579
/// shape: (5, 2)
580
/// +-----------+-------------------+
581
/// | Element | Melting Point (K) |
582
/// | --- | --- |
583
/// | str | f64 |
584
/// +===========+===================+
585
/// | Copper | 1357.77 |
586
/// +-----------+-------------------+
587
/// | Silver | 1234.93 |
588
/// +-----------+-------------------+
589
/// | Gold | 1337.33 |
590
/// +-----------+-------------------+
591
/// | Platinum | 2041.4 |
592
/// +-----------+-------------------+
593
/// | Palladium | 1828.05 |
594
/// +-----------+-------------------+
595
/// ```
596
pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
597
if self.width() != other.width() {
598
polars_ensure!(
599
self.shape() == (0, 0),
600
ShapeMismatch:
601
"unable to append to a DataFrame of shape {:?} with a DataFrame of width {}",
602
self.shape(), other.width(),
603
);
604
605
self.clone_from(other);
606
607
return Ok(self);
608
}
609
610
let new_height = usize::checked_add(self.height(), other.height()).unwrap();
611
612
unsafe { self.columns_mut_retain_schema() }
613
.iter_mut()
614
.zip(other.columns())
615
.try_for_each::<_, PolarsResult<_>>(|(left, right)| {
616
ensure_can_extend(&*left, right)?;
617
left.append(right).map_err(|e| {
618
e.context(format!("failed to vstack column '{}'", right.name()).into())
619
})?;
620
Ok(())
621
})?;
622
623
unsafe { self.set_height(new_height) };
624
625
Ok(self)
626
}
627
628
pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
629
if self.width() != other.width() {
630
polars_ensure!(
631
self.shape() == (0, 0),
632
ShapeMismatch:
633
"unable to append to a DataFrame of width {} with a DataFrame of width {}",
634
self.width(), other.width(),
635
);
636
637
*self = other;
638
639
return Ok(self);
640
}
641
642
let new_height = usize::checked_add(self.height(), other.height()).unwrap();
643
644
unsafe { self.columns_mut_retain_schema() }
645
.iter_mut()
646
.zip(other.into_columns())
647
.try_for_each::<_, PolarsResult<_>>(|(left, right)| {
648
ensure_can_extend(&*left, &right)?;
649
let right_name = right.name().clone();
650
left.append_owned(right).map_err(|e| {
651
e.context(format!("failed to vstack column '{right_name}'").into())
652
})?;
653
Ok(())
654
})?;
655
656
unsafe { self.set_height(new_height) };
657
658
Ok(self)
659
}
660
661
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
662
///
663
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
664
///
665
/// # Panics
666
/// Panics if the schema's don't match.
667
pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) -> &mut Self {
668
let new_height = usize::checked_add(self.height(), other.height()).unwrap();
669
670
unsafe { self.columns_mut_retain_schema() }
671
.iter_mut()
672
.zip(other.columns())
673
.for_each(|(left, right)| {
674
left.append(right)
675
.map_err(|e| {
676
e.context(format!("failed to vstack column '{}'", right.name()).into())
677
})
678
.expect("should not fail");
679
});
680
681
unsafe { self.set_height(new_height) };
682
683
self
684
}
685
686
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
687
///
688
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
689
///
690
/// # Panics
691
/// Panics if the schema's don't match.
692
pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) -> &mut Self {
693
let new_height = usize::checked_add(self.height(), other.height()).unwrap();
694
695
unsafe { self.columns_mut_retain_schema() }
696
.iter_mut()
697
.zip(other.into_columns())
698
.for_each(|(left, right)| {
699
left.append_owned(right).expect("should not fail");
700
});
701
702
unsafe { self.set_height(new_height) };
703
704
self
705
}
706
707
/// Extend the memory backed by this [`DataFrame`] with the values from `other`.
708
///
709
/// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
710
/// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
711
///
712
/// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
713
/// and thus will yield faster queries.
714
///
715
/// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
716
/// online operations where you add `n` rows and rerun a query.
717
///
718
/// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
719
/// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
720
/// of `append` operations with a [`rechunk`](Self::align_chunks_par).
721
pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
722
polars_ensure!(
723
self.width() == other.width(),
724
ShapeMismatch:
725
"unable to extend a DataFrame of width {} with a DataFrame of width {}",
726
self.width(), other.width(),
727
);
728
729
let new_height = usize::checked_add(self.height(), other.height()).unwrap();
730
731
unsafe { self.columns_mut_retain_schema() }
732
.iter_mut()
733
.zip(other.columns())
734
.try_for_each::<_, PolarsResult<_>>(|(left, right)| {
735
ensure_can_extend(&*left, right)?;
736
left.extend(right).map_err(|e| {
737
e.context(format!("failed to extend column '{}'", right.name()).into())
738
})?;
739
Ok(())
740
})?;
741
742
unsafe { self.set_height(new_height) };
743
744
Ok(())
745
}
746
747
/// Remove a column by name and return the column removed.
748
///
749
/// # Example
750
///
751
/// ```rust
752
/// # use polars_core::prelude::*;
753
/// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
754
/// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
755
///
756
/// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
757
/// assert!(s1.is_err());
758
///
759
/// let s2: Column = df.drop_in_place("Animal")?;
760
/// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
761
/// # Ok::<(), PolarsError>(())
762
/// ```
763
pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
764
let idx = self.try_get_column_index(name)?;
765
Ok(unsafe { self.columns_mut() }.remove(idx))
766
}
767
768
/// Return a new [`DataFrame`] where all null values are dropped.
769
///
770
/// # Example
771
///
772
/// ```no_run
773
/// # use polars_core::prelude::*;
774
/// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
775
/// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
776
/// assert_eq!(df1.shape(), (3, 2));
777
///
778
/// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
779
/// assert_eq!(df2.shape(), (1, 2));
780
/// println!("{}", df2);
781
/// # Ok::<(), PolarsError>(())
782
/// ```
783
///
784
/// Output:
785
///
786
/// ```text
787
/// shape: (1, 2)
788
/// +---------+---------------------+
789
/// | Country | Tax revenue (% GDP) |
790
/// | --- | --- |
791
/// | str | f64 |
792
/// +=========+=====================+
793
/// | Malta | 32.7 |
794
/// +---------+---------------------+
795
/// ```
796
pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
797
where
798
for<'a> &'a S: AsRef<str>,
799
{
800
if let Some(v) = subset {
801
let v = self.select_to_vec(v)?;
802
self._drop_nulls_impl(v.as_slice())
803
} else {
804
self._drop_nulls_impl(self.columns())
805
}
806
}
807
808
fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
809
// fast path for no nulls in df
810
if subset.iter().all(|s| !s.has_nulls()) {
811
return Ok(self.clone());
812
}
813
814
let mut iter = subset.iter();
815
816
let mask = iter
817
.next()
818
.ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
819
let mut mask = mask.is_not_null();
820
821
for c in iter {
822
mask = mask & c.is_not_null();
823
}
824
self.filter(&mask)
825
}
826
827
/// Drop a column by name.
828
/// This is a pure method and will return a new [`DataFrame`] instead of modifying
829
/// the current one in place.
830
///
831
/// # Example
832
///
833
/// ```rust
834
/// # use polars_core::prelude::*;
835
/// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
836
/// let df2: DataFrame = df1.drop("Ray type")?;
837
///
838
/// assert_eq!(df2.width(), 0);
839
/// # Ok::<(), PolarsError>(())
840
/// ```
841
pub fn drop(&self, name: &str) -> PolarsResult<Self> {
842
let idx = self.try_get_column_index(name)?;
843
let mut new_cols = Vec::with_capacity(self.width() - 1);
844
845
self.columns().iter().enumerate().for_each(|(i, s)| {
846
if i != idx {
847
new_cols.push(s.clone())
848
}
849
});
850
851
Ok(unsafe { DataFrame::_new_unchecked_impl(self.height(), new_cols) })
852
}
853
854
/// Drop columns that are in `names`.
855
pub fn drop_many<I, S>(&self, names: I) -> Self
856
where
857
I: IntoIterator<Item = S>,
858
S: Into<PlSmallStr>,
859
{
860
let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
861
self.drop_many_amortized(&names)
862
}
863
864
/// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
865
pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
866
if names.is_empty() {
867
return self.clone();
868
}
869
let mut new_cols = Vec::with_capacity(self.width().saturating_sub(names.len()));
870
self.columns().iter().for_each(|s| {
871
if !names.contains(s.name()) {
872
new_cols.push(s.clone())
873
}
874
});
875
876
unsafe { DataFrame::new_unchecked(self.height(), new_cols) }
877
}
878
879
/// Insert a new column at a given index without checking for duplicates.
880
/// This can leave the [`DataFrame`] at an invalid state
881
fn insert_column_no_namecheck(
882
&mut self,
883
index: usize,
884
column: Column,
885
) -> PolarsResult<&mut Self> {
886
if self.shape() == (0, 0) {
887
unsafe { self.set_height(column.len()) };
888
}
889
890
polars_ensure!(
891
column.len() == self.height(),
892
ShapeMismatch:
893
"unable to add a column of length {} to a DataFrame of height {}",
894
column.len(), self.height(),
895
);
896
897
unsafe { self.columns_mut() }.insert(index, column);
898
Ok(self)
899
}
900
901
/// Insert a new column at a given index.
902
pub fn insert_column(&mut self, index: usize, column: Column) -> PolarsResult<&mut Self> {
903
let name = column.name();
904
905
polars_ensure!(
906
self.get_column_index(name).is_none(),
907
Duplicate:
908
"column with name {:?} is already present in the DataFrame", name
909
);
910
911
self.insert_column_no_namecheck(index, column)
912
}
913
914
/// Add a new column to this [`DataFrame`] or replace an existing one. Broadcasts unit-length
915
/// columns.
916
pub fn with_column(&mut self, mut column: Column) -> PolarsResult<&mut Self> {
917
if self.shape() == (0, 0) {
918
unsafe { self.set_height(column.len()) };
919
}
920
921
if column.len() != self.height() && column.len() == 1 {
922
column = column.new_from_index(0, self.height());
923
}
924
925
polars_ensure!(
926
column.len() == self.height(),
927
ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
928
column.len(), self.height(),
929
);
930
931
if let Some(i) = self.get_column_index(column.name()) {
932
*unsafe { self.columns_mut() }.get_mut(i).unwrap() = column
933
} else {
934
unsafe { self.columns_mut() }.push(column)
935
};
936
937
Ok(self)
938
}
939
940
/// Adds a column to the [`DataFrame`] without doing any checks
941
/// on length or duplicates.
942
///
943
/// # Safety
944
/// The caller must ensure `column.len() == self.height()` .
945
pub unsafe fn push_column_unchecked(&mut self, column: Column) -> &mut Self {
946
unsafe { self.columns_mut() }.push(column);
947
self
948
}
949
950
/// Add or replace columns to this [`DataFrame`] or replace an existing one.
951
/// Broadcasts unit-length columns, and uses an existing schema to amortize lookups.
952
pub fn with_columns_mut(
953
&mut self,
954
columns: impl IntoIterator<Item = Column>,
955
output_schema: &Schema,
956
) -> PolarsResult<()> {
957
let columns = columns.into_iter();
958
959
unsafe {
960
self.columns_mut_retain_schema()
961
.reserve(columns.size_hint().0)
962
}
963
964
for c in columns {
965
self.with_column_and_schema_mut(c, output_schema)?;
966
}
967
968
Ok(())
969
}
970
971
fn with_column_and_schema_mut(
972
&mut self,
973
mut column: Column,
974
output_schema: &Schema,
975
) -> PolarsResult<&mut Self> {
976
if self.shape() == (0, 0) {
977
unsafe { self.set_height(column.len()) };
978
}
979
980
if column.len() != self.height() && column.len() == 1 {
981
column = column.new_from_index(0, self.height());
982
}
983
984
polars_ensure!(
985
column.len() == self.height(),
986
ShapeMismatch:
987
"unable to add a column of length {} to a DataFrame of height {}",
988
column.len(), self.height(),
989
);
990
991
let i = output_schema
992
.index_of(column.name())
993
.or_else(|| self.get_column_index(column.name()))
994
.unwrap_or(self.width());
995
996
if i < self.width() {
997
*unsafe { self.columns_mut() }.get_mut(i).unwrap() = column
998
} else if i == self.width() {
999
unsafe { self.columns_mut() }.push(column)
1000
} else {
1001
// Unordered column insertion is not handled.
1002
panic!()
1003
}
1004
1005
Ok(self)
1006
}
1007
1008
/// Get a row in the [`DataFrame`]. Beware this is slow.
1009
///
1010
/// # Example
1011
///
1012
/// ```
1013
/// # use polars_core::prelude::*;
1014
/// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1015
/// df.get(idx)
1016
/// }
1017
/// ```
1018
pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1019
(idx < self.height()).then(|| self.columns().iter().map(|c| c.get(idx).unwrap()).collect())
1020
}
1021
1022
/// Select a [`Series`] by index.
1023
///
1024
/// # Example
1025
///
1026
/// ```rust
1027
/// # use polars_core::prelude::*;
1028
/// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1029
/// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1030
///
1031
/// let s1: Option<&Column> = df.select_at_idx(0);
1032
/// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1033
///
1034
/// assert_eq!(s1, Some(&s2));
1035
/// # Ok::<(), PolarsError>(())
1036
/// ```
1037
pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1038
self.columns().get(idx)
1039
}
1040
1041
/// Get column index of a [`Series`] by name.
1042
/// # Example
1043
///
1044
/// ```rust
1045
/// # use polars_core::prelude::*;
1046
/// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1047
/// "Health" => [100, 200, 500],
1048
/// "Mana" => [250, 100, 0],
1049
/// "Strength" => [30, 150, 300])?;
1050
///
1051
/// assert_eq!(df.get_column_index("Name"), Some(0));
1052
/// assert_eq!(df.get_column_index("Health"), Some(1));
1053
/// assert_eq!(df.get_column_index("Mana"), Some(2));
1054
/// assert_eq!(df.get_column_index("Strength"), Some(3));
1055
/// assert_eq!(df.get_column_index("Haste"), None);
1056
/// # Ok::<(), PolarsError>(())
1057
/// ```
1058
pub fn get_column_index(&self, name: &str) -> Option<usize> {
1059
if let Some(schema) = self.cached_schema() {
1060
schema.index_of(name)
1061
} else if self.width() <= LINEAR_SEARCH_LIMIT {
1062
self.columns().iter().position(|s| s.name() == name)
1063
} else {
1064
self.schema().index_of(name)
1065
}
1066
}
1067
1068
/// Get column index of a [`Series`] by name.
1069
pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1070
self.get_column_index(name)
1071
.ok_or_else(|| polars_err!(col_not_found = name))
1072
}
1073
1074
/// Select a single column by name.
1075
///
1076
/// # Example
1077
///
1078
/// ```rust
1079
/// # use polars_core::prelude::*;
1080
/// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1081
/// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1082
/// let df: DataFrame = DataFrame::new_infer_height(vec![s1.clone(), s2])?;
1083
///
1084
/// assert_eq!(df.column("Password")?, &s1);
1085
/// # Ok::<(), PolarsError>(())
1086
/// ```
1087
pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1088
let idx = self.try_get_column_index(name)?;
1089
Ok(self.select_at_idx(idx).unwrap())
1090
}
1091
1092
/// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1093
///
1094
/// # Examples
1095
///
1096
/// ```
1097
/// # use polars_core::prelude::*;
1098
/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1099
/// df.select(["foo", "bar"])
1100
/// }
1101
/// ```
1102
pub fn select<I, S>(&self, names: I) -> PolarsResult<Self>
1103
where
1104
I: IntoIterator<Item = S>,
1105
S: AsRef<str>,
1106
{
1107
DataFrame::new(self.height(), self.select_to_vec(names)?)
1108
}
1109
1110
/// Does not check for duplicates.
1111
///
1112
/// # Safety
1113
/// `names` must not contain duplicates.
1114
pub unsafe fn select_unchecked<I, S>(&self, names: I) -> PolarsResult<Self>
1115
where
1116
I: IntoIterator<Item = S>,
1117
S: AsRef<str>,
1118
{
1119
Ok(unsafe { DataFrame::new_unchecked(self.height(), self.select_to_vec(names)?) })
1120
}
1121
1122
/// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1123
///
1124
/// This does not error on duplicate selections.
1125
///
1126
/// # Example
1127
///
1128
/// ```rust
1129
/// # use polars_core::prelude::*;
1130
/// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1131
/// "Carbon" => [1, 2, 3],
1132
/// "Hydrogen" => [4, 6, 8])?;
1133
/// let sv: Vec<Column> = df.select_to_vec(["Carbon", "Hydrogen"])?;
1134
///
1135
/// assert_eq!(df["Carbon"], sv[0]);
1136
/// assert_eq!(df["Hydrogen"], sv[1]);
1137
/// # Ok::<(), PolarsError>(())
1138
/// ```
1139
pub fn select_to_vec(
1140
&self,
1141
selection: impl IntoIterator<Item = impl AsRef<str>>,
1142
) -> PolarsResult<Vec<Column>> {
1143
AmortizedColumnSelector::new(self).select_multiple(selection)
1144
}
1145
1146
/// Take the [`DataFrame`] rows by a boolean mask.
1147
///
1148
/// # Example
1149
///
1150
/// ```
1151
/// # use polars_core::prelude::*;
1152
/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1153
/// let mask = df.column("sepal_width")?.is_not_null();
1154
/// df.filter(&mask)
1155
/// }
1156
/// ```
1157
pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1158
if self.width() == 0 {
1159
filter_zero_width(self.height(), mask)
1160
} else {
1161
let new_columns: Vec<Column> = self.try_apply_columns_par(|s| s.filter(mask))?;
1162
let out = unsafe {
1163
DataFrame::new_unchecked(new_columns[0].len(), new_columns).with_schema_from(self)
1164
};
1165
1166
Ok(out)
1167
}
1168
}
1169
1170
/// Same as `filter` but does not parallelize.
1171
pub fn filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1172
if self.width() == 0 {
1173
filter_zero_width(self.height(), mask)
1174
} else {
1175
let new_columns: Vec<Column> = self.try_apply_columns(|s| s.filter(mask))?;
1176
let out = unsafe {
1177
DataFrame::new_unchecked(new_columns[0].len(), new_columns).with_schema_from(self)
1178
};
1179
1180
Ok(out)
1181
}
1182
}
1183
1184
/// Gather [`DataFrame`] rows by index values.
1185
///
1186
/// # Example
1187
///
1188
/// ```
1189
/// # use polars_core::prelude::*;
1190
/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1191
/// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1192
/// df.take(&idx)
1193
/// }
1194
/// ```
1195
pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1196
check_bounds_ca(indices, self.height().try_into().unwrap_or(IdxSize::MAX))?;
1197
1198
let new_cols = self.apply_columns_par(|c| {
1199
assert_eq!(c.len(), self.height());
1200
unsafe { c.take_unchecked(indices) }
1201
});
1202
1203
Ok(unsafe { DataFrame::new_unchecked(indices.len(), new_cols).with_schema_from(self) })
1204
}
1205
1206
/// # Safety
1207
/// The indices must be in-bounds.
1208
pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1209
self.take_unchecked_impl(idx, true)
1210
}
1211
1212
/// # Safety
1213
/// The indices must be in-bounds.
1214
#[cfg(feature = "algorithm_group_by")]
1215
pub unsafe fn gather_group_unchecked(&self, group: &GroupsIndicator) -> Self {
1216
match group {
1217
GroupsIndicator::Idx((_, indices)) => unsafe {
1218
self.take_slice_unchecked_impl(indices.as_slice(), false)
1219
},
1220
GroupsIndicator::Slice([offset, len]) => self.slice(*offset as i64, *len as usize),
1221
}
1222
}
1223
1224
/// # Safety
1225
/// The indices must be in-bounds.
1226
pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1227
let cols = if allow_threads && POOL.current_num_threads() > 1 {
1228
POOL.install(|| {
1229
if POOL.current_num_threads() > self.width() {
1230
let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
1231
if self.height() / stride >= 2 {
1232
self.apply_columns_par(|c| {
1233
// Nested types initiate a rechunk in their take_unchecked implementation.
1234
// If we do not rechunk, it will result in rechunk storms downstream.
1235
let c = if c.dtype().is_nested() {
1236
&c.rechunk()
1237
} else {
1238
c
1239
};
1240
1241
(0..idx.len().div_ceil(stride))
1242
.into_par_iter()
1243
.map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
1244
.reduce(
1245
|| Column::new_empty(c.name().clone(), c.dtype()),
1246
|mut a, b| {
1247
a.append_owned(b).unwrap();
1248
a
1249
},
1250
)
1251
})
1252
} else {
1253
self.apply_columns_par(|c| c.take_unchecked(idx))
1254
}
1255
} else {
1256
self.apply_columns_par(|c| c.take_unchecked(idx))
1257
}
1258
})
1259
} else {
1260
self.apply_columns(|s| s.take_unchecked(idx))
1261
};
1262
1263
unsafe { DataFrame::new_unchecked(idx.len(), cols).with_schema_from(self) }
1264
}
1265
1266
/// # Safety
1267
/// The indices must be in-bounds.
1268
pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
1269
self.take_slice_unchecked_impl(idx, true)
1270
}
1271
1272
/// # Safety
1273
/// The indices must be in-bounds.
1274
pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
1275
let cols = if allow_threads && POOL.current_num_threads() > 1 {
1276
POOL.install(|| {
1277
if POOL.current_num_threads() > self.width() {
1278
let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
1279
if self.height() / stride >= 2 {
1280
self.apply_columns_par(|c| {
1281
// Nested types initiate a rechunk in their take_unchecked implementation.
1282
// If we do not rechunk, it will result in rechunk storms downstream.
1283
let c = if c.dtype().is_nested() {
1284
&c.rechunk()
1285
} else {
1286
c
1287
};
1288
1289
(0..idx.len().div_ceil(stride))
1290
.into_par_iter()
1291
.map(|i| {
1292
let idx = &idx[i * stride..];
1293
let idx = &idx[..idx.len().min(stride)];
1294
c.take_slice_unchecked(idx)
1295
})
1296
.reduce(
1297
|| Column::new_empty(c.name().clone(), c.dtype()),
1298
|mut a, b| {
1299
a.append_owned(b).unwrap();
1300
a
1301
},
1302
)
1303
})
1304
} else {
1305
self.apply_columns_par(|s| s.take_slice_unchecked(idx))
1306
}
1307
} else {
1308
self.apply_columns_par(|s| s.take_slice_unchecked(idx))
1309
}
1310
})
1311
} else {
1312
self.apply_columns(|s| s.take_slice_unchecked(idx))
1313
};
1314
unsafe { DataFrame::new_unchecked(idx.len(), cols).with_schema_from(self) }
1315
}
1316
1317
/// Rename a column in the [`DataFrame`].
1318
///
1319
/// Should not be called in a loop as that can lead to quadratic behavior.
1320
///
1321
/// # Example
1322
///
1323
/// ```
1324
/// # use polars_core::prelude::*;
1325
/// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
1326
/// let original_name = "foo";
1327
/// let new_name = "bar";
1328
/// df.rename(original_name, new_name.into())
1329
/// }
1330
/// ```
1331
pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
1332
if column == name.as_str() {
1333
return Ok(self);
1334
}
1335
polars_ensure!(
1336
!self.schema().contains(&name),
1337
Duplicate: "column rename attempted with already existing name \"{name}\""
1338
);
1339
1340
self.get_column_index(column)
1341
.and_then(|idx| unsafe { self.columns_mut() }.get_mut(idx))
1342
.ok_or_else(|| polars_err!(col_not_found = column))
1343
.map(|c| c.rename(name))?;
1344
1345
Ok(self)
1346
}
1347
1348
pub fn rename_many<'a>(
1349
&mut self,
1350
renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
1351
) -> PolarsResult<&mut Self> {
1352
let mut schema_arc = self.schema().clone();
1353
let schema = Arc::make_mut(&mut schema_arc);
1354
1355
for (from, to) in renames {
1356
if from == to.as_str() {
1357
continue;
1358
}
1359
1360
polars_ensure!(
1361
!schema.contains(&to),
1362
Duplicate: "column rename attempted with already existing name \"{to}\""
1363
);
1364
1365
match schema.get_full(from) {
1366
None => polars_bail!(col_not_found = from),
1367
Some((idx, _, _)) => {
1368
let (n, _) = schema.get_at_index_mut(idx).unwrap();
1369
*n = to.clone();
1370
unsafe { self.columns_mut() }
1371
.get_mut(idx)
1372
.unwrap()
1373
.rename(to);
1374
},
1375
}
1376
}
1377
1378
unsafe { self.set_schema(schema_arc) };
1379
1380
Ok(self)
1381
}
1382
1383
/// Sort [`DataFrame`] in place.
1384
///
1385
/// See [`DataFrame::sort`] for more instruction.
1386
pub fn sort_in_place(
1387
&mut self,
1388
by: impl IntoIterator<Item = impl AsRef<str>>,
1389
sort_options: SortMultipleOptions,
1390
) -> PolarsResult<&mut Self> {
1391
let by_column = self.select_to_vec(by)?;
1392
1393
let mut out = self.sort_impl(by_column, sort_options, None)?;
1394
unsafe { out.set_schema_from(self) };
1395
1396
*self = out;
1397
1398
Ok(self)
1399
}
1400
1401
#[doc(hidden)]
1402
/// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
1403
pub fn sort_impl(
1404
&self,
1405
by_column: Vec<Column>,
1406
sort_options: SortMultipleOptions,
1407
slice: Option<(i64, usize)>,
1408
) -> PolarsResult<Self> {
1409
if by_column.is_empty() {
1410
// If no columns selected, any order (including original order) is correct.
1411
return if let Some((offset, len)) = slice {
1412
Ok(self.slice(offset, len))
1413
} else {
1414
Ok(self.clone())
1415
};
1416
}
1417
1418
// note that the by_column argument also contains evaluated expression from
1419
// polars-lazy that may not even be present in this dataframe. therefore
1420
// when we try to set the first columns as sorted, we ignore the error as
1421
// expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
1422
let first_descending = sort_options.descending[0];
1423
let first_by_column = by_column[0].name().to_string();
1424
1425
let set_sorted = |df: &mut DataFrame| {
1426
// Mark the first sort column as sorted; if the column does not exist it
1427
// is ok, because we sorted by an expression not present in the dataframe
1428
let _ = df.apply(&first_by_column, |s| {
1429
let mut s = s.clone();
1430
if first_descending {
1431
s.set_sorted_flag(IsSorted::Descending)
1432
} else {
1433
s.set_sorted_flag(IsSorted::Ascending)
1434
}
1435
s
1436
});
1437
};
1438
1439
if self.shape_has_zero() {
1440
let mut out = self.clone();
1441
set_sorted(&mut out);
1442
return Ok(out);
1443
}
1444
1445
if let Some((0, k)) = slice {
1446
if k < self.height() {
1447
return self.bottom_k_impl(k, by_column, sort_options);
1448
}
1449
}
1450
// Check if the required column is already sorted; if so we can exit early
1451
// We can do so when there is only one column to sort by, for multiple columns
1452
// it will be complicated to do so
1453
#[cfg(feature = "dtype-categorical")]
1454
let is_not_categorical_enum =
1455
!(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
1456
|| matches!(by_column[0].dtype(), DataType::Enum(_, _)));
1457
1458
#[cfg(not(feature = "dtype-categorical"))]
1459
#[allow(non_upper_case_globals)]
1460
const is_not_categorical_enum: bool = true;
1461
1462
if by_column.len() == 1 && is_not_categorical_enum {
1463
let required_sorting = if sort_options.descending[0] {
1464
IsSorted::Descending
1465
} else {
1466
IsSorted::Ascending
1467
};
1468
// If null count is 0 then nulls_last doesnt matter
1469
// Safe to get value at last position since the dataframe is not empty (taken care above)
1470
let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
1471
&& ((by_column[0].null_count() == 0)
1472
|| by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
1473
== sort_options.nulls_last[0]);
1474
1475
if no_sorting_required {
1476
return if let Some((offset, len)) = slice {
1477
Ok(self.slice(offset, len))
1478
} else {
1479
Ok(self.clone())
1480
};
1481
}
1482
}
1483
1484
let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
1485
let allow_threads = sort_options.multithreaded;
1486
1487
// a lot of indirection in both sorting and take
1488
let mut df = self.clone();
1489
let df = df.rechunk_mut_par();
1490
let mut take = match (by_column.len(), has_nested) {
1491
(1, false) => {
1492
let s = &by_column[0];
1493
let options = SortOptions {
1494
descending: sort_options.descending[0],
1495
nulls_last: sort_options.nulls_last[0],
1496
multithreaded: sort_options.multithreaded,
1497
maintain_order: sort_options.maintain_order,
1498
limit: sort_options.limit,
1499
};
1500
// fast path for a frame with a single series
1501
// no need to compute the sort indices and then take by these indices
1502
// simply sort and return as frame
1503
if df.width() == 1 && df.try_get_column_index(s.name().as_str()).is_ok() {
1504
let mut out = s.sort_with(options)?;
1505
if let Some((offset, len)) = slice {
1506
out = out.slice(offset, len);
1507
}
1508
return Ok(out.into_frame());
1509
}
1510
s.arg_sort(options)
1511
},
1512
_ => arg_sort(&by_column, sort_options)?,
1513
};
1514
1515
if let Some((offset, len)) = slice {
1516
take = take.slice(offset, len);
1517
}
1518
1519
// SAFETY:
1520
// the created indices are in bounds
1521
let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
1522
set_sorted(&mut df);
1523
Ok(df)
1524
}
1525
1526
/// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
1527
///
1528
/// This dataframe does not necessarily have a specified schema and may be changed at any
1529
/// point. It is primarily used for debugging.
1530
pub fn _to_metadata(&self) -> DataFrame {
1531
let num_columns = self.width();
1532
1533
let mut column_names =
1534
StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
1535
let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
1536
let mut sorted_asc_ca =
1537
BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
1538
let mut sorted_dsc_ca =
1539
BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
1540
let mut fast_explode_list_ca =
1541
BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
1542
let mut materialized_at_ca =
1543
StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
1544
1545
for col in self.columns() {
1546
let flags = col.get_flags();
1547
1548
let (repr, materialized_at) = match col {
1549
Column::Series(s) => ("series", s.materialized_at()),
1550
Column::Scalar(_) => ("scalar", None),
1551
};
1552
let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
1553
let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
1554
let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
1555
1556
column_names.append_value(col.name().clone());
1557
repr_ca.append_value(repr);
1558
sorted_asc_ca.append_value(sorted_asc);
1559
sorted_dsc_ca.append_value(sorted_dsc);
1560
fast_explode_list_ca.append_value(fast_explode_list);
1561
materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
1562
}
1563
1564
unsafe {
1565
DataFrame::new_unchecked(
1566
self.width(),
1567
vec![
1568
column_names.finish().into_column(),
1569
repr_ca.finish().into_column(),
1570
sorted_asc_ca.finish().into_column(),
1571
sorted_dsc_ca.finish().into_column(),
1572
fast_explode_list_ca.finish().into_column(),
1573
materialized_at_ca.finish().into_column(),
1574
],
1575
)
1576
}
1577
}
1578
/// Return a sorted clone of this [`DataFrame`].
1579
///
1580
/// In many cases the output chunks will be continuous in memory but this is not guaranteed
1581
/// # Example
1582
///
1583
/// Sort by a single column with default options:
1584
/// ```
1585
/// # use polars_core::prelude::*;
1586
/// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
1587
/// df.sort(["sepal_width"], Default::default())
1588
/// }
1589
/// ```
1590
/// Sort by a single column with specific order:
1591
/// ```
1592
/// # use polars_core::prelude::*;
1593
/// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
1594
/// df.sort(
1595
/// ["sepal_width"],
1596
/// SortMultipleOptions::new()
1597
/// .with_order_descending(descending)
1598
/// )
1599
/// }
1600
/// ```
1601
/// Sort by multiple columns with specifying order for each column:
1602
/// ```
1603
/// # use polars_core::prelude::*;
1604
/// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
1605
/// df.sort(
1606
/// ["sepal_width", "sepal_length"],
1607
/// SortMultipleOptions::new()
1608
/// .with_order_descending_multi([false, true])
1609
/// )
1610
/// }
1611
/// ```
1612
/// See [`SortMultipleOptions`] for more options.
1613
///
1614
/// Also see [`DataFrame::sort_in_place`].
1615
pub fn sort(
1616
&self,
1617
by: impl IntoIterator<Item = impl AsRef<str>>,
1618
sort_options: SortMultipleOptions,
1619
) -> PolarsResult<Self> {
1620
let mut df = self.clone();
1621
df.sort_in_place(by, sort_options)?;
1622
Ok(df)
1623
}
1624
1625
/// Replace a column with a [`Column`].
1626
///
1627
/// # Example
1628
///
1629
/// ```rust
1630
/// # use polars_core::prelude::*;
1631
/// let mut df: DataFrame = df!("Country" => ["United States", "China"],
1632
/// "Area (km²)" => [9_833_520, 9_596_961])?;
1633
/// let s: Column = Column::new("Country".into(), ["USA", "PRC"]);
1634
///
1635
/// assert!(df.replace("Nation", s.clone()).is_err());
1636
/// assert!(df.replace("Country", s).is_ok());
1637
/// # Ok::<(), PolarsError>(())
1638
/// ```
1639
pub fn replace(&mut self, column: &str, new_col: Column) -> PolarsResult<&mut Self> {
1640
self.apply(column, |_| new_col)
1641
}
1642
1643
/// Replace column at index `idx` with a [`Series`].
1644
///
1645
/// # Example
1646
///
1647
/// ```ignored
1648
/// # use polars_core::prelude::*;
1649
/// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
1650
/// let s1 = Series::new("ascii".into(), [70, 79, 79]);
1651
/// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1652
///
1653
/// // Add 32 to get lowercase ascii values
1654
/// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
1655
/// # Ok::<(), PolarsError>(())
1656
/// ```
1657
pub fn replace_column(&mut self, index: usize, new_column: Column) -> PolarsResult<&mut Self> {
1658
polars_ensure!(
1659
index < self.width(),
1660
ShapeMismatch:
1661
"unable to replace at index {}, the DataFrame has only {} columns",
1662
index, self.width(),
1663
);
1664
1665
polars_ensure!(
1666
new_column.len() == self.height(),
1667
ShapeMismatch:
1668
"unable to replace a column, series length {} doesn't match the DataFrame height {}",
1669
new_column.len(), self.height(),
1670
);
1671
1672
unsafe { *self.columns_mut().get_mut(index).unwrap() = new_column };
1673
1674
Ok(self)
1675
}
1676
1677
/// Apply a closure to a column. This is the recommended way to do in place modification.
1678
///
1679
/// # Example
1680
///
1681
/// ```rust
1682
/// # use polars_core::prelude::*;
1683
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
1684
/// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
1685
/// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1686
///
1687
/// fn str_to_len(str_val: &Column) -> Column {
1688
/// str_val.str()
1689
/// .unwrap()
1690
/// .into_iter()
1691
/// .map(|opt_name: Option<&str>| {
1692
/// opt_name.map(|name: &str| name.len() as u32)
1693
/// })
1694
/// .collect::<UInt32Chunked>()
1695
/// .into_column()
1696
/// }
1697
///
1698
/// // Replace the names column by the length of the names.
1699
/// df.apply("names", str_to_len);
1700
/// # Ok::<(), PolarsError>(())
1701
/// ```
1702
/// Results in:
1703
///
1704
/// ```text
1705
/// +--------+-------+
1706
/// | foo | |
1707
/// | --- | names |
1708
/// | str | u32 |
1709
/// +========+=======+
1710
/// | "ham" | 4 |
1711
/// +--------+-------+
1712
/// | "spam" | 6 |
1713
/// +--------+-------+
1714
/// | "egg" | 3 |
1715
/// +--------+-------+
1716
/// ```
1717
pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
1718
where
1719
F: FnOnce(&Column) -> C,
1720
C: IntoColumn,
1721
{
1722
let idx = self.try_get_column_index(name)?;
1723
self.apply_at_idx(idx, f)?;
1724
Ok(self)
1725
}
1726
1727
/// Apply a closure to a column at index `idx`. This is the recommended way to do in place
1728
/// modification.
1729
///
1730
/// # Example
1731
///
1732
/// ```rust
1733
/// # use polars_core::prelude::*;
1734
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
1735
/// let s1 = Column::new("ascii".into(), [70, 79, 79]);
1736
/// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1737
///
1738
/// // Add 32 to get lowercase ascii values
1739
/// df.apply_at_idx(1, |s| s + 32);
1740
/// # Ok::<(), PolarsError>(())
1741
/// ```
1742
/// Results in:
1743
///
1744
/// ```text
1745
/// +--------+-------+
1746
/// | foo | ascii |
1747
/// | --- | --- |
1748
/// | str | i32 |
1749
/// +========+=======+
1750
/// | "ham" | 102 |
1751
/// +--------+-------+
1752
/// | "spam" | 111 |
1753
/// +--------+-------+
1754
/// | "egg" | 111 |
1755
/// +--------+-------+
1756
/// ```
1757
pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
1758
where
1759
F: FnOnce(&Column) -> C,
1760
C: IntoColumn,
1761
{
1762
let df_height = self.height();
1763
let width = self.width();
1764
1765
let cached_schema = self.cached_schema().cloned();
1766
1767
let col = unsafe { self.columns_mut() }.get_mut(idx).ok_or_else(|| {
1768
polars_err!(
1769
ComputeError: "invalid column index: {} for a DataFrame with {} columns",
1770
idx, width
1771
)
1772
})?;
1773
1774
let mut new_col = f(col).into_column();
1775
1776
if new_col.len() != df_height && new_col.len() == 1 {
1777
new_col = new_col.new_from_index(0, df_height);
1778
}
1779
1780
polars_ensure!(
1781
new_col.len() == df_height,
1782
ShapeMismatch:
1783
"apply_at_idx: resulting Series has length {} while the DataFrame has height {}",
1784
new_col.len(), df_height
1785
);
1786
1787
new_col = new_col.with_name(col.name().clone());
1788
let col_before = std::mem::replace(col, new_col);
1789
1790
if col.dtype() == col_before.dtype() {
1791
unsafe { self.set_opt_schema(cached_schema) };
1792
}
1793
1794
Ok(self)
1795
}
1796
1797
/// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
1798
/// modification.
1799
///
1800
/// # Example
1801
///
1802
/// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
1803
///
1804
/// ```rust
1805
/// # use polars_core::prelude::*;
1806
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
1807
/// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
1808
/// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1809
///
1810
/// let idx = vec![0, 1, 4];
1811
///
1812
/// df.try_apply("foo", |c| {
1813
/// c.str()?
1814
/// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
1815
/// });
1816
/// # Ok::<(), PolarsError>(())
1817
/// ```
1818
/// Results in:
1819
///
1820
/// ```text
1821
/// +---------------------+--------+
1822
/// | foo | values |
1823
/// | --- | --- |
1824
/// | str | i32 |
1825
/// +=====================+========+
1826
/// | "ham-is-modified" | 1 |
1827
/// +---------------------+--------+
1828
/// | "spam-is-modified" | 2 |
1829
/// +---------------------+--------+
1830
/// | "egg" | 3 |
1831
/// +---------------------+--------+
1832
/// | "bacon" | 4 |
1833
/// +---------------------+--------+
1834
/// | "quack-is-modified" | 5 |
1835
/// +---------------------+--------+
1836
/// ```
1837
pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
1838
where
1839
F: FnOnce(&Column) -> PolarsResult<C>,
1840
C: IntoColumn,
1841
{
1842
let df_height = self.height();
1843
let width = self.width();
1844
1845
let cached_schema = self.cached_schema().cloned();
1846
1847
let col = unsafe { self.columns_mut() }.get_mut(idx).ok_or_else(|| {
1848
polars_err!(
1849
ComputeError: "invalid column index: {} for a DataFrame with {} columns",
1850
idx, width
1851
)
1852
})?;
1853
1854
let mut new_col = f(col).map(|c| c.into_column())?;
1855
1856
polars_ensure!(
1857
new_col.len() == df_height,
1858
ShapeMismatch:
1859
"try_apply_at_idx: resulting Series has length {} while the DataFrame has height {}",
1860
new_col.len(), df_height
1861
);
1862
1863
// make sure the name remains the same after applying the closure
1864
new_col = new_col.with_name(col.name().clone());
1865
let col_before = std::mem::replace(col, new_col);
1866
1867
if col.dtype() == col_before.dtype() {
1868
unsafe { self.set_opt_schema(cached_schema) };
1869
}
1870
1871
Ok(self)
1872
}
1873
1874
/// Apply a closure that may fail to a column. This is the recommended way to do in place
1875
/// modification.
1876
///
1877
/// # Example
1878
///
1879
/// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
1880
///
1881
/// ```rust
1882
/// # use polars_core::prelude::*;
1883
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
1884
/// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
1885
/// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1886
///
1887
/// // create a mask
1888
/// let values = df.column("values")?.as_materialized_series();
1889
/// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
1890
///
1891
/// df.try_apply("foo", |c| {
1892
/// c.str()?
1893
/// .set(&mask, Some("not_within_bounds"))
1894
/// });
1895
/// # Ok::<(), PolarsError>(())
1896
/// ```
1897
/// Results in:
1898
///
1899
/// ```text
1900
/// +---------------------+--------+
1901
/// | foo | values |
1902
/// | --- | --- |
1903
/// | str | i32 |
1904
/// +=====================+========+
1905
/// | "not_within_bounds" | 1 |
1906
/// +---------------------+--------+
1907
/// | "spam" | 2 |
1908
/// +---------------------+--------+
1909
/// | "egg" | 3 |
1910
/// +---------------------+--------+
1911
/// | "bacon" | 4 |
1912
/// +---------------------+--------+
1913
/// | "not_within_bounds" | 5 |
1914
/// +---------------------+--------+
1915
/// ```
1916
pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
1917
where
1918
F: FnOnce(&Series) -> PolarsResult<C>,
1919
C: IntoColumn,
1920
{
1921
let idx = self.try_get_column_index(column)?;
1922
self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
1923
}
1924
1925
/// Slice the [`DataFrame`] along the rows.
1926
///
1927
/// # Example
1928
///
1929
/// ```rust
1930
/// # use polars_core::prelude::*;
1931
/// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
1932
/// "Color" => ["Green", "Red", "White", "White", "Red"])?;
1933
/// let sl: DataFrame = df.slice(2, 3);
1934
///
1935
/// assert_eq!(sl.shape(), (3, 2));
1936
/// println!("{}", sl);
1937
/// # Ok::<(), PolarsError>(())
1938
/// ```
1939
/// Output:
1940
/// ```text
1941
/// shape: (3, 2)
1942
/// +-------+-------+
1943
/// | Fruit | Color |
1944
/// | --- | --- |
1945
/// | str | str |
1946
/// +=======+=======+
1947
/// | Grape | White |
1948
/// +-------+-------+
1949
/// | Fig | White |
1950
/// +-------+-------+
1951
/// | Fig | Red |
1952
/// +-------+-------+
1953
/// ```
1954
#[must_use]
1955
pub fn slice(&self, offset: i64, length: usize) -> Self {
1956
if offset == 0 && length == self.height() {
1957
return self.clone();
1958
}
1959
1960
if length == 0 {
1961
return self.clear();
1962
}
1963
1964
let cols = self.apply_columns(|s| s.slice(offset, length));
1965
1966
let height = if let Some(fst) = cols.first() {
1967
fst.len()
1968
} else {
1969
let (_, length) = slice_offsets(offset, length, self.height());
1970
length
1971
};
1972
1973
unsafe { DataFrame::_new_unchecked_impl(height, cols).with_schema_from(self) }
1974
}
1975
1976
/// Split [`DataFrame`] at the given `offset`.
1977
pub fn split_at(&self, offset: i64) -> (Self, Self) {
1978
let (a, b) = self.columns().iter().map(|s| s.split_at(offset)).unzip();
1979
1980
let (idx, _) = slice_offsets(offset, 0, self.height());
1981
1982
let a = unsafe { DataFrame::new_unchecked(idx, a).with_schema_from(self) };
1983
let b = unsafe { DataFrame::new_unchecked(self.height() - idx, b).with_schema_from(self) };
1984
(a, b)
1985
}
1986
1987
#[must_use]
1988
pub fn clear(&self) -> Self {
1989
let cols = self.columns().iter().map(|s| s.clear()).collect::<Vec<_>>();
1990
unsafe { DataFrame::_new_unchecked_impl(0, cols).with_schema_from(self) }
1991
}
1992
1993
#[must_use]
1994
pub fn slice_par(&self, offset: i64, length: usize) -> Self {
1995
if offset == 0 && length == self.height() {
1996
return self.clone();
1997
}
1998
let columns = self.apply_columns_par(|s| s.slice(offset, length));
1999
unsafe { DataFrame::new_unchecked(length, columns).with_schema_from(self) }
2000
}
2001
2002
#[must_use]
2003
pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2004
if offset == 0 && length == self.height() {
2005
return self.clone();
2006
}
2007
// @scalar-opt
2008
let columns = self.apply_columns(|s| {
2009
let mut out = s.slice(offset, length);
2010
out.shrink_to_fit();
2011
out
2012
});
2013
unsafe { DataFrame::new_unchecked(length, columns).with_schema_from(self) }
2014
}
2015
2016
/// Get the head of the [`DataFrame`].
2017
///
2018
/// # Example
2019
///
2020
/// ```rust
2021
/// # use polars_core::prelude::*;
2022
/// let countries: DataFrame =
2023
/// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2024
/// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2025
/// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2026
/// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2027
/// assert_eq!(countries.shape(), (5, 4));
2028
///
2029
/// println!("{}", countries.head(Some(3)));
2030
/// # Ok::<(), PolarsError>(())
2031
/// ```
2032
///
2033
/// Output:
2034
///
2035
/// ```text
2036
/// shape: (3, 4)
2037
/// +--------------------+---------------+---------------+------------+
2038
/// | Rank by GDP (2021) | Continent | Country | Capital |
2039
/// | --- | --- | --- | --- |
2040
/// | i32 | str | str | str |
2041
/// +====================+===============+===============+============+
2042
/// | 1 | North America | United States | Washington |
2043
/// +--------------------+---------------+---------------+------------+
2044
/// | 2 | Asia | China | Beijing |
2045
/// +--------------------+---------------+---------------+------------+
2046
/// | 3 | Asia | Japan | Tokyo |
2047
/// +--------------------+---------------+---------------+------------+
2048
/// ```
2049
#[must_use]
2050
pub fn head(&self, length: Option<usize>) -> Self {
2051
let new_height = usize::min(self.height(), length.unwrap_or(HEAD_DEFAULT_LENGTH));
2052
let new_cols = self.apply_columns(|c| c.head(Some(new_height)));
2053
2054
unsafe { DataFrame::new_unchecked(new_height, new_cols).with_schema_from(self) }
2055
}
2056
2057
/// Get the tail of the [`DataFrame`].
2058
///
2059
/// # Example
2060
///
2061
/// ```rust
2062
/// # use polars_core::prelude::*;
2063
/// let countries: DataFrame =
2064
/// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2065
/// "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2066
/// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2067
/// assert_eq!(countries.shape(), (5, 3));
2068
///
2069
/// println!("{}", countries.tail(Some(2)));
2070
/// # Ok::<(), PolarsError>(())
2071
/// ```
2072
///
2073
/// Output:
2074
///
2075
/// ```text
2076
/// shape: (2, 3)
2077
/// +-------------+--------------------+---------+
2078
/// | Rank (2021) | Apple Price (€/kg) | Country |
2079
/// | --- | --- | --- |
2080
/// | i32 | f64 | str |
2081
/// +=============+====================+=========+
2082
/// | 108 | 0.65 | Syria |
2083
/// +-------------+--------------------+---------+
2084
/// | 109 | 0.52 | Turkey |
2085
/// +-------------+--------------------+---------+
2086
/// ```
2087
#[must_use]
2088
pub fn tail(&self, length: Option<usize>) -> Self {
2089
let new_height = usize::min(self.height(), length.unwrap_or(TAIL_DEFAULT_LENGTH));
2090
let new_cols = self.apply_columns(|c| c.tail(Some(new_height)));
2091
2092
unsafe { DataFrame::new_unchecked(new_height, new_cols).with_schema_from(self) }
2093
}
2094
2095
/// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2096
///
2097
/// # Panics
2098
///
2099
/// Panics if the [`DataFrame`] that is passed is not rechunked.
2100
///
2101
/// This responsibility is left to the caller as we don't want to take mutable references here,
2102
/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2103
/// as well.
2104
pub fn iter_chunks(
2105
&self,
2106
compat_level: CompatLevel,
2107
parallel: bool,
2108
) -> impl Iterator<Item = RecordBatch> + '_ {
2109
debug_assert!(!self.should_rechunk(), "expected equal chunks");
2110
2111
if self.width() == 0 {
2112
return RecordBatchIterWrap::new_zero_width(self.height());
2113
}
2114
2115
// If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2116
// as we must allocate arrow strings/binaries.
2117
let must_convert = compat_level.0 == 0;
2118
let parallel = parallel
2119
&& must_convert
2120
&& self.width() > 1
2121
&& self
2122
.columns()
2123
.iter()
2124
.any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2125
2126
RecordBatchIterWrap::Batches(RecordBatchIter {
2127
df: self,
2128
schema: Arc::new(
2129
self.columns()
2130
.iter()
2131
.map(|c| c.field().to_arrow(compat_level))
2132
.collect(),
2133
),
2134
idx: 0,
2135
n_chunks: usize::max(1, self.first_col_n_chunks()),
2136
compat_level,
2137
parallel,
2138
})
2139
}
2140
2141
/// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2142
///
2143
/// # Panics
2144
///
2145
/// Panics if the [`DataFrame`] that is passed is not rechunked.
2146
///
2147
/// This responsibility is left to the caller as we don't want to take mutable references here,
2148
/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2149
/// as well.
2150
pub fn iter_chunks_physical(&self) -> impl Iterator<Item = RecordBatch> + '_ {
2151
debug_assert!(!self.should_rechunk());
2152
2153
if self.width() == 0 {
2154
return RecordBatchIterWrap::new_zero_width(self.height());
2155
}
2156
2157
RecordBatchIterWrap::PhysicalBatches(PhysRecordBatchIter {
2158
schema: Arc::new(
2159
self.columns()
2160
.iter()
2161
.map(|c| c.field().to_arrow(CompatLevel::newest()))
2162
.collect(),
2163
),
2164
arr_iters: self
2165
.materialized_column_iter()
2166
.map(|s| s.chunks().iter())
2167
.collect(),
2168
})
2169
}
2170
2171
/// Get a [`DataFrame`] with all the columns in reversed order.
2172
#[must_use]
2173
pub fn reverse(&self) -> Self {
2174
let new_cols = self.apply_columns(Column::reverse);
2175
unsafe { DataFrame::new_unchecked(self.height(), new_cols).with_schema_from(self) }
2176
}
2177
2178
/// Shift the values by a given period and fill the parts that will be empty due to this operation
2179
/// with `Nones`.
2180
///
2181
/// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2182
#[must_use]
2183
pub fn shift(&self, periods: i64) -> Self {
2184
let col = self.apply_columns_par(|s| s.shift(periods));
2185
unsafe { DataFrame::new_unchecked(self.height(), col).with_schema_from(self) }
2186
}
2187
2188
/// Replace None values with one of the following strategies:
2189
/// * Forward fill (replace None with the previous value)
2190
/// * Backward fill (replace None with the next value)
2191
/// * Mean fill (replace None with the mean of the whole array)
2192
/// * Min fill (replace None with the minimum of the whole array)
2193
/// * Max fill (replace None with the maximum of the whole array)
2194
///
2195
/// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2196
pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2197
let col = self.try_apply_columns_par(|s| s.fill_null(strategy))?;
2198
2199
Ok(unsafe { DataFrame::new_unchecked(self.height(), col) })
2200
}
2201
2202
/// Pipe different functions/ closure operations that work on a DataFrame together.
2203
pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2204
where
2205
F: Fn(DataFrame) -> PolarsResult<B>,
2206
{
2207
f(self)
2208
}
2209
2210
/// Pipe different functions/ closure operations that work on a DataFrame together.
2211
pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2212
where
2213
F: Fn(&mut DataFrame) -> PolarsResult<B>,
2214
{
2215
f(self)
2216
}
2217
2218
/// Pipe different functions/ closure operations that work on a DataFrame together.
2219
pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2220
where
2221
F: Fn(DataFrame, Args) -> PolarsResult<B>,
2222
{
2223
f(self, args)
2224
}
2225
/// Drop duplicate rows from a [`DataFrame`].
2226
/// *This fails when there is a column of type List in DataFrame*
2227
///
2228
/// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2229
///
2230
/// # Example
2231
///
2232
/// ```no_run
2233
/// # use polars_core::prelude::*;
2234
/// let df = df! {
2235
/// "flt" => [1., 1., 2., 2., 3., 3.],
2236
/// "int" => [1, 1, 2, 2, 3, 3, ],
2237
/// "str" => ["a", "a", "b", "b", "c", "c"]
2238
/// }?;
2239
///
2240
/// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2241
/// # Ok::<(), PolarsError>(())
2242
/// ```
2243
/// Returns
2244
///
2245
/// ```text
2246
/// +-----+-----+-----+
2247
/// | flt | int | str |
2248
/// | --- | --- | --- |
2249
/// | f64 | i32 | str |
2250
/// +=====+=====+=====+
2251
/// | 1 | 1 | "a" |
2252
/// +-----+-----+-----+
2253
/// | 2 | 2 | "b" |
2254
/// +-----+-----+-----+
2255
/// | 3 | 3 | "c" |
2256
/// +-----+-----+-----+
2257
/// ```
2258
#[cfg(feature = "algorithm_group_by")]
2259
pub fn unique_stable(
2260
&self,
2261
subset: Option<&[String]>,
2262
keep: UniqueKeepStrategy,
2263
slice: Option<(i64, usize)>,
2264
) -> PolarsResult<DataFrame> {
2265
self.unique_impl(
2266
true,
2267
subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2268
keep,
2269
slice,
2270
)
2271
}
2272
2273
/// Unstable distinct. See [`DataFrame::unique_stable`].
2274
#[cfg(feature = "algorithm_group_by")]
2275
pub fn unique<I, S>(
2276
&self,
2277
subset: Option<&[String]>,
2278
keep: UniqueKeepStrategy,
2279
slice: Option<(i64, usize)>,
2280
) -> PolarsResult<DataFrame> {
2281
self.unique_impl(
2282
false,
2283
subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2284
keep,
2285
slice,
2286
)
2287
}
2288
2289
#[cfg(feature = "algorithm_group_by")]
2290
pub fn unique_impl(
2291
&self,
2292
maintain_order: bool,
2293
subset: Option<Vec<PlSmallStr>>,
2294
keep: UniqueKeepStrategy,
2295
slice: Option<(i64, usize)>,
2296
) -> PolarsResult<Self> {
2297
if self.width() == 0 {
2298
let height = usize::min(self.height(), 1);
2299
return Ok(DataFrame::empty_with_height(height));
2300
}
2301
2302
let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2303
let mut df = self.clone();
2304
// take on multiple chunks is terrible
2305
df.rechunk_mut_par();
2306
2307
let columns = match (keep, maintain_order) {
2308
(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2309
let gb = df.group_by_stable(names)?;
2310
let groups = gb.get_groups();
2311
let (offset, len) = slice.unwrap_or((0, groups.len()));
2312
let groups = groups.slice(offset, len);
2313
df.apply_columns_par(|s| unsafe { s.agg_first(&groups) })
2314
},
2315
(UniqueKeepStrategy::Last, true) => {
2316
// maintain order by last values, so the sorted groups are not correct as they
2317
// are sorted by the first value
2318
let gb = df.group_by_stable(names)?;
2319
let groups = gb.get_groups();
2320
2321
let last_idx: NoNull<IdxCa> = groups
2322
.iter()
2323
.map(|g| match g {
2324
GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2325
GroupsIndicator::Slice([first, len]) => first + len - 1,
2326
})
2327
.collect();
2328
2329
let mut last_idx = last_idx.into_inner().sort(false);
2330
2331
if let Some((offset, len)) = slice {
2332
last_idx = last_idx.slice(offset, len);
2333
}
2334
2335
let last_idx = NoNull::new(last_idx);
2336
let out = unsafe { df.take_unchecked(&last_idx) };
2337
return Ok(out);
2338
},
2339
(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
2340
let gb = df.group_by(names)?;
2341
let groups = gb.get_groups();
2342
let (offset, len) = slice.unwrap_or((0, groups.len()));
2343
let groups = groups.slice(offset, len);
2344
df.apply_columns_par(|s| unsafe { s.agg_first(&groups) })
2345
},
2346
(UniqueKeepStrategy::Last, false) => {
2347
let gb = df.group_by(names)?;
2348
let groups = gb.get_groups();
2349
let (offset, len) = slice.unwrap_or((0, groups.len()));
2350
let groups = groups.slice(offset, len);
2351
df.apply_columns_par(|s| unsafe { s.agg_last(&groups) })
2352
},
2353
(UniqueKeepStrategy::None, _) => {
2354
let df_part = df.select(names)?;
2355
let mask = df_part.is_unique()?;
2356
let mut filtered = df.filter(&mask)?;
2357
2358
if let Some((offset, len)) = slice {
2359
filtered = filtered.slice(offset, len);
2360
}
2361
return Ok(filtered);
2362
},
2363
};
2364
Ok(unsafe { DataFrame::new_unchecked_infer_height(columns).with_schema_from(self) })
2365
}
2366
2367
/// Get a mask of all the unique rows in the [`DataFrame`].
2368
///
2369
/// # Example
2370
///
2371
/// ```no_run
2372
/// # use polars_core::prelude::*;
2373
/// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
2374
/// "ISIN" => ["US0378331005", "US5949181045"])?;
2375
/// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
2376
///
2377
/// assert!(ca.all());
2378
/// # Ok::<(), PolarsError>(())
2379
/// ```
2380
#[cfg(feature = "algorithm_group_by")]
2381
pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
2382
let gb = self.group_by(self.get_column_names_owned())?;
2383
let groups = gb.get_groups();
2384
Ok(is_unique_helper(
2385
groups,
2386
self.height() as IdxSize,
2387
true,
2388
false,
2389
))
2390
}
2391
2392
/// Get a mask of all the duplicated rows in the [`DataFrame`].
2393
///
2394
/// # Example
2395
///
2396
/// ```no_run
2397
/// # use polars_core::prelude::*;
2398
/// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
2399
/// "ISIN" => ["US02079K3059", "US02079K1079"])?;
2400
/// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
2401
///
2402
/// assert!(!ca.all());
2403
/// # Ok::<(), PolarsError>(())
2404
/// ```
2405
#[cfg(feature = "algorithm_group_by")]
2406
pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
2407
let gb = self.group_by(self.get_column_names_owned())?;
2408
let groups = gb.get_groups();
2409
Ok(is_unique_helper(
2410
groups,
2411
self.height() as IdxSize,
2412
false,
2413
true,
2414
))
2415
}
2416
2417
/// Create a new [`DataFrame`] that shows the null counts per column.
2418
#[must_use]
2419
pub fn null_count(&self) -> Self {
2420
let cols =
2421
self.apply_columns(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]));
2422
unsafe { Self::new_unchecked(1, cols) }
2423
}
2424
2425
/// Hash and combine the row values
2426
#[cfg(feature = "row_hash")]
2427
pub fn hash_rows(
2428
&mut self,
2429
hasher_builder: Option<PlSeedableRandomStateQuality>,
2430
) -> PolarsResult<UInt64Chunked> {
2431
let dfs = split_df(self, POOL.current_num_threads(), false);
2432
let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
2433
2434
let mut iter = cas.into_iter();
2435
let mut acc_ca = iter.next().unwrap();
2436
for ca in iter {
2437
acc_ca.append(&ca)?;
2438
}
2439
Ok(acc_ca.rechunk().into_owned())
2440
}
2441
2442
/// Get the supertype of the columns in this DataFrame
2443
pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
2444
self.columns()
2445
.iter()
2446
.map(|s| Ok(s.dtype().clone()))
2447
.reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
2448
}
2449
2450
/// Take by index values given by the slice `idx`.
2451
/// # Warning
2452
/// Be careful with allowing threads when calling this in a large hot loop
2453
/// every thread split may be on rayon stack and lead to SO
2454
#[doc(hidden)]
2455
pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2456
self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
2457
}
2458
2459
/// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
2460
/// if the index value in `idx` are sorted. This will maintain sorted flags.
2461
///
2462
/// # Warning
2463
/// Be careful with allowing threads when calling this in a large hot loop
2464
/// every thread split may be on rayon stack and lead to SO
2465
#[doc(hidden)]
2466
pub unsafe fn _take_unchecked_slice_sorted(
2467
&self,
2468
idx: &[IdxSize],
2469
allow_threads: bool,
2470
sorted: IsSorted,
2471
) -> Self {
2472
#[cfg(debug_assertions)]
2473
{
2474
if idx.len() > 2 {
2475
use crate::series::IsSorted;
2476
2477
match sorted {
2478
IsSorted::Ascending => {
2479
assert!(idx[0] <= idx[idx.len() - 1]);
2480
},
2481
IsSorted::Descending => {
2482
assert!(idx[0] >= idx[idx.len() - 1]);
2483
},
2484
_ => {},
2485
}
2486
}
2487
}
2488
let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
2489
ca.set_sorted_flag(sorted);
2490
self.take_unchecked_impl(&ca, allow_threads)
2491
}
2492
#[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
2493
#[doc(hidden)]
2494
pub fn _partition_by_impl(
2495
&self,
2496
cols: &[PlSmallStr],
2497
stable: bool,
2498
include_key: bool,
2499
parallel: bool,
2500
) -> PolarsResult<Vec<DataFrame>> {
2501
let selected_keys = self.select_to_vec(cols.iter().cloned())?;
2502
let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
2503
let groups = groups.into_groups();
2504
2505
// drop key columns prior to calculation if requested
2506
let df = if include_key {
2507
self.clone()
2508
} else {
2509
self.drop_many(cols.iter().cloned())
2510
};
2511
2512
if parallel {
2513
// don't parallelize this
2514
// there is a lot of parallelization in take and this may easily SO
2515
POOL.install(|| {
2516
match groups.as_ref() {
2517
GroupsType::Idx(idx) => {
2518
// Rechunk as the gather may rechunk for every group #17562.
2519
let mut df = df.clone();
2520
df.rechunk_mut_par();
2521
Ok(idx
2522
.into_par_iter()
2523
.map(|(_, group)| {
2524
// groups are in bounds
2525
unsafe {
2526
df._take_unchecked_slice_sorted(
2527
group,
2528
false,
2529
IsSorted::Ascending,
2530
)
2531
}
2532
})
2533
.collect())
2534
},
2535
GroupsType::Slice { groups, .. } => Ok(groups
2536
.into_par_iter()
2537
.map(|[first, len]| df.slice(*first as i64, *len as usize))
2538
.collect()),
2539
}
2540
})
2541
} else {
2542
match groups.as_ref() {
2543
GroupsType::Idx(idx) => {
2544
// Rechunk as the gather may rechunk for every group #17562.
2545
let mut df = df;
2546
df.rechunk_mut();
2547
Ok(idx
2548
.into_iter()
2549
.map(|(_, group)| {
2550
// groups are in bounds
2551
unsafe {
2552
df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
2553
}
2554
})
2555
.collect())
2556
},
2557
GroupsType::Slice { groups, .. } => Ok(groups
2558
.iter()
2559
.map(|[first, len]| df.slice(*first as i64, *len as usize))
2560
.collect()),
2561
}
2562
}
2563
}
2564
2565
/// Split into multiple DataFrames partitioned by groups
2566
#[cfg(feature = "partition_by")]
2567
pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
2568
where
2569
I: IntoIterator<Item = S>,
2570
S: Into<PlSmallStr>,
2571
{
2572
let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
2573
self._partition_by_impl(cols.as_slice(), false, include_key, true)
2574
}
2575
2576
/// Split into multiple DataFrames partitioned by groups
2577
/// Order of the groups are maintained.
2578
#[cfg(feature = "partition_by")]
2579
pub fn partition_by_stable<I, S>(
2580
&self,
2581
cols: I,
2582
include_key: bool,
2583
) -> PolarsResult<Vec<DataFrame>>
2584
where
2585
I: IntoIterator<Item = S>,
2586
S: Into<PlSmallStr>,
2587
{
2588
let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
2589
self._partition_by_impl(cols.as_slice(), true, include_key, true)
2590
}
2591
2592
/// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
2593
/// inserted as columns.
2594
#[cfg(feature = "dtype-struct")]
2595
pub fn unnest(
2596
&self,
2597
cols: impl IntoIterator<Item = impl Into<PlSmallStr>>,
2598
separator: Option<&str>,
2599
) -> PolarsResult<DataFrame> {
2600
self.unnest_impl(cols.into_iter().map(Into::into).collect(), separator)
2601
}
2602
2603
#[cfg(feature = "dtype-struct")]
2604
fn unnest_impl(
2605
&self,
2606
cols: PlHashSet<PlSmallStr>,
2607
separator: Option<&str>,
2608
) -> PolarsResult<DataFrame> {
2609
let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
2610
let mut count = 0;
2611
for s in self.columns() {
2612
if cols.contains(s.name()) {
2613
let ca = s.struct_()?.clone();
2614
new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
2615
if let Some(separator) = &separator {
2616
f.rename(polars_utils::format_pl_smallstr!(
2617
"{}{}{}",
2618
s.name(),
2619
separator,
2620
f.name()
2621
));
2622
}
2623
Column::from(f)
2624
}));
2625
count += 1;
2626
} else {
2627
new_cols.push(s.clone())
2628
}
2629
}
2630
if count != cols.len() {
2631
// one or more columns not found
2632
// the code below will return an error with the missing name
2633
let schema = self.schema();
2634
for col in cols {
2635
let _ = schema
2636
.get(col.as_str())
2637
.ok_or_else(|| polars_err!(col_not_found = col))?;
2638
}
2639
}
2640
2641
DataFrame::new_infer_height(new_cols)
2642
}
2643
2644
pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
2645
// @Optimize: this does a lot of unnecessary allocations. We should probably have a
2646
// append_chunk or something like this. It is just quite difficult to make that safe.
2647
let df = DataFrame::from(rb);
2648
polars_ensure!(
2649
self.schema() == df.schema(),
2650
SchemaMismatch: "cannot append record batch with different schema\n\n
2651
Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
2652
);
2653
self.vstack_mut_owned_unchecked(df);
2654
Ok(())
2655
}
2656
}
2657
2658
pub struct RecordBatchIter<'a> {
2659
df: &'a DataFrame,
2660
schema: ArrowSchemaRef,
2661
idx: usize,
2662
n_chunks: usize,
2663
compat_level: CompatLevel,
2664
parallel: bool,
2665
}
2666
2667
impl Iterator for RecordBatchIter<'_> {
2668
type Item = RecordBatch;
2669
2670
fn next(&mut self) -> Option<Self::Item> {
2671
if self.idx >= self.n_chunks {
2672
return None;
2673
}
2674
2675
// Create a batch of the columns with the same chunk no.
2676
let batch_cols: Vec<ArrayRef> = if self.parallel {
2677
let iter = self
2678
.df
2679
.columns()
2680
.par_iter()
2681
.map(Column::as_materialized_series)
2682
.map(|s| s.to_arrow(self.idx, self.compat_level));
2683
POOL.install(|| iter.collect())
2684
} else {
2685
self.df
2686
.columns()
2687
.iter()
2688
.map(Column::as_materialized_series)
2689
.map(|s| s.to_arrow(self.idx, self.compat_level))
2690
.collect()
2691
};
2692
2693
let length = batch_cols.first().map_or(0, |arr| arr.len());
2694
2695
self.idx += 1;
2696
2697
Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
2698
}
2699
2700
fn size_hint(&self) -> (usize, Option<usize>) {
2701
let n = self.n_chunks - self.idx;
2702
(n, Some(n))
2703
}
2704
}
2705
2706
pub struct PhysRecordBatchIter<'a> {
2707
schema: ArrowSchemaRef,
2708
arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
2709
}
2710
2711
impl Iterator for PhysRecordBatchIter<'_> {
2712
type Item = RecordBatch;
2713
2714
fn next(&mut self) -> Option<Self::Item> {
2715
let arrs = self
2716
.arr_iters
2717
.iter_mut()
2718
.map(|phys_iter| phys_iter.next().cloned())
2719
.collect::<Option<Vec<_>>>()?;
2720
2721
let length = arrs.first().map_or(0, |arr| arr.len());
2722
Some(RecordBatch::new(length, self.schema.clone(), arrs))
2723
}
2724
2725
fn size_hint(&self) -> (usize, Option<usize>) {
2726
if let Some(iter) = self.arr_iters.first() {
2727
iter.size_hint()
2728
} else {
2729
(0, None)
2730
}
2731
}
2732
}
2733
2734
pub enum RecordBatchIterWrap<'a> {
2735
ZeroWidth {
2736
remaining_height: usize,
2737
chunk_size: usize,
2738
},
2739
Batches(RecordBatchIter<'a>),
2740
PhysicalBatches(PhysRecordBatchIter<'a>),
2741
}
2742
2743
impl<'a> RecordBatchIterWrap<'a> {
2744
fn new_zero_width(height: usize) -> Self {
2745
Self::ZeroWidth {
2746
remaining_height: height,
2747
chunk_size: polars_config::config().ideal_morsel_size() as usize,
2748
}
2749
}
2750
}
2751
2752
impl Iterator for RecordBatchIterWrap<'_> {
2753
type Item = RecordBatch;
2754
2755
fn next(&mut self) -> Option<Self::Item> {
2756
match self {
2757
Self::ZeroWidth {
2758
remaining_height,
2759
chunk_size,
2760
} => {
2761
let n = usize::min(*remaining_height, *chunk_size);
2762
*remaining_height -= n;
2763
2764
(n > 0).then(|| RecordBatch::new(n, ArrowSchemaRef::default(), vec![]))
2765
},
2766
Self::Batches(v) => v.next(),
2767
Self::PhysicalBatches(v) => v.next(),
2768
}
2769
}
2770
2771
fn size_hint(&self) -> (usize, Option<usize>) {
2772
match self {
2773
Self::ZeroWidth {
2774
remaining_height,
2775
chunk_size,
2776
} => {
2777
let n = remaining_height.div_ceil(*chunk_size);
2778
(n, Some(n))
2779
},
2780
Self::Batches(v) => v.size_hint(),
2781
Self::PhysicalBatches(v) => v.size_hint(),
2782
}
2783
}
2784
}
2785
2786
// utility to test if we can vstack/extend the columns
2787
fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
2788
polars_ensure!(
2789
left.name() == right.name(),
2790
ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
2791
left.name(), right.name(),
2792
);
2793
Ok(())
2794
}
2795
2796
#[cfg(test)]
2797
mod test {
2798
use super::*;
2799
2800
fn create_frame() -> DataFrame {
2801
let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
2802
let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
2803
DataFrame::new_infer_height(vec![s0, s1]).unwrap()
2804
}
2805
2806
#[test]
2807
#[cfg_attr(miri, ignore)]
2808
fn test_recordbatch_iterator() {
2809
let df = df!(
2810
"foo" => [1, 2, 3, 4, 5]
2811
)
2812
.unwrap();
2813
let mut iter = df.iter_chunks(CompatLevel::newest(), false);
2814
assert_eq!(5, iter.next().unwrap().len());
2815
assert!(iter.next().is_none());
2816
}
2817
2818
#[test]
2819
#[cfg_attr(miri, ignore)]
2820
fn test_select() {
2821
let df = create_frame();
2822
assert_eq!(
2823
df.column("days")
2824
.unwrap()
2825
.as_series()
2826
.unwrap()
2827
.equal(1)
2828
.unwrap()
2829
.sum(),
2830
Some(1)
2831
);
2832
}
2833
2834
#[test]
2835
#[cfg_attr(miri, ignore)]
2836
fn test_filter_broadcast_on_string_col() {
2837
let col_name = "some_col";
2838
let v = vec!["test".to_string()];
2839
let s0 = Column::new(PlSmallStr::from_str(col_name), v);
2840
let mut df = DataFrame::new_infer_height(vec![s0]).unwrap();
2841
2842
df = df
2843
.filter(
2844
&df.column(col_name)
2845
.unwrap()
2846
.as_materialized_series()
2847
.equal("")
2848
.unwrap(),
2849
)
2850
.unwrap();
2851
assert_eq!(
2852
df.column(col_name)
2853
.unwrap()
2854
.as_materialized_series()
2855
.n_chunks(),
2856
1
2857
);
2858
}
2859
2860
#[test]
2861
#[cfg_attr(miri, ignore)]
2862
fn test_filter_broadcast_on_list_col() {
2863
let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
2864
let ll: ListChunked = [&s1].iter().copied().collect();
2865
2866
let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
2867
let new = ll.filter(&mask).unwrap();
2868
2869
assert_eq!(new.chunks.len(), 1);
2870
assert_eq!(new.len(), 0);
2871
}
2872
2873
#[test]
2874
fn slice() {
2875
let df = create_frame();
2876
let sliced_df = df.slice(0, 2);
2877
assert_eq!(sliced_df.shape(), (2, 2));
2878
}
2879
2880
#[test]
2881
fn rechunk_false() {
2882
let df = create_frame();
2883
assert!(!df.should_rechunk())
2884
}
2885
2886
#[test]
2887
fn rechunk_true() -> PolarsResult<()> {
2888
let mut base = df!(
2889
"a" => [1, 2, 3],
2890
"b" => [1, 2, 3]
2891
)?;
2892
2893
// Create a series with multiple chunks
2894
let mut s = Series::new("foo".into(), 0..2);
2895
let s2 = Series::new("bar".into(), 0..1);
2896
s.append(&s2)?;
2897
2898
// Append series to frame
2899
let out = base.with_column(s.into_column())?;
2900
2901
// Now we should rechunk
2902
assert!(out.should_rechunk());
2903
Ok(())
2904
}
2905
2906
#[test]
2907
fn test_duplicate_column() {
2908
let mut df = df! {
2909
"foo" => [1, 2, 3]
2910
}
2911
.unwrap();
2912
// check if column is replaced
2913
assert!(
2914
df.with_column(Column::new("foo".into(), &[1, 2, 3]))
2915
.is_ok()
2916
);
2917
assert!(
2918
df.with_column(Column::new("bar".into(), &[1, 2, 3]))
2919
.is_ok()
2920
);
2921
assert!(df.column("bar").is_ok())
2922
}
2923
2924
#[test]
2925
#[cfg_attr(miri, ignore)]
2926
fn distinct() {
2927
let df = df! {
2928
"flt" => [1., 1., 2., 2., 3., 3.],
2929
"int" => [1, 1, 2, 2, 3, 3, ],
2930
"str" => ["a", "a", "b", "b", "c", "c"]
2931
}
2932
.unwrap();
2933
let df = df
2934
.unique_stable(None, UniqueKeepStrategy::First, None)
2935
.unwrap()
2936
.sort(["flt"], SortMultipleOptions::default())
2937
.unwrap();
2938
let valid = df! {
2939
"flt" => [1., 2., 3.],
2940
"int" => [1, 2, 3],
2941
"str" => ["a", "b", "c"]
2942
}
2943
.unwrap();
2944
assert!(df.equals(&valid));
2945
}
2946
2947
#[test]
2948
fn test_vstack() {
2949
// check that it does not accidentally rechunks
2950
let mut df = df! {
2951
"flt" => [1., 1., 2., 2., 3., 3.],
2952
"int" => [1, 1, 2, 2, 3, 3, ],
2953
"str" => ["a", "a", "b", "b", "c", "c"]
2954
}
2955
.unwrap();
2956
2957
df.vstack_mut(&df.slice(0, 3)).unwrap();
2958
assert_eq!(df.first_col_n_chunks(), 2)
2959
}
2960
2961
#[test]
2962
fn test_vstack_on_empty_dataframe() {
2963
let mut df = DataFrame::empty();
2964
2965
let df_data = df! {
2966
"flt" => [1., 1., 2., 2., 3., 3.],
2967
"int" => [1, 1, 2, 2, 3, 3, ],
2968
"str" => ["a", "a", "b", "b", "c", "c"]
2969
}
2970
.unwrap();
2971
2972
df.vstack_mut(&df_data).unwrap();
2973
assert_eq!(df.height(), 6)
2974
}
2975
2976
#[test]
2977
fn test_unique_keep_none_with_slice() {
2978
let df = df! {
2979
"x" => [1, 2, 3, 2, 1]
2980
}
2981
.unwrap();
2982
let out = df
2983
.unique_stable(
2984
Some(&["x".to_string()][..]),
2985
UniqueKeepStrategy::None,
2986
Some((0, 2)),
2987
)
2988
.unwrap();
2989
let expected = df! {
2990
"x" => [3]
2991
}
2992
.unwrap();
2993
assert!(out.equals(&expected));
2994
}
2995
2996
#[test]
2997
#[cfg(feature = "dtype-i8")]
2998
fn test_apply_result_schema() {
2999
let mut df = df! {
3000
"x" => [1, 2, 3, 2, 1]
3001
}
3002
.unwrap();
3003
3004
let schema_before = df.schema().clone();
3005
df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3006
assert_ne!(&schema_before, df.schema());
3007
}
3008
}
3009
3010