Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/frame/mod.rs
6940 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! DataFrame module.
3
use std::sync::OnceLock;
4
use std::{mem, ops};
5
6
use arrow::datatypes::ArrowSchemaRef;
7
use polars_row::ArrayRef;
8
use polars_schema::schema::ensure_matching_schema_names;
9
use polars_utils::itertools::Itertools;
10
use rayon::prelude::*;
11
12
use crate::chunked_array::flags::StatisticsFlags;
13
#[cfg(feature = "algorithm_group_by")]
14
use crate::chunked_array::ops::unique::is_unique_helper;
15
use crate::prelude::*;
16
#[cfg(feature = "row_hash")]
17
use crate::utils::split_df;
18
use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19
use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21
#[cfg(feature = "dataframe_arithmetic")]
22
mod arithmetic;
23
pub mod builder;
24
mod chunks;
25
pub use chunks::chunk_df_for_writing;
26
pub mod column;
27
pub mod explode;
28
mod from;
29
#[cfg(feature = "algorithm_group_by")]
30
pub mod group_by;
31
pub(crate) mod horizontal;
32
#[cfg(any(feature = "rows", feature = "object"))]
33
pub mod row;
34
mod top_k;
35
mod upstream_traits;
36
mod validation;
37
38
use arrow::record_batch::{RecordBatch, RecordBatchT};
39
use polars_utils::pl_str::PlSmallStr;
40
#[cfg(feature = "serde")]
41
use serde::{Deserialize, Serialize};
42
use strum_macros::IntoStaticStr;
43
44
use crate::POOL;
45
#[cfg(feature = "row_hash")]
46
use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47
use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48
use crate::series::IsSorted;
49
50
#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
53
#[strum(serialize_all = "snake_case")]
54
pub enum UniqueKeepStrategy {
55
/// Keep the first unique row.
56
First,
57
/// Keep the last unique row.
58
Last,
59
/// Keep None of the unique rows.
60
None,
61
/// Keep any of the unique rows
62
/// This allows more optimizations
63
#[default]
64
Any,
65
}
66
67
fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
68
where
69
F: for<'a> FnMut(&'a T) -> &'a str,
70
{
71
// Always unique.
72
if items.len() <= 1 {
73
return Ok(());
74
}
75
76
if items.len() <= 4 {
77
// Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
78
for i in 0..items.len() - 1 {
79
let name = get_name(&items[i]);
80
for other in items.iter().skip(i + 1) {
81
if name == get_name(other) {
82
polars_bail!(duplicate = name);
83
}
84
}
85
}
86
} else {
87
let mut names = PlHashSet::with_capacity(items.len());
88
for item in items {
89
let name = get_name(item);
90
if !names.insert(name) {
91
polars_bail!(duplicate = name);
92
}
93
}
94
}
95
Ok(())
96
}
97
98
/// A contiguous growable collection of `Series` that have the same length.
99
///
100
/// ## Use declarations
101
///
102
/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
103
///
104
/// ```rust
105
/// use polars_core::prelude::*; // if the crate polars-core is used directly
106
/// // use polars::prelude::*; if the crate polars is used
107
/// ```
108
///
109
/// # Initialization
110
/// ## Default
111
///
112
/// A `DataFrame` can be initialized empty:
113
///
114
/// ```rust
115
/// # use polars_core::prelude::*;
116
/// let df = DataFrame::default();
117
/// assert!(df.is_empty());
118
/// ```
119
///
120
/// ## Wrapping a `Vec<Series>`
121
///
122
/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
123
///
124
/// ```rust
125
/// # use polars_core::prelude::*;
126
/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
127
/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
128
///
129
/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
130
/// ```
131
///
132
/// ## Using a macro
133
///
134
/// The [`df!`] macro is a convenient method:
135
///
136
/// ```rust
137
/// # use polars_core::prelude::*;
138
/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
139
/// "Color" => ["Red", "Yellow", "Green"]);
140
/// ```
141
///
142
/// ## Using a CSV file
143
///
144
/// See the `polars_io::csv::CsvReader`.
145
///
146
/// # Indexing
147
/// ## By a number
148
///
149
/// The `Index<usize>` is implemented for the `DataFrame`.
150
///
151
/// ```rust
152
/// # use polars_core::prelude::*;
153
/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
154
/// "Color" => ["Red", "Yellow", "Green"])?;
155
///
156
/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
157
/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
158
/// # Ok::<(), PolarsError>(())
159
/// ```
160
///
161
/// ## By a `Series` name
162
///
163
/// ```rust
164
/// # use polars_core::prelude::*;
165
/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
166
/// "Color" => ["Red", "Yellow", "Green"])?;
167
///
168
/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
169
/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
170
/// # Ok::<(), PolarsError>(())
171
/// ```
172
#[derive(Clone)]
173
pub struct DataFrame {
174
height: usize,
175
// invariant: columns[i].len() == height for each 0 >= i > columns.len()
176
pub(crate) columns: Vec<Column>,
177
178
/// A cached schema. This might not give correct results if the DataFrame was modified in place
179
/// between schema and reading.
180
cached_schema: OnceLock<SchemaRef>,
181
}
182
183
impl DataFrame {
184
pub fn clear_schema(&mut self) {
185
self.cached_schema = OnceLock::new();
186
}
187
188
#[inline]
189
pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
190
self.columns.iter()
191
}
192
193
#[inline]
194
pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
195
self.columns.iter().map(Column::as_materialized_series)
196
}
197
198
#[inline]
199
pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
200
self.columns.par_iter().map(Column::as_materialized_series)
201
}
202
203
/// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
204
///
205
/// # Implementation
206
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
207
/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
208
/// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
209
///
210
/// When an array is sliced, its allocated size remains constant because the buffer unchanged.
211
/// However, this function will yield a smaller number. This is because this function returns
212
/// the visible size of the buffer, not its total capacity.
213
///
214
/// FFI buffers are included in this estimation.
215
pub fn estimated_size(&self) -> usize {
216
self.columns.iter().map(Column::estimated_size).sum()
217
}
218
219
// Reduce monomorphization.
220
fn try_apply_columns(
221
&self,
222
func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
223
) -> PolarsResult<Vec<Column>> {
224
self.columns.iter().map(func).collect()
225
}
226
// Reduce monomorphization.
227
pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
228
self.columns.iter().map(func).collect()
229
}
230
// Reduce monomorphization.
231
fn try_apply_columns_par(
232
&self,
233
func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
234
) -> PolarsResult<Vec<Column>> {
235
POOL.install(|| self.columns.par_iter().map(func).collect())
236
}
237
// Reduce monomorphization.
238
pub fn _apply_columns_par(
239
&self,
240
func: &(dyn Fn(&Column) -> Column + Send + Sync),
241
) -> Vec<Column> {
242
POOL.install(|| self.columns.par_iter().map(func).collect())
243
}
244
245
/// Get the index of the column.
246
fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
247
self.get_column_index(name)
248
.ok_or_else(|| polars_err!(col_not_found = name))
249
}
250
251
fn check_already_present(&self, name: &str) -> PolarsResult<()> {
252
polars_ensure!(
253
self.columns.iter().all(|s| s.name().as_str() != name),
254
Duplicate: "column with name {:?} is already present in the DataFrame", name
255
);
256
Ok(())
257
}
258
259
/// Reserve additional slots into the chunks of the series.
260
pub(crate) fn reserve_chunks(&mut self, additional: usize) {
261
for s in &mut self.columns {
262
if let Column::Series(s) = s {
263
// SAFETY:
264
// do not modify the data, simply resize.
265
unsafe { s.chunks_mut().reserve(additional) }
266
}
267
}
268
}
269
270
/// Create a DataFrame from a Vector of Series.
271
///
272
/// Errors if a column names are not unique, or if heights are not all equal.
273
///
274
/// # Example
275
///
276
/// ```
277
/// # use polars_core::prelude::*;
278
/// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
279
/// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
280
///
281
/// let df = DataFrame::new(vec![s0, s1])?;
282
/// # Ok::<(), PolarsError>(())
283
/// ```
284
pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
285
DataFrame::validate_columns_slice(&columns)
286
.map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
287
Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
288
}
289
290
pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
291
for col in &columns {
292
polars_ensure!(
293
col.len() == height,
294
ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
295
columns[0].name(), height, col.name(), col.len()
296
);
297
}
298
299
Ok(DataFrame {
300
height,
301
columns,
302
cached_schema: OnceLock::new(),
303
})
304
}
305
306
/// Converts a sequence of columns into a DataFrame, broadcasting length-1
307
/// columns to match the other columns.
308
pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
309
// The length of the longest non-unit length column determines the
310
// broadcast length. If all columns are unit-length the broadcast length
311
// is one.
312
let broadcast_len = columns
313
.iter()
314
.map(|s| s.len())
315
.filter(|l| *l != 1)
316
.max()
317
.unwrap_or(1);
318
Self::new_with_broadcast_len(columns, broadcast_len)
319
}
320
321
/// Converts a sequence of columns into a DataFrame, broadcasting length-1
322
/// columns to broadcast_len.
323
pub fn new_with_broadcast_len(
324
columns: Vec<Column>,
325
broadcast_len: usize,
326
) -> PolarsResult<Self> {
327
ensure_names_unique(&columns, |s| s.name().as_str())?;
328
unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
329
}
330
331
/// Converts a sequence of columns into a DataFrame, broadcasting length-1
332
/// columns to match the other columns.
333
///
334
/// # Safety
335
/// Does not check that the column names are unique (which they must be).
336
pub unsafe fn new_with_broadcast_no_namecheck(
337
mut columns: Vec<Column>,
338
broadcast_len: usize,
339
) -> PolarsResult<Self> {
340
for col in &mut columns {
341
// Length not equal to the broadcast len, needs broadcast or is an error.
342
let len = col.len();
343
if len != broadcast_len {
344
if len != 1 {
345
let name = col.name().to_owned();
346
let extra_info =
347
if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
348
format!(" (matching column '{}')", c.name())
349
} else {
350
String::new()
351
};
352
polars_bail!(
353
ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
354
);
355
}
356
*col = col.new_from_index(0, broadcast_len);
357
}
358
}
359
360
let length = if columns.is_empty() { 0 } else { broadcast_len };
361
362
Ok(unsafe { DataFrame::new_no_checks(length, columns) })
363
}
364
365
pub fn new_from_index(&self, index: usize, height: usize) -> Self {
366
let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
367
unsafe { Self::new_no_checks(height, cols.collect()) }
368
}
369
370
/// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
371
///
372
/// # Example
373
///
374
/// ```rust
375
/// use polars_core::prelude::DataFrame;
376
/// static EMPTY: DataFrame = DataFrame::empty();
377
/// ```
378
pub const fn empty() -> Self {
379
Self::empty_with_height(0)
380
}
381
382
/// Creates an empty `DataFrame` with a specific `height`.
383
pub const fn empty_with_height(height: usize) -> Self {
384
DataFrame {
385
height,
386
columns: vec![],
387
cached_schema: OnceLock::new(),
388
}
389
}
390
391
/// Create an empty `DataFrame` with empty columns as per the `schema`.
392
pub fn empty_with_schema(schema: &Schema) -> Self {
393
let cols = schema
394
.iter()
395
.map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
396
.collect();
397
unsafe { DataFrame::new_no_checks(0, cols) }
398
}
399
400
/// Create an empty `DataFrame` with empty columns as per the `schema`.
401
pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
402
let cols = schema
403
.iter_values()
404
.map(|fld| {
405
Column::from(Series::new_empty(
406
fld.name.clone(),
407
&(DataType::from_arrow_field(fld)),
408
))
409
})
410
.collect();
411
unsafe { DataFrame::new_no_checks(0, cols) }
412
}
413
414
/// Create a new `DataFrame` with the given schema, only containing nulls.
415
pub fn full_null(schema: &Schema, height: usize) -> Self {
416
let columns = schema
417
.iter_fields()
418
.map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
419
.collect();
420
unsafe { DataFrame::new_no_checks(height, columns) }
421
}
422
423
/// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
424
///
425
/// # Example
426
///
427
/// ```rust
428
/// # use polars_core::prelude::*;
429
/// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
430
/// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
431
/// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
432
///
433
/// assert_eq!(df.pop(), Some(s2));
434
/// assert_eq!(df.pop(), Some(s1));
435
/// assert_eq!(df.pop(), None);
436
/// assert!(df.is_empty());
437
/// # Ok::<(), PolarsError>(())
438
/// ```
439
pub fn pop(&mut self) -> Option<Column> {
440
self.clear_schema();
441
442
self.columns.pop()
443
}
444
445
/// Add a new column at index 0 that counts the rows.
446
///
447
/// # Example
448
///
449
/// ```
450
/// # use polars_core::prelude::*;
451
/// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
452
/// assert_eq!(df1.shape(), (4, 1));
453
///
454
/// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
455
/// assert_eq!(df2.shape(), (4, 2));
456
/// println!("{}", df2);
457
///
458
/// # Ok::<(), PolarsError>(())
459
/// ```
460
///
461
/// Output:
462
///
463
/// ```text
464
/// shape: (4, 2)
465
/// +-----+----------+
466
/// | Id | Name |
467
/// | --- | --- |
468
/// | u32 | str |
469
/// +=====+==========+
470
/// | 0 | James |
471
/// +-----+----------+
472
/// | 1 | Mary |
473
/// +-----+----------+
474
/// | 2 | John |
475
/// +-----+----------+
476
/// | 3 | Patricia |
477
/// +-----+----------+
478
/// ```
479
pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
480
let mut columns = Vec::with_capacity(self.columns.len() + 1);
481
let offset = offset.unwrap_or(0);
482
483
let col = Column::new_row_index(name, offset, self.height())?;
484
columns.push(col);
485
columns.extend_from_slice(&self.columns);
486
DataFrame::new(columns)
487
}
488
489
/// Add a row index column in place.
490
///
491
/// # Safety
492
/// The caller should ensure the DataFrame does not already contain a column with the given name.
493
///
494
/// # Panics
495
/// Panics if the resulting column would reach or overflow IdxSize::MAX.
496
pub unsafe fn with_row_index_mut(
497
&mut self,
498
name: PlSmallStr,
499
offset: Option<IdxSize>,
500
) -> &mut Self {
501
// TODO: Make this function unsafe
502
debug_assert!(
503
self.columns.iter().all(|c| c.name() != &name),
504
"with_row_index_mut(): column with name {} already exists",
505
&name
506
);
507
508
let offset = offset.unwrap_or(0);
509
let col = Column::new_row_index(name, offset, self.height()).unwrap();
510
511
self.clear_schema();
512
self.columns.insert(0, col);
513
self
514
}
515
516
/// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
517
/// `Series`.
518
///
519
/// Calculates the height from the first column or `0` if no columns are given.
520
///
521
/// # Safety
522
///
523
/// It is the callers responsibility to uphold the contract of all `Series`
524
/// having an equal length and a unique name, if not this may panic down the line.
525
pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
526
let height = columns.first().map_or(0, Column::len);
527
unsafe { Self::new_no_checks(height, columns) }
528
}
529
530
/// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
531
/// `Series`.
532
///
533
/// It is advised to use [DataFrame::new] in favor of this method.
534
///
535
/// # Safety
536
///
537
/// It is the callers responsibility to uphold the contract of all `Series`
538
/// having an equal length and a unique name, if not this may panic down the line.
539
pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
540
if cfg!(debug_assertions) {
541
DataFrame::validate_columns_slice(&columns).unwrap();
542
}
543
544
unsafe { Self::_new_no_checks_impl(height, columns) }
545
}
546
547
/// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
548
/// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
549
/// constructed with this method is generally highly unsafe and should not be long-lived.
550
#[allow(clippy::missing_safety_doc)]
551
pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
552
DataFrame {
553
height,
554
columns,
555
cached_schema: OnceLock::new(),
556
}
557
}
558
559
/// Shrink the capacity of this DataFrame to fit its length.
560
pub fn shrink_to_fit(&mut self) {
561
// Don't parallelize this. Memory overhead
562
for s in &mut self.columns {
563
s.shrink_to_fit();
564
}
565
}
566
567
/// Aggregate all the chunks in the DataFrame to a single chunk.
568
pub fn as_single_chunk(&mut self) -> &mut Self {
569
// Don't parallelize this. Memory overhead
570
for s in &mut self.columns {
571
*s = s.rechunk();
572
}
573
self
574
}
575
576
/// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
577
/// This may lead to more peak memory consumption.
578
pub fn as_single_chunk_par(&mut self) -> &mut Self {
579
if self.columns.iter().any(|c| c.n_chunks() > 1) {
580
self.columns = self._apply_columns_par(&|s| s.rechunk());
581
}
582
self
583
}
584
585
/// Rechunks all columns to only have a single chunk.
586
pub fn rechunk_mut(&mut self) {
587
// SAFETY: We never adjust the length or names of the columns.
588
let columns = unsafe { self.get_columns_mut() };
589
590
for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
591
*col = col.rechunk();
592
}
593
}
594
595
pub fn _deshare_views_mut(&mut self) {
596
// SAFETY: We never adjust the length or names of the columns.
597
unsafe {
598
let columns = self.get_columns_mut();
599
for col in columns {
600
let Column::Series(s) = col else { continue };
601
602
if let Ok(ca) = s.binary() {
603
let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
604
*col = Column::from(gc_ca.into_series());
605
} else if let Ok(ca) = s.str() {
606
let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
607
*col = Column::from(gc_ca.into_series());
608
}
609
}
610
}
611
}
612
613
/// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
614
pub fn rechunk_to_record_batch(
615
self,
616
compat_level: CompatLevel,
617
) -> RecordBatchT<Box<dyn Array>> {
618
let height = self.height();
619
620
let (schema, arrays) = self
621
.columns
622
.into_iter()
623
.map(|col| {
624
let mut series = col.take_materialized_series();
625
// Rechunk to one chunk if necessary
626
if series.n_chunks() > 1 {
627
series = series.rechunk();
628
}
629
(
630
series.field().to_arrow(compat_level),
631
series.to_arrow(0, compat_level),
632
)
633
})
634
.collect();
635
636
RecordBatchT::new(height, Arc::new(schema), arrays)
637
}
638
639
/// Returns true if the chunks of the columns do not align and re-chunking should be done
640
pub fn should_rechunk(&self) -> bool {
641
// Fast check. It is also needed for correctness, as code below doesn't check if the number
642
// of chunks is equal.
643
if !self
644
.get_columns()
645
.iter()
646
.filter_map(|c| c.as_series().map(|s| s.n_chunks()))
647
.all_equal()
648
{
649
return true;
650
}
651
652
// From here we check chunk lengths.
653
let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
654
match chunk_lengths.next() {
655
None => false,
656
Some(first_column_chunk_lengths) => {
657
// Fast Path for single Chunk Series
658
if first_column_chunk_lengths.size_hint().0 == 1 {
659
return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
660
}
661
// Always rechunk if we have more chunks than rows.
662
// except when we have an empty df containing a single chunk
663
let height = self.height();
664
let n_chunks = first_column_chunk_lengths.size_hint().0;
665
if n_chunks > height && !(height == 0 && n_chunks == 1) {
666
return true;
667
}
668
// Slow Path for multi Chunk series
669
let v: Vec<_> = first_column_chunk_lengths.collect();
670
for cl in chunk_lengths {
671
if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
672
return true;
673
}
674
}
675
false
676
},
677
}
678
}
679
680
/// Ensure all the chunks in the [`DataFrame`] are aligned.
681
pub fn align_chunks_par(&mut self) -> &mut Self {
682
if self.should_rechunk() {
683
self.as_single_chunk_par()
684
} else {
685
self
686
}
687
}
688
689
pub fn align_chunks(&mut self) -> &mut Self {
690
if self.should_rechunk() {
691
self.as_single_chunk()
692
} else {
693
self
694
}
695
}
696
697
/// Get the [`DataFrame`] schema.
698
///
699
/// # Example
700
///
701
/// ```rust
702
/// # use polars_core::prelude::*;
703
/// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
704
/// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
705
///
706
/// let f1: Field = Field::new("Thing".into(), DataType::String);
707
/// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
708
/// let sc: Schema = Schema::from_iter(vec![f1, f2]);
709
///
710
/// assert_eq!(&**df.schema(), &sc);
711
/// # Ok::<(), PolarsError>(())
712
/// ```
713
pub fn schema(&self) -> &SchemaRef {
714
let out = self.cached_schema.get_or_init(|| {
715
Arc::new(
716
self.columns
717
.iter()
718
.map(|x| (x.name().clone(), x.dtype().clone()))
719
.collect(),
720
)
721
});
722
723
debug_assert_eq!(out.len(), self.width());
724
725
out
726
}
727
728
/// Get a reference to the [`DataFrame`] columns.
729
///
730
/// # Example
731
///
732
/// ```rust
733
/// # use polars_core::prelude::*;
734
/// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
735
/// "Symbol" => ["A", "C", "G", "T"])?;
736
/// let columns: &[Column] = df.get_columns();
737
///
738
/// assert_eq!(columns[0].name(), "Name");
739
/// assert_eq!(columns[1].name(), "Symbol");
740
/// # Ok::<(), PolarsError>(())
741
/// ```
742
#[inline]
743
pub fn get_columns(&self) -> &[Column] {
744
&self.columns
745
}
746
747
#[inline]
748
/// Get mutable access to the underlying columns.
749
///
750
/// # Safety
751
///
752
/// The caller must ensure the length of all [`Series`] remains equal to `height` or
753
/// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
754
/// The caller must ensure that the cached schema is cleared if it modifies the schema by
755
/// calling [`DataFrame::clear_schema`].
756
pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
757
&mut self.columns
758
}
759
760
#[inline]
761
/// Remove all the columns in the [`DataFrame`] but keep the `height`.
762
pub fn clear_columns(&mut self) {
763
unsafe { self.get_columns_mut() }.clear();
764
self.clear_schema();
765
}
766
767
#[inline]
768
/// Extend the columns without checking for name collisions or height.
769
///
770
/// # Safety
771
///
772
/// The caller needs to ensure that:
773
/// - Column names are unique within the resulting [`DataFrame`].
774
/// - The length of each appended column matches the height of the [`DataFrame`]. For
775
/// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
776
/// with [`DataFrame::set_height`].
777
pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
778
unsafe { self.get_columns_mut() }.extend(iter);
779
self.clear_schema();
780
}
781
782
/// Take ownership of the underlying columns vec.
783
pub fn take_columns(self) -> Vec<Column> {
784
self.columns
785
}
786
787
/// Iterator over the columns as [`Series`].
788
///
789
/// # Example
790
///
791
/// ```rust
792
/// # use polars_core::prelude::*;
793
/// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
794
/// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
795
/// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
796
///
797
/// let mut iterator = df.iter();
798
///
799
/// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
800
/// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
801
/// assert_eq!(iterator.next(), None);
802
/// # Ok::<(), PolarsError>(())
803
/// ```
804
pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
805
self.materialized_column_iter()
806
}
807
808
/// # Example
809
///
810
/// ```rust
811
/// # use polars_core::prelude::*;
812
/// let df: DataFrame = df!("Language" => ["Rust", "Python"],
813
/// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
814
///
815
/// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
816
/// # Ok::<(), PolarsError>(())
817
/// ```
818
pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
819
self.columns.iter().map(|s| s.name()).collect()
820
}
821
822
/// Get the [`Vec<PlSmallStr>`] representing the column names.
823
pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
824
self.columns.iter().map(|s| s.name().clone()).collect()
825
}
826
827
pub fn get_column_names_str(&self) -> Vec<&str> {
828
self.columns.iter().map(|s| s.name().as_str()).collect()
829
}
830
831
/// Set the column names.
832
/// # Example
833
///
834
/// ```rust
835
/// # use polars_core::prelude::*;
836
/// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
837
/// df.set_column_names(["Set"])?;
838
///
839
/// assert_eq!(df.get_column_names(), &["Set"]);
840
/// # Ok::<(), PolarsError>(())
841
/// ```
842
pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
843
where
844
I: IntoIterator<Item = S>,
845
S: Into<PlSmallStr>,
846
{
847
let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
848
self._set_column_names_impl(names.as_slice())
849
}
850
851
fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
852
polars_ensure!(
853
names.len() == self.width(),
854
ShapeMismatch: "{} column names provided for a DataFrame of width {}",
855
names.len(), self.width()
856
);
857
ensure_names_unique(names, |s| s.as_str())?;
858
859
let columns = mem::take(&mut self.columns);
860
self.columns = columns
861
.into_iter()
862
.zip(names)
863
.map(|(s, name)| {
864
let mut s = s;
865
s.rename(name.clone());
866
s
867
})
868
.collect();
869
self.clear_schema();
870
Ok(())
871
}
872
873
/// Get the data types of the columns in the [`DataFrame`].
874
///
875
/// # Example
876
///
877
/// ```rust
878
/// # use polars_core::prelude::*;
879
/// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
880
/// "Fraction" => [0.965, 0.035])?;
881
///
882
/// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
883
/// # Ok::<(), PolarsError>(())
884
/// ```
885
pub fn dtypes(&self) -> Vec<DataType> {
886
self.columns.iter().map(|s| s.dtype().clone()).collect()
887
}
888
889
pub(crate) fn first_series_column(&self) -> Option<&Series> {
890
self.columns.iter().find_map(|col| col.as_series())
891
}
892
893
/// The number of chunks for the first column.
894
pub fn first_col_n_chunks(&self) -> usize {
895
match self.first_series_column() {
896
None if self.columns.is_empty() => 0,
897
None => 1,
898
Some(s) => s.n_chunks(),
899
}
900
}
901
902
/// The highest number of chunks for any column.
903
pub fn max_n_chunks(&self) -> usize {
904
self.columns
905
.iter()
906
.map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
907
.max()
908
.unwrap_or(0)
909
}
910
911
/// Get a reference to the schema fields of the [`DataFrame`].
912
///
913
/// # Example
914
///
915
/// ```rust
916
/// # use polars_core::prelude::*;
917
/// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
918
/// "Fraction" => [0.708, 0.292])?;
919
///
920
/// let f1: Field = Field::new("Surface type".into(), DataType::String);
921
/// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
922
///
923
/// assert_eq!(earth.fields(), &[f1, f2]);
924
/// # Ok::<(), PolarsError>(())
925
/// ```
926
pub fn fields(&self) -> Vec<Field> {
927
self.columns
928
.iter()
929
.map(|s| s.field().into_owned())
930
.collect()
931
}
932
933
/// Get (height, width) of the [`DataFrame`].
934
///
935
/// # Example
936
///
937
/// ```rust
938
/// # use polars_core::prelude::*;
939
/// let df0: DataFrame = DataFrame::default();
940
/// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
941
/// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
942
/// "2" => [1, 2, 3, 4, 5])?;
943
///
944
/// assert_eq!(df0.shape(), (0 ,0));
945
/// assert_eq!(df1.shape(), (5, 1));
946
/// assert_eq!(df2.shape(), (5, 2));
947
/// # Ok::<(), PolarsError>(())
948
/// ```
949
pub fn shape(&self) -> (usize, usize) {
950
(self.height, self.columns.len())
951
}
952
953
/// Get the width of the [`DataFrame`] which is the number of columns.
954
///
955
/// # Example
956
///
957
/// ```rust
958
/// # use polars_core::prelude::*;
959
/// let df0: DataFrame = DataFrame::default();
960
/// let df1: DataFrame = df!("Series 1" => [0; 0])?;
961
/// let df2: DataFrame = df!("Series 1" => [0; 0],
962
/// "Series 2" => [0; 0])?;
963
///
964
/// assert_eq!(df0.width(), 0);
965
/// assert_eq!(df1.width(), 1);
966
/// assert_eq!(df2.width(), 2);
967
/// # Ok::<(), PolarsError>(())
968
/// ```
969
pub fn width(&self) -> usize {
970
self.columns.len()
971
}
972
973
/// Get the height of the [`DataFrame`] which is the number of rows.
974
///
975
/// # Example
976
///
977
/// ```rust
978
/// # use polars_core::prelude::*;
979
/// let df0: DataFrame = DataFrame::default();
980
/// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
981
/// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
982
///
983
/// assert_eq!(df0.height(), 0);
984
/// assert_eq!(df1.height(), 2);
985
/// assert_eq!(df2.height(), 5);
986
/// # Ok::<(), PolarsError>(())
987
/// ```
988
pub fn height(&self) -> usize {
989
self.height
990
}
991
992
/// Returns the size as number of rows * number of columns
993
pub fn size(&self) -> usize {
994
let s = self.shape();
995
s.0 * s.1
996
}
997
998
/// Returns `true` if the [`DataFrame`] contains no rows.
999
///
1000
/// # Example
1001
///
1002
/// ```rust
1003
/// # use polars_core::prelude::*;
1004
/// let df1: DataFrame = DataFrame::default();
1005
/// assert!(df1.is_empty());
1006
///
1007
/// let df2: DataFrame = df!("First name" => ["Forever"],
1008
/// "Last name" => ["Alone"])?;
1009
/// assert!(!df2.is_empty());
1010
/// # Ok::<(), PolarsError>(())
1011
/// ```
1012
pub fn is_empty(&self) -> bool {
1013
matches!(self.shape(), (0, _) | (_, 0))
1014
}
1015
1016
/// Set the height (i.e. number of rows) of this [`DataFrame`].
1017
///
1018
/// # Safety
1019
///
1020
/// This needs to be equal to the length of all the columns.
1021
pub unsafe fn set_height(&mut self, height: usize) {
1022
self.height = height;
1023
}
1024
1025
/// Add multiple [`Series`] to a [`DataFrame`].
1026
/// The added `Series` are required to have the same length.
1027
///
1028
/// # Example
1029
///
1030
/// ```rust
1031
/// # use polars_core::prelude::*;
1032
/// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1033
/// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1034
/// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1035
///
1036
/// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1037
/// assert_eq!(df2.shape(), (3, 3));
1038
/// println!("{}", df2);
1039
/// # Ok::<(), PolarsError>(())
1040
/// ```
1041
///
1042
/// Output:
1043
///
1044
/// ```text
1045
/// shape: (3, 3)
1046
/// +---------+--------+----------+
1047
/// | Element | Proton | Electron |
1048
/// | --- | --- | --- |
1049
/// | str | i32 | i32 |
1050
/// +=========+========+==========+
1051
/// | Copper | 29 | 29 |
1052
/// +---------+--------+----------+
1053
/// | Silver | 47 | 47 |
1054
/// +---------+--------+----------+
1055
/// | Gold | 79 | 79 |
1056
/// +---------+--------+----------+
1057
/// ```
1058
pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1059
let mut new_cols = self.columns.clone();
1060
new_cols.extend_from_slice(columns);
1061
DataFrame::new(new_cols)
1062
}
1063
1064
/// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1065
///
1066
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1067
///
1068
/// # Example
1069
///
1070
/// ```rust
1071
/// # use polars_core::prelude::*;
1072
/// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1073
/// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1074
/// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1075
/// "Melting Point (K)" => [2041.4, 1828.05])?;
1076
///
1077
/// let df3: DataFrame = df1.vstack(&df2)?;
1078
///
1079
/// assert_eq!(df3.shape(), (5, 2));
1080
/// println!("{}", df3);
1081
/// # Ok::<(), PolarsError>(())
1082
/// ```
1083
///
1084
/// Output:
1085
///
1086
/// ```text
1087
/// shape: (5, 2)
1088
/// +-----------+-------------------+
1089
/// | Element | Melting Point (K) |
1090
/// | --- | --- |
1091
/// | str | f64 |
1092
/// +===========+===================+
1093
/// | Copper | 1357.77 |
1094
/// +-----------+-------------------+
1095
/// | Silver | 1234.93 |
1096
/// +-----------+-------------------+
1097
/// | Gold | 1337.33 |
1098
/// +-----------+-------------------+
1099
/// | Platinum | 2041.4 |
1100
/// +-----------+-------------------+
1101
/// | Palladium | 1828.05 |
1102
/// +-----------+-------------------+
1103
/// ```
1104
pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1105
let mut df = self.clone();
1106
df.vstack_mut(other)?;
1107
Ok(df)
1108
}
1109
1110
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
1111
///
1112
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1113
///
1114
/// # Example
1115
///
1116
/// ```rust
1117
/// # use polars_core::prelude::*;
1118
/// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1119
/// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1120
/// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1121
/// "Melting Point (K)" => [2041.4, 1828.05])?;
1122
///
1123
/// df1.vstack_mut(&df2)?;
1124
///
1125
/// assert_eq!(df1.shape(), (5, 2));
1126
/// println!("{}", df1);
1127
/// # Ok::<(), PolarsError>(())
1128
/// ```
1129
///
1130
/// Output:
1131
///
1132
/// ```text
1133
/// shape: (5, 2)
1134
/// +-----------+-------------------+
1135
/// | Element | Melting Point (K) |
1136
/// | --- | --- |
1137
/// | str | f64 |
1138
/// +===========+===================+
1139
/// | Copper | 1357.77 |
1140
/// +-----------+-------------------+
1141
/// | Silver | 1234.93 |
1142
/// +-----------+-------------------+
1143
/// | Gold | 1337.33 |
1144
/// +-----------+-------------------+
1145
/// | Platinum | 2041.4 |
1146
/// +-----------+-------------------+
1147
/// | Palladium | 1828.05 |
1148
/// +-----------+-------------------+
1149
/// ```
1150
pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1151
if self.width() != other.width() {
1152
polars_ensure!(
1153
self.width() == 0,
1154
ShapeMismatch:
1155
"unable to append to a DataFrame of width {} with a DataFrame of width {}",
1156
self.width(), other.width(),
1157
);
1158
self.columns.clone_from(&other.columns);
1159
self.height = other.height;
1160
return Ok(self);
1161
}
1162
1163
self.columns
1164
.iter_mut()
1165
.zip(other.columns.iter())
1166
.try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1167
ensure_can_extend(&*left, right)?;
1168
left.append(right).map_err(|e| {
1169
e.context(format!("failed to vstack column '{}'", right.name()).into())
1170
})?;
1171
Ok(())
1172
})?;
1173
self.height += other.height;
1174
Ok(self)
1175
}
1176
1177
pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1178
if self.width() != other.width() {
1179
polars_ensure!(
1180
self.width() == 0,
1181
ShapeMismatch:
1182
"unable to append to a DataFrame of width {} with a DataFrame of width {}",
1183
self.width(), other.width(),
1184
);
1185
self.columns = other.columns;
1186
self.height = other.height;
1187
return Ok(self);
1188
}
1189
1190
self.columns
1191
.iter_mut()
1192
.zip(other.columns.into_iter())
1193
.try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1194
ensure_can_extend(&*left, &right)?;
1195
let right_name = right.name().clone();
1196
left.append_owned(right).map_err(|e| {
1197
e.context(format!("failed to vstack column '{right_name}'").into())
1198
})?;
1199
Ok(())
1200
})?;
1201
self.height += other.height;
1202
Ok(self)
1203
}
1204
1205
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
1206
///
1207
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1208
///
1209
/// # Panics
1210
/// Panics if the schema's don't match.
1211
pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1212
self.columns
1213
.iter_mut()
1214
.zip(other.columns.iter())
1215
.for_each(|(left, right)| {
1216
left.append(right)
1217
.map_err(|e| {
1218
e.context(format!("failed to vstack column '{}'", right.name()).into())
1219
})
1220
.expect("should not fail");
1221
});
1222
self.height += other.height;
1223
}
1224
1225
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
1226
///
1227
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1228
///
1229
/// # Panics
1230
/// Panics if the schema's don't match.
1231
pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1232
self.columns
1233
.iter_mut()
1234
.zip(other.columns)
1235
.for_each(|(left, right)| {
1236
left.append_owned(right).expect("should not fail");
1237
});
1238
self.height += other.height;
1239
}
1240
1241
/// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1242
///
1243
/// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1244
/// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1245
///
1246
/// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1247
/// and thus will yield faster queries.
1248
///
1249
/// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1250
/// online operations where you add `n` rows and rerun a query.
1251
///
1252
/// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1253
/// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1254
/// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1255
pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1256
polars_ensure!(
1257
self.width() == other.width(),
1258
ShapeMismatch:
1259
"unable to extend a DataFrame of width {} with a DataFrame of width {}",
1260
self.width(), other.width(),
1261
);
1262
1263
self.columns
1264
.iter_mut()
1265
.zip(other.columns.iter())
1266
.try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1267
ensure_can_extend(&*left, right)?;
1268
left.extend(right).map_err(|e| {
1269
e.context(format!("failed to extend column '{}'", right.name()).into())
1270
})?;
1271
Ok(())
1272
})?;
1273
self.height += other.height;
1274
self.clear_schema();
1275
Ok(())
1276
}
1277
1278
/// Remove a column by name and return the column removed.
1279
///
1280
/// # Example
1281
///
1282
/// ```rust
1283
/// # use polars_core::prelude::*;
1284
/// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1285
/// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1286
///
1287
/// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1288
/// assert!(s1.is_err());
1289
///
1290
/// let s2: Column = df.drop_in_place("Animal")?;
1291
/// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1292
/// # Ok::<(), PolarsError>(())
1293
/// ```
1294
pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1295
let idx = self.check_name_to_idx(name)?;
1296
self.clear_schema();
1297
Ok(self.columns.remove(idx))
1298
}
1299
1300
/// Return a new [`DataFrame`] where all null values are dropped.
1301
///
1302
/// # Example
1303
///
1304
/// ```no_run
1305
/// # use polars_core::prelude::*;
1306
/// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1307
/// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1308
/// assert_eq!(df1.shape(), (3, 2));
1309
///
1310
/// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1311
/// assert_eq!(df2.shape(), (1, 2));
1312
/// println!("{}", df2);
1313
/// # Ok::<(), PolarsError>(())
1314
/// ```
1315
///
1316
/// Output:
1317
///
1318
/// ```text
1319
/// shape: (1, 2)
1320
/// +---------+---------------------+
1321
/// | Country | Tax revenue (% GDP) |
1322
/// | --- | --- |
1323
/// | str | f64 |
1324
/// +=========+=====================+
1325
/// | Malta | 32.7 |
1326
/// +---------+---------------------+
1327
/// ```
1328
pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1329
where
1330
for<'a> &'a S: Into<PlSmallStr>,
1331
{
1332
if let Some(v) = subset {
1333
let v = self.select_columns(v)?;
1334
self._drop_nulls_impl(v.as_slice())
1335
} else {
1336
self._drop_nulls_impl(self.columns.as_slice())
1337
}
1338
}
1339
1340
fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1341
// fast path for no nulls in df
1342
if subset.iter().all(|s| !s.has_nulls()) {
1343
return Ok(self.clone());
1344
}
1345
1346
let mut iter = subset.iter();
1347
1348
let mask = iter
1349
.next()
1350
.ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1351
let mut mask = mask.is_not_null();
1352
1353
for c in iter {
1354
mask = mask & c.is_not_null();
1355
}
1356
self.filter(&mask)
1357
}
1358
1359
/// Drop a column by name.
1360
/// This is a pure method and will return a new [`DataFrame`] instead of modifying
1361
/// the current one in place.
1362
///
1363
/// # Example
1364
///
1365
/// ```rust
1366
/// # use polars_core::prelude::*;
1367
/// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1368
/// let df2: DataFrame = df1.drop("Ray type")?;
1369
///
1370
/// assert!(df2.is_empty());
1371
/// # Ok::<(), PolarsError>(())
1372
/// ```
1373
pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1374
let idx = self.check_name_to_idx(name)?;
1375
let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1376
1377
self.columns.iter().enumerate().for_each(|(i, s)| {
1378
if i != idx {
1379
new_cols.push(s.clone())
1380
}
1381
});
1382
1383
Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1384
}
1385
1386
/// Drop columns that are in `names`.
1387
pub fn drop_many<I, S>(&self, names: I) -> Self
1388
where
1389
I: IntoIterator<Item = S>,
1390
S: Into<PlSmallStr>,
1391
{
1392
let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1393
self.drop_many_amortized(&names)
1394
}
1395
1396
/// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1397
pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1398
if names.is_empty() {
1399
return self.clone();
1400
}
1401
let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1402
self.columns.iter().for_each(|s| {
1403
if !names.contains(s.name()) {
1404
new_cols.push(s.clone())
1405
}
1406
});
1407
1408
unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1409
}
1410
1411
/// Insert a new column at a given index without checking for duplicates.
1412
/// This can leave the [`DataFrame`] at an invalid state
1413
fn insert_column_no_name_check(
1414
&mut self,
1415
index: usize,
1416
column: Column,
1417
) -> PolarsResult<&mut Self> {
1418
polars_ensure!(
1419
self.width() == 0 || column.len() == self.height(),
1420
ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1421
column.len(), self.height(),
1422
);
1423
1424
if self.width() == 0 {
1425
self.height = column.len();
1426
}
1427
1428
self.columns.insert(index, column);
1429
self.clear_schema();
1430
Ok(self)
1431
}
1432
1433
/// Insert a new column at a given index.
1434
pub fn insert_column<S: IntoColumn>(
1435
&mut self,
1436
index: usize,
1437
column: S,
1438
) -> PolarsResult<&mut Self> {
1439
let column = column.into_column();
1440
self.check_already_present(column.name().as_str())?;
1441
self.insert_column_no_name_check(index, column)
1442
}
1443
1444
fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1445
if let Some(idx) = self.get_column_index(column.name().as_str()) {
1446
self.replace_column(idx, column)?;
1447
} else {
1448
if self.width() == 0 {
1449
self.height = column.len();
1450
}
1451
1452
self.columns.push(column);
1453
self.clear_schema();
1454
}
1455
Ok(())
1456
}
1457
1458
/// Add a new column to this [`DataFrame`] or replace an existing one.
1459
pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1460
fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1461
let height = df.height();
1462
if column.len() == 1 && height > 1 {
1463
column = column.new_from_index(0, height);
1464
}
1465
1466
if column.len() == height || df.get_columns().is_empty() {
1467
df.add_column_by_search(column)?;
1468
Ok(df)
1469
}
1470
// special case for literals
1471
else if height == 0 && column.len() == 1 {
1472
let s = column.clear();
1473
df.add_column_by_search(s)?;
1474
Ok(df)
1475
} else {
1476
polars_bail!(
1477
ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1478
column.len(), height,
1479
);
1480
}
1481
}
1482
let column = column.into_column();
1483
inner(self, column)
1484
}
1485
1486
/// Adds a column to the [`DataFrame`] without doing any checks
1487
/// on length or duplicates.
1488
///
1489
/// # Safety
1490
/// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1491
pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1492
debug_assert!(self.width() == 0 || self.height() == column.len());
1493
debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1494
1495
// SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1496
// properly for `width` == 0.
1497
if self.width() == 0 {
1498
unsafe { self.set_height(column.len()) };
1499
}
1500
unsafe { self.get_columns_mut() }.push(column);
1501
self.clear_schema();
1502
1503
self
1504
}
1505
1506
// Note: Schema can be both input or output_schema
1507
fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1508
let name = c.name();
1509
if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1510
if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1511
// Given schema is output_schema and we can push.
1512
if idx == self.columns.len() {
1513
if self.width() == 0 {
1514
self.height = c.len();
1515
}
1516
1517
self.columns.push(c);
1518
self.clear_schema();
1519
}
1520
// Schema is incorrect fallback to search
1521
else {
1522
debug_assert!(false);
1523
self.add_column_by_search(c)?;
1524
}
1525
} else {
1526
self.replace_column(idx, c)?;
1527
}
1528
} else {
1529
if self.width() == 0 {
1530
self.height = c.len();
1531
}
1532
1533
self.columns.push(c);
1534
self.clear_schema();
1535
}
1536
1537
Ok(())
1538
}
1539
1540
// Note: Schema can be both input or output_schema
1541
pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1542
for (i, s) in series.into_iter().enumerate() {
1543
// we need to branch here
1544
// because users can add multiple columns with the same name
1545
if i == 0 || schema.get(s.name().as_str()).is_some() {
1546
self.with_column_and_schema(s.into_column(), schema)?;
1547
} else {
1548
self.with_column(s.clone().into_column())?;
1549
}
1550
}
1551
Ok(())
1552
}
1553
1554
pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1555
for (i, s) in columns.into_iter().enumerate() {
1556
// we need to branch here
1557
// because users can add multiple columns with the same name
1558
if i == 0 || schema.get(s.name().as_str()).is_some() {
1559
self.with_column_and_schema(s, schema)?;
1560
} else {
1561
self.with_column(s.clone())?;
1562
}
1563
}
1564
1565
Ok(())
1566
}
1567
1568
/// Add a new column to this [`DataFrame`] or replace an existing one.
1569
/// Uses an existing schema to amortize lookups.
1570
/// If the schema is incorrect, we will fallback to linear search.
1571
///
1572
/// Note: Schema can be both input or output_schema
1573
pub fn with_column_and_schema<C: IntoColumn>(
1574
&mut self,
1575
column: C,
1576
schema: &Schema,
1577
) -> PolarsResult<&mut Self> {
1578
let mut column = column.into_column();
1579
1580
let height = self.height();
1581
if column.len() == 1 && height > 1 {
1582
column = column.new_from_index(0, height);
1583
}
1584
1585
if column.len() == height || self.columns.is_empty() {
1586
self.add_column_by_schema(column, schema)?;
1587
Ok(self)
1588
}
1589
// special case for literals
1590
else if height == 0 && column.len() == 1 {
1591
let s = column.clear();
1592
self.add_column_by_schema(s, schema)?;
1593
Ok(self)
1594
} else {
1595
polars_bail!(
1596
ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1597
column.len(), height,
1598
);
1599
}
1600
}
1601
1602
/// Get a row in the [`DataFrame`]. Beware this is slow.
1603
///
1604
/// # Example
1605
///
1606
/// ```
1607
/// # use polars_core::prelude::*;
1608
/// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1609
/// df.get(idx)
1610
/// }
1611
/// ```
1612
pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1613
match self.columns.first() {
1614
Some(s) => {
1615
if s.len() <= idx {
1616
return None;
1617
}
1618
},
1619
None => return None,
1620
}
1621
// SAFETY: we just checked bounds
1622
unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1623
}
1624
1625
/// Select a [`Series`] by index.
1626
///
1627
/// # Example
1628
///
1629
/// ```rust
1630
/// # use polars_core::prelude::*;
1631
/// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1632
/// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1633
///
1634
/// let s1: Option<&Column> = df.select_at_idx(0);
1635
/// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1636
///
1637
/// assert_eq!(s1, Some(&s2));
1638
/// # Ok::<(), PolarsError>(())
1639
/// ```
1640
pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1641
self.columns.get(idx)
1642
}
1643
1644
/// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1645
///
1646
/// # Examples
1647
///
1648
/// ```rust
1649
/// # use polars_core::prelude::*;
1650
/// let df = df! {
1651
/// "0" => [0, 0, 0],
1652
/// "1" => [1, 1, 1],
1653
/// "2" => [2, 2, 2]
1654
/// }?;
1655
///
1656
/// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1657
/// assert!(df.equals(&df.select_by_range(..)?));
1658
/// # Ok::<(), PolarsError>(())
1659
/// ```
1660
pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1661
where
1662
R: ops::RangeBounds<usize>,
1663
{
1664
// This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1665
// because it is the nightly feature. We should change here if this function were stable.
1666
fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1667
where
1668
R: ops::RangeBounds<usize>,
1669
{
1670
let len = bounds.end;
1671
1672
let start: ops::Bound<&usize> = range.start_bound();
1673
let start = match start {
1674
ops::Bound::Included(&start) => start,
1675
ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1676
panic!("attempted to index slice from after maximum usize");
1677
}),
1678
ops::Bound::Unbounded => 0,
1679
};
1680
1681
let end: ops::Bound<&usize> = range.end_bound();
1682
let end = match end {
1683
ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1684
panic!("attempted to index slice up to maximum usize");
1685
}),
1686
ops::Bound::Excluded(&end) => end,
1687
ops::Bound::Unbounded => len,
1688
};
1689
1690
if start > end {
1691
panic!("slice index starts at {start} but ends at {end}");
1692
}
1693
if end > len {
1694
panic!("range end index {end} out of range for slice of length {len}",);
1695
}
1696
1697
ops::Range { start, end }
1698
}
1699
1700
let colnames = self.get_column_names_owned();
1701
let range = get_range(range, ..colnames.len());
1702
1703
self._select_impl(&colnames[range])
1704
}
1705
1706
/// Get column index of a [`Series`] by name.
1707
/// # Example
1708
///
1709
/// ```rust
1710
/// # use polars_core::prelude::*;
1711
/// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1712
/// "Health" => [100, 200, 500],
1713
/// "Mana" => [250, 100, 0],
1714
/// "Strength" => [30, 150, 300])?;
1715
///
1716
/// assert_eq!(df.get_column_index("Name"), Some(0));
1717
/// assert_eq!(df.get_column_index("Health"), Some(1));
1718
/// assert_eq!(df.get_column_index("Mana"), Some(2));
1719
/// assert_eq!(df.get_column_index("Strength"), Some(3));
1720
/// assert_eq!(df.get_column_index("Haste"), None);
1721
/// # Ok::<(), PolarsError>(())
1722
/// ```
1723
pub fn get_column_index(&self, name: &str) -> Option<usize> {
1724
let schema = self.schema();
1725
if let Some(idx) = schema.index_of(name) {
1726
if self
1727
.get_columns()
1728
.get(idx)
1729
.is_some_and(|c| c.name() == name)
1730
{
1731
return Some(idx);
1732
}
1733
}
1734
1735
self.columns.iter().position(|s| s.name().as_str() == name)
1736
}
1737
1738
/// Get column index of a [`Series`] by name.
1739
pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1740
self.get_column_index(name)
1741
.ok_or_else(|| polars_err!(col_not_found = name))
1742
}
1743
1744
/// Select a single column by name.
1745
///
1746
/// # Example
1747
///
1748
/// ```rust
1749
/// # use polars_core::prelude::*;
1750
/// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1751
/// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1752
/// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1753
///
1754
/// assert_eq!(df.column("Password")?, &s1);
1755
/// # Ok::<(), PolarsError>(())
1756
/// ```
1757
pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1758
let idx = self.try_get_column_index(name)?;
1759
Ok(self.select_at_idx(idx).unwrap())
1760
}
1761
1762
/// Selected multiple columns by name.
1763
///
1764
/// # Example
1765
///
1766
/// ```rust
1767
/// # use polars_core::prelude::*;
1768
/// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1769
/// "Max weight (kg)" => [16.0, 35.89])?;
1770
/// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1771
///
1772
/// assert_eq!(&df[0], sv[0]);
1773
/// assert_eq!(&df[1], sv[1]);
1774
/// # Ok::<(), PolarsError>(())
1775
/// ```
1776
pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1777
where
1778
I: IntoIterator<Item = S>,
1779
S: AsRef<str>,
1780
{
1781
names
1782
.into_iter()
1783
.map(|name| self.column(name.as_ref()))
1784
.collect()
1785
}
1786
1787
/// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1788
///
1789
/// # Examples
1790
///
1791
/// ```
1792
/// # use polars_core::prelude::*;
1793
/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1794
/// df.select(["foo", "bar"])
1795
/// }
1796
/// ```
1797
pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1798
where
1799
I: IntoIterator<Item = S>,
1800
S: Into<PlSmallStr>,
1801
{
1802
let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1803
self._select_impl(cols.as_slice())
1804
}
1805
1806
pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1807
ensure_names_unique(cols, |s| s.as_str())?;
1808
self._select_impl_unchecked(cols)
1809
}
1810
1811
pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1812
let selected = self.select_columns_impl(cols)?;
1813
Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814
}
1815
1816
/// Select with a known schema. The schema names must match the column names of this DataFrame.
1817
pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1818
where
1819
I: IntoIterator<Item = S>,
1820
S: Into<PlSmallStr>,
1821
{
1822
let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1823
self._select_with_schema_impl(&cols, schema, true)
1824
}
1825
1826
/// Select with a known schema without checking for duplicates in `selection`.
1827
/// The schema names must match the column names of this DataFrame.
1828
pub fn select_with_schema_unchecked<I, S>(
1829
&self,
1830
selection: I,
1831
schema: &Schema,
1832
) -> PolarsResult<Self>
1833
where
1834
I: IntoIterator<Item = S>,
1835
S: Into<PlSmallStr>,
1836
{
1837
let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1838
self._select_with_schema_impl(&cols, schema, false)
1839
}
1840
1841
/// * The schema names must match the column names of this DataFrame.
1842
pub fn _select_with_schema_impl(
1843
&self,
1844
cols: &[PlSmallStr],
1845
schema: &Schema,
1846
check_duplicates: bool,
1847
) -> PolarsResult<Self> {
1848
if check_duplicates {
1849
ensure_names_unique(cols, |s| s.as_str())?;
1850
}
1851
1852
let selected = self.select_columns_impl_with_schema(cols, schema)?;
1853
Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1854
}
1855
1856
/// A non generic implementation to reduce compiler bloat.
1857
fn select_columns_impl_with_schema(
1858
&self,
1859
cols: &[PlSmallStr],
1860
schema: &Schema,
1861
) -> PolarsResult<Vec<Column>> {
1862
if cfg!(debug_assertions) {
1863
ensure_matching_schema_names(schema, self.schema())?;
1864
}
1865
1866
cols.iter()
1867
.map(|name| {
1868
let index = schema.try_get_full(name.as_str())?.0;
1869
Ok(self.columns[index].clone())
1870
})
1871
.collect()
1872
}
1873
1874
pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1875
where
1876
I: IntoIterator<Item = S>,
1877
S: Into<PlSmallStr>,
1878
{
1879
let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1880
self.select_physical_impl(&cols)
1881
}
1882
1883
fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1884
ensure_names_unique(cols, |s| s.as_str())?;
1885
let selected = self.select_columns_physical_impl(cols)?;
1886
Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1887
}
1888
1889
pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1890
let from = self.schema();
1891
let columns = to
1892
.iter_names()
1893
.map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1894
.collect::<PolarsResult<Vec<_>>>()?;
1895
let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1896
df.cached_schema = to.into();
1897
Ok(df)
1898
}
1899
1900
/// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1901
///
1902
/// # Example
1903
///
1904
/// ```rust
1905
/// # use polars_core::prelude::*;
1906
/// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1907
/// "Carbon" => [1, 2, 3],
1908
/// "Hydrogen" => [4, 6, 8])?;
1909
/// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1910
///
1911
/// assert_eq!(df["Carbon"], sv[0]);
1912
/// assert_eq!(df["Hydrogen"], sv[1]);
1913
/// # Ok::<(), PolarsError>(())
1914
/// ```
1915
pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1916
let cols = selection.into_vec();
1917
self.select_columns_impl(&cols)
1918
}
1919
1920
fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1921
self.columns
1922
.iter()
1923
.enumerate()
1924
.map(|(i, s)| (s.name().as_str(), i))
1925
.collect()
1926
}
1927
1928
/// A non generic implementation to reduce compiler bloat.
1929
fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1930
let selected = if cols.len() > 1 && self.columns.len() > 10 {
1931
let name_to_idx = self._names_to_idx_map();
1932
cols.iter()
1933
.map(|name| {
1934
let idx = *name_to_idx
1935
.get(name.as_str())
1936
.ok_or_else(|| polars_err!(col_not_found = name))?;
1937
Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1938
})
1939
.collect::<PolarsResult<Vec<_>>>()?
1940
} else {
1941
cols.iter()
1942
.map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1943
.collect::<PolarsResult<Vec<_>>>()?
1944
};
1945
1946
Ok(selected)
1947
}
1948
1949
/// A non generic implementation to reduce compiler bloat.
1950
fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1951
let selected = if cols.len() > 1 && self.columns.len() > 10 {
1952
// we hash, because there are user that having millions of columns.
1953
// # https://github.com/pola-rs/polars/issues/1023
1954
let name_to_idx = self._names_to_idx_map();
1955
1956
cols.iter()
1957
.map(|name| {
1958
let idx = *name_to_idx
1959
.get(name.as_str())
1960
.ok_or_else(|| polars_err!(col_not_found = name))?;
1961
Ok(self.select_at_idx(idx).unwrap().clone())
1962
})
1963
.collect::<PolarsResult<Vec<_>>>()?
1964
} else {
1965
cols.iter()
1966
.map(|c| self.column(c.as_str()).cloned())
1967
.collect::<PolarsResult<Vec<_>>>()?
1968
};
1969
1970
Ok(selected)
1971
}
1972
1973
fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1974
// If there is a filtered column just see how many columns there are left.
1975
if let Some(fst) = filtered.first() {
1976
return fst.len();
1977
}
1978
1979
// Otherwise, count the number of values that would be filtered and return that height.
1980
let num_trues = mask.num_trues();
1981
if mask.len() == self.height() {
1982
num_trues
1983
} else {
1984
// This is for broadcasting masks
1985
debug_assert!(num_trues == 0 || num_trues == 1);
1986
self.height() * num_trues
1987
}
1988
}
1989
1990
/// Take the [`DataFrame`] rows by a boolean mask.
1991
///
1992
/// # Example
1993
///
1994
/// ```
1995
/// # use polars_core::prelude::*;
1996
/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1997
/// let mask = df.column("sepal_width")?.is_not_null();
1998
/// df.filter(&mask)
1999
/// }
2000
/// ```
2001
pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2002
let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2003
let height = self.filter_height(&new_col, mask);
2004
2005
Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2006
}
2007
2008
/// Same as `filter` but does not parallelize.
2009
pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2010
let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2011
let height = self.filter_height(&new_col, mask);
2012
2013
Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2014
}
2015
2016
/// Take [`DataFrame`] rows by index values.
2017
///
2018
/// # Example
2019
///
2020
/// ```
2021
/// # use polars_core::prelude::*;
2022
/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2023
/// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2024
/// df.take(&idx)
2025
/// }
2026
/// ```
2027
pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2028
let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2029
2030
Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2031
}
2032
2033
/// # Safety
2034
/// The indices must be in-bounds.
2035
pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2036
self.take_unchecked_impl(idx, true)
2037
}
2038
2039
/// # Safety
2040
/// The indices must be in-bounds.
2041
pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2042
let cols = if allow_threads {
2043
POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2044
} else {
2045
self._apply_columns(&|s| s.take_unchecked(idx))
2046
};
2047
unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2048
}
2049
2050
/// # Safety
2051
/// The indices must be in-bounds.
2052
pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2053
self.take_slice_unchecked_impl(idx, true)
2054
}
2055
2056
/// # Safety
2057
/// The indices must be in-bounds.
2058
pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2059
let cols = if allow_threads {
2060
POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2061
} else {
2062
self._apply_columns(&|s| s.take_slice_unchecked(idx))
2063
};
2064
unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2065
}
2066
2067
/// Rename a column in the [`DataFrame`].
2068
///
2069
/// # Example
2070
///
2071
/// ```
2072
/// # use polars_core::prelude::*;
2073
/// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2074
/// let original_name = "foo";
2075
/// let new_name = "bar";
2076
/// df.rename(original_name, new_name.into())
2077
/// }
2078
/// ```
2079
pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2080
if column == name.as_str() {
2081
return Ok(self);
2082
}
2083
polars_ensure!(
2084
!self.schema().contains(&name),
2085
Duplicate: "column rename attempted with already existing name \"{name}\""
2086
);
2087
2088
self.get_column_index(column)
2089
.and_then(|idx| self.columns.get_mut(idx))
2090
.ok_or_else(|| polars_err!(col_not_found = column))
2091
.map(|c| c.rename(name))?;
2092
self.clear_schema();
2093
2094
Ok(self)
2095
}
2096
2097
/// Sort [`DataFrame`] in place.
2098
///
2099
/// See [`DataFrame::sort`] for more instruction.
2100
pub fn sort_in_place(
2101
&mut self,
2102
by: impl IntoVec<PlSmallStr>,
2103
sort_options: SortMultipleOptions,
2104
) -> PolarsResult<&mut Self> {
2105
let by_column = self.select_columns(by)?;
2106
self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2107
Ok(self)
2108
}
2109
2110
#[doc(hidden)]
2111
/// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2112
pub fn sort_impl(
2113
&self,
2114
by_column: Vec<Column>,
2115
mut sort_options: SortMultipleOptions,
2116
slice: Option<(i64, usize)>,
2117
) -> PolarsResult<Self> {
2118
if by_column.is_empty() {
2119
// If no columns selected, any order (including original order) is correct.
2120
return if let Some((offset, len)) = slice {
2121
Ok(self.slice(offset, len))
2122
} else {
2123
Ok(self.clone())
2124
};
2125
}
2126
2127
// note that the by_column argument also contains evaluated expression from
2128
// polars-lazy that may not even be present in this dataframe. therefore
2129
// when we try to set the first columns as sorted, we ignore the error as
2130
// expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2131
let first_descending = sort_options.descending[0];
2132
let first_by_column = by_column[0].name().to_string();
2133
2134
let set_sorted = |df: &mut DataFrame| {
2135
// Mark the first sort column as sorted; if the column does not exist it
2136
// is ok, because we sorted by an expression not present in the dataframe
2137
let _ = df.apply(&first_by_column, |s| {
2138
let mut s = s.clone();
2139
if first_descending {
2140
s.set_sorted_flag(IsSorted::Descending)
2141
} else {
2142
s.set_sorted_flag(IsSorted::Ascending)
2143
}
2144
s
2145
});
2146
};
2147
if self.is_empty() {
2148
let mut out = self.clone();
2149
set_sorted(&mut out);
2150
return Ok(out);
2151
}
2152
2153
if let Some((0, k)) = slice {
2154
if k < self.len() {
2155
return self.bottom_k_impl(k, by_column, sort_options);
2156
}
2157
}
2158
// Check if the required column is already sorted; if so we can exit early
2159
// We can do so when there is only one column to sort by, for multiple columns
2160
// it will be complicated to do so
2161
#[cfg(feature = "dtype-categorical")]
2162
let is_not_categorical_enum =
2163
!(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2164
|| matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2165
2166
#[cfg(not(feature = "dtype-categorical"))]
2167
#[allow(non_upper_case_globals)]
2168
const is_not_categorical_enum: bool = true;
2169
2170
if by_column.len() == 1 && is_not_categorical_enum {
2171
let required_sorting = if sort_options.descending[0] {
2172
IsSorted::Descending
2173
} else {
2174
IsSorted::Ascending
2175
};
2176
// If null count is 0 then nulls_last doesnt matter
2177
// Safe to get value at last position since the dataframe is not empty (taken care above)
2178
let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2179
&& ((by_column[0].null_count() == 0)
2180
|| by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2181
== sort_options.nulls_last[0]);
2182
2183
if no_sorting_required {
2184
return if let Some((offset, len)) = slice {
2185
Ok(self.slice(offset, len))
2186
} else {
2187
Ok(self.clone())
2188
};
2189
}
2190
}
2191
2192
let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2193
2194
// a lot of indirection in both sorting and take
2195
let mut df = self.clone();
2196
let df = df.as_single_chunk_par();
2197
let mut take = match (by_column.len(), has_nested) {
2198
(1, false) => {
2199
let s = &by_column[0];
2200
let options = SortOptions {
2201
descending: sort_options.descending[0],
2202
nulls_last: sort_options.nulls_last[0],
2203
multithreaded: sort_options.multithreaded,
2204
maintain_order: sort_options.maintain_order,
2205
limit: sort_options.limit,
2206
};
2207
// fast path for a frame with a single series
2208
// no need to compute the sort indices and then take by these indices
2209
// simply sort and return as frame
2210
if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2211
let mut out = s.sort_with(options)?;
2212
if let Some((offset, len)) = slice {
2213
out = out.slice(offset, len);
2214
}
2215
return Ok(out.into_frame());
2216
}
2217
s.arg_sort(options)
2218
},
2219
_ => {
2220
if sort_options.nulls_last.iter().all(|&x| x)
2221
|| has_nested
2222
|| std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2223
{
2224
argsort_multiple_row_fmt(
2225
&by_column,
2226
sort_options.descending,
2227
sort_options.nulls_last,
2228
sort_options.multithreaded,
2229
)?
2230
} else {
2231
let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2232
first
2233
.as_materialized_series()
2234
.arg_sort_multiple(&other, &sort_options)?
2235
}
2236
},
2237
};
2238
2239
if let Some((offset, len)) = slice {
2240
take = take.slice(offset, len);
2241
}
2242
2243
// SAFETY:
2244
// the created indices are in bounds
2245
let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2246
set_sorted(&mut df);
2247
Ok(df)
2248
}
2249
2250
/// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2251
///
2252
/// This dataframe does not necessarily have a specified schema and may be changed at any
2253
/// point. It is primarily used for debugging.
2254
pub fn _to_metadata(&self) -> DataFrame {
2255
let num_columns = self.columns.len();
2256
2257
let mut column_names =
2258
StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2259
let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2260
let mut sorted_asc_ca =
2261
BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2262
let mut sorted_dsc_ca =
2263
BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2264
let mut fast_explode_list_ca =
2265
BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2266
let mut materialized_at_ca =
2267
StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2268
2269
for col in &self.columns {
2270
let flags = col.get_flags();
2271
2272
let (repr, materialized_at) = match col {
2273
Column::Series(s) => ("series", s.materialized_at()),
2274
Column::Partitioned(_) => ("partitioned", None),
2275
Column::Scalar(_) => ("scalar", None),
2276
};
2277
let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2278
let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2279
let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2280
2281
column_names.append_value(col.name().clone());
2282
repr_ca.append_value(repr);
2283
sorted_asc_ca.append_value(sorted_asc);
2284
sorted_dsc_ca.append_value(sorted_dsc);
2285
fast_explode_list_ca.append_value(fast_explode_list);
2286
materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2287
}
2288
2289
unsafe {
2290
DataFrame::new_no_checks(
2291
self.width(),
2292
vec![
2293
column_names.finish().into_column(),
2294
repr_ca.finish().into_column(),
2295
sorted_asc_ca.finish().into_column(),
2296
sorted_dsc_ca.finish().into_column(),
2297
fast_explode_list_ca.finish().into_column(),
2298
materialized_at_ca.finish().into_column(),
2299
],
2300
)
2301
}
2302
}
2303
2304
/// Return a sorted clone of this [`DataFrame`].
2305
///
2306
/// In many cases the output chunks will be continuous in memory but this is not guaranteed
2307
/// # Example
2308
///
2309
/// Sort by a single column with default options:
2310
/// ```
2311
/// # use polars_core::prelude::*;
2312
/// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2313
/// df.sort(["sepal_width"], Default::default())
2314
/// }
2315
/// ```
2316
/// Sort by a single column with specific order:
2317
/// ```
2318
/// # use polars_core::prelude::*;
2319
/// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2320
/// df.sort(
2321
/// ["sepal_width"],
2322
/// SortMultipleOptions::new()
2323
/// .with_order_descending(descending)
2324
/// )
2325
/// }
2326
/// ```
2327
/// Sort by multiple columns with specifying order for each column:
2328
/// ```
2329
/// # use polars_core::prelude::*;
2330
/// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2331
/// df.sort(
2332
/// ["sepal_width", "sepal_length"],
2333
/// SortMultipleOptions::new()
2334
/// .with_order_descending_multi([false, true])
2335
/// )
2336
/// }
2337
/// ```
2338
/// See [`SortMultipleOptions`] for more options.
2339
///
2340
/// Also see [`DataFrame::sort_in_place`].
2341
pub fn sort(
2342
&self,
2343
by: impl IntoVec<PlSmallStr>,
2344
sort_options: SortMultipleOptions,
2345
) -> PolarsResult<Self> {
2346
let mut df = self.clone();
2347
df.sort_in_place(by, sort_options)?;
2348
Ok(df)
2349
}
2350
2351
/// Replace a column with a [`Series`].
2352
///
2353
/// # Example
2354
///
2355
/// ```rust
2356
/// # use polars_core::prelude::*;
2357
/// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2358
/// "Area (km²)" => [9_833_520, 9_596_961])?;
2359
/// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2360
///
2361
/// assert!(df.replace("Nation", s.clone()).is_err());
2362
/// assert!(df.replace("Country", s).is_ok());
2363
/// # Ok::<(), PolarsError>(())
2364
/// ```
2365
pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2366
self.apply(column, |_| new_col.into_series())
2367
}
2368
2369
/// Replace or update a column. The difference between this method and [DataFrame::with_column]
2370
/// is that now the value of `column: &str` determines the name of the column and not the name
2371
/// of the `Series` passed to this method.
2372
pub fn replace_or_add<S: IntoSeries>(
2373
&mut self,
2374
column: PlSmallStr,
2375
new_col: S,
2376
) -> PolarsResult<&mut Self> {
2377
let mut new_col = new_col.into_series();
2378
new_col.rename(column);
2379
self.with_column(new_col)
2380
}
2381
2382
/// Replace column at index `idx` with a [`Series`].
2383
///
2384
/// # Example
2385
///
2386
/// ```ignored
2387
/// # use polars_core::prelude::*;
2388
/// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2389
/// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2390
/// let mut df = DataFrame::new(vec![s0, s1])?;
2391
///
2392
/// // Add 32 to get lowercase ascii values
2393
/// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2394
/// # Ok::<(), PolarsError>(())
2395
/// ```
2396
pub fn replace_column<C: IntoColumn>(
2397
&mut self,
2398
index: usize,
2399
new_column: C,
2400
) -> PolarsResult<&mut Self> {
2401
polars_ensure!(
2402
index < self.width(),
2403
ShapeMismatch:
2404
"unable to replace at index {}, the DataFrame has only {} columns",
2405
index, self.width(),
2406
);
2407
let mut new_column = new_column.into_column();
2408
polars_ensure!(
2409
new_column.len() == self.height(),
2410
ShapeMismatch:
2411
"unable to replace a column, series length {} doesn't match the DataFrame height {}",
2412
new_column.len(), self.height(),
2413
);
2414
let old_col = &mut self.columns[index];
2415
mem::swap(old_col, &mut new_column);
2416
self.clear_schema();
2417
Ok(self)
2418
}
2419
2420
/// Apply a closure to a column. This is the recommended way to do in place modification.
2421
///
2422
/// # Example
2423
///
2424
/// ```rust
2425
/// # use polars_core::prelude::*;
2426
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2427
/// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2428
/// let mut df = DataFrame::new(vec![s0, s1])?;
2429
///
2430
/// fn str_to_len(str_val: &Column) -> Column {
2431
/// str_val.str()
2432
/// .unwrap()
2433
/// .into_iter()
2434
/// .map(|opt_name: Option<&str>| {
2435
/// opt_name.map(|name: &str| name.len() as u32)
2436
/// })
2437
/// .collect::<UInt32Chunked>()
2438
/// .into_column()
2439
/// }
2440
///
2441
/// // Replace the names column by the length of the names.
2442
/// df.apply("names", str_to_len);
2443
/// # Ok::<(), PolarsError>(())
2444
/// ```
2445
/// Results in:
2446
///
2447
/// ```text
2448
/// +--------+-------+
2449
/// | foo | |
2450
/// | --- | names |
2451
/// | str | u32 |
2452
/// +========+=======+
2453
/// | "ham" | 4 |
2454
/// +--------+-------+
2455
/// | "spam" | 6 |
2456
/// +--------+-------+
2457
/// | "egg" | 3 |
2458
/// +--------+-------+
2459
/// ```
2460
pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2461
where
2462
F: FnOnce(&Column) -> C,
2463
C: IntoColumn,
2464
{
2465
let idx = self.check_name_to_idx(name)?;
2466
self.apply_at_idx(idx, f)?;
2467
Ok(self)
2468
}
2469
2470
/// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2471
/// modification.
2472
///
2473
/// # Example
2474
///
2475
/// ```rust
2476
/// # use polars_core::prelude::*;
2477
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2478
/// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2479
/// let mut df = DataFrame::new(vec![s0, s1])?;
2480
///
2481
/// // Add 32 to get lowercase ascii values
2482
/// df.apply_at_idx(1, |s| s + 32);
2483
/// # Ok::<(), PolarsError>(())
2484
/// ```
2485
/// Results in:
2486
///
2487
/// ```text
2488
/// +--------+-------+
2489
/// | foo | ascii |
2490
/// | --- | --- |
2491
/// | str | i32 |
2492
/// +========+=======+
2493
/// | "ham" | 102 |
2494
/// +--------+-------+
2495
/// | "spam" | 111 |
2496
/// +--------+-------+
2497
/// | "egg" | 111 |
2498
/// +--------+-------+
2499
/// ```
2500
pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2501
where
2502
F: FnOnce(&Column) -> C,
2503
C: IntoColumn,
2504
{
2505
let df_height = self.height();
2506
let width = self.width();
2507
let col = self.columns.get_mut(idx).ok_or_else(|| {
2508
polars_err!(
2509
ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2510
idx, width
2511
)
2512
})?;
2513
let name = col.name().clone();
2514
let dtype_before = col.dtype().clone();
2515
let new_col = f(col).into_column();
2516
match new_col.len() {
2517
1 => {
2518
let new_col = new_col.new_from_index(0, df_height);
2519
let _ = mem::replace(col, new_col);
2520
},
2521
len if (len == df_height) => {
2522
let _ = mem::replace(col, new_col);
2523
},
2524
len => polars_bail!(
2525
ShapeMismatch:
2526
"resulting Series has length {} while the DataFrame has height {}",
2527
len, df_height
2528
),
2529
}
2530
2531
// make sure the name remains the same after applying the closure
2532
unsafe {
2533
let col = self.columns.get_unchecked_mut(idx);
2534
col.rename(name);
2535
2536
if col.dtype() != &dtype_before {
2537
self.clear_schema();
2538
}
2539
}
2540
Ok(self)
2541
}
2542
2543
/// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2544
/// modification.
2545
///
2546
/// # Example
2547
///
2548
/// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2549
///
2550
/// ```rust
2551
/// # use polars_core::prelude::*;
2552
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2553
/// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2554
/// let mut df = DataFrame::new(vec![s0, s1])?;
2555
///
2556
/// let idx = vec![0, 1, 4];
2557
///
2558
/// df.try_apply("foo", |c| {
2559
/// c.str()?
2560
/// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2561
/// });
2562
/// # Ok::<(), PolarsError>(())
2563
/// ```
2564
/// Results in:
2565
///
2566
/// ```text
2567
/// +---------------------+--------+
2568
/// | foo | values |
2569
/// | --- | --- |
2570
/// | str | i32 |
2571
/// +=====================+========+
2572
/// | "ham-is-modified" | 1 |
2573
/// +---------------------+--------+
2574
/// | "spam-is-modified" | 2 |
2575
/// +---------------------+--------+
2576
/// | "egg" | 3 |
2577
/// +---------------------+--------+
2578
/// | "bacon" | 4 |
2579
/// +---------------------+--------+
2580
/// | "quack-is-modified" | 5 |
2581
/// +---------------------+--------+
2582
/// ```
2583
pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2584
where
2585
F: FnOnce(&Column) -> PolarsResult<C>,
2586
C: IntoColumn,
2587
{
2588
let width = self.width();
2589
let col = self.columns.get_mut(idx).ok_or_else(|| {
2590
polars_err!(
2591
ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2592
idx, width
2593
)
2594
})?;
2595
let name = col.name().clone();
2596
2597
let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2598
2599
// make sure the name remains the same after applying the closure
2600
unsafe {
2601
let col = self.columns.get_unchecked_mut(idx);
2602
col.rename(name);
2603
}
2604
Ok(self)
2605
}
2606
2607
/// Apply a closure that may fail to a column. This is the recommended way to do in place
2608
/// modification.
2609
///
2610
/// # Example
2611
///
2612
/// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2613
///
2614
/// ```rust
2615
/// # use polars_core::prelude::*;
2616
/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2617
/// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2618
/// let mut df = DataFrame::new(vec![s0, s1])?;
2619
///
2620
/// // create a mask
2621
/// let values = df.column("values")?.as_materialized_series();
2622
/// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2623
///
2624
/// df.try_apply("foo", |c| {
2625
/// c.str()?
2626
/// .set(&mask, Some("not_within_bounds"))
2627
/// });
2628
/// # Ok::<(), PolarsError>(())
2629
/// ```
2630
/// Results in:
2631
///
2632
/// ```text
2633
/// +---------------------+--------+
2634
/// | foo | values |
2635
/// | --- | --- |
2636
/// | str | i32 |
2637
/// +=====================+========+
2638
/// | "not_within_bounds" | 1 |
2639
/// +---------------------+--------+
2640
/// | "spam" | 2 |
2641
/// +---------------------+--------+
2642
/// | "egg" | 3 |
2643
/// +---------------------+--------+
2644
/// | "bacon" | 4 |
2645
/// +---------------------+--------+
2646
/// | "not_within_bounds" | 5 |
2647
/// +---------------------+--------+
2648
/// ```
2649
pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2650
where
2651
F: FnOnce(&Series) -> PolarsResult<C>,
2652
C: IntoColumn,
2653
{
2654
let idx = self.try_get_column_index(column)?;
2655
self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2656
}
2657
2658
/// Slice the [`DataFrame`] along the rows.
2659
///
2660
/// # Example
2661
///
2662
/// ```rust
2663
/// # use polars_core::prelude::*;
2664
/// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2665
/// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2666
/// let sl: DataFrame = df.slice(2, 3);
2667
///
2668
/// assert_eq!(sl.shape(), (3, 2));
2669
/// println!("{}", sl);
2670
/// # Ok::<(), PolarsError>(())
2671
/// ```
2672
/// Output:
2673
/// ```text
2674
/// shape: (3, 2)
2675
/// +-------+-------+
2676
/// | Fruit | Color |
2677
/// | --- | --- |
2678
/// | str | str |
2679
/// +=======+=======+
2680
/// | Grape | White |
2681
/// +-------+-------+
2682
/// | Fig | White |
2683
/// +-------+-------+
2684
/// | Fig | Red |
2685
/// +-------+-------+
2686
/// ```
2687
#[must_use]
2688
pub fn slice(&self, offset: i64, length: usize) -> Self {
2689
if offset == 0 && length == self.height() {
2690
return self.clone();
2691
}
2692
if length == 0 {
2693
return self.clear();
2694
}
2695
let col = self
2696
.columns
2697
.iter()
2698
.map(|s| s.slice(offset, length))
2699
.collect::<Vec<_>>();
2700
2701
let height = if let Some(fst) = col.first() {
2702
fst.len()
2703
} else {
2704
let (_, length) = slice_offsets(offset, length, self.height());
2705
length
2706
};
2707
2708
unsafe { DataFrame::new_no_checks(height, col) }
2709
}
2710
2711
/// Split [`DataFrame`] at the given `offset`.
2712
pub fn split_at(&self, offset: i64) -> (Self, Self) {
2713
let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2714
2715
let (idx, _) = slice_offsets(offset, 0, self.height());
2716
2717
let a = unsafe { DataFrame::new_no_checks(idx, a) };
2718
let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2719
(a, b)
2720
}
2721
2722
pub fn clear(&self) -> Self {
2723
let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2724
unsafe { DataFrame::new_no_checks(0, col) }
2725
}
2726
2727
#[must_use]
2728
pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2729
if offset == 0 && length == self.height() {
2730
return self.clone();
2731
}
2732
let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2733
unsafe { DataFrame::new_no_checks(length, columns) }
2734
}
2735
2736
#[must_use]
2737
pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2738
if offset == 0 && length == self.height() {
2739
return self.clone();
2740
}
2741
// @scalar-opt
2742
let columns = self._apply_columns(&|s| {
2743
let mut out = s.slice(offset, length);
2744
out.shrink_to_fit();
2745
out
2746
});
2747
unsafe { DataFrame::new_no_checks(length, columns) }
2748
}
2749
2750
/// Get the head of the [`DataFrame`].
2751
///
2752
/// # Example
2753
///
2754
/// ```rust
2755
/// # use polars_core::prelude::*;
2756
/// let countries: DataFrame =
2757
/// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2758
/// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2759
/// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2760
/// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2761
/// assert_eq!(countries.shape(), (5, 4));
2762
///
2763
/// println!("{}", countries.head(Some(3)));
2764
/// # Ok::<(), PolarsError>(())
2765
/// ```
2766
///
2767
/// Output:
2768
///
2769
/// ```text
2770
/// shape: (3, 4)
2771
/// +--------------------+---------------+---------------+------------+
2772
/// | Rank by GDP (2021) | Continent | Country | Capital |
2773
/// | --- | --- | --- | --- |
2774
/// | i32 | str | str | str |
2775
/// +====================+===============+===============+============+
2776
/// | 1 | North America | United States | Washington |
2777
/// +--------------------+---------------+---------------+------------+
2778
/// | 2 | Asia | China | Beijing |
2779
/// +--------------------+---------------+---------------+------------+
2780
/// | 3 | Asia | Japan | Tokyo |
2781
/// +--------------------+---------------+---------------+------------+
2782
/// ```
2783
#[must_use]
2784
pub fn head(&self, length: Option<usize>) -> Self {
2785
let col = self
2786
.columns
2787
.iter()
2788
.map(|c| c.head(length))
2789
.collect::<Vec<_>>();
2790
2791
let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2792
let height = usize::min(height, self.height());
2793
unsafe { DataFrame::new_no_checks(height, col) }
2794
}
2795
2796
/// Get the tail of the [`DataFrame`].
2797
///
2798
/// # Example
2799
///
2800
/// ```rust
2801
/// # use polars_core::prelude::*;
2802
/// let countries: DataFrame =
2803
/// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2804
/// "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2805
/// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2806
/// assert_eq!(countries.shape(), (5, 3));
2807
///
2808
/// println!("{}", countries.tail(Some(2)));
2809
/// # Ok::<(), PolarsError>(())
2810
/// ```
2811
///
2812
/// Output:
2813
///
2814
/// ```text
2815
/// shape: (2, 3)
2816
/// +-------------+--------------------+---------+
2817
/// | Rank (2021) | Apple Price (€/kg) | Country |
2818
/// | --- | --- | --- |
2819
/// | i32 | f64 | str |
2820
/// +=============+====================+=========+
2821
/// | 108 | 0.63 | Syria |
2822
/// +-------------+--------------------+---------+
2823
/// | 109 | 0.63 | Turkey |
2824
/// +-------------+--------------------+---------+
2825
/// ```
2826
#[must_use]
2827
pub fn tail(&self, length: Option<usize>) -> Self {
2828
let col = self
2829
.columns
2830
.iter()
2831
.map(|c| c.tail(length))
2832
.collect::<Vec<_>>();
2833
2834
let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2835
let height = usize::min(height, self.height());
2836
unsafe { DataFrame::new_no_checks(height, col) }
2837
}
2838
2839
/// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2840
///
2841
/// # Panics
2842
///
2843
/// Panics if the [`DataFrame`] that is passed is not rechunked.
2844
///
2845
/// This responsibility is left to the caller as we don't want to take mutable references here,
2846
/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2847
/// as well.
2848
pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2849
debug_assert!(!self.should_rechunk(), "expected equal chunks");
2850
// If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2851
// as we must allocate arrow strings/binaries.
2852
let must_convert = compat_level.0 == 0;
2853
let parallel = parallel
2854
&& must_convert
2855
&& self.columns.len() > 1
2856
&& self
2857
.columns
2858
.iter()
2859
.any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2860
2861
RecordBatchIter {
2862
columns: &self.columns,
2863
schema: Arc::new(
2864
self.columns
2865
.iter()
2866
.map(|c| c.field().to_arrow(compat_level))
2867
.collect(),
2868
),
2869
idx: 0,
2870
n_chunks: self.first_col_n_chunks(),
2871
compat_level,
2872
parallel,
2873
}
2874
}
2875
2876
/// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2877
///
2878
/// # Panics
2879
///
2880
/// Panics if the [`DataFrame`] that is passed is not rechunked.
2881
///
2882
/// This responsibility is left to the caller as we don't want to take mutable references here,
2883
/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2884
/// as well.
2885
pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2886
debug_assert!(!self.should_rechunk());
2887
PhysRecordBatchIter {
2888
schema: Arc::new(
2889
self.get_columns()
2890
.iter()
2891
.map(|c| c.field().to_arrow(CompatLevel::newest()))
2892
.collect(),
2893
),
2894
arr_iters: self
2895
.materialized_column_iter()
2896
.map(|s| s.chunks().iter())
2897
.collect(),
2898
}
2899
}
2900
2901
/// Get a [`DataFrame`] with all the columns in reversed order.
2902
#[must_use]
2903
pub fn reverse(&self) -> Self {
2904
let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2905
unsafe { DataFrame::new_no_checks(self.height(), col) }
2906
}
2907
2908
/// Shift the values by a given period and fill the parts that will be empty due to this operation
2909
/// with `Nones`.
2910
///
2911
/// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2912
#[must_use]
2913
pub fn shift(&self, periods: i64) -> Self {
2914
let col = self._apply_columns_par(&|s| s.shift(periods));
2915
unsafe { DataFrame::new_no_checks(self.height(), col) }
2916
}
2917
2918
/// Replace None values with one of the following strategies:
2919
/// * Forward fill (replace None with the previous value)
2920
/// * Backward fill (replace None with the next value)
2921
/// * Mean fill (replace None with the mean of the whole array)
2922
/// * Min fill (replace None with the minimum of the whole array)
2923
/// * Max fill (replace None with the maximum of the whole array)
2924
///
2925
/// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2926
pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2927
let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2928
2929
Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2930
}
2931
2932
/// Pipe different functions/ closure operations that work on a DataFrame together.
2933
pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2934
where
2935
F: Fn(DataFrame) -> PolarsResult<B>,
2936
{
2937
f(self)
2938
}
2939
2940
/// Pipe different functions/ closure operations that work on a DataFrame together.
2941
pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2942
where
2943
F: Fn(&mut DataFrame) -> PolarsResult<B>,
2944
{
2945
f(self)
2946
}
2947
2948
/// Pipe different functions/ closure operations that work on a DataFrame together.
2949
pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2950
where
2951
F: Fn(DataFrame, Args) -> PolarsResult<B>,
2952
{
2953
f(self, args)
2954
}
2955
2956
/// Drop duplicate rows from a [`DataFrame`].
2957
/// *This fails when there is a column of type List in DataFrame*
2958
///
2959
/// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2960
///
2961
/// # Example
2962
///
2963
/// ```no_run
2964
/// # use polars_core::prelude::*;
2965
/// let df = df! {
2966
/// "flt" => [1., 1., 2., 2., 3., 3.],
2967
/// "int" => [1, 1, 2, 2, 3, 3, ],
2968
/// "str" => ["a", "a", "b", "b", "c", "c"]
2969
/// }?;
2970
///
2971
/// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2972
/// # Ok::<(), PolarsError>(())
2973
/// ```
2974
/// Returns
2975
///
2976
/// ```text
2977
/// +-----+-----+-----+
2978
/// | flt | int | str |
2979
/// | --- | --- | --- |
2980
/// | f64 | i32 | str |
2981
/// +=====+=====+=====+
2982
/// | 1 | 1 | "a" |
2983
/// +-----+-----+-----+
2984
/// | 2 | 2 | "b" |
2985
/// +-----+-----+-----+
2986
/// | 3 | 3 | "c" |
2987
/// +-----+-----+-----+
2988
/// ```
2989
#[cfg(feature = "algorithm_group_by")]
2990
pub fn unique_stable(
2991
&self,
2992
subset: Option<&[String]>,
2993
keep: UniqueKeepStrategy,
2994
slice: Option<(i64, usize)>,
2995
) -> PolarsResult<DataFrame> {
2996
self.unique_impl(
2997
true,
2998
subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2999
keep,
3000
slice,
3001
)
3002
}
3003
3004
/// Unstable distinct. See [`DataFrame::unique_stable`].
3005
#[cfg(feature = "algorithm_group_by")]
3006
pub fn unique<I, S>(
3007
&self,
3008
subset: Option<&[String]>,
3009
keep: UniqueKeepStrategy,
3010
slice: Option<(i64, usize)>,
3011
) -> PolarsResult<DataFrame> {
3012
self.unique_impl(
3013
false,
3014
subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3015
keep,
3016
slice,
3017
)
3018
}
3019
3020
#[cfg(feature = "algorithm_group_by")]
3021
pub fn unique_impl(
3022
&self,
3023
maintain_order: bool,
3024
subset: Option<Vec<PlSmallStr>>,
3025
keep: UniqueKeepStrategy,
3026
slice: Option<(i64, usize)>,
3027
) -> PolarsResult<Self> {
3028
let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3029
let mut df = self.clone();
3030
// take on multiple chunks is terrible
3031
df.as_single_chunk_par();
3032
3033
let columns = match (keep, maintain_order) {
3034
(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3035
let gb = df.group_by_stable(names)?;
3036
let groups = gb.get_groups();
3037
let (offset, len) = slice.unwrap_or((0, groups.len()));
3038
let groups = groups.slice(offset, len);
3039
df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3040
},
3041
(UniqueKeepStrategy::Last, true) => {
3042
// maintain order by last values, so the sorted groups are not correct as they
3043
// are sorted by the first value
3044
let gb = df.group_by_stable(names)?;
3045
let groups = gb.get_groups();
3046
3047
let last_idx: NoNull<IdxCa> = groups
3048
.iter()
3049
.map(|g| match g {
3050
GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3051
GroupsIndicator::Slice([first, len]) => first + len - 1,
3052
})
3053
.collect();
3054
3055
let mut last_idx = last_idx.into_inner().sort(false);
3056
3057
if let Some((offset, len)) = slice {
3058
last_idx = last_idx.slice(offset, len);
3059
}
3060
3061
let last_idx = NoNull::new(last_idx);
3062
let out = unsafe { df.take_unchecked(&last_idx) };
3063
return Ok(out);
3064
},
3065
(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3066
let gb = df.group_by(names)?;
3067
let groups = gb.get_groups();
3068
let (offset, len) = slice.unwrap_or((0, groups.len()));
3069
let groups = groups.slice(offset, len);
3070
df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3071
},
3072
(UniqueKeepStrategy::Last, false) => {
3073
let gb = df.group_by(names)?;
3074
let groups = gb.get_groups();
3075
let (offset, len) = slice.unwrap_or((0, groups.len()));
3076
let groups = groups.slice(offset, len);
3077
df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3078
},
3079
(UniqueKeepStrategy::None, _) => {
3080
let df_part = df.select(names)?;
3081
let mask = df_part.is_unique()?;
3082
let mut filtered = df.filter(&mask)?;
3083
3084
if let Some((offset, len)) = slice {
3085
filtered = filtered.slice(offset, len);
3086
}
3087
return Ok(filtered);
3088
},
3089
};
3090
let height = Self::infer_height(&columns);
3091
Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3092
}
3093
3094
/// Get a mask of all the unique rows in the [`DataFrame`].
3095
///
3096
/// # Example
3097
///
3098
/// ```no_run
3099
/// # use polars_core::prelude::*;
3100
/// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3101
/// "ISIN" => ["US0378331005", "US5949181045"])?;
3102
/// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3103
///
3104
/// assert!(ca.all());
3105
/// # Ok::<(), PolarsError>(())
3106
/// ```
3107
#[cfg(feature = "algorithm_group_by")]
3108
pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3109
let gb = self.group_by(self.get_column_names_owned())?;
3110
let groups = gb.get_groups();
3111
Ok(is_unique_helper(
3112
groups,
3113
self.height() as IdxSize,
3114
true,
3115
false,
3116
))
3117
}
3118
3119
/// Get a mask of all the duplicated rows in the [`DataFrame`].
3120
///
3121
/// # Example
3122
///
3123
/// ```no_run
3124
/// # use polars_core::prelude::*;
3125
/// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3126
/// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3127
/// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3128
///
3129
/// assert!(!ca.all());
3130
/// # Ok::<(), PolarsError>(())
3131
/// ```
3132
#[cfg(feature = "algorithm_group_by")]
3133
pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3134
let gb = self.group_by(self.get_column_names_owned())?;
3135
let groups = gb.get_groups();
3136
Ok(is_unique_helper(
3137
groups,
3138
self.height() as IdxSize,
3139
false,
3140
true,
3141
))
3142
}
3143
3144
/// Create a new [`DataFrame`] that shows the null counts per column.
3145
#[must_use]
3146
pub fn null_count(&self) -> Self {
3147
let cols = self
3148
.columns
3149
.iter()
3150
.map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3151
.collect();
3152
unsafe { Self::new_no_checks(1, cols) }
3153
}
3154
3155
/// Hash and combine the row values
3156
#[cfg(feature = "row_hash")]
3157
pub fn hash_rows(
3158
&mut self,
3159
hasher_builder: Option<PlSeedableRandomStateQuality>,
3160
) -> PolarsResult<UInt64Chunked> {
3161
let dfs = split_df(self, POOL.current_num_threads(), false);
3162
let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3163
3164
let mut iter = cas.into_iter();
3165
let mut acc_ca = iter.next().unwrap();
3166
for ca in iter {
3167
acc_ca.append(&ca)?;
3168
}
3169
Ok(acc_ca.rechunk().into_owned())
3170
}
3171
3172
/// Get the supertype of the columns in this DataFrame
3173
pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3174
self.columns
3175
.iter()
3176
.map(|s| Ok(s.dtype().clone()))
3177
.reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3178
}
3179
3180
/// Take by index values given by the slice `idx`.
3181
/// # Warning
3182
/// Be careful with allowing threads when calling this in a large hot loop
3183
/// every thread split may be on rayon stack and lead to SO
3184
#[doc(hidden)]
3185
pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3186
self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3187
}
3188
3189
/// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3190
/// if the index value in `idx` are sorted. This will maintain sorted flags.
3191
///
3192
/// # Warning
3193
/// Be careful with allowing threads when calling this in a large hot loop
3194
/// every thread split may be on rayon stack and lead to SO
3195
#[doc(hidden)]
3196
pub unsafe fn _take_unchecked_slice_sorted(
3197
&self,
3198
idx: &[IdxSize],
3199
allow_threads: bool,
3200
sorted: IsSorted,
3201
) -> Self {
3202
#[cfg(debug_assertions)]
3203
{
3204
if idx.len() > 2 {
3205
match sorted {
3206
IsSorted::Ascending => {
3207
assert!(idx[0] <= idx[idx.len() - 1]);
3208
},
3209
IsSorted::Descending => {
3210
assert!(idx[0] >= idx[idx.len() - 1]);
3211
},
3212
_ => {},
3213
}
3214
}
3215
}
3216
let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3217
ca.set_sorted_flag(sorted);
3218
self.take_unchecked_impl(&ca, allow_threads)
3219
}
3220
3221
#[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3222
#[doc(hidden)]
3223
pub fn _partition_by_impl(
3224
&self,
3225
cols: &[PlSmallStr],
3226
stable: bool,
3227
include_key: bool,
3228
parallel: bool,
3229
) -> PolarsResult<Vec<DataFrame>> {
3230
let selected_keys = self.select_columns(cols.iter().cloned())?;
3231
let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3232
let groups = groups.take_groups();
3233
3234
// drop key columns prior to calculation if requested
3235
let df = if include_key {
3236
self.clone()
3237
} else {
3238
self.drop_many(cols.iter().cloned())
3239
};
3240
3241
if parallel {
3242
// don't parallelize this
3243
// there is a lot of parallelization in take and this may easily SO
3244
POOL.install(|| {
3245
match groups.as_ref() {
3246
GroupsType::Idx(idx) => {
3247
// Rechunk as the gather may rechunk for every group #17562.
3248
let mut df = df.clone();
3249
df.as_single_chunk_par();
3250
Ok(idx
3251
.into_par_iter()
3252
.map(|(_, group)| {
3253
// groups are in bounds
3254
unsafe {
3255
df._take_unchecked_slice_sorted(
3256
group,
3257
false,
3258
IsSorted::Ascending,
3259
)
3260
}
3261
})
3262
.collect())
3263
},
3264
GroupsType::Slice { groups, .. } => Ok(groups
3265
.into_par_iter()
3266
.map(|[first, len]| df.slice(*first as i64, *len as usize))
3267
.collect()),
3268
}
3269
})
3270
} else {
3271
match groups.as_ref() {
3272
GroupsType::Idx(idx) => {
3273
// Rechunk as the gather may rechunk for every group #17562.
3274
let mut df = df;
3275
df.as_single_chunk();
3276
Ok(idx
3277
.into_iter()
3278
.map(|(_, group)| {
3279
// groups are in bounds
3280
unsafe {
3281
df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3282
}
3283
})
3284
.collect())
3285
},
3286
GroupsType::Slice { groups, .. } => Ok(groups
3287
.iter()
3288
.map(|[first, len]| df.slice(*first as i64, *len as usize))
3289
.collect()),
3290
}
3291
}
3292
}
3293
3294
/// Split into multiple DataFrames partitioned by groups
3295
#[cfg(feature = "partition_by")]
3296
pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3297
where
3298
I: IntoIterator<Item = S>,
3299
S: Into<PlSmallStr>,
3300
{
3301
let cols = cols
3302
.into_iter()
3303
.map(Into::into)
3304
.collect::<Vec<PlSmallStr>>();
3305
self._partition_by_impl(cols.as_slice(), false, include_key, true)
3306
}
3307
3308
/// Split into multiple DataFrames partitioned by groups
3309
/// Order of the groups are maintained.
3310
#[cfg(feature = "partition_by")]
3311
pub fn partition_by_stable<I, S>(
3312
&self,
3313
cols: I,
3314
include_key: bool,
3315
) -> PolarsResult<Vec<DataFrame>>
3316
where
3317
I: IntoIterator<Item = S>,
3318
S: Into<PlSmallStr>,
3319
{
3320
let cols = cols
3321
.into_iter()
3322
.map(Into::into)
3323
.collect::<Vec<PlSmallStr>>();
3324
self._partition_by_impl(cols.as_slice(), true, include_key, true)
3325
}
3326
3327
/// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3328
/// inserted as columns.
3329
#[cfg(feature = "dtype-struct")]
3330
pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3331
let cols = cols.into_vec();
3332
self.unnest_impl(cols.into_iter().collect())
3333
}
3334
3335
#[cfg(feature = "dtype-struct")]
3336
fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3337
let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3338
let mut count = 0;
3339
for s in &self.columns {
3340
if cols.contains(s.name()) {
3341
let ca = s.struct_()?.clone();
3342
new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3343
count += 1;
3344
} else {
3345
new_cols.push(s.clone())
3346
}
3347
}
3348
if count != cols.len() {
3349
// one or more columns not found
3350
// the code below will return an error with the missing name
3351
let schema = self.schema();
3352
for col in cols {
3353
let _ = schema
3354
.get(col.as_str())
3355
.ok_or_else(|| polars_err!(col_not_found = col))?;
3356
}
3357
}
3358
DataFrame::new(new_cols)
3359
}
3360
3361
pub(crate) fn infer_height(cols: &[Column]) -> usize {
3362
cols.first().map_or(0, Column::len)
3363
}
3364
3365
pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3366
// @Optimize: this does a lot of unnecessary allocations. We should probably have a
3367
// append_chunk or something like this. It is just quite difficult to make that safe.
3368
let df = DataFrame::from(rb);
3369
polars_ensure!(
3370
self.schema() == df.schema(),
3371
SchemaMismatch: "cannot append record batch with different schema\n\n
3372
Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3373
);
3374
self.vstack_mut_owned_unchecked(df);
3375
Ok(())
3376
}
3377
}
3378
3379
pub struct RecordBatchIter<'a> {
3380
columns: &'a Vec<Column>,
3381
schema: ArrowSchemaRef,
3382
idx: usize,
3383
n_chunks: usize,
3384
compat_level: CompatLevel,
3385
parallel: bool,
3386
}
3387
3388
impl Iterator for RecordBatchIter<'_> {
3389
type Item = RecordBatch;
3390
3391
fn next(&mut self) -> Option<Self::Item> {
3392
if self.idx >= self.n_chunks {
3393
return None;
3394
}
3395
3396
// Create a batch of the columns with the same chunk no.
3397
let batch_cols: Vec<ArrayRef> = if self.parallel {
3398
let iter = self
3399
.columns
3400
.par_iter()
3401
.map(Column::as_materialized_series)
3402
.map(|s| s.to_arrow(self.idx, self.compat_level));
3403
POOL.install(|| iter.collect())
3404
} else {
3405
self.columns
3406
.iter()
3407
.map(Column::as_materialized_series)
3408
.map(|s| s.to_arrow(self.idx, self.compat_level))
3409
.collect()
3410
};
3411
self.idx += 1;
3412
3413
let length = batch_cols.first().map_or(0, |arr| arr.len());
3414
Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3415
}
3416
3417
fn size_hint(&self) -> (usize, Option<usize>) {
3418
let n = self.n_chunks - self.idx;
3419
(n, Some(n))
3420
}
3421
}
3422
3423
pub struct PhysRecordBatchIter<'a> {
3424
schema: ArrowSchemaRef,
3425
arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3426
}
3427
3428
impl Iterator for PhysRecordBatchIter<'_> {
3429
type Item = RecordBatch;
3430
3431
fn next(&mut self) -> Option<Self::Item> {
3432
let arrs = self
3433
.arr_iters
3434
.iter_mut()
3435
.map(|phys_iter| phys_iter.next().cloned())
3436
.collect::<Option<Vec<_>>>()?;
3437
3438
let length = arrs.first().map_or(0, |arr| arr.len());
3439
Some(RecordBatch::new(length, self.schema.clone(), arrs))
3440
}
3441
3442
fn size_hint(&self) -> (usize, Option<usize>) {
3443
if let Some(iter) = self.arr_iters.first() {
3444
iter.size_hint()
3445
} else {
3446
(0, None)
3447
}
3448
}
3449
}
3450
3451
impl Default for DataFrame {
3452
fn default() -> Self {
3453
DataFrame::empty()
3454
}
3455
}
3456
3457
impl From<DataFrame> for Vec<Column> {
3458
fn from(df: DataFrame) -> Self {
3459
df.columns
3460
}
3461
}
3462
3463
// utility to test if we can vstack/extend the columns
3464
fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3465
polars_ensure!(
3466
left.name() == right.name(),
3467
ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3468
left.name(), right.name(),
3469
);
3470
Ok(())
3471
}
3472
3473
#[cfg(test)]
3474
mod test {
3475
use super::*;
3476
3477
fn create_frame() -> DataFrame {
3478
let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3479
let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3480
DataFrame::new(vec![s0, s1]).unwrap()
3481
}
3482
3483
#[test]
3484
#[cfg_attr(miri, ignore)]
3485
fn test_recordbatch_iterator() {
3486
let df = df!(
3487
"foo" => [1, 2, 3, 4, 5]
3488
)
3489
.unwrap();
3490
let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3491
assert_eq!(5, iter.next().unwrap().len());
3492
assert!(iter.next().is_none());
3493
}
3494
3495
#[test]
3496
#[cfg_attr(miri, ignore)]
3497
fn test_select() {
3498
let df = create_frame();
3499
assert_eq!(
3500
df.column("days")
3501
.unwrap()
3502
.as_series()
3503
.unwrap()
3504
.equal(1)
3505
.unwrap()
3506
.sum(),
3507
Some(1)
3508
);
3509
}
3510
3511
#[test]
3512
#[cfg_attr(miri, ignore)]
3513
fn test_filter_broadcast_on_string_col() {
3514
let col_name = "some_col";
3515
let v = vec!["test".to_string()];
3516
let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3517
let mut df = DataFrame::new(vec![s0]).unwrap();
3518
3519
df = df
3520
.filter(
3521
&df.column(col_name)
3522
.unwrap()
3523
.as_materialized_series()
3524
.equal("")
3525
.unwrap(),
3526
)
3527
.unwrap();
3528
assert_eq!(
3529
df.column(col_name)
3530
.unwrap()
3531
.as_materialized_series()
3532
.n_chunks(),
3533
1
3534
);
3535
}
3536
3537
#[test]
3538
#[cfg_attr(miri, ignore)]
3539
fn test_filter_broadcast_on_list_col() {
3540
let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3541
let ll: ListChunked = [&s1].iter().copied().collect();
3542
3543
let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3544
let new = ll.filter(&mask).unwrap();
3545
3546
assert_eq!(new.chunks.len(), 1);
3547
assert_eq!(new.len(), 0);
3548
}
3549
3550
#[test]
3551
fn slice() {
3552
let df = create_frame();
3553
let sliced_df = df.slice(0, 2);
3554
assert_eq!(sliced_df.shape(), (2, 2));
3555
}
3556
3557
#[test]
3558
fn rechunk_false() {
3559
let df = create_frame();
3560
assert!(!df.should_rechunk())
3561
}
3562
3563
#[test]
3564
fn rechunk_true() -> PolarsResult<()> {
3565
let mut base = df!(
3566
"a" => [1, 2, 3],
3567
"b" => [1, 2, 3]
3568
)?;
3569
3570
// Create a series with multiple chunks
3571
let mut s = Series::new("foo".into(), 0..2);
3572
let s2 = Series::new("bar".into(), 0..1);
3573
s.append(&s2)?;
3574
3575
// Append series to frame
3576
let out = base.with_column(s)?;
3577
3578
// Now we should rechunk
3579
assert!(out.should_rechunk());
3580
Ok(())
3581
}
3582
3583
#[test]
3584
fn test_duplicate_column() {
3585
let mut df = df! {
3586
"foo" => [1, 2, 3]
3587
}
3588
.unwrap();
3589
// check if column is replaced
3590
assert!(
3591
df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3592
.is_ok()
3593
);
3594
assert!(
3595
df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3596
.is_ok()
3597
);
3598
assert!(df.column("bar").is_ok())
3599
}
3600
3601
#[test]
3602
#[cfg_attr(miri, ignore)]
3603
fn distinct() {
3604
let df = df! {
3605
"flt" => [1., 1., 2., 2., 3., 3.],
3606
"int" => [1, 1, 2, 2, 3, 3, ],
3607
"str" => ["a", "a", "b", "b", "c", "c"]
3608
}
3609
.unwrap();
3610
let df = df
3611
.unique_stable(None, UniqueKeepStrategy::First, None)
3612
.unwrap()
3613
.sort(["flt"], SortMultipleOptions::default())
3614
.unwrap();
3615
let valid = df! {
3616
"flt" => [1., 2., 3.],
3617
"int" => [1, 2, 3],
3618
"str" => ["a", "b", "c"]
3619
}
3620
.unwrap();
3621
assert!(df.equals(&valid));
3622
}
3623
3624
#[test]
3625
fn test_vstack() {
3626
// check that it does not accidentally rechunks
3627
let mut df = df! {
3628
"flt" => [1., 1., 2., 2., 3., 3.],
3629
"int" => [1, 1, 2, 2, 3, 3, ],
3630
"str" => ["a", "a", "b", "b", "c", "c"]
3631
}
3632
.unwrap();
3633
3634
df.vstack_mut(&df.slice(0, 3)).unwrap();
3635
assert_eq!(df.first_col_n_chunks(), 2)
3636
}
3637
3638
#[test]
3639
fn test_vstack_on_empty_dataframe() {
3640
let mut df = DataFrame::empty();
3641
3642
let df_data = df! {
3643
"flt" => [1., 1., 2., 2., 3., 3.],
3644
"int" => [1, 1, 2, 2, 3, 3, ],
3645
"str" => ["a", "a", "b", "b", "c", "c"]
3646
}
3647
.unwrap();
3648
3649
df.vstack_mut(&df_data).unwrap();
3650
assert_eq!(df.height, 6)
3651
}
3652
3653
#[test]
3654
fn test_replace_or_add() -> PolarsResult<()> {
3655
let mut df = df!(
3656
"a" => [1, 2, 3],
3657
"b" => [1, 2, 3]
3658
)?;
3659
3660
// check that the new column is "c" and not "bar".
3661
df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3662
3663
assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3664
Ok(())
3665
}
3666
3667
#[test]
3668
fn test_unique_keep_none_with_slice() {
3669
let df = df! {
3670
"x" => [1, 2, 3, 2, 1]
3671
}
3672
.unwrap();
3673
let out = df
3674
.unique_stable(
3675
Some(&["x".to_string()][..]),
3676
UniqueKeepStrategy::None,
3677
Some((0, 2)),
3678
)
3679
.unwrap();
3680
let expected = df! {
3681
"x" => [3]
3682
}
3683
.unwrap();
3684
assert!(out.equals(&expected));
3685
}
3686
3687
#[test]
3688
#[cfg(feature = "dtype-i8")]
3689
fn test_apply_result_schema() {
3690
let mut df = df! {
3691
"x" => [1, 2, 3, 2, 1]
3692
}
3693
.unwrap();
3694
3695
let schema_before = df.schema().clone();
3696
df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3697
assert_ne!(&schema_before, df.schema());
3698
}
3699
}
3700
3701