Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/dsl/mod.rs
8475 views
1
#![allow(ambiguous_glob_reexports)]
2
//! Domain specific language for the Lazy API.
3
#[cfg(feature = "dtype-categorical")]
4
pub mod cat;
5
#[cfg(feature = "dtype-categorical")]
6
pub use cat::*;
7
#[cfg(feature = "rolling_window_by")]
8
pub(crate) use polars_time::prelude::*;
9
10
mod arithmetic;
11
mod arity;
12
#[cfg(feature = "dtype-array")]
13
mod array;
14
pub mod binary;
15
#[cfg(feature = "bitwise")]
16
mod bitwise;
17
mod builder_dsl;
18
pub use builder_dsl::*;
19
mod datatype_expr;
20
#[cfg(feature = "temporal")]
21
pub mod dt;
22
mod expr;
23
#[cfg(feature = "dtype-extension")]
24
mod extension;
25
mod format;
26
mod from;
27
pub mod function_expr;
28
pub mod functions;
29
mod list;
30
mod match_to_schema;
31
#[cfg(feature = "meta")]
32
mod meta;
33
mod name;
34
mod options;
35
#[cfg(feature = "python")]
36
pub mod python_dsl;
37
#[cfg(feature = "random")]
38
mod random;
39
mod scan_sources;
40
mod selector;
41
#[cfg(feature = "serde")]
42
mod serializable_plan;
43
mod statistics;
44
#[cfg(feature = "strings")]
45
pub mod string;
46
#[cfg(feature = "dtype-struct")]
47
mod struct_;
48
pub mod udf;
49
50
use std::fmt::Debug;
51
use std::sync::Arc;
52
53
mod iter;
54
mod plan;
55
pub use arity::*;
56
#[cfg(feature = "dtype-array")]
57
pub use array::*;
58
pub use datatype_expr::DataTypeExpr;
59
pub use expr::*;
60
#[cfg(feature = "dtype-extension")]
61
pub use extension::*;
62
pub use function_expr::*;
63
pub use list::*;
64
pub use match_to_schema::*;
65
#[cfg(feature = "meta")]
66
pub use meta::*;
67
pub use name::*;
68
pub use options::*;
69
pub use plan::*;
70
use polars_compute::rolling::QuantileMethod;
71
use polars_core::chunked_array::cast::CastOptions;
72
use polars_core::error::feature_gated;
73
use polars_core::prelude::*;
74
use polars_core::series::IsSorted;
75
#[cfg(feature = "diff")]
76
use polars_core::series::ops::NullBehavior;
77
#[cfg(feature = "is_close")]
78
use polars_utils::total_ord::TotalOrdWrap;
79
pub use selector::{DataTypeSelector, Selector, TimeUnitSet, TimeZoneSet};
80
#[cfg(feature = "dtype-struct")]
81
pub use struct_::*;
82
pub use udf::UserDefinedFunction;
83
mod file_scan;
84
pub use file_scan::*;
85
use functions::lit;
86
pub use scan_sources::{ScanSource, ScanSourceIter, ScanSourceRef, ScanSources};
87
88
use crate::prelude::*;
89
90
impl Expr {
91
/// Compare `Expr` with other `Expr` on equality.
92
pub fn eq<E: Into<Expr>>(self, other: E) -> Expr {
93
binary_expr(self, Operator::Eq, other.into())
94
}
95
96
/// Compare `Expr` with other `Expr` on equality where `None == None`.
97
pub fn eq_missing<E: Into<Expr>>(self, other: E) -> Expr {
98
binary_expr(self, Operator::EqValidity, other.into())
99
}
100
101
/// Compare `Expr` with other `Expr` on non-equality.
102
pub fn neq<E: Into<Expr>>(self, other: E) -> Expr {
103
binary_expr(self, Operator::NotEq, other.into())
104
}
105
106
/// Compare `Expr` with other `Expr` on non-equality where `None == None`.
107
pub fn neq_missing<E: Into<Expr>>(self, other: E) -> Expr {
108
binary_expr(self, Operator::NotEqValidity, other.into())
109
}
110
111
/// Check if `Expr` < `Expr`.
112
pub fn lt<E: Into<Expr>>(self, other: E) -> Expr {
113
binary_expr(self, Operator::Lt, other.into())
114
}
115
116
/// Check if `Expr` > `Expr`.
117
pub fn gt<E: Into<Expr>>(self, other: E) -> Expr {
118
binary_expr(self, Operator::Gt, other.into())
119
}
120
121
/// Check if `Expr` >= `Expr`.
122
pub fn gt_eq<E: Into<Expr>>(self, other: E) -> Expr {
123
binary_expr(self, Operator::GtEq, other.into())
124
}
125
126
/// Check if `Expr` <= `Expr`.
127
pub fn lt_eq<E: Into<Expr>>(self, other: E) -> Expr {
128
binary_expr(self, Operator::LtEq, other.into())
129
}
130
131
/// Negate `Expr`.
132
#[allow(clippy::should_implement_trait)]
133
pub fn not(self) -> Expr {
134
self.map_unary(BooleanFunction::Not)
135
}
136
137
/// Rename Column.
138
pub fn alias<S>(self, name: S) -> Expr
139
where
140
S: Into<PlSmallStr>,
141
{
142
Expr::Alias(Arc::new(self), name.into())
143
}
144
145
/// Run is_null operation on `Expr`.
146
#[allow(clippy::wrong_self_convention)]
147
pub fn is_null(self) -> Self {
148
self.map_unary(BooleanFunction::IsNull)
149
}
150
151
/// Run is_not_null operation on `Expr`.
152
#[allow(clippy::wrong_self_convention)]
153
pub fn is_not_null(self) -> Self {
154
self.map_unary(BooleanFunction::IsNotNull)
155
}
156
157
/// Drop null values.
158
pub fn drop_nulls(self) -> Self {
159
self.map_unary(FunctionExpr::DropNulls)
160
}
161
162
/// Drop NaN values.
163
pub fn drop_nans(self) -> Self {
164
self.map_unary(FunctionExpr::DropNans)
165
}
166
167
/// Get the number of unique values in the groups.
168
pub fn n_unique(self) -> Self {
169
AggExpr::NUnique(Arc::new(self)).into()
170
}
171
172
/// Get the first value in the group.
173
pub fn first(self) -> Self {
174
AggExpr::First(Arc::new(self)).into()
175
}
176
177
/// Get the first non-nullvalue in the group.
178
pub fn first_non_null(self) -> Self {
179
AggExpr::FirstNonNull(Arc::new(self)).into()
180
}
181
182
/// Get the last value in the group.
183
pub fn last(self) -> Self {
184
AggExpr::Last(Arc::new(self)).into()
185
}
186
187
/// Get the last non-null value in the group.
188
pub fn last_non_null(self) -> Self {
189
AggExpr::LastNonNull(Arc::new(self)).into()
190
}
191
192
/// Get the single value in the group. If there are multiple values, an error is returned.
193
pub fn item(self, allow_empty: bool) -> Self {
194
AggExpr::Item {
195
input: Arc::new(self),
196
allow_empty,
197
}
198
.into()
199
}
200
201
/// Implode into a list scalar.
202
pub fn implode(self) -> Self {
203
AggExpr::Implode(Arc::new(self)).into()
204
}
205
206
/// Compute the quantile per group.
207
pub fn quantile(self, quantile: Expr, method: QuantileMethod) -> Self {
208
AggExpr::Quantile {
209
expr: Arc::new(self),
210
quantile: Arc::new(quantile),
211
method,
212
}
213
.into()
214
}
215
216
/// Get the group indexes of the group by operation.
217
pub fn agg_groups(self) -> Self {
218
AggExpr::AggGroups(Arc::new(self)).into()
219
}
220
221
/// Alias for `explode`.
222
#[deprecated(
223
since = "0.53.0",
224
note = "Use `explode()` with `ExplodeOptions { empty_as_null: false, keep_nulls: false }` instead. Will be removed in version 2.0."
225
)]
226
pub fn flatten(self) -> Self {
227
self.explode(ExplodeOptions {
228
empty_as_null: true,
229
keep_nulls: true,
230
})
231
}
232
233
/// Explode the String/List column.
234
pub fn explode(self, options: ExplodeOptions) -> Self {
235
Expr::Explode {
236
input: Arc::new(self),
237
options,
238
}
239
}
240
241
/// Slice the Series.
242
/// `offset` may be negative.
243
pub fn slice<E: Into<Expr>, F: Into<Expr>>(self, offset: E, length: F) -> Self {
244
Expr::Slice {
245
input: Arc::new(self),
246
offset: Arc::new(offset.into()),
247
length: Arc::new(length.into()),
248
}
249
}
250
251
/// Append expressions. This is done by adding the chunks of `other` to this [`Series`].
252
pub fn append<E: Into<Expr>>(self, other: E, upcast: bool) -> Self {
253
self.map_binary(FunctionExpr::Append { upcast }, other.into())
254
}
255
256
/// Collect all chunks into a single chunk before continuing.
257
pub fn rechunk(self) -> Self {
258
self.map_unary(FunctionExpr::Rechunk)
259
}
260
261
/// Get the first `n` elements of the Expr result.
262
pub fn head(self, length: Option<usize>) -> Self {
263
self.slice(lit(0), lit(length.unwrap_or(10) as u64))
264
}
265
266
/// Get the last `n` elements of the Expr result.
267
pub fn tail(self, length: Option<usize>) -> Self {
268
let len = length.unwrap_or(10);
269
self.slice(lit(-(len as i64)), lit(len as u64))
270
}
271
272
/// Get unique values of this expression.
273
pub fn unique(self) -> Self {
274
self.map_unary(FunctionExpr::Unique(false))
275
}
276
277
/// Get unique values of this expression, while maintaining order.
278
/// This requires more work than [`Expr::unique`].
279
pub fn unique_stable(self) -> Self {
280
self.map_unary(FunctionExpr::Unique(true))
281
}
282
283
/// Get the first index of unique values of this expression.
284
pub fn arg_unique(self) -> Self {
285
self.map_unary(FunctionExpr::ArgUnique)
286
}
287
288
/// Get the index value that has the minimum value.
289
pub fn arg_min(self) -> Self {
290
self.map_unary(FunctionExpr::ArgMin)
291
}
292
293
/// Get the index value that has the maximum value.
294
pub fn arg_max(self) -> Self {
295
self.map_unary(FunctionExpr::ArgMax)
296
}
297
/// Get the index values that would sort this expression.
298
pub fn arg_sort(self, descending: bool, nulls_last: bool) -> Self {
299
self.map_unary(FunctionExpr::ArgSort {
300
descending,
301
nulls_last,
302
})
303
}
304
305
#[cfg(feature = "index_of")]
306
/// Find the index of a value.
307
pub fn index_of<E: Into<Expr>>(self, element: E) -> Expr {
308
self.map_binary(FunctionExpr::IndexOf, element.into())
309
}
310
311
#[cfg(feature = "search_sorted")]
312
/// Find indices where elements should be inserted to maintain order.
313
pub fn search_sorted<E: Into<Expr>>(
314
self,
315
element: E,
316
side: SearchSortedSide,
317
descending: bool,
318
) -> Expr {
319
self.map_binary(
320
FunctionExpr::SearchSorted { side, descending },
321
element.into(),
322
)
323
}
324
325
/// Cast expression to another data type.
326
/// Throws an error if conversion had overflows.
327
/// Returns an Error if cast is invalid on rows after predicates are pushed down.
328
pub fn strict_cast(self, dtype: impl Into<DataTypeExpr>) -> Self {
329
Expr::Cast {
330
expr: Arc::new(self),
331
dtype: dtype.into(),
332
options: CastOptions::Strict,
333
}
334
}
335
336
/// Cast expression to another data type.
337
pub fn cast(self, dtype: impl Into<DataTypeExpr>) -> Self {
338
Expr::Cast {
339
expr: Arc::new(self),
340
dtype: dtype.into(),
341
options: CastOptions::NonStrict,
342
}
343
}
344
345
/// Cast expression to another data type.
346
pub fn cast_with_options(
347
self,
348
dtype: impl Into<DataTypeExpr>,
349
cast_options: CastOptions,
350
) -> Self {
351
Expr::Cast {
352
expr: Arc::new(self),
353
dtype: dtype.into(),
354
options: cast_options,
355
}
356
}
357
358
/// Take the values by idx.
359
pub fn gather<E: Into<Expr>>(self, idx: E) -> Self {
360
Expr::Gather {
361
expr: Arc::new(self),
362
idx: Arc::new(idx.into()),
363
returns_scalar: false,
364
null_on_oob: false,
365
}
366
}
367
368
/// Take the values by a single index.
369
pub fn get<E: Into<Expr>>(self, idx: E, null_on_oob: bool) -> Self {
370
Expr::Gather {
371
expr: Arc::new(self),
372
idx: Arc::new(idx.into()),
373
returns_scalar: true,
374
null_on_oob,
375
}
376
}
377
378
/// Sort with given options.
379
///
380
/// # Example
381
///
382
/// ```rust
383
/// # use polars_core::prelude::*;
384
/// # use polars_lazy::prelude::*;
385
/// # fn main() -> PolarsResult<()> {
386
/// let lf = df! {
387
/// "a" => [Some(5), Some(4), Some(3), Some(2), None]
388
/// }?
389
/// .lazy();
390
///
391
/// let sorted = lf
392
/// .select(
393
/// vec![col("a").sort(SortOptions::default())],
394
/// )
395
/// .collect()?;
396
///
397
/// assert_eq!(
398
/// sorted,
399
/// df! {
400
/// "a" => [None, Some(2), Some(3), Some(4), Some(5)]
401
/// }?
402
/// );
403
/// # Ok(())
404
/// # }
405
/// ```
406
/// See [`SortOptions`] for more options.
407
pub fn sort(self, options: SortOptions) -> Self {
408
Expr::Sort {
409
expr: Arc::new(self),
410
options,
411
}
412
}
413
414
/// Returns the `k` largest elements.
415
///
416
/// This has time complexity `O(n + k log(n))`.
417
#[cfg(feature = "top_k")]
418
pub fn top_k(self, k: Expr) -> Self {
419
self.map_binary(FunctionExpr::TopK { descending: false }, k)
420
}
421
422
/// Returns the `k` largest rows by given column.
423
///
424
/// For single column, use [`Expr::top_k`].
425
#[cfg(feature = "top_k")]
426
pub fn top_k_by<K: Into<Expr>, E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
427
self,
428
k: K,
429
by: E,
430
descending: Vec<bool>,
431
) -> Self {
432
self.map_n_ary(
433
FunctionExpr::TopKBy { descending },
434
[k.into()]
435
.into_iter()
436
.chain(by.as_ref().iter().map(|e| -> Expr { e.clone().into() })),
437
)
438
}
439
440
/// Returns the `k` smallest elements.
441
///
442
/// This has time complexity `O(n + k log(n))`.
443
#[cfg(feature = "top_k")]
444
pub fn bottom_k(self, k: Expr) -> Self {
445
self.map_binary(FunctionExpr::TopK { descending: true }, k)
446
}
447
448
/// Returns the `k` smallest rows by given column.
449
///
450
/// For single column, use [`Expr::bottom_k`].
451
#[cfg(feature = "top_k")]
452
pub fn bottom_k_by<K: Into<Expr>, E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
453
self,
454
k: K,
455
by: E,
456
descending: Vec<bool>,
457
) -> Self {
458
let descending = descending.into_iter().map(|x| !x).collect();
459
self.map_n_ary(
460
FunctionExpr::TopKBy { descending },
461
[k.into()]
462
.into_iter()
463
.chain(by.as_ref().iter().map(|e| -> Expr { e.clone().into() })),
464
)
465
}
466
467
/// Reverse column
468
pub fn reverse(self) -> Self {
469
self.map_unary(FunctionExpr::Reverse)
470
}
471
472
/// Apply a function/closure once the logical plan get executed.
473
///
474
/// This function is very similar to [`Expr::apply`], but differs in how it handles aggregations.
475
///
476
/// * `map` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
477
/// * `apply` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
478
///
479
/// It is the responsibility of the caller that the schema is correct by giving
480
/// the correct output_type. If None given the output type of the input expr is used.
481
pub fn map<F, DT>(self, function: F, output_type: DT) -> Self
482
where
483
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
484
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
485
{
486
self.map_with_fmt_str(function, output_type, "map")
487
}
488
489
pub fn map_with_fmt_str<F, DT>(
490
self,
491
function: F,
492
output_type: DT,
493
fmt_str: impl Into<PlSmallStr>,
494
) -> Self
495
where
496
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
497
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
498
{
499
let f = BaseColumnUdf::new(
500
move |c: &mut [Column]| function(std::mem::take(&mut c[0])),
501
move |schema: &Schema, fields: &[Field]| output_type(schema, &fields[0]),
502
);
503
504
let options =
505
FunctionOptions::elementwise().with_flags(|f| f | FunctionFlags::OPTIONAL_RE_ENTRANT);
506
let fmt_str = Box::new(fmt_str.into());
507
Expr::AnonymousFunction {
508
input: vec![self],
509
function: new_column_udf(f),
510
options,
511
fmt_str,
512
}
513
}
514
515
pub fn agg_with_fmt_str<F, DT>(
516
self,
517
function: F,
518
output_type: DT,
519
fmt_str: impl Into<PlSmallStr>,
520
) -> Self
521
where
522
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
523
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
524
{
525
let f = BaseColumnUdf::new(
526
move |c: &mut [Column]| function(std::mem::take(&mut c[0])),
527
move |schema: &Schema, fields: &[Field]| output_type(schema, &fields[0]),
528
);
529
530
let options = FunctionOptions::aggregation();
531
let fmt_str = Box::new(fmt_str.into());
532
Expr::AnonymousFunction {
533
input: vec![self],
534
function: new_column_udf(f),
535
options,
536
fmt_str,
537
}
538
}
539
540
pub fn apply_with_fmt_str<F, DT>(
541
self,
542
function: F,
543
output_type: DT,
544
fmt_str: impl Into<PlSmallStr>,
545
) -> Self
546
where
547
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
548
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
549
{
550
let f = BaseColumnUdf::new(
551
move |c: &mut [Column]| function(std::mem::take(&mut c[0])),
552
move |schema: &Schema, fields: &[Field]| output_type(schema, &fields[0]),
553
);
554
555
let options = FunctionOptions::groupwise();
556
let fmt_str = Box::new(fmt_str.into());
557
Expr::AnonymousFunction {
558
input: vec![self],
559
function: new_column_udf(f),
560
options,
561
fmt_str,
562
}
563
}
564
565
/// Apply a function/closure once the logical plan get executed with many arguments.
566
///
567
/// See the [`Expr::map`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply).
568
pub fn map_many<F, DT>(self, function: F, arguments: &[Expr], output_type: DT) -> Self
569
where
570
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
571
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
572
{
573
let mut input = vec![self];
574
input.extend_from_slice(arguments);
575
576
let function = BaseColumnUdf::new(function, output_type);
577
578
let options = FunctionOptions::elementwise();
579
Expr::AnonymousFunction {
580
input,
581
function: new_column_udf(function),
582
options,
583
fmt_str: Box::new(PlSmallStr::EMPTY),
584
}
585
}
586
587
/// Apply a function/closure over the groups. This should only be used in a group_by aggregation.
588
///
589
/// It is the responsibility of the caller that the schema is correct by giving
590
/// the correct output_type. If None given the output type of the input expr is used.
591
///
592
/// This difference with [map](Self::map) is that `apply` will create a separate `Series` per group.
593
///
594
/// * `map` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
595
/// * `apply` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
596
pub fn apply<F, DT>(self, function: F, output_type: DT) -> Self
597
where
598
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
599
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
600
{
601
self.apply_with_fmt_str(function, output_type, PlSmallStr::EMPTY)
602
}
603
604
/// Apply a function/closure over the groups with many arguments. This should only be used in a group_by aggregation.
605
///
606
/// See the [`Expr::apply`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply).
607
pub fn apply_many<F, DT>(self, function: F, arguments: &[Expr], output_type: DT) -> Self
608
where
609
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
610
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
611
{
612
let mut input = vec![self];
613
input.extend_from_slice(arguments);
614
615
let function = BaseColumnUdf::new(function, output_type);
616
617
let options = FunctionOptions::groupwise();
618
Expr::AnonymousFunction {
619
input,
620
function: new_column_udf(function),
621
options,
622
fmt_str: Box::new(PlSmallStr::EMPTY),
623
}
624
}
625
626
/// Get mask of finite values if dtype is Float.
627
#[allow(clippy::wrong_self_convention)]
628
pub fn is_finite(self) -> Self {
629
self.map_unary(BooleanFunction::IsFinite)
630
}
631
632
/// Get mask of infinite values if dtype is Float.
633
#[allow(clippy::wrong_self_convention)]
634
pub fn is_infinite(self) -> Self {
635
self.map_unary(BooleanFunction::IsInfinite)
636
}
637
638
/// Get mask of NaN values if dtype is Float.
639
pub fn is_nan(self) -> Self {
640
self.map_unary(BooleanFunction::IsNan)
641
}
642
643
/// Get inverse mask of NaN values if dtype is Float.
644
pub fn is_not_nan(self) -> Self {
645
self.map_unary(BooleanFunction::IsNotNan)
646
}
647
648
/// Shift the values in the array by some period. See [the eager implementation](polars_core::series::SeriesTrait::shift).
649
pub fn shift(self, n: Expr) -> Self {
650
self.map_binary(FunctionExpr::Shift, n)
651
}
652
653
/// Shift the values in the array by some period and fill the resulting empty values.
654
pub fn shift_and_fill<E: Into<Expr>, IE: Into<Expr>>(self, n: E, fill_value: IE) -> Self {
655
self.map_ternary(FunctionExpr::ShiftAndFill, n.into(), fill_value.into())
656
}
657
658
/// Cumulatively count values from 0 to len.
659
#[cfg(feature = "cum_agg")]
660
pub fn cumulative_eval(self, evaluation: Expr, min_samples: usize) -> Self {
661
Expr::Eval {
662
expr: Arc::new(self),
663
evaluation: Arc::new(evaluation),
664
variant: EvalVariant::Cumulative { min_samples },
665
}
666
}
667
668
/// Cumulatively count values from 0 to len.
669
#[cfg(feature = "cum_agg")]
670
pub fn cum_count(self, reverse: bool) -> Self {
671
self.map_unary(FunctionExpr::CumCount { reverse })
672
}
673
674
/// Get an array with the cumulative sum computed at every element.
675
#[cfg(feature = "cum_agg")]
676
pub fn cum_sum(self, reverse: bool) -> Self {
677
self.map_unary(FunctionExpr::CumSum { reverse })
678
}
679
680
/// Get an array with the cumulative product computed at every element.
681
#[cfg(feature = "cum_agg")]
682
pub fn cum_prod(self, reverse: bool) -> Self {
683
self.map_unary(FunctionExpr::CumProd { reverse })
684
}
685
686
/// Get an array with the cumulative min computed at every element.
687
#[cfg(feature = "cum_agg")]
688
pub fn cum_min(self, reverse: bool) -> Self {
689
self.map_unary(FunctionExpr::CumMin { reverse })
690
}
691
692
/// Get an array with the cumulative max computed at every element.
693
#[cfg(feature = "cum_agg")]
694
pub fn cum_max(self, reverse: bool) -> Self {
695
self.map_unary(FunctionExpr::CumMax { reverse })
696
}
697
698
/// Get the product aggregation of an expression.
699
pub fn product(self) -> Self {
700
self.map_unary(FunctionExpr::Product)
701
}
702
703
/// Round underlying floating point array to given decimal numbers.
704
#[cfg(feature = "round_series")]
705
pub fn round(self, decimals: u32, mode: RoundMode) -> Self {
706
self.map_unary(FunctionExpr::Round { decimals, mode })
707
}
708
709
/// Round to a number of significant figures.
710
#[cfg(feature = "round_series")]
711
pub fn round_sig_figs(self, digits: i32) -> Self {
712
self.map_unary(FunctionExpr::RoundSF { digits })
713
}
714
715
/// Truncate underlying floating point array toward zero to given decimal.
716
#[cfg(feature = "round_series")]
717
pub fn truncate(self, decimals: u32) -> Self {
718
self.map_unary(FunctionExpr::Truncate { decimals })
719
}
720
721
/// Floor underlying floating point array to the lowest integers smaller or equal to the float value.
722
#[cfg(feature = "round_series")]
723
pub fn floor(self) -> Self {
724
self.map_unary(FunctionExpr::Floor)
725
}
726
727
/// Constant Pi
728
#[cfg(feature = "round_series")]
729
pub fn pi() -> Self {
730
lit(std::f64::consts::PI)
731
}
732
733
/// Ceil underlying floating point array to the highest integers smaller or equal to the float value.
734
#[cfg(feature = "round_series")]
735
pub fn ceil(self) -> Self {
736
self.map_unary(FunctionExpr::Ceil)
737
}
738
739
/// Clip underlying values to a set boundary.
740
#[cfg(feature = "round_series")]
741
pub fn clip(self, min: Expr, max: Expr) -> Self {
742
self.map_ternary(
743
FunctionExpr::Clip {
744
has_min: true,
745
has_max: true,
746
},
747
min,
748
max,
749
)
750
}
751
752
/// Clip underlying values to a set boundary.
753
#[cfg(feature = "round_series")]
754
pub fn clip_max(self, max: Expr) -> Self {
755
self.map_binary(
756
FunctionExpr::Clip {
757
has_min: false,
758
has_max: true,
759
},
760
max,
761
)
762
}
763
764
/// Clip underlying values to a set boundary.
765
#[cfg(feature = "round_series")]
766
pub fn clip_min(self, min: Expr) -> Self {
767
self.map_binary(
768
FunctionExpr::Clip {
769
has_min: true,
770
has_max: false,
771
},
772
min,
773
)
774
}
775
776
/// Convert all values to their absolute/positive value.
777
#[cfg(feature = "abs")]
778
pub fn abs(self) -> Self {
779
self.map_unary(FunctionExpr::Abs)
780
}
781
782
/// Apply window function over a subgroup.
783
/// This is similar to a group_by + aggregation + self join.
784
/// Or similar to [window functions in Postgres](https://www.postgresql.org/docs/9.1/tutorial-window.html).
785
///
786
/// # Example
787
///
788
/// ``` rust
789
/// #[macro_use] extern crate polars_core;
790
/// use polars_core::prelude::*;
791
/// use polars_lazy::prelude::*;
792
///
793
/// fn example() -> PolarsResult<()> {
794
/// let df = df! {
795
/// "groups" => &[1, 1, 2, 2, 1, 2, 3, 3, 1],
796
/// "values" => &[1, 2, 3, 4, 5, 6, 7, 8, 8]
797
/// }?;
798
///
799
/// let out = df
800
/// .lazy()
801
/// .select(&[
802
/// col("groups"),
803
/// sum("values").over([col("groups")]),
804
/// ])
805
/// .collect()?;
806
/// println!("{}", &out);
807
/// Ok(())
808
/// }
809
///
810
/// ```
811
///
812
/// Outputs:
813
///
814
/// ``` text
815
/// ╭────────┬────────╮
816
/// │ groups ┆ values │
817
/// │ --- ┆ --- │
818
/// │ i32 ┆ i32 │
819
/// ╞════════╪════════╡
820
/// │ 1 ┆ 16 │
821
/// │ 1 ┆ 16 │
822
/// │ 2 ┆ 13 │
823
/// │ 2 ┆ 13 │
824
/// │ … ┆ … │
825
/// │ 1 ┆ 16 │
826
/// │ 2 ┆ 13 │
827
/// │ 3 ┆ 15 │
828
/// │ 3 ┆ 15 │
829
/// │ 1 ┆ 16 │
830
/// ╰────────┴────────╯
831
/// ```
832
pub fn over<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(self, partition_by: E) -> Self {
833
self.over_with_options(Some(partition_by), None, Default::default())
834
.expect("We explicitly passed `partition_by`")
835
}
836
837
pub fn over_with_options<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
838
self,
839
partition_by: Option<E>,
840
order_by: Option<(E, SortOptions)>,
841
mapping: WindowMapping,
842
) -> PolarsResult<Self> {
843
polars_ensure!(partition_by.is_some() || order_by.is_some(), InvalidOperation: "At least one of `partition_by` and `order_by` must be specified in `over`");
844
let partition_by = if let Some(partition_by) = partition_by {
845
partition_by
846
.as_ref()
847
.iter()
848
.map(|e| e.clone().into())
849
.collect()
850
} else {
851
vec![lit(1)]
852
};
853
854
let order_by = order_by.map(|(e, options)| {
855
let e = e.as_ref();
856
let e = if e.len() == 1 {
857
Arc::new(e[0].clone().into())
858
} else {
859
feature_gated!["dtype-struct", {
860
let e = e.iter().map(|e| e.clone().into()).collect::<Vec<_>>();
861
Arc::new(functions::as_struct(e))
862
}]
863
};
864
(e, options)
865
});
866
867
Ok(Expr::Over {
868
function: Arc::new(self),
869
partition_by,
870
order_by,
871
mapping,
872
})
873
}
874
875
#[cfg(feature = "dynamic_group_by")]
876
pub fn rolling(
877
self,
878
index_column: impl Into<Expr>,
879
period: Duration,
880
offset: Duration,
881
closed_window: ClosedWindow,
882
) -> Self {
883
Expr::Rolling {
884
function: Arc::new(self),
885
index_column: Arc::new(index_column.into()),
886
period,
887
offset,
888
closed_window,
889
}
890
}
891
892
fn fill_null_impl(self, fill_value: Expr) -> Self {
893
self.map_binary(FunctionExpr::FillNull, fill_value)
894
}
895
896
/// Replace the null values by a value.
897
pub fn fill_null<E: Into<Expr>>(self, fill_value: E) -> Self {
898
self.fill_null_impl(fill_value.into())
899
}
900
901
pub fn fill_null_with_strategy(self, strategy: FillNullStrategy) -> Self {
902
self.map_unary(FunctionExpr::FillNullWithStrategy(strategy))
903
}
904
905
/// Replace the floating point `NaN` values by a value.
906
pub fn fill_nan<E: Into<Expr>>(self, fill_value: E) -> Self {
907
// we take the not branch so that self is truthy value of `when -> then -> otherwise`
908
// and that ensure we keep the name of `self`
909
910
when(self.clone().is_not_nan().or(self.clone().is_null()))
911
.then(self)
912
.otherwise(fill_value.into())
913
}
914
/// Count the values of the Series
915
/// or
916
/// Get counts of the group by operation.
917
pub fn count(self) -> Self {
918
AggExpr::Count {
919
input: Arc::new(self),
920
include_nulls: false,
921
}
922
.into()
923
}
924
925
pub fn len(self) -> Self {
926
AggExpr::Count {
927
input: Arc::new(self),
928
include_nulls: true,
929
}
930
.into()
931
}
932
933
/// Get a mask of duplicated values.
934
#[allow(clippy::wrong_self_convention)]
935
#[cfg(feature = "is_unique")]
936
pub fn is_duplicated(self) -> Self {
937
self.map_unary(BooleanFunction::IsDuplicated)
938
}
939
940
#[allow(clippy::wrong_self_convention)]
941
#[cfg(feature = "is_between")]
942
pub fn is_between<E: Into<Expr>>(self, lower: E, upper: E, closed: ClosedInterval) -> Self {
943
self.map_ternary(
944
BooleanFunction::IsBetween { closed },
945
lower.into(),
946
upper.into(),
947
)
948
}
949
950
/// Get a mask of unique values.
951
#[allow(clippy::wrong_self_convention)]
952
#[cfg(feature = "is_unique")]
953
pub fn is_unique(self) -> Self {
954
self.map_unary(BooleanFunction::IsUnique)
955
}
956
957
/// Check whether floating point values are close to each other.
958
#[allow(clippy::wrong_self_convention)]
959
#[cfg(feature = "is_close")]
960
pub fn is_close<E: Into<Expr>>(
961
self,
962
expr: E,
963
abs_tol: f64,
964
rel_tol: f64,
965
nans_equal: bool,
966
) -> Self {
967
self.map_binary(
968
BooleanFunction::IsClose {
969
abs_tol: TotalOrdWrap(abs_tol),
970
rel_tol: TotalOrdWrap(rel_tol),
971
nans_equal,
972
},
973
expr.into(),
974
)
975
}
976
977
/// Get the approximate count of unique values.
978
#[cfg(feature = "approx_unique")]
979
pub fn approx_n_unique(self) -> Self {
980
self.map_unary(FunctionExpr::ApproxNUnique)
981
}
982
983
/// Bitwise "and" operation.
984
pub fn and<E: Into<Expr>>(self, expr: E) -> Self {
985
binary_expr(self, Operator::And, expr.into())
986
}
987
988
/// Bitwise "xor" operation.
989
pub fn xor<E: Into<Expr>>(self, expr: E) -> Self {
990
binary_expr(self, Operator::Xor, expr.into())
991
}
992
993
/// Bitwise "or" operation.
994
pub fn or<E: Into<Expr>>(self, expr: E) -> Self {
995
binary_expr(self, Operator::Or, expr.into())
996
}
997
998
/// Logical "or" operation.
999
pub fn logical_or<E: Into<Expr>>(self, expr: E) -> Self {
1000
binary_expr(self, Operator::LogicalOr, expr.into())
1001
}
1002
1003
/// Logical "and" operation.
1004
pub fn logical_and<E: Into<Expr>>(self, expr: E) -> Self {
1005
binary_expr(self, Operator::LogicalAnd, expr.into())
1006
}
1007
1008
/// Filter a single column.
1009
///
1010
/// Should be used in aggregation context. If you want to filter on a
1011
/// DataFrame level, use `LazyFrame::filter`.
1012
pub fn filter<E: Into<Expr>>(self, predicate: E) -> Self {
1013
Expr::Filter {
1014
input: Arc::new(self),
1015
by: Arc::new(predicate.into()),
1016
}
1017
}
1018
1019
/// Check if the values of the left expression are in the lists of the right expr.
1020
#[allow(clippy::wrong_self_convention)]
1021
#[cfg(feature = "is_in")]
1022
pub fn is_in<E: Into<Expr>>(self, other: E, nulls_equal: bool) -> Self {
1023
let other = other.into();
1024
let function = BooleanFunction::IsIn { nulls_equal };
1025
let function = function.into();
1026
Expr::Function {
1027
input: vec![self, other],
1028
function,
1029
}
1030
}
1031
1032
/// Sort this column by the ordering of another column evaluated from given expr.
1033
/// Can also be used in a group_by context to sort the groups.
1034
///
1035
/// # Example
1036
///
1037
/// ```rust
1038
/// # use polars_core::prelude::*;
1039
/// # use polars_lazy::prelude::*;
1040
/// # fn main() -> PolarsResult<()> {
1041
/// let lf = df! {
1042
/// "a" => [1, 2, 3, 4, 5],
1043
/// "b" => [5, 4, 3, 2, 1]
1044
/// }?.lazy();
1045
///
1046
/// let sorted = lf
1047
/// .select(
1048
/// vec![col("a").sort_by(col("b"), SortOptions::default())],
1049
/// )
1050
/// .collect()?;
1051
///
1052
/// assert_eq!(
1053
/// sorted,
1054
/// df! { "a" => [5, 4, 3, 2, 1] }?
1055
/// );
1056
/// # Ok(())
1057
/// # }
1058
pub fn sort_by<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
1059
self,
1060
by: E,
1061
sort_options: SortMultipleOptions,
1062
) -> Expr {
1063
let by = by.as_ref().iter().map(|e| e.clone().into()).collect();
1064
Expr::SortBy {
1065
expr: Arc::new(self),
1066
by,
1067
sort_options,
1068
}
1069
}
1070
1071
#[cfg(feature = "repeat_by")]
1072
/// Repeat the column `n` times, where `n` is determined by the values in `by`.
1073
/// This yields an `Expr` of dtype `List`.
1074
pub fn repeat_by<E: Into<Expr>>(self, by: E) -> Expr {
1075
self.map_binary(FunctionExpr::RepeatBy, by.into())
1076
}
1077
1078
#[cfg(feature = "is_first_distinct")]
1079
#[allow(clippy::wrong_self_convention)]
1080
/// Get a mask of the first unique value.
1081
pub fn is_first_distinct(self) -> Expr {
1082
self.map_unary(BooleanFunction::IsFirstDistinct)
1083
}
1084
1085
#[cfg(feature = "is_last_distinct")]
1086
#[allow(clippy::wrong_self_convention)]
1087
/// Get a mask of the last unique value.
1088
pub fn is_last_distinct(self) -> Expr {
1089
self.map_unary(BooleanFunction::IsLastDistinct)
1090
}
1091
1092
fn dot_impl(self, other: Expr) -> Expr {
1093
(self * other).sum()
1094
}
1095
1096
/// Compute the dot/inner product between two expressions.
1097
pub fn dot<E: Into<Expr>>(self, other: E) -> Expr {
1098
self.dot_impl(other.into())
1099
}
1100
1101
#[cfg(feature = "mode")]
1102
/// Compute the mode(s) of this column. This is the most occurring value.
1103
pub fn mode(self, maintain_order: bool) -> Expr {
1104
self.map_unary(FunctionExpr::Mode { maintain_order })
1105
}
1106
1107
#[cfg(feature = "interpolate")]
1108
/// Interpolate intermediate values.
1109
/// Nulls at the beginning and end of the series remain null.
1110
pub fn interpolate(self, method: InterpolationMethod) -> Expr {
1111
self.map_unary(FunctionExpr::Interpolate(method))
1112
}
1113
1114
#[cfg(feature = "rolling_window_by")]
1115
#[allow(clippy::type_complexity)]
1116
fn finish_rolling_by(
1117
self,
1118
by: Expr,
1119
options: RollingOptionsDynamicWindow,
1120
rolling_function_by: RollingFunctionBy,
1121
) -> Expr {
1122
self.map_binary(
1123
FunctionExpr::RollingExprBy {
1124
function_by: rolling_function_by,
1125
options,
1126
},
1127
by,
1128
)
1129
}
1130
1131
#[cfg(feature = "interpolate_by")]
1132
/// Interpolate intermediate values.
1133
/// Nulls at the beginning and end of the series remain null.
1134
/// The `by` column provides the x-coordinates for interpolation and must not contain nulls.
1135
pub fn interpolate_by(self, by: Expr) -> Expr {
1136
self.map_binary(FunctionExpr::InterpolateBy, by)
1137
}
1138
1139
#[cfg(feature = "rolling_window")]
1140
#[allow(clippy::type_complexity)]
1141
fn finish_rolling(
1142
self,
1143
options: RollingOptionsFixedWindow,
1144
rolling_function: RollingFunction,
1145
) -> Expr {
1146
self.map_unary(FunctionExpr::RollingExpr {
1147
function: rolling_function,
1148
options,
1149
})
1150
}
1151
1152
/// Apply a rolling minimum based on another column.
1153
#[cfg(feature = "rolling_window_by")]
1154
pub fn rolling_min_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1155
self.finish_rolling_by(by, options, RollingFunctionBy::MinBy)
1156
}
1157
1158
/// Apply a rolling maximum based on another column.
1159
#[cfg(feature = "rolling_window_by")]
1160
pub fn rolling_max_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1161
self.finish_rolling_by(by, options, RollingFunctionBy::MaxBy)
1162
}
1163
1164
/// Apply a rolling mean based on another column.
1165
#[cfg(feature = "rolling_window_by")]
1166
pub fn rolling_mean_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1167
self.finish_rolling_by(by, options, RollingFunctionBy::MeanBy)
1168
}
1169
1170
/// Apply a rolling sum based on another column.
1171
#[cfg(feature = "rolling_window_by")]
1172
pub fn rolling_sum_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1173
self.finish_rolling_by(by, options, RollingFunctionBy::SumBy)
1174
}
1175
1176
/// Apply a rolling quantile based on another column.
1177
#[cfg(feature = "rolling_window_by")]
1178
pub fn rolling_quantile_by(
1179
self,
1180
by: Expr,
1181
method: QuantileMethod,
1182
quantile: f64,
1183
mut options: RollingOptionsDynamicWindow,
1184
) -> Expr {
1185
use polars_compute::rolling::{RollingFnParams, RollingQuantileParams};
1186
options.fn_params = Some(RollingFnParams::Quantile(RollingQuantileParams {
1187
prob: quantile,
1188
method,
1189
}));
1190
1191
self.finish_rolling_by(by, options, RollingFunctionBy::QuantileBy)
1192
}
1193
1194
/// Apply a rolling variance based on another column.
1195
#[cfg(feature = "rolling_window_by")]
1196
pub fn rolling_var_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1197
self.finish_rolling_by(by, options, RollingFunctionBy::VarBy)
1198
}
1199
1200
/// Apply a rolling std-dev based on another column.
1201
#[cfg(feature = "rolling_window_by")]
1202
pub fn rolling_std_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1203
self.finish_rolling_by(by, options, RollingFunctionBy::StdBy)
1204
}
1205
1206
/// Apply a rolling median based on another column.
1207
#[cfg(feature = "rolling_window_by")]
1208
pub fn rolling_median_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1209
self.rolling_quantile_by(by, QuantileMethod::Linear, 0.5, options)
1210
}
1211
1212
#[cfg(feature = "rolling_window_by")]
1213
pub fn rolling_rank_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1214
self.finish_rolling_by(by, options, RollingFunctionBy::RankBy)
1215
}
1216
1217
/// Apply a rolling minimum.
1218
///
1219
/// See: [`RollingAgg::rolling_min`]
1220
#[cfg(feature = "rolling_window")]
1221
pub fn rolling_min(self, options: RollingOptionsFixedWindow) -> Expr {
1222
self.finish_rolling(options, RollingFunction::Min)
1223
}
1224
1225
/// Apply a rolling maximum.
1226
///
1227
/// See: [`RollingAgg::rolling_max`]
1228
#[cfg(feature = "rolling_window")]
1229
pub fn rolling_max(self, options: RollingOptionsFixedWindow) -> Expr {
1230
self.finish_rolling(options, RollingFunction::Max)
1231
}
1232
1233
/// Apply a rolling mean.
1234
///
1235
/// See: [`RollingAgg::rolling_mean`]
1236
#[cfg(feature = "rolling_window")]
1237
pub fn rolling_mean(self, options: RollingOptionsFixedWindow) -> Expr {
1238
self.finish_rolling(options, RollingFunction::Mean)
1239
}
1240
1241
/// Apply a rolling sum.
1242
///
1243
/// See: [`RollingAgg::rolling_sum`]
1244
#[cfg(feature = "rolling_window")]
1245
pub fn rolling_sum(self, options: RollingOptionsFixedWindow) -> Expr {
1246
self.finish_rolling(options, RollingFunction::Sum)
1247
}
1248
1249
/// Apply a rolling median.
1250
///
1251
/// See: [`RollingAgg::rolling_median`]
1252
#[cfg(feature = "rolling_window")]
1253
pub fn rolling_median(self, options: RollingOptionsFixedWindow) -> Expr {
1254
self.rolling_quantile(QuantileMethod::Linear, 0.5, options)
1255
}
1256
1257
/// Apply a rolling quantile.
1258
///
1259
/// See: [`RollingAgg::rolling_quantile`]
1260
#[cfg(feature = "rolling_window")]
1261
pub fn rolling_quantile(
1262
self,
1263
method: QuantileMethod,
1264
quantile: f64,
1265
mut options: RollingOptionsFixedWindow,
1266
) -> Expr {
1267
use polars_compute::rolling::{RollingFnParams, RollingQuantileParams};
1268
1269
options.fn_params = Some(RollingFnParams::Quantile(RollingQuantileParams {
1270
prob: quantile,
1271
method,
1272
}));
1273
1274
self.finish_rolling(options, RollingFunction::Quantile)
1275
}
1276
1277
/// Apply a rolling variance.
1278
#[cfg(feature = "rolling_window")]
1279
pub fn rolling_var(self, options: RollingOptionsFixedWindow) -> Expr {
1280
self.finish_rolling(options, RollingFunction::Var)
1281
}
1282
1283
/// Apply a rolling std-dev.
1284
#[cfg(feature = "rolling_window")]
1285
pub fn rolling_std(self, options: RollingOptionsFixedWindow) -> Expr {
1286
self.finish_rolling(options, RollingFunction::Std)
1287
}
1288
1289
#[cfg(feature = "rolling_window")]
1290
pub fn rolling_rank(self, options: RollingOptionsFixedWindow) -> Expr {
1291
self.finish_rolling(options, RollingFunction::Rank)
1292
}
1293
1294
/// Apply a rolling skew.
1295
#[cfg(feature = "rolling_window")]
1296
#[cfg(feature = "moment")]
1297
pub fn rolling_skew(self, options: RollingOptionsFixedWindow) -> Expr {
1298
self.finish_rolling(options, RollingFunction::Skew)
1299
}
1300
1301
/// Apply a rolling skew.
1302
#[cfg(feature = "rolling_window")]
1303
#[cfg(feature = "moment")]
1304
pub fn rolling_kurtosis(self, options: RollingOptionsFixedWindow) -> Expr {
1305
self.finish_rolling(options, RollingFunction::Kurtosis)
1306
}
1307
1308
#[cfg(feature = "rolling_window")]
1309
/// Apply a custom function over a rolling/ moving window of the array.
1310
/// This has quite some dynamic dispatch, so prefer rolling_min, max, mean, sum over this.
1311
pub fn rolling_map(
1312
self,
1313
f: PlanCallback<Series, Series>,
1314
options: RollingOptionsFixedWindow,
1315
) -> Expr {
1316
self.finish_rolling(options, RollingFunction::Map(f))
1317
}
1318
1319
#[cfg(feature = "peaks")]
1320
pub fn peak_min(self) -> Expr {
1321
self.map_unary(FunctionExpr::PeakMin)
1322
}
1323
1324
#[cfg(feature = "peaks")]
1325
pub fn peak_max(self) -> Expr {
1326
self.map_unary(FunctionExpr::PeakMax)
1327
}
1328
1329
#[cfg(feature = "rank")]
1330
/// Assign ranks to data, dealing with ties appropriately.
1331
pub fn rank(self, options: RankOptions, seed: Option<u64>) -> Expr {
1332
self.map_unary(FunctionExpr::Rank { options, seed })
1333
}
1334
1335
#[cfg(feature = "replace")]
1336
/// Replace the given values with other values.
1337
pub fn replace<E: Into<Expr>>(self, old: E, new: E) -> Expr {
1338
let old = old.into();
1339
let new = new.into();
1340
self.map_n_ary(FunctionExpr::Replace, [old, new])
1341
}
1342
1343
#[cfg(feature = "replace")]
1344
/// Replace the given values with other values.
1345
pub fn replace_strict<E: Into<Expr>>(
1346
self,
1347
old: E,
1348
new: E,
1349
default: Option<E>,
1350
return_dtype: Option<impl Into<DataTypeExpr>>,
1351
) -> Expr {
1352
let old = old.into();
1353
let new = new.into();
1354
let mut args = vec![old, new];
1355
args.extend(default.map(Into::into));
1356
self.map_n_ary(
1357
FunctionExpr::ReplaceStrict {
1358
return_dtype: return_dtype.map(Into::into),
1359
},
1360
args,
1361
)
1362
}
1363
1364
#[cfg(feature = "cutqcut")]
1365
/// Bin continuous values into discrete categories.
1366
pub fn cut(
1367
self,
1368
breaks: Vec<f64>,
1369
labels: Option<impl IntoVec<PlSmallStr>>,
1370
left_closed: bool,
1371
include_breaks: bool,
1372
) -> Expr {
1373
self.map_unary(FunctionExpr::Cut {
1374
breaks,
1375
labels: labels.map(|x| x.into_vec()),
1376
left_closed,
1377
include_breaks,
1378
})
1379
}
1380
1381
#[cfg(feature = "cutqcut")]
1382
/// Bin continuous values into discrete categories based on their quantiles.
1383
pub fn qcut(
1384
self,
1385
probs: Vec<f64>,
1386
labels: Option<impl IntoVec<PlSmallStr>>,
1387
left_closed: bool,
1388
allow_duplicates: bool,
1389
include_breaks: bool,
1390
) -> Expr {
1391
self.map_unary(FunctionExpr::QCut {
1392
probs,
1393
labels: labels.map(|x| x.into_vec()),
1394
left_closed,
1395
allow_duplicates,
1396
include_breaks,
1397
})
1398
}
1399
1400
#[cfg(feature = "cutqcut")]
1401
/// Bin continuous values into discrete categories using uniform quantile probabilities.
1402
pub fn qcut_uniform(
1403
self,
1404
n_bins: usize,
1405
labels: Option<impl IntoVec<PlSmallStr>>,
1406
left_closed: bool,
1407
allow_duplicates: bool,
1408
include_breaks: bool,
1409
) -> Expr {
1410
let probs = (1..n_bins).map(|b| b as f64 / n_bins as f64).collect();
1411
self.map_unary(FunctionExpr::QCut {
1412
probs,
1413
labels: labels.map(|x| x.into_vec()),
1414
left_closed,
1415
allow_duplicates,
1416
include_breaks,
1417
})
1418
}
1419
1420
#[cfg(feature = "rle")]
1421
/// Get the lengths of runs of identical values.
1422
pub fn rle(self) -> Expr {
1423
self.map_unary(FunctionExpr::RLE)
1424
}
1425
1426
#[cfg(feature = "rle")]
1427
/// Similar to `rle`, but maps values to run IDs.
1428
pub fn rle_id(self) -> Expr {
1429
self.map_unary(FunctionExpr::RLEID)
1430
}
1431
1432
#[cfg(feature = "diff")]
1433
/// Calculate the n-th discrete difference between values.
1434
pub fn diff(self, n: Expr, null_behavior: NullBehavior) -> Expr {
1435
self.map_binary(FunctionExpr::Diff(null_behavior), n)
1436
}
1437
1438
#[cfg(feature = "pct_change")]
1439
/// Computes percentage change between values.
1440
pub fn pct_change(self, n: Expr) -> Expr {
1441
self.map_binary(FunctionExpr::PctChange, n)
1442
}
1443
1444
#[cfg(feature = "moment")]
1445
/// Compute the sample skewness of a data set.
1446
///
1447
/// For normally distributed data, the skewness should be about zero. For
1448
/// uni-modal continuous distributions, a skewness value greater than zero means
1449
/// that there is more weight in the right tail of the distribution. The
1450
/// function `skewtest` can be used to determine if the skewness value
1451
/// is close enough to zero, statistically speaking.
1452
///
1453
/// see: [scipy](https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1024)
1454
pub fn skew(self, bias: bool) -> Expr {
1455
self.map_unary(FunctionExpr::Skew(bias))
1456
}
1457
1458
#[cfg(feature = "moment")]
1459
/// Compute the kurtosis (Fisher or Pearson).
1460
///
1461
/// Kurtosis is the fourth central moment divided by the square of the
1462
/// variance. If Fisher's definition is used, then 3.0 is subtracted from
1463
/// the result to give 0.0 for a normal distribution.
1464
/// If bias is False then the kurtosis is calculated using k statistics to
1465
/// eliminate bias coming from biased moment estimators.
1466
pub fn kurtosis(self, fisher: bool, bias: bool) -> Expr {
1467
self.map_unary(FunctionExpr::Kurtosis(fisher, bias))
1468
}
1469
1470
/// Get maximal value that could be hold by this dtype.
1471
pub fn upper_bound(self) -> Expr {
1472
self.map_unary(FunctionExpr::UpperBound)
1473
}
1474
1475
/// Get minimal value that could be hold by this dtype.
1476
pub fn lower_bound(self) -> Expr {
1477
self.map_unary(FunctionExpr::LowerBound)
1478
}
1479
1480
#[cfg(feature = "dtype-array")]
1481
pub fn reshape(self, dimensions: &[i64]) -> Self {
1482
let dimensions = dimensions
1483
.iter()
1484
.map(|&v| ReshapeDimension::new(v))
1485
.collect();
1486
self.map_unary(FunctionExpr::Reshape(dimensions))
1487
}
1488
1489
#[cfg(feature = "ewma")]
1490
/// Calculate the exponentially-weighted moving average.
1491
pub fn ewm_mean(self, options: EWMOptions) -> Self {
1492
self.map_unary(FunctionExpr::EwmMean { options })
1493
}
1494
1495
#[cfg(feature = "ewma_by")]
1496
/// Calculate the exponentially-weighted moving average by a time column.
1497
pub fn ewm_mean_by(self, times: Expr, half_life: Duration) -> Self {
1498
self.map_binary(FunctionExpr::EwmMeanBy { half_life }, times)
1499
}
1500
1501
#[cfg(feature = "ewma")]
1502
/// Calculate the exponentially-weighted moving standard deviation.
1503
pub fn ewm_std(self, options: EWMOptions) -> Self {
1504
self.map_unary(FunctionExpr::EwmStd { options })
1505
}
1506
1507
#[cfg(feature = "ewma")]
1508
/// Calculate the exponentially-weighted moving variance.
1509
pub fn ewm_var(self, options: EWMOptions) -> Self {
1510
self.map_unary(FunctionExpr::EwmVar { options })
1511
}
1512
1513
/// Returns whether any of the values in the column are `true`.
1514
///
1515
/// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls:
1516
/// if the column contains any null values and no `true` values, the output
1517
/// is null.
1518
///
1519
/// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic
1520
pub fn any(self, ignore_nulls: bool) -> Self {
1521
self.map_unary(BooleanFunction::Any { ignore_nulls })
1522
}
1523
1524
/// Returns whether all values in the column are `true`.
1525
///
1526
/// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls:
1527
/// if the column contains any null values and no `false` values, the output
1528
/// is null.
1529
///
1530
/// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic
1531
pub fn all(self, ignore_nulls: bool) -> Self {
1532
self.map_unary(BooleanFunction::All { ignore_nulls })
1533
}
1534
1535
#[cfg(feature = "dtype-struct")]
1536
/// Count all unique values and create a struct mapping value to count.
1537
/// (Note that it is better to turn parallel off in the aggregation context).
1538
/// The name of the struct field with the counts is given by the parameter `name`.
1539
pub fn value_counts(self, sort: bool, parallel: bool, name: &str, normalize: bool) -> Self {
1540
self.map_unary(FunctionExpr::ValueCounts {
1541
sort,
1542
parallel,
1543
name: name.into(),
1544
normalize,
1545
})
1546
}
1547
1548
#[cfg(feature = "unique_counts")]
1549
/// Returns a count of the unique values in the order of appearance.
1550
/// This method differs from [`Expr::value_counts`] in that it does not return the
1551
/// values, only the counts and might be faster.
1552
pub fn unique_counts(self) -> Self {
1553
self.map_unary(FunctionExpr::UniqueCounts)
1554
}
1555
1556
#[cfg(feature = "log")]
1557
/// Compute the logarithm to a given base.
1558
pub fn log(self, base: Expr) -> Self {
1559
self.map_binary(FunctionExpr::Log, base)
1560
}
1561
1562
#[cfg(feature = "log")]
1563
/// Compute the natural logarithm of all elements plus one in the input array.
1564
pub fn log1p(self) -> Self {
1565
self.map_unary(FunctionExpr::Log1p)
1566
}
1567
1568
#[cfg(feature = "log")]
1569
/// Calculate the exponential of all elements in the input array.
1570
pub fn exp(self) -> Self {
1571
self.map_unary(FunctionExpr::Exp)
1572
}
1573
1574
#[cfg(feature = "log")]
1575
/// Compute the entropy as `-sum(pk * log(pk))`.
1576
/// where `pk` are discrete probabilities.
1577
pub fn entropy(self, base: f64, normalize: bool) -> Self {
1578
self.map_unary(FunctionExpr::Entropy { base, normalize })
1579
}
1580
/// Get the null count of the column/group.
1581
pub fn null_count(self) -> Expr {
1582
self.map_unary(FunctionExpr::NullCount)
1583
}
1584
1585
/// Set this `Series` as `sorted` so that downstream code can use
1586
/// fast paths for sorted arrays.
1587
/// # Warning
1588
/// This can lead to incorrect results if this `Series` is not sorted!!
1589
/// Use with care!
1590
pub fn set_sorted_flag(self, sorted: IsSorted) -> Expr {
1591
// This is `map`. If a column is sorted. Chunks of that column are also sorted.
1592
self.map_unary(FunctionExpr::SetSortedFlag(sorted))
1593
}
1594
1595
#[cfg(feature = "row_hash")]
1596
/// Compute the hash of every element.
1597
pub fn hash(self, k0: u64, k1: u64, k2: u64, k3: u64) -> Expr {
1598
self.map_unary(FunctionExpr::Hash(k0, k1, k2, k3))
1599
}
1600
1601
pub fn to_physical(self) -> Expr {
1602
self.map_unary(FunctionExpr::ToPhysical)
1603
}
1604
1605
pub fn gather_every(self, n: usize, offset: usize) -> Expr {
1606
self.map_unary(FunctionExpr::GatherEvery { n, offset })
1607
}
1608
1609
#[cfg(feature = "reinterpret")]
1610
pub fn reinterpret(self, signed: bool) -> Expr {
1611
self.map_unary(FunctionExpr::Reinterpret(signed))
1612
}
1613
1614
pub fn extend_constant(self, value: Expr, n: Expr) -> Expr {
1615
self.map_ternary(FunctionExpr::ExtendConstant, value, n)
1616
}
1617
1618
#[cfg(feature = "strings")]
1619
/// Get the [`string::StringNameSpace`]
1620
pub fn str(self) -> string::StringNameSpace {
1621
string::StringNameSpace(self)
1622
}
1623
1624
/// Get the [`binary::BinaryNameSpace`]
1625
pub fn binary(self) -> binary::BinaryNameSpace {
1626
binary::BinaryNameSpace(self)
1627
}
1628
1629
#[cfg(feature = "temporal")]
1630
/// Get the [`dt::DateLikeNameSpace`]
1631
pub fn dt(self) -> dt::DateLikeNameSpace {
1632
dt::DateLikeNameSpace(self)
1633
}
1634
1635
/// Get the [`list::ListNameSpace`]
1636
pub fn list(self) -> list::ListNameSpace {
1637
list::ListNameSpace(self)
1638
}
1639
1640
/// Get the [`name::ExprNameNameSpace`]
1641
pub fn name(self) -> name::ExprNameNameSpace {
1642
name::ExprNameNameSpace(self)
1643
}
1644
1645
/// Get the [`array::ArrayNameSpace`].
1646
#[cfg(feature = "dtype-array")]
1647
pub fn arr(self) -> array::ArrayNameSpace {
1648
array::ArrayNameSpace(self)
1649
}
1650
1651
/// Get the [`CategoricalNameSpace`].
1652
#[cfg(feature = "dtype-categorical")]
1653
pub fn cat(self) -> cat::CategoricalNameSpace {
1654
cat::CategoricalNameSpace(self)
1655
}
1656
1657
/// Get the [`extension::ExtensionNameSpace`].
1658
#[cfg(feature = "dtype-extension")]
1659
pub fn ext(self) -> extension::ExtensionNameSpace {
1660
extension::ExtensionNameSpace(self)
1661
}
1662
1663
/// Get the [`struct_::StructNameSpace`].
1664
#[cfg(feature = "dtype-struct")]
1665
pub fn struct_(self) -> struct_::StructNameSpace {
1666
struct_::StructNameSpace(self)
1667
}
1668
1669
/// Get the [`meta::MetaNameSpace`]
1670
#[cfg(feature = "meta")]
1671
pub fn meta(self) -> meta::MetaNameSpace {
1672
meta::MetaNameSpace(self)
1673
}
1674
}
1675
1676
/// Apply a function/closure over multiple columns once the logical plan get executed.
1677
///
1678
/// This function is very similar to [`apply_multiple`], but differs in how it handles aggregations.
1679
///
1680
/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
1681
/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
1682
///
1683
/// It is the responsibility of the caller that the schema is correct by giving
1684
/// the correct output_type. If None given the output type of the input expr is used.
1685
pub fn map_multiple<F, DT, E>(function: F, expr: E, output_type: DT) -> Expr
1686
where
1687
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
1688
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
1689
E: AsRef<[Expr]>,
1690
{
1691
let input = expr.as_ref().to_vec();
1692
1693
let function = BaseColumnUdf::new(function, output_type);
1694
1695
let options = FunctionOptions::elementwise();
1696
Expr::AnonymousFunction {
1697
input,
1698
function: new_column_udf(function),
1699
options,
1700
fmt_str: Box::new(PlSmallStr::EMPTY),
1701
}
1702
}
1703
1704
/// Apply a function/closure over the groups of multiple columns. This should only be used in a group_by aggregation.
1705
///
1706
/// It is the responsibility of the caller that the schema is correct by giving
1707
/// the correct output_type. If None given the output type of the input expr is used.
1708
///
1709
/// This difference with [`map_multiple`] is that [`apply_multiple`] will create a separate [`Series`] per group.
1710
///
1711
/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
1712
/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
1713
pub fn apply_multiple<F, DT, E>(function: F, expr: E, output_type: DT, returns_scalar: bool) -> Expr
1714
where
1715
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
1716
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
1717
E: AsRef<[Expr]>,
1718
{
1719
let input = expr.as_ref().to_vec();
1720
let options = FunctionOptions::groupwise().with_flags(|mut f| {
1721
f.set(FunctionFlags::RETURNS_SCALAR, returns_scalar);
1722
f
1723
});
1724
let function = BaseColumnUdf::new(function, output_type);
1725
Expr::AnonymousFunction {
1726
input,
1727
function: new_column_udf(function),
1728
options,
1729
fmt_str: Box::new(PlSmallStr::EMPTY),
1730
}
1731
}
1732
1733