Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/dsl/mod.rs
6939 views
1
#![allow(ambiguous_glob_reexports)]
2
//! Domain specific language for the Lazy API.
3
#[cfg(feature = "dtype-categorical")]
4
pub mod cat;
5
6
#[cfg(feature = "dtype-categorical")]
7
pub use cat::*;
8
#[cfg(feature = "rolling_window_by")]
9
pub(crate) use polars_time::prelude::*;
10
11
mod arithmetic;
12
mod arity;
13
#[cfg(feature = "dtype-array")]
14
mod array;
15
pub mod binary;
16
#[cfg(feature = "bitwise")]
17
mod bitwise;
18
mod builder_dsl;
19
pub use builder_dsl::*;
20
mod datatype_expr;
21
#[cfg(feature = "temporal")]
22
pub mod dt;
23
mod expr;
24
mod format;
25
mod from;
26
pub mod function_expr;
27
pub mod functions;
28
mod list;
29
mod match_to_schema;
30
#[cfg(feature = "meta")]
31
mod meta;
32
mod name;
33
mod options;
34
#[cfg(feature = "python")]
35
pub mod python_dsl;
36
#[cfg(feature = "random")]
37
mod random;
38
mod scan_sources;
39
mod selector;
40
mod statistics;
41
#[cfg(feature = "strings")]
42
pub mod string;
43
#[cfg(feature = "dtype-struct")]
44
mod struct_;
45
pub mod udf;
46
47
use std::fmt::Debug;
48
use std::sync::Arc;
49
50
mod plan;
51
pub use arity::*;
52
#[cfg(feature = "dtype-array")]
53
pub use array::*;
54
pub use datatype_expr::DataTypeExpr;
55
pub use expr::*;
56
pub use function_expr::*;
57
pub use functions::*;
58
pub use list::*;
59
pub use match_to_schema::*;
60
#[cfg(feature = "meta")]
61
pub use meta::*;
62
pub use name::*;
63
pub use options::*;
64
pub use plan::*;
65
use polars_compute::rolling::QuantileMethod;
66
use polars_core::chunked_array::cast::CastOptions;
67
use polars_core::error::feature_gated;
68
use polars_core::prelude::*;
69
use polars_core::series::IsSorted;
70
#[cfg(feature = "diff")]
71
use polars_core::series::ops::NullBehavior;
72
#[cfg(feature = "is_close")]
73
use polars_utils::total_ord::TotalOrdWrap;
74
pub use selector::{DataTypeSelector, Selector, TimeUnitSet, TimeZoneSet};
75
#[cfg(feature = "dtype-struct")]
76
pub use struct_::*;
77
pub use udf::UserDefinedFunction;
78
mod file_scan;
79
pub use file_scan::*;
80
pub use scan_sources::{ScanSource, ScanSourceIter, ScanSourceRef, ScanSources};
81
82
pub use crate::plans::lit;
83
use crate::prelude::*;
84
85
impl Expr {
86
/// Compare `Expr` with other `Expr` on equality.
87
pub fn eq<E: Into<Expr>>(self, other: E) -> Expr {
88
binary_expr(self, Operator::Eq, other.into())
89
}
90
91
/// Compare `Expr` with other `Expr` on equality where `None == None`.
92
pub fn eq_missing<E: Into<Expr>>(self, other: E) -> Expr {
93
binary_expr(self, Operator::EqValidity, other.into())
94
}
95
96
/// Compare `Expr` with other `Expr` on non-equality.
97
pub fn neq<E: Into<Expr>>(self, other: E) -> Expr {
98
binary_expr(self, Operator::NotEq, other.into())
99
}
100
101
/// Compare `Expr` with other `Expr` on non-equality where `None == None`.
102
pub fn neq_missing<E: Into<Expr>>(self, other: E) -> Expr {
103
binary_expr(self, Operator::NotEqValidity, other.into())
104
}
105
106
/// Check if `Expr` < `Expr`.
107
pub fn lt<E: Into<Expr>>(self, other: E) -> Expr {
108
binary_expr(self, Operator::Lt, other.into())
109
}
110
111
/// Check if `Expr` > `Expr`.
112
pub fn gt<E: Into<Expr>>(self, other: E) -> Expr {
113
binary_expr(self, Operator::Gt, other.into())
114
}
115
116
/// Check if `Expr` >= `Expr`.
117
pub fn gt_eq<E: Into<Expr>>(self, other: E) -> Expr {
118
binary_expr(self, Operator::GtEq, other.into())
119
}
120
121
/// Check if `Expr` <= `Expr`.
122
pub fn lt_eq<E: Into<Expr>>(self, other: E) -> Expr {
123
binary_expr(self, Operator::LtEq, other.into())
124
}
125
126
/// Negate `Expr`.
127
#[allow(clippy::should_implement_trait)]
128
pub fn not(self) -> Expr {
129
self.map_unary(BooleanFunction::Not)
130
}
131
132
/// Rename Column.
133
pub fn alias<S>(self, name: S) -> Expr
134
where
135
S: Into<PlSmallStr>,
136
{
137
Expr::Alias(Arc::new(self), name.into())
138
}
139
140
/// Run is_null operation on `Expr`.
141
#[allow(clippy::wrong_self_convention)]
142
pub fn is_null(self) -> Self {
143
self.map_unary(BooleanFunction::IsNull)
144
}
145
146
/// Run is_not_null operation on `Expr`.
147
#[allow(clippy::wrong_self_convention)]
148
pub fn is_not_null(self) -> Self {
149
self.map_unary(BooleanFunction::IsNotNull)
150
}
151
152
/// Drop null values.
153
pub fn drop_nulls(self) -> Self {
154
self.map_unary(FunctionExpr::DropNulls)
155
}
156
157
/// Drop NaN values.
158
pub fn drop_nans(self) -> Self {
159
self.map_unary(FunctionExpr::DropNans)
160
}
161
162
/// Get the number of unique values in the groups.
163
pub fn n_unique(self) -> Self {
164
AggExpr::NUnique(Arc::new(self)).into()
165
}
166
167
/// Get the first value in the group.
168
pub fn first(self) -> Self {
169
AggExpr::First(Arc::new(self)).into()
170
}
171
172
/// Get the last value in the group.
173
pub fn last(self) -> Self {
174
AggExpr::Last(Arc::new(self)).into()
175
}
176
177
/// GroupBy the group to a Series.
178
pub fn implode(self) -> Self {
179
AggExpr::Implode(Arc::new(self)).into()
180
}
181
182
/// Compute the quantile per group.
183
pub fn quantile(self, quantile: Expr, method: QuantileMethod) -> Self {
184
AggExpr::Quantile {
185
expr: Arc::new(self),
186
quantile: Arc::new(quantile),
187
method,
188
}
189
.into()
190
}
191
192
/// Get the group indexes of the group by operation.
193
pub fn agg_groups(self) -> Self {
194
AggExpr::AggGroups(Arc::new(self)).into()
195
}
196
197
/// Alias for `explode`.
198
pub fn flatten(self) -> Self {
199
self.explode()
200
}
201
202
/// Explode the String/List column.
203
pub fn explode(self) -> Self {
204
Expr::Explode {
205
input: Arc::new(self),
206
skip_empty: false,
207
}
208
}
209
210
/// Slice the Series.
211
/// `offset` may be negative.
212
pub fn slice<E: Into<Expr>, F: Into<Expr>>(self, offset: E, length: F) -> Self {
213
Expr::Slice {
214
input: Arc::new(self),
215
offset: Arc::new(offset.into()),
216
length: Arc::new(length.into()),
217
}
218
}
219
220
/// Append expressions. This is done by adding the chunks of `other` to this [`Series`].
221
pub fn append<E: Into<Expr>>(self, other: E, upcast: bool) -> Self {
222
self.map_binary(FunctionExpr::Append { upcast }, other.into())
223
}
224
225
/// Collect all chunks into a single chunk before continuing.
226
pub fn rechunk(self) -> Self {
227
self.map_unary(FunctionExpr::Rechunk)
228
}
229
230
/// Get the first `n` elements of the Expr result.
231
pub fn head(self, length: Option<usize>) -> Self {
232
self.slice(lit(0), lit(length.unwrap_or(10) as u64))
233
}
234
235
/// Get the last `n` elements of the Expr result.
236
pub fn tail(self, length: Option<usize>) -> Self {
237
let len = length.unwrap_or(10);
238
self.slice(lit(-(len as i64)), lit(len as u64))
239
}
240
241
/// Get unique values of this expression.
242
pub fn unique(self) -> Self {
243
self.map_unary(FunctionExpr::Unique(false))
244
}
245
246
/// Get unique values of this expression, while maintaining order.
247
/// This requires more work than [`Expr::unique`].
248
pub fn unique_stable(self) -> Self {
249
self.map_unary(FunctionExpr::Unique(true))
250
}
251
252
/// Get the first index of unique values of this expression.
253
pub fn arg_unique(self) -> Self {
254
self.map_unary(FunctionExpr::ArgUnique)
255
}
256
257
/// Get the index value that has the minimum value.
258
pub fn arg_min(self) -> Self {
259
self.map_unary(FunctionExpr::ArgMin)
260
}
261
262
/// Get the index value that has the maximum value.
263
pub fn arg_max(self) -> Self {
264
self.map_unary(FunctionExpr::ArgMax)
265
}
266
267
/// Get the index values that would sort this expression.
268
pub fn arg_sort(self, descending: bool, nulls_last: bool) -> Self {
269
self.map_unary(FunctionExpr::ArgSort {
270
descending,
271
nulls_last,
272
})
273
}
274
275
#[cfg(feature = "index_of")]
276
/// Find the index of a value.
277
pub fn index_of<E: Into<Expr>>(self, element: E) -> Expr {
278
self.map_binary(FunctionExpr::IndexOf, element.into())
279
}
280
281
#[cfg(feature = "search_sorted")]
282
/// Find indices where elements should be inserted to maintain order.
283
pub fn search_sorted<E: Into<Expr>>(
284
self,
285
element: E,
286
side: SearchSortedSide,
287
descending: bool,
288
) -> Expr {
289
self.map_binary(
290
FunctionExpr::SearchSorted { side, descending },
291
element.into(),
292
)
293
}
294
295
/// Cast expression to another data type.
296
/// Throws an error if conversion had overflows.
297
/// Returns an Error if cast is invalid on rows after predicates are pushed down.
298
pub fn strict_cast(self, dtype: impl Into<DataTypeExpr>) -> Self {
299
Expr::Cast {
300
expr: Arc::new(self),
301
dtype: dtype.into(),
302
options: CastOptions::Strict,
303
}
304
}
305
306
/// Cast expression to another data type.
307
pub fn cast(self, dtype: impl Into<DataTypeExpr>) -> Self {
308
Expr::Cast {
309
expr: Arc::new(self),
310
dtype: dtype.into(),
311
options: CastOptions::NonStrict,
312
}
313
}
314
315
/// Cast expression to another data type.
316
pub fn cast_with_options(
317
self,
318
dtype: impl Into<DataTypeExpr>,
319
cast_options: CastOptions,
320
) -> Self {
321
Expr::Cast {
322
expr: Arc::new(self),
323
dtype: dtype.into(),
324
options: cast_options,
325
}
326
}
327
328
/// Take the values by idx.
329
pub fn gather<E: Into<Expr>>(self, idx: E) -> Self {
330
Expr::Gather {
331
expr: Arc::new(self),
332
idx: Arc::new(idx.into()),
333
returns_scalar: false,
334
}
335
}
336
337
/// Take the values by a single index.
338
pub fn get<E: Into<Expr>>(self, idx: E) -> Self {
339
Expr::Gather {
340
expr: Arc::new(self),
341
idx: Arc::new(idx.into()),
342
returns_scalar: true,
343
}
344
}
345
346
/// Sort with given options.
347
///
348
/// # Example
349
///
350
/// ```rust
351
/// # use polars_core::prelude::*;
352
/// # use polars_lazy::prelude::*;
353
/// # fn main() -> PolarsResult<()> {
354
/// let lf = df! {
355
/// "a" => [Some(5), Some(4), Some(3), Some(2), None]
356
/// }?
357
/// .lazy();
358
///
359
/// let sorted = lf
360
/// .select(
361
/// vec![col("a").sort(SortOptions::default())],
362
/// )
363
/// .collect()?;
364
///
365
/// assert_eq!(
366
/// sorted,
367
/// df! {
368
/// "a" => [None, Some(2), Some(3), Some(4), Some(5)]
369
/// }?
370
/// );
371
/// # Ok(())
372
/// # }
373
/// ```
374
/// See [`SortOptions`] for more options.
375
pub fn sort(self, options: SortOptions) -> Self {
376
Expr::Sort {
377
expr: Arc::new(self),
378
options,
379
}
380
}
381
382
/// Returns the `k` largest elements.
383
///
384
/// This has time complexity `O(n + k log(n))`.
385
#[cfg(feature = "top_k")]
386
pub fn top_k(self, k: Expr) -> Self {
387
self.map_binary(FunctionExpr::TopK { descending: false }, k)
388
}
389
390
/// Returns the `k` largest rows by given column.
391
///
392
/// For single column, use [`Expr::top_k`].
393
#[cfg(feature = "top_k")]
394
pub fn top_k_by<K: Into<Expr>, E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
395
self,
396
k: K,
397
by: E,
398
descending: Vec<bool>,
399
) -> Self {
400
self.map_n_ary(
401
FunctionExpr::TopKBy { descending },
402
[k.into()]
403
.into_iter()
404
.chain(by.as_ref().iter().map(|e| -> Expr { e.clone().into() })),
405
)
406
}
407
408
/// Returns the `k` smallest elements.
409
///
410
/// This has time complexity `O(n + k log(n))`.
411
#[cfg(feature = "top_k")]
412
pub fn bottom_k(self, k: Expr) -> Self {
413
self.map_binary(FunctionExpr::TopK { descending: true }, k)
414
}
415
416
/// Returns the `k` smallest rows by given column.
417
///
418
/// For single column, use [`Expr::bottom_k`].
419
#[cfg(feature = "top_k")]
420
pub fn bottom_k_by<K: Into<Expr>, E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
421
self,
422
k: K,
423
by: E,
424
descending: Vec<bool>,
425
) -> Self {
426
let descending = descending.into_iter().map(|x| !x).collect();
427
self.map_n_ary(
428
FunctionExpr::TopKBy { descending },
429
[k.into()]
430
.into_iter()
431
.chain(by.as_ref().iter().map(|e| -> Expr { e.clone().into() })),
432
)
433
}
434
435
/// Reverse column
436
pub fn reverse(self) -> Self {
437
self.map_unary(FunctionExpr::Reverse)
438
}
439
440
/// Apply a function/closure once the logical plan get executed.
441
///
442
/// This function is very similar to [`Expr::apply`], but differs in how it handles aggregations.
443
///
444
/// * `map` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
445
/// * `apply` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
446
///
447
/// It is the responsibility of the caller that the schema is correct by giving
448
/// the correct output_type. If None given the output type of the input expr is used.
449
pub fn map<F, DT>(self, function: F, output_type: DT) -> Self
450
where
451
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
452
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
453
{
454
self.map_with_fmt_str(function, output_type, "map")
455
}
456
457
pub fn map_with_fmt_str<F, DT>(
458
self,
459
function: F,
460
output_type: DT,
461
fmt_str: impl Into<PlSmallStr>,
462
) -> Self
463
where
464
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
465
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
466
{
467
let f = BaseColumnUdf::new(
468
move |c: &mut [Column]| function(std::mem::take(&mut c[0])),
469
move |schema: &Schema, fields: &[Field]| output_type(schema, &fields[0]),
470
);
471
472
let options =
473
FunctionOptions::elementwise().with_flags(|f| f | FunctionFlags::OPTIONAL_RE_ENTRANT);
474
let fmt_str = Box::new(fmt_str.into());
475
Expr::AnonymousFunction {
476
input: vec![self],
477
function: new_column_udf(f),
478
options,
479
fmt_str,
480
}
481
}
482
483
pub fn agg_with_fmt_str<F, DT>(
484
self,
485
function: F,
486
output_type: DT,
487
fmt_str: impl Into<PlSmallStr>,
488
) -> Self
489
where
490
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
491
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
492
{
493
let f = BaseColumnUdf::new(
494
move |c: &mut [Column]| function(std::mem::take(&mut c[0])),
495
move |schema: &Schema, fields: &[Field]| output_type(schema, &fields[0]),
496
);
497
498
let options = FunctionOptions::aggregation();
499
let fmt_str = Box::new(fmt_str.into());
500
Expr::AnonymousFunction {
501
input: vec![self],
502
function: new_column_udf(f),
503
options,
504
fmt_str,
505
}
506
}
507
508
pub fn apply_with_fmt_str<F, DT>(
509
self,
510
function: F,
511
output_type: DT,
512
fmt_str: impl Into<PlSmallStr>,
513
) -> Self
514
where
515
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
516
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
517
{
518
let f = BaseColumnUdf::new(
519
move |c: &mut [Column]| function(std::mem::take(&mut c[0])),
520
move |schema: &Schema, fields: &[Field]| output_type(schema, &fields[0]),
521
);
522
523
let options = FunctionOptions::groupwise();
524
let fmt_str = Box::new(fmt_str.into());
525
Expr::AnonymousFunction {
526
input: vec![self],
527
function: new_column_udf(f),
528
options,
529
fmt_str,
530
}
531
}
532
533
/// Apply a function/closure once the logical plan get executed with many arguments.
534
///
535
/// See the [`Expr::map`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply).
536
pub fn map_many<F, DT>(self, function: F, arguments: &[Expr], output_type: DT) -> Self
537
where
538
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
539
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
540
{
541
let mut input = vec![self];
542
input.extend_from_slice(arguments);
543
544
let function = BaseColumnUdf::new(function, output_type);
545
546
let options = FunctionOptions::elementwise();
547
Expr::AnonymousFunction {
548
input,
549
function: new_column_udf(function),
550
options,
551
fmt_str: Box::new(PlSmallStr::EMPTY),
552
}
553
}
554
555
/// Apply a function/closure over the groups. This should only be used in a group_by aggregation.
556
///
557
/// It is the responsibility of the caller that the schema is correct by giving
558
/// the correct output_type. If None given the output type of the input expr is used.
559
///
560
/// This difference with [map](Self::map) is that `apply` will create a separate `Series` per group.
561
///
562
/// * `map` should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
563
/// * `apply` should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
564
pub fn apply<F, DT>(self, function: F, output_type: DT) -> Self
565
where
566
F: Fn(Column) -> PolarsResult<Column> + 'static + Send + Sync,
567
DT: Fn(&Schema, &Field) -> PolarsResult<Field> + 'static + Send + Sync,
568
{
569
self.apply_with_fmt_str(function, output_type, PlSmallStr::EMPTY)
570
}
571
572
/// Apply a function/closure over the groups with many arguments. This should only be used in a group_by aggregation.
573
///
574
/// See the [`Expr::apply`] function for the differences between [`map`](Expr::map) and [`apply`](Expr::apply).
575
pub fn apply_many<F, DT>(self, function: F, arguments: &[Expr], output_type: DT) -> Self
576
where
577
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
578
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
579
{
580
let mut input = vec![self];
581
input.extend_from_slice(arguments);
582
583
let function = BaseColumnUdf::new(function, output_type);
584
585
let options = FunctionOptions::groupwise();
586
Expr::AnonymousFunction {
587
input,
588
function: new_column_udf(function),
589
options,
590
fmt_str: Box::new(PlSmallStr::EMPTY),
591
}
592
}
593
594
/// Get mask of finite values if dtype is Float.
595
#[allow(clippy::wrong_self_convention)]
596
pub fn is_finite(self) -> Self {
597
self.map_unary(BooleanFunction::IsFinite)
598
}
599
600
/// Get mask of infinite values if dtype is Float.
601
#[allow(clippy::wrong_self_convention)]
602
pub fn is_infinite(self) -> Self {
603
self.map_unary(BooleanFunction::IsInfinite)
604
}
605
606
/// Get mask of NaN values if dtype is Float.
607
pub fn is_nan(self) -> Self {
608
self.map_unary(BooleanFunction::IsNan)
609
}
610
611
/// Get inverse mask of NaN values if dtype is Float.
612
pub fn is_not_nan(self) -> Self {
613
self.map_unary(BooleanFunction::IsNotNan)
614
}
615
616
/// Shift the values in the array by some period. See [the eager implementation](polars_core::series::SeriesTrait::shift).
617
pub fn shift(self, n: Expr) -> Self {
618
self.map_binary(FunctionExpr::Shift, n)
619
}
620
621
/// Shift the values in the array by some period and fill the resulting empty values.
622
pub fn shift_and_fill<E: Into<Expr>, IE: Into<Expr>>(self, n: E, fill_value: IE) -> Self {
623
self.map_ternary(FunctionExpr::ShiftAndFill, n.into(), fill_value.into())
624
}
625
626
/// Cumulatively count values from 0 to len.
627
#[cfg(feature = "cum_agg")]
628
pub fn cumulative_eval(self, evaluation: Expr, min_samples: usize) -> Self {
629
Expr::Eval {
630
expr: Arc::new(self),
631
evaluation: Arc::new(evaluation),
632
variant: EvalVariant::Cumulative { min_samples },
633
}
634
}
635
636
/// Cumulatively count values from 0 to len.
637
#[cfg(feature = "cum_agg")]
638
pub fn cum_count(self, reverse: bool) -> Self {
639
self.map_unary(FunctionExpr::CumCount { reverse })
640
}
641
642
/// Get an array with the cumulative sum computed at every element.
643
#[cfg(feature = "cum_agg")]
644
pub fn cum_sum(self, reverse: bool) -> Self {
645
self.map_unary(FunctionExpr::CumSum { reverse })
646
}
647
648
/// Get an array with the cumulative product computed at every element.
649
#[cfg(feature = "cum_agg")]
650
pub fn cum_prod(self, reverse: bool) -> Self {
651
self.map_unary(FunctionExpr::CumProd { reverse })
652
}
653
654
/// Get an array with the cumulative min computed at every element.
655
#[cfg(feature = "cum_agg")]
656
pub fn cum_min(self, reverse: bool) -> Self {
657
self.map_unary(FunctionExpr::CumMin { reverse })
658
}
659
660
/// Get an array with the cumulative max computed at every element.
661
#[cfg(feature = "cum_agg")]
662
pub fn cum_max(self, reverse: bool) -> Self {
663
self.map_unary(FunctionExpr::CumMax { reverse })
664
}
665
666
/// Get the product aggregation of an expression.
667
pub fn product(self) -> Self {
668
self.map_unary(FunctionExpr::Product)
669
}
670
671
/// Round underlying floating point array to given decimal numbers.
672
#[cfg(feature = "round_series")]
673
pub fn round(self, decimals: u32, mode: RoundMode) -> Self {
674
self.map_unary(FunctionExpr::Round { decimals, mode })
675
}
676
677
/// Round to a number of significant figures.
678
#[cfg(feature = "round_series")]
679
pub fn round_sig_figs(self, digits: i32) -> Self {
680
self.map_unary(FunctionExpr::RoundSF { digits })
681
}
682
683
/// Floor underlying floating point array to the lowest integers smaller or equal to the float value.
684
#[cfg(feature = "round_series")]
685
pub fn floor(self) -> Self {
686
self.map_unary(FunctionExpr::Floor)
687
}
688
689
/// Constant Pi
690
#[cfg(feature = "round_series")]
691
pub fn pi() -> Self {
692
lit(std::f64::consts::PI)
693
}
694
695
/// Ceil underlying floating point array to the highest integers smaller or equal to the float value.
696
#[cfg(feature = "round_series")]
697
pub fn ceil(self) -> Self {
698
self.map_unary(FunctionExpr::Ceil)
699
}
700
701
/// Clip underlying values to a set boundary.
702
#[cfg(feature = "round_series")]
703
pub fn clip(self, min: Expr, max: Expr) -> Self {
704
self.map_ternary(
705
FunctionExpr::Clip {
706
has_min: true,
707
has_max: true,
708
},
709
min,
710
max,
711
)
712
}
713
714
/// Clip underlying values to a set boundary.
715
#[cfg(feature = "round_series")]
716
pub fn clip_max(self, max: Expr) -> Self {
717
self.map_binary(
718
FunctionExpr::Clip {
719
has_min: false,
720
has_max: true,
721
},
722
max,
723
)
724
}
725
726
/// Clip underlying values to a set boundary.
727
#[cfg(feature = "round_series")]
728
pub fn clip_min(self, min: Expr) -> Self {
729
self.map_binary(
730
FunctionExpr::Clip {
731
has_min: true,
732
has_max: false,
733
},
734
min,
735
)
736
}
737
738
/// Convert all values to their absolute/positive value.
739
#[cfg(feature = "abs")]
740
pub fn abs(self) -> Self {
741
self.map_unary(FunctionExpr::Abs)
742
}
743
744
/// Apply window function over a subgroup.
745
/// This is similar to a group_by + aggregation + self join.
746
/// Or similar to [window functions in Postgres](https://www.postgresql.org/docs/9.1/tutorial-window.html).
747
///
748
/// # Example
749
///
750
/// ``` rust
751
/// #[macro_use] extern crate polars_core;
752
/// use polars_core::prelude::*;
753
/// use polars_lazy::prelude::*;
754
///
755
/// fn example() -> PolarsResult<()> {
756
/// let df = df! {
757
/// "groups" => &[1, 1, 2, 2, 1, 2, 3, 3, 1],
758
/// "values" => &[1, 2, 3, 4, 5, 6, 7, 8, 8]
759
/// }?;
760
///
761
/// let out = df
762
/// .lazy()
763
/// .select(&[
764
/// col("groups"),
765
/// sum("values").over([col("groups")]),
766
/// ])
767
/// .collect()?;
768
/// println!("{}", &out);
769
/// Ok(())
770
/// }
771
///
772
/// ```
773
///
774
/// Outputs:
775
///
776
/// ``` text
777
/// ╭────────┬────────╮
778
/// │ groups ┆ values │
779
/// │ --- ┆ --- │
780
/// │ i32 ┆ i32 │
781
/// ╞════════╪════════╡
782
/// │ 1 ┆ 16 │
783
/// │ 1 ┆ 16 │
784
/// │ 2 ┆ 13 │
785
/// │ 2 ┆ 13 │
786
/// │ … ┆ … │
787
/// │ 1 ┆ 16 │
788
/// │ 2 ┆ 13 │
789
/// │ 3 ┆ 15 │
790
/// │ 3 ┆ 15 │
791
/// │ 1 ┆ 16 │
792
/// ╰────────┴────────╯
793
/// ```
794
pub fn over<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(self, partition_by: E) -> Self {
795
self.over_with_options(Some(partition_by), None, Default::default())
796
.expect("We explicitly passed `partition_by`")
797
}
798
799
pub fn over_with_options<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
800
self,
801
partition_by: Option<E>,
802
order_by: Option<(E, SortOptions)>,
803
options: WindowMapping,
804
) -> PolarsResult<Self> {
805
polars_ensure!(partition_by.is_some() || order_by.is_some(), InvalidOperation: "At least one of `partition_by` and `order_by` must be specified in `over`");
806
let partition_by = if let Some(partition_by) = partition_by {
807
partition_by
808
.as_ref()
809
.iter()
810
.map(|e| e.clone().into())
811
.collect()
812
} else {
813
vec![lit(1)]
814
};
815
816
let order_by = order_by.map(|(e, options)| {
817
let e = e.as_ref();
818
let e = if e.len() == 1 {
819
Arc::new(e[0].clone().into())
820
} else {
821
feature_gated!["dtype-struct", {
822
let e = e.iter().map(|e| e.clone().into()).collect::<Vec<_>>();
823
Arc::new(as_struct(e))
824
}]
825
};
826
(e, options)
827
});
828
829
Ok(Expr::Window {
830
function: Arc::new(self),
831
partition_by,
832
order_by,
833
options: options.into(),
834
})
835
}
836
837
#[cfg(feature = "dynamic_group_by")]
838
pub fn rolling(self, options: RollingGroupOptions) -> Self {
839
// We add the index column as `partition expr` so that the optimizer will
840
// not ignore it.
841
let index_col = col(options.index_column.clone());
842
Expr::Window {
843
function: Arc::new(self),
844
partition_by: vec![index_col],
845
order_by: None,
846
options: WindowType::Rolling(options),
847
}
848
}
849
850
fn fill_null_impl(self, fill_value: Expr) -> Self {
851
self.map_binary(FunctionExpr::FillNull, fill_value)
852
}
853
854
/// Replace the null values by a value.
855
pub fn fill_null<E: Into<Expr>>(self, fill_value: E) -> Self {
856
self.fill_null_impl(fill_value.into())
857
}
858
859
pub fn fill_null_with_strategy(self, strategy: FillNullStrategy) -> Self {
860
self.map_unary(FunctionExpr::FillNullWithStrategy(strategy))
861
}
862
863
/// Replace the floating point `NaN` values by a value.
864
pub fn fill_nan<E: Into<Expr>>(self, fill_value: E) -> Self {
865
// we take the not branch so that self is truthy value of `when -> then -> otherwise`
866
// and that ensure we keep the name of `self`
867
868
when(self.clone().is_not_nan().or(self.clone().is_null()))
869
.then(self)
870
.otherwise(fill_value.into())
871
}
872
/// Count the values of the Series
873
/// or
874
/// Get counts of the group by operation.
875
pub fn count(self) -> Self {
876
AggExpr::Count {
877
input: Arc::new(self),
878
include_nulls: false,
879
}
880
.into()
881
}
882
883
pub fn len(self) -> Self {
884
AggExpr::Count {
885
input: Arc::new(self),
886
include_nulls: true,
887
}
888
.into()
889
}
890
891
/// Get a mask of duplicated values.
892
#[allow(clippy::wrong_self_convention)]
893
#[cfg(feature = "is_unique")]
894
pub fn is_duplicated(self) -> Self {
895
self.map_unary(BooleanFunction::IsDuplicated)
896
}
897
898
#[allow(clippy::wrong_self_convention)]
899
#[cfg(feature = "is_between")]
900
pub fn is_between<E: Into<Expr>>(self, lower: E, upper: E, closed: ClosedInterval) -> Self {
901
self.map_ternary(
902
BooleanFunction::IsBetween { closed },
903
lower.into(),
904
upper.into(),
905
)
906
}
907
908
/// Get a mask of unique values.
909
#[allow(clippy::wrong_self_convention)]
910
#[cfg(feature = "is_unique")]
911
pub fn is_unique(self) -> Self {
912
self.map_unary(BooleanFunction::IsUnique)
913
}
914
915
/// Check whether floating point values are close to each other.
916
#[allow(clippy::wrong_self_convention)]
917
#[cfg(feature = "is_close")]
918
pub fn is_close<E: Into<Expr>>(
919
self,
920
expr: E,
921
abs_tol: f64,
922
rel_tol: f64,
923
nans_equal: bool,
924
) -> Self {
925
self.map_binary(
926
BooleanFunction::IsClose {
927
abs_tol: TotalOrdWrap(abs_tol),
928
rel_tol: TotalOrdWrap(rel_tol),
929
nans_equal,
930
},
931
expr.into(),
932
)
933
}
934
935
/// Get the approximate count of unique values.
936
#[cfg(feature = "approx_unique")]
937
pub fn approx_n_unique(self) -> Self {
938
self.map_unary(FunctionExpr::ApproxNUnique)
939
}
940
941
/// Bitwise "and" operation.
942
pub fn and<E: Into<Expr>>(self, expr: E) -> Self {
943
binary_expr(self, Operator::And, expr.into())
944
}
945
946
/// Bitwise "xor" operation.
947
pub fn xor<E: Into<Expr>>(self, expr: E) -> Self {
948
binary_expr(self, Operator::Xor, expr.into())
949
}
950
951
/// Bitwise "or" operation.
952
pub fn or<E: Into<Expr>>(self, expr: E) -> Self {
953
binary_expr(self, Operator::Or, expr.into())
954
}
955
956
/// Logical "or" operation.
957
pub fn logical_or<E: Into<Expr>>(self, expr: E) -> Self {
958
binary_expr(self, Operator::LogicalOr, expr.into())
959
}
960
961
/// Logical "and" operation.
962
pub fn logical_and<E: Into<Expr>>(self, expr: E) -> Self {
963
binary_expr(self, Operator::LogicalAnd, expr.into())
964
}
965
966
/// Filter a single column.
967
///
968
/// Should be used in aggregation context. If you want to filter on a
969
/// DataFrame level, use `LazyFrame::filter`.
970
pub fn filter<E: Into<Expr>>(self, predicate: E) -> Self {
971
Expr::Filter {
972
input: Arc::new(self),
973
by: Arc::new(predicate.into()),
974
}
975
}
976
977
/// Check if the values of the left expression are in the lists of the right expr.
978
#[allow(clippy::wrong_self_convention)]
979
#[cfg(feature = "is_in")]
980
pub fn is_in<E: Into<Expr>>(self, other: E, nulls_equal: bool) -> Self {
981
let other = other.into();
982
let function = BooleanFunction::IsIn { nulls_equal };
983
let function = function.into();
984
Expr::Function {
985
input: vec![self, other],
986
function,
987
}
988
}
989
990
/// Sort this column by the ordering of another column evaluated from given expr.
991
/// Can also be used in a group_by context to sort the groups.
992
///
993
/// # Example
994
///
995
/// ```rust
996
/// # use polars_core::prelude::*;
997
/// # use polars_lazy::prelude::*;
998
/// # fn main() -> PolarsResult<()> {
999
/// let lf = df! {
1000
/// "a" => [1, 2, 3, 4, 5],
1001
/// "b" => [5, 4, 3, 2, 1]
1002
/// }?.lazy();
1003
///
1004
/// let sorted = lf
1005
/// .select(
1006
/// vec![col("a").sort_by(col("b"), SortOptions::default())],
1007
/// )
1008
/// .collect()?;
1009
///
1010
/// assert_eq!(
1011
/// sorted,
1012
/// df! { "a" => [5, 4, 3, 2, 1] }?
1013
/// );
1014
/// # Ok(())
1015
/// # }
1016
pub fn sort_by<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
1017
self,
1018
by: E,
1019
sort_options: SortMultipleOptions,
1020
) -> Expr {
1021
let by = by.as_ref().iter().map(|e| e.clone().into()).collect();
1022
Expr::SortBy {
1023
expr: Arc::new(self),
1024
by,
1025
sort_options,
1026
}
1027
}
1028
1029
#[cfg(feature = "repeat_by")]
1030
/// Repeat the column `n` times, where `n` is determined by the values in `by`.
1031
/// This yields an `Expr` of dtype `List`.
1032
pub fn repeat_by<E: Into<Expr>>(self, by: E) -> Expr {
1033
self.map_binary(FunctionExpr::RepeatBy, by.into())
1034
}
1035
1036
#[cfg(feature = "is_first_distinct")]
1037
#[allow(clippy::wrong_self_convention)]
1038
/// Get a mask of the first unique value.
1039
pub fn is_first_distinct(self) -> Expr {
1040
self.map_unary(BooleanFunction::IsFirstDistinct)
1041
}
1042
1043
#[cfg(feature = "is_last_distinct")]
1044
#[allow(clippy::wrong_self_convention)]
1045
/// Get a mask of the last unique value.
1046
pub fn is_last_distinct(self) -> Expr {
1047
self.map_unary(BooleanFunction::IsLastDistinct)
1048
}
1049
1050
fn dot_impl(self, other: Expr) -> Expr {
1051
(self * other).sum()
1052
}
1053
1054
/// Compute the dot/inner product between two expressions.
1055
pub fn dot<E: Into<Expr>>(self, other: E) -> Expr {
1056
self.dot_impl(other.into())
1057
}
1058
1059
#[cfg(feature = "mode")]
1060
/// Compute the mode(s) of this column. This is the most occurring value.
1061
pub fn mode(self) -> Expr {
1062
self.map_unary(FunctionExpr::Mode)
1063
}
1064
1065
#[cfg(feature = "interpolate")]
1066
/// Interpolate intermediate values.
1067
/// Nulls at the beginning and end of the series remain null.
1068
pub fn interpolate(self, method: InterpolationMethod) -> Expr {
1069
self.map_unary(FunctionExpr::Interpolate(method))
1070
}
1071
1072
#[cfg(feature = "rolling_window_by")]
1073
#[allow(clippy::type_complexity)]
1074
fn finish_rolling_by(
1075
self,
1076
by: Expr,
1077
options: RollingOptionsDynamicWindow,
1078
rolling_function_by: RollingFunctionBy,
1079
) -> Expr {
1080
self.map_binary(
1081
FunctionExpr::RollingExprBy {
1082
function_by: rolling_function_by,
1083
options,
1084
},
1085
by,
1086
)
1087
}
1088
1089
#[cfg(feature = "interpolate_by")]
1090
/// Interpolate intermediate values.
1091
/// Nulls at the beginning and end of the series remain null.
1092
/// The `by` column provides the x-coordinates for interpolation and must not contain nulls.
1093
pub fn interpolate_by(self, by: Expr) -> Expr {
1094
self.map_binary(FunctionExpr::InterpolateBy, by)
1095
}
1096
1097
#[cfg(feature = "rolling_window")]
1098
#[allow(clippy::type_complexity)]
1099
fn finish_rolling(
1100
self,
1101
options: RollingOptionsFixedWindow,
1102
rolling_function: RollingFunction,
1103
) -> Expr {
1104
self.map_unary(FunctionExpr::RollingExpr {
1105
function: rolling_function,
1106
options,
1107
})
1108
}
1109
1110
/// Apply a rolling minimum based on another column.
1111
#[cfg(feature = "rolling_window_by")]
1112
pub fn rolling_min_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1113
self.finish_rolling_by(by, options, RollingFunctionBy::MinBy)
1114
}
1115
1116
/// Apply a rolling maximum based on another column.
1117
#[cfg(feature = "rolling_window_by")]
1118
pub fn rolling_max_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1119
self.finish_rolling_by(by, options, RollingFunctionBy::MaxBy)
1120
}
1121
1122
/// Apply a rolling mean based on another column.
1123
#[cfg(feature = "rolling_window_by")]
1124
pub fn rolling_mean_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1125
self.finish_rolling_by(by, options, RollingFunctionBy::MeanBy)
1126
}
1127
1128
/// Apply a rolling sum based on another column.
1129
#[cfg(feature = "rolling_window_by")]
1130
pub fn rolling_sum_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1131
self.finish_rolling_by(by, options, RollingFunctionBy::SumBy)
1132
}
1133
1134
/// Apply a rolling quantile based on another column.
1135
#[cfg(feature = "rolling_window_by")]
1136
pub fn rolling_quantile_by(
1137
self,
1138
by: Expr,
1139
method: QuantileMethod,
1140
quantile: f64,
1141
mut options: RollingOptionsDynamicWindow,
1142
) -> Expr {
1143
use polars_compute::rolling::{RollingFnParams, RollingQuantileParams};
1144
options.fn_params = Some(RollingFnParams::Quantile(RollingQuantileParams {
1145
prob: quantile,
1146
method,
1147
}));
1148
1149
self.finish_rolling_by(by, options, RollingFunctionBy::QuantileBy)
1150
}
1151
1152
/// Apply a rolling variance based on another column.
1153
#[cfg(feature = "rolling_window_by")]
1154
pub fn rolling_var_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1155
self.finish_rolling_by(by, options, RollingFunctionBy::VarBy)
1156
}
1157
1158
/// Apply a rolling std-dev based on another column.
1159
#[cfg(feature = "rolling_window_by")]
1160
pub fn rolling_std_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1161
self.finish_rolling_by(by, options, RollingFunctionBy::StdBy)
1162
}
1163
1164
/// Apply a rolling median based on another column.
1165
#[cfg(feature = "rolling_window_by")]
1166
pub fn rolling_median_by(self, by: Expr, options: RollingOptionsDynamicWindow) -> Expr {
1167
self.rolling_quantile_by(by, QuantileMethod::Linear, 0.5, options)
1168
}
1169
1170
/// Apply a rolling minimum.
1171
///
1172
/// See: [`RollingAgg::rolling_min`]
1173
#[cfg(feature = "rolling_window")]
1174
pub fn rolling_min(self, options: RollingOptionsFixedWindow) -> Expr {
1175
self.finish_rolling(options, RollingFunction::Min)
1176
}
1177
1178
/// Apply a rolling maximum.
1179
///
1180
/// See: [`RollingAgg::rolling_max`]
1181
#[cfg(feature = "rolling_window")]
1182
pub fn rolling_max(self, options: RollingOptionsFixedWindow) -> Expr {
1183
self.finish_rolling(options, RollingFunction::Max)
1184
}
1185
1186
/// Apply a rolling mean.
1187
///
1188
/// See: [`RollingAgg::rolling_mean`]
1189
#[cfg(feature = "rolling_window")]
1190
pub fn rolling_mean(self, options: RollingOptionsFixedWindow) -> Expr {
1191
self.finish_rolling(options, RollingFunction::Mean)
1192
}
1193
1194
/// Apply a rolling sum.
1195
///
1196
/// See: [`RollingAgg::rolling_sum`]
1197
#[cfg(feature = "rolling_window")]
1198
pub fn rolling_sum(self, options: RollingOptionsFixedWindow) -> Expr {
1199
self.finish_rolling(options, RollingFunction::Sum)
1200
}
1201
1202
/// Apply a rolling median.
1203
///
1204
/// See: [`RollingAgg::rolling_median`]
1205
#[cfg(feature = "rolling_window")]
1206
pub fn rolling_median(self, options: RollingOptionsFixedWindow) -> Expr {
1207
self.rolling_quantile(QuantileMethod::Linear, 0.5, options)
1208
}
1209
1210
/// Apply a rolling quantile.
1211
///
1212
/// See: [`RollingAgg::rolling_quantile`]
1213
#[cfg(feature = "rolling_window")]
1214
pub fn rolling_quantile(
1215
self,
1216
method: QuantileMethod,
1217
quantile: f64,
1218
mut options: RollingOptionsFixedWindow,
1219
) -> Expr {
1220
use polars_compute::rolling::{RollingFnParams, RollingQuantileParams};
1221
1222
options.fn_params = Some(RollingFnParams::Quantile(RollingQuantileParams {
1223
prob: quantile,
1224
method,
1225
}));
1226
1227
self.finish_rolling(options, RollingFunction::Quantile)
1228
}
1229
1230
/// Apply a rolling variance.
1231
#[cfg(feature = "rolling_window")]
1232
pub fn rolling_var(self, options: RollingOptionsFixedWindow) -> Expr {
1233
self.finish_rolling(options, RollingFunction::Var)
1234
}
1235
1236
/// Apply a rolling std-dev.
1237
#[cfg(feature = "rolling_window")]
1238
pub fn rolling_std(self, options: RollingOptionsFixedWindow) -> Expr {
1239
self.finish_rolling(options, RollingFunction::Std)
1240
}
1241
1242
/// Apply a rolling skew.
1243
#[cfg(feature = "rolling_window")]
1244
#[cfg(feature = "moment")]
1245
pub fn rolling_skew(self, options: RollingOptionsFixedWindow) -> Expr {
1246
self.finish_rolling(options, RollingFunction::Skew)
1247
}
1248
1249
/// Apply a rolling skew.
1250
#[cfg(feature = "rolling_window")]
1251
#[cfg(feature = "moment")]
1252
pub fn rolling_kurtosis(self, options: RollingOptionsFixedWindow) -> Expr {
1253
self.finish_rolling(options, RollingFunction::Kurtosis)
1254
}
1255
1256
#[cfg(feature = "rolling_window")]
1257
/// Apply a custom function over a rolling/ moving window of the array.
1258
/// This has quite some dynamic dispatch, so prefer rolling_min, max, mean, sum over this.
1259
pub fn rolling_map(
1260
self,
1261
f: PlanCallback<Series, Series>,
1262
options: RollingOptionsFixedWindow,
1263
) -> Expr {
1264
self.finish_rolling(options, RollingFunction::Map(f))
1265
}
1266
1267
#[cfg(feature = "peaks")]
1268
pub fn peak_min(self) -> Expr {
1269
self.map_unary(FunctionExpr::PeakMin)
1270
}
1271
1272
#[cfg(feature = "peaks")]
1273
pub fn peak_max(self) -> Expr {
1274
self.map_unary(FunctionExpr::PeakMax)
1275
}
1276
1277
#[cfg(feature = "rank")]
1278
/// Assign ranks to data, dealing with ties appropriately.
1279
pub fn rank(self, options: RankOptions, seed: Option<u64>) -> Expr {
1280
self.map_unary(FunctionExpr::Rank { options, seed })
1281
}
1282
1283
#[cfg(feature = "replace")]
1284
/// Replace the given values with other values.
1285
pub fn replace<E: Into<Expr>>(self, old: E, new: E) -> Expr {
1286
let old = old.into();
1287
let new = new.into();
1288
self.map_n_ary(FunctionExpr::Replace, [old, new])
1289
}
1290
1291
#[cfg(feature = "replace")]
1292
/// Replace the given values with other values.
1293
pub fn replace_strict<E: Into<Expr>>(
1294
self,
1295
old: E,
1296
new: E,
1297
default: Option<E>,
1298
return_dtype: Option<impl Into<DataTypeExpr>>,
1299
) -> Expr {
1300
let old = old.into();
1301
let new = new.into();
1302
let mut args = vec![old, new];
1303
args.extend(default.map(Into::into));
1304
self.map_n_ary(
1305
FunctionExpr::ReplaceStrict {
1306
return_dtype: return_dtype.map(Into::into),
1307
},
1308
args,
1309
)
1310
}
1311
1312
#[cfg(feature = "cutqcut")]
1313
/// Bin continuous values into discrete categories.
1314
pub fn cut(
1315
self,
1316
breaks: Vec<f64>,
1317
labels: Option<impl IntoVec<PlSmallStr>>,
1318
left_closed: bool,
1319
include_breaks: bool,
1320
) -> Expr {
1321
self.map_unary(FunctionExpr::Cut {
1322
breaks,
1323
labels: labels.map(|x| x.into_vec()),
1324
left_closed,
1325
include_breaks,
1326
})
1327
}
1328
1329
#[cfg(feature = "cutqcut")]
1330
/// Bin continuous values into discrete categories based on their quantiles.
1331
pub fn qcut(
1332
self,
1333
probs: Vec<f64>,
1334
labels: Option<impl IntoVec<PlSmallStr>>,
1335
left_closed: bool,
1336
allow_duplicates: bool,
1337
include_breaks: bool,
1338
) -> Expr {
1339
self.map_unary(FunctionExpr::QCut {
1340
probs,
1341
labels: labels.map(|x| x.into_vec()),
1342
left_closed,
1343
allow_duplicates,
1344
include_breaks,
1345
})
1346
}
1347
1348
#[cfg(feature = "cutqcut")]
1349
/// Bin continuous values into discrete categories using uniform quantile probabilities.
1350
pub fn qcut_uniform(
1351
self,
1352
n_bins: usize,
1353
labels: Option<impl IntoVec<PlSmallStr>>,
1354
left_closed: bool,
1355
allow_duplicates: bool,
1356
include_breaks: bool,
1357
) -> Expr {
1358
let probs = (1..n_bins).map(|b| b as f64 / n_bins as f64).collect();
1359
self.map_unary(FunctionExpr::QCut {
1360
probs,
1361
labels: labels.map(|x| x.into_vec()),
1362
left_closed,
1363
allow_duplicates,
1364
include_breaks,
1365
})
1366
}
1367
1368
#[cfg(feature = "rle")]
1369
/// Get the lengths of runs of identical values.
1370
pub fn rle(self) -> Expr {
1371
self.map_unary(FunctionExpr::RLE)
1372
}
1373
1374
#[cfg(feature = "rle")]
1375
/// Similar to `rle`, but maps values to run IDs.
1376
pub fn rle_id(self) -> Expr {
1377
self.map_unary(FunctionExpr::RLEID)
1378
}
1379
1380
#[cfg(feature = "diff")]
1381
/// Calculate the n-th discrete difference between values.
1382
pub fn diff(self, n: Expr, null_behavior: NullBehavior) -> Expr {
1383
self.map_binary(FunctionExpr::Diff(null_behavior), n)
1384
}
1385
1386
#[cfg(feature = "pct_change")]
1387
/// Computes percentage change between values.
1388
pub fn pct_change(self, n: Expr) -> Expr {
1389
self.map_binary(FunctionExpr::PctChange, n)
1390
}
1391
1392
#[cfg(feature = "moment")]
1393
/// Compute the sample skewness of a data set.
1394
///
1395
/// For normally distributed data, the skewness should be about zero. For
1396
/// uni-modal continuous distributions, a skewness value greater than zero means
1397
/// that there is more weight in the right tail of the distribution. The
1398
/// function `skewtest` can be used to determine if the skewness value
1399
/// is close enough to zero, statistically speaking.
1400
///
1401
/// see: [scipy](https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L1024)
1402
pub fn skew(self, bias: bool) -> Expr {
1403
self.map_unary(FunctionExpr::Skew(bias))
1404
}
1405
1406
#[cfg(feature = "moment")]
1407
/// Compute the kurtosis (Fisher or Pearson).
1408
///
1409
/// Kurtosis is the fourth central moment divided by the square of the
1410
/// variance. If Fisher's definition is used, then 3.0 is subtracted from
1411
/// the result to give 0.0 for a normal distribution.
1412
/// If bias is False then the kurtosis is calculated using k statistics to
1413
/// eliminate bias coming from biased moment estimators.
1414
pub fn kurtosis(self, fisher: bool, bias: bool) -> Expr {
1415
self.map_unary(FunctionExpr::Kurtosis(fisher, bias))
1416
}
1417
1418
/// Get maximal value that could be hold by this dtype.
1419
pub fn upper_bound(self) -> Expr {
1420
self.map_unary(FunctionExpr::UpperBound)
1421
}
1422
1423
/// Get minimal value that could be hold by this dtype.
1424
pub fn lower_bound(self) -> Expr {
1425
self.map_unary(FunctionExpr::LowerBound)
1426
}
1427
1428
#[cfg(feature = "dtype-array")]
1429
pub fn reshape(self, dimensions: &[i64]) -> Self {
1430
let dimensions = dimensions
1431
.iter()
1432
.map(|&v| ReshapeDimension::new(v))
1433
.collect();
1434
self.map_unary(FunctionExpr::Reshape(dimensions))
1435
}
1436
1437
#[cfg(feature = "ewma")]
1438
/// Calculate the exponentially-weighted moving average.
1439
pub fn ewm_mean(self, options: EWMOptions) -> Self {
1440
self.map_unary(FunctionExpr::EwmMean { options })
1441
}
1442
1443
#[cfg(feature = "ewma_by")]
1444
/// Calculate the exponentially-weighted moving average by a time column.
1445
pub fn ewm_mean_by(self, times: Expr, half_life: Duration) -> Self {
1446
self.map_binary(FunctionExpr::EwmMeanBy { half_life }, times)
1447
}
1448
1449
#[cfg(feature = "ewma")]
1450
/// Calculate the exponentially-weighted moving standard deviation.
1451
pub fn ewm_std(self, options: EWMOptions) -> Self {
1452
self.map_unary(FunctionExpr::EwmStd { options })
1453
}
1454
1455
#[cfg(feature = "ewma")]
1456
/// Calculate the exponentially-weighted moving variance.
1457
pub fn ewm_var(self, options: EWMOptions) -> Self {
1458
self.map_unary(FunctionExpr::EwmVar { options })
1459
}
1460
1461
/// Returns whether any of the values in the column are `true`.
1462
///
1463
/// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls:
1464
/// if the column contains any null values and no `true` values, the output
1465
/// is null.
1466
///
1467
/// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic
1468
pub fn any(self, ignore_nulls: bool) -> Self {
1469
self.map_unary(BooleanFunction::Any { ignore_nulls })
1470
}
1471
1472
/// Returns whether all values in the column are `true`.
1473
///
1474
/// If `ignore_nulls` is `False`, [Kleene logic] is used to deal with nulls:
1475
/// if the column contains any null values and no `false` values, the output
1476
/// is null.
1477
///
1478
/// [Kleene logic]: https://en.wikipedia.org/wiki/Three-valued_logic
1479
pub fn all(self, ignore_nulls: bool) -> Self {
1480
self.map_unary(BooleanFunction::All { ignore_nulls })
1481
}
1482
1483
#[cfg(feature = "dtype-struct")]
1484
/// Count all unique values and create a struct mapping value to count.
1485
/// (Note that it is better to turn parallel off in the aggregation context).
1486
/// The name of the struct field with the counts is given by the parameter `name`.
1487
pub fn value_counts(self, sort: bool, parallel: bool, name: &str, normalize: bool) -> Self {
1488
self.map_unary(FunctionExpr::ValueCounts {
1489
sort,
1490
parallel,
1491
name: name.into(),
1492
normalize,
1493
})
1494
}
1495
1496
#[cfg(feature = "unique_counts")]
1497
/// Returns a count of the unique values in the order of appearance.
1498
/// This method differs from [`Expr::value_counts`] in that it does not return the
1499
/// values, only the counts and might be faster.
1500
pub fn unique_counts(self) -> Self {
1501
self.map_unary(FunctionExpr::UniqueCounts)
1502
}
1503
1504
#[cfg(feature = "log")]
1505
/// Compute the logarithm to a given base.
1506
pub fn log(self, base: Expr) -> Self {
1507
self.map_binary(FunctionExpr::Log, base)
1508
}
1509
1510
#[cfg(feature = "log")]
1511
/// Compute the natural logarithm of all elements plus one in the input array.
1512
pub fn log1p(self) -> Self {
1513
self.map_unary(FunctionExpr::Log1p)
1514
}
1515
1516
#[cfg(feature = "log")]
1517
/// Calculate the exponential of all elements in the input array.
1518
pub fn exp(self) -> Self {
1519
self.map_unary(FunctionExpr::Exp)
1520
}
1521
1522
#[cfg(feature = "log")]
1523
/// Compute the entropy as `-sum(pk * log(pk))`.
1524
/// where `pk` are discrete probabilities.
1525
pub fn entropy(self, base: f64, normalize: bool) -> Self {
1526
self.map_unary(FunctionExpr::Entropy { base, normalize })
1527
}
1528
/// Get the null count of the column/group.
1529
pub fn null_count(self) -> Expr {
1530
self.map_unary(FunctionExpr::NullCount)
1531
}
1532
1533
/// Set this `Series` as `sorted` so that downstream code can use
1534
/// fast paths for sorted arrays.
1535
/// # Warning
1536
/// This can lead to incorrect results if this `Series` is not sorted!!
1537
/// Use with care!
1538
pub fn set_sorted_flag(self, sorted: IsSorted) -> Expr {
1539
// This is `map`. If a column is sorted. Chunks of that column are also sorted.
1540
self.map_unary(FunctionExpr::SetSortedFlag(sorted))
1541
}
1542
1543
#[cfg(feature = "row_hash")]
1544
/// Compute the hash of every element.
1545
pub fn hash(self, k0: u64, k1: u64, k2: u64, k3: u64) -> Expr {
1546
self.map_unary(FunctionExpr::Hash(k0, k1, k2, k3))
1547
}
1548
1549
pub fn to_physical(self) -> Expr {
1550
self.map_unary(FunctionExpr::ToPhysical)
1551
}
1552
1553
pub fn gather_every(self, n: usize, offset: usize) -> Expr {
1554
self.map_unary(FunctionExpr::GatherEvery { n, offset })
1555
}
1556
1557
#[cfg(feature = "reinterpret")]
1558
pub fn reinterpret(self, signed: bool) -> Expr {
1559
self.map_unary(FunctionExpr::Reinterpret(signed))
1560
}
1561
1562
pub fn extend_constant(self, value: Expr, n: Expr) -> Expr {
1563
self.map_ternary(FunctionExpr::ExtendConstant, value, n)
1564
}
1565
1566
#[cfg(feature = "strings")]
1567
/// Get the [`string::StringNameSpace`]
1568
pub fn str(self) -> string::StringNameSpace {
1569
string::StringNameSpace(self)
1570
}
1571
1572
/// Get the [`binary::BinaryNameSpace`]
1573
pub fn binary(self) -> binary::BinaryNameSpace {
1574
binary::BinaryNameSpace(self)
1575
}
1576
1577
#[cfg(feature = "temporal")]
1578
/// Get the [`dt::DateLikeNameSpace`]
1579
pub fn dt(self) -> dt::DateLikeNameSpace {
1580
dt::DateLikeNameSpace(self)
1581
}
1582
1583
/// Get the [`list::ListNameSpace`]
1584
pub fn list(self) -> list::ListNameSpace {
1585
list::ListNameSpace(self)
1586
}
1587
1588
/// Get the [`name::ExprNameNameSpace`]
1589
pub fn name(self) -> name::ExprNameNameSpace {
1590
name::ExprNameNameSpace(self)
1591
}
1592
1593
/// Get the [`array::ArrayNameSpace`].
1594
#[cfg(feature = "dtype-array")]
1595
pub fn arr(self) -> array::ArrayNameSpace {
1596
array::ArrayNameSpace(self)
1597
}
1598
1599
/// Get the [`CategoricalNameSpace`].
1600
#[cfg(feature = "dtype-categorical")]
1601
pub fn cat(self) -> cat::CategoricalNameSpace {
1602
cat::CategoricalNameSpace(self)
1603
}
1604
1605
/// Get the [`struct_::StructNameSpace`].
1606
#[cfg(feature = "dtype-struct")]
1607
pub fn struct_(self) -> struct_::StructNameSpace {
1608
struct_::StructNameSpace(self)
1609
}
1610
1611
/// Get the [`meta::MetaNameSpace`]
1612
#[cfg(feature = "meta")]
1613
pub fn meta(self) -> meta::MetaNameSpace {
1614
meta::MetaNameSpace(self)
1615
}
1616
}
1617
1618
/// Apply a function/closure over multiple columns once the logical plan get executed.
1619
///
1620
/// This function is very similar to [`apply_multiple`], but differs in how it handles aggregations.
1621
///
1622
/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
1623
/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
1624
///
1625
/// It is the responsibility of the caller that the schema is correct by giving
1626
/// the correct output_type. If None given the output type of the input expr is used.
1627
pub fn map_multiple<F, DT, E>(function: F, expr: E, output_type: DT) -> Expr
1628
where
1629
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
1630
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
1631
E: AsRef<[Expr]>,
1632
{
1633
let input = expr.as_ref().to_vec();
1634
1635
let function = BaseColumnUdf::new(function, output_type);
1636
1637
let options = FunctionOptions::elementwise();
1638
Expr::AnonymousFunction {
1639
input,
1640
function: new_column_udf(function),
1641
options,
1642
fmt_str: Box::new(PlSmallStr::EMPTY),
1643
}
1644
}
1645
1646
/// Apply a function/closure over the groups of multiple columns. This should only be used in a group_by aggregation.
1647
///
1648
/// It is the responsibility of the caller that the schema is correct by giving
1649
/// the correct output_type. If None given the output type of the input expr is used.
1650
///
1651
/// This difference with [`map_multiple`] is that [`apply_multiple`] will create a separate [`Series`] per group.
1652
///
1653
/// * [`map_multiple`] should be used for operations that are independent of groups, e.g. `multiply * 2`, or `raise to the power`
1654
/// * [`apply_multiple`] should be used for operations that work on a group of data. e.g. `sum`, `count`, etc.
1655
pub fn apply_multiple<F, DT, E>(function: F, expr: E, output_type: DT, returns_scalar: bool) -> Expr
1656
where
1657
F: Fn(&mut [Column]) -> PolarsResult<Column> + 'static + Send + Sync,
1658
DT: Fn(&Schema, &[Field]) -> PolarsResult<Field> + 'static + Send + Sync,
1659
E: AsRef<[Expr]>,
1660
{
1661
let input = expr.as_ref().to_vec();
1662
let options = FunctionOptions::groupwise().with_flags(|mut f| {
1663
f.set(FunctionFlags::RETURNS_SCALAR, returns_scalar);
1664
f
1665
});
1666
let function = BaseColumnUdf::new(function, output_type);
1667
Expr::AnonymousFunction {
1668
input,
1669
function: new_column_udf(function),
1670
options,
1671
fmt_str: Box::new(PlSmallStr::EMPTY),
1672
}
1673
}
1674
1675
/// Return the number of rows in the context.
1676
pub fn len() -> Expr {
1677
Expr::Len
1678
}
1679
1680
/// First column in a DataFrame.
1681
pub fn first() -> Selector {
1682
nth(0)
1683
}
1684
1685
/// Last column in a DataFrame.
1686
pub fn last() -> Selector {
1687
nth(-1)
1688
}
1689
1690
/// Nth column in a DataFrame.
1691
pub fn nth(n: i64) -> Selector {
1692
Selector::ByIndex {
1693
indices: [n].into(),
1694
strict: true,
1695
}
1696
}
1697
1698