CoCalc -- queries.rs

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-lazy/src/tests/queries.rs
⁶⁹³⁹ views
1
#[cfg(feature = "diff")]
2
use polars_core::series::ops::NullBehavior;
3

4
use super::*;
5

6
#[test]
7
fn test_lazy_with_column() {
8
    let df = get_df()
9
        .lazy()
10
        .with_column(lit(10).alias("foo"))
11
        .collect()
12
        .unwrap();
13
    assert_eq!(df.width(), 6);
14
    assert!(df.column("foo").is_ok());
15
}
16

17
#[test]
18
fn test_lazy_exec() {
19
    let df = get_df();
20
    let _new = df
21
        .clone()
22
        .lazy()
23
        .select([col("sepal_width"), col("variety")])
24
        .sort(["sepal_width"], Default::default())
25
        .collect();
26

27
    let new = df
28
        .lazy()
29
        .filter(not(col("sepal_width").lt(lit(3.5))))
30
        .collect()
31
        .unwrap();
32

33
    let check = new.column("sepal_width").unwrap().f64().unwrap().gt(3.4);
34
    assert!(check.all())
35
}
36

37
#[test]
38
fn test_lazy_alias() {
39
    let df = get_df();
40
    let new = df
41
        .lazy()
42
        .select([col("sepal_width").alias("petals"), col("sepal_width")])
43
        .collect()
44
        .unwrap();
45
    assert_eq!(new.get_column_names(), &["petals", "sepal_width"]);
46
}
47

48
#[test]
49
#[cfg(feature = "pivot")]
50
fn test_lazy_unpivot() {
51
    let df = get_df();
52

53
    let args = UnpivotArgsDSL {
54
        on: by_name(["sepal_length", "sepal_width"], true),
55
        index: by_name(["petal_width", "petal_length"], true),
56
        variable_name: None,
57
        value_name: None,
58
    };
59

60
    let out = df
61
        .lazy()
62
        .unpivot(args)
63
        .filter(col("variable").eq(lit("sepal_length")))
64
        .select([col("variable"), col("petal_width"), col("value")])
65
        .collect()
66
        .unwrap();
67
    assert_eq!(out.shape(), (7, 3));
68
}
69

70
#[test]
71
fn test_lazy_drop_nulls() {
72
    let df = df! {
73
        "foo" => &[Some(1), None, Some(3)],
74
        "bar" => &[Some(1), Some(2), None]
75
    }
76
    .unwrap();
77

78
    let new = df.lazy().drop_nulls(None).collect().unwrap();
79
    let out = df! {
80
        "foo" => &[Some(1)],
81
        "bar" => &[Some(1)]
82
    }
83
    .unwrap();
84
    assert!(new.equals(&out));
85
}
86

87
#[test]
88
fn test_lazy_udf() {
89
    let df = get_df();
90
    let new = df
91
        .lazy()
92
        .select([col("sepal_width").map(|s| Ok(s * 200.0), |_, f| Ok(f.clone()))])
93
        .collect()
94
        .unwrap();
95
    assert_eq!(
96
        new.column("sepal_width").unwrap().f64().unwrap().get(0),
97
        Some(700.0)
98
    );
99
}
100

101
#[test]
102
fn test_lazy_is_null() {
103
    let df = get_df();
104
    let new = df
105
        .clone()
106
        .lazy()
107
        .filter(col("sepal_width").is_null())
108
        .collect()
109
        .unwrap();
110

111
    assert_eq!(new.height(), 0);
112

113
    let new = df
114
        .clone()
115
        .lazy()
116
        .filter(col("sepal_width").is_not_null())
117
        .collect()
118
        .unwrap();
119
    assert_eq!(new.height(), df.height());
120

121
    let new = df
122
        .lazy()
123
        .group_by([col("variety")])
124
        .agg([col("sepal_width").min()])
125
        .collect()
126
        .unwrap();
127

128
    assert_eq!(new.shape(), (1, 2));
129
}
130

131
#[test]
132
fn test_lazy_pushdown_through_agg() {
133
    // An aggregation changes the schema names, check if the pushdown succeeds.
134
    let df = get_df();
135
    let new = df
136
        .lazy()
137
        .group_by([col("variety")])
138
        .agg([
139
            col("sepal_length").min(),
140
            col("petal_length").min().alias("foo"),
141
        ])
142
        .select([col("foo")])
143
        // second selection is to test if optimizer can handle that
144
        .select([col("foo").alias("bar")])
145
        .collect()
146
        .unwrap();
147

148
    assert_eq!(new.shape(), (1, 1));
149
    let bar = new.column("bar").unwrap();
150
    assert_eq!(bar.get(0).unwrap(), AnyValue::Float64(1.3));
151
}
152

153
#[test]
154
fn test_lazy_shift() {
155
    let df = get_df();
156
    let new = df
157
        .lazy()
158
        .select([col("sepal_width").alias("foo").shift(lit(2))])
159
        .collect()
160
        .unwrap();
161
    assert_eq!(new.column("foo").unwrap().f64().unwrap().get(0), None);
162
}
163

164
#[test]
165
fn test_shift_and_fill() -> PolarsResult<()> {
166
    let out = df![
167
        "a" => [1, 2, 3]
168
    ]?
169
    .lazy()
170
    .select([col("a").shift_and_fill(lit(-1), lit(5))])
171
    .collect()?;
172

173
    let out = out.column("a")?;
174
    assert_eq!(Vec::from(out.i32()?), &[Some(2), Some(3), Some(5)]);
175
    Ok(())
176
}
177

178
#[test]
179
fn test_shift_and_fill_non_numeric() -> PolarsResult<()> {
180
    let out = df![
181
        "bool" => [true, false, true],
182
    ]?
183
    .lazy()
184
    .select([col("bool").shift_and_fill(1, true)])
185
    .collect()?;
186

187
    let out = out.column("bool")?;
188
    assert_eq!(
189
        Vec::from(out.bool()?),
190
        &[Some(true), Some(true), Some(false)]
191
    );
192
    Ok(())
193
}
194

195
#[test]
196
fn test_lazy_ternary_and_predicates() {
197
    let df = get_df();
198
    // test if this runs. This failed because is_not_null changes the schema name, so we
199
    // really need to check the root column
200
    let ldf = df
201
        .clone()
202
        .lazy()
203
        .with_column(lit(3).alias("foo"))
204
        .filter(col("foo").is_not_null());
205
    let _new = ldf.collect().unwrap();
206

207
    let ldf = df
208
        .lazy()
209
        .with_column(
210
            when(col("sepal_length").lt(lit(5.0)))
211
                .then(
212
                    lit(3), // is another type on purpose to check type coercion
213
                )
214
                .otherwise(col("sepal_width"))
215
                .alias("foo"),
216
        )
217
        .filter(col("foo").gt(lit(3.0)));
218

219
    let new = ldf.collect().unwrap();
220
    let length = new.column("sepal_length").unwrap();
221
    assert_eq!(
222
        length,
223
        &Column::new("sepal_length".into(), &[5.1f64, 5.0, 5.4])
224
    );
225
    assert_eq!(new.shape(), (3, 6));
226
}
227

228
#[test]
229
fn test_lazy_binary_ops() {
230
    let df = df!("a" => &[1, 2, 3, 4, 5, ]).unwrap();
231
    let new = df
232
        .lazy()
233
        .select([col("a").eq(lit(2)).alias("foo")])
234
        .collect()
235
        .unwrap();
236
    assert_eq!(
237
        new.column("foo")
238
            .unwrap()
239
            .as_materialized_series()
240
            .sum::<i32>()
241
            .unwrap(),
242
        1
243
    );
244
}
245

246
#[test]
247
fn test_lazy_query_2() {
248
    let df = load_df();
249
    let ldf = df
250
        .lazy()
251
        .with_column(col("a").map(|s| Ok(s * 2), |_, f| Ok(f.clone())))
252
        .filter(col("a").lt(lit(2)))
253
        .select([col("b"), col("a")]);
254

255
    let new = ldf.collect().unwrap();
256
    assert_eq!(new.shape(), (0, 2));
257
}
258

259
#[test]
260
#[cfg(feature = "csv")]
261
fn test_lazy_query_3() {
262
    // query checks if schema of scanning is not changed by aggregation
263
    let _ = scan_foods_csv()
264
        .group_by([col("calories")])
265
        .agg([col("fats_g").max()])
266
        .collect()
267
        .unwrap();
268
}
269

270
#[test]
271
fn test_lazy_query_4() -> PolarsResult<()> {
272
    let df = df! {
273
        "uid" => [0, 0, 0, 1, 1, 1],
274
        "day" => [1, 2, 3, 1, 2, 3],
275
        "cumcases" => [10, 12, 15, 25, 30, 41]
276
    }
277
    .unwrap();
278

279
    let base_df = df.lazy();
280

281
    let out = base_df
282
        .clone()
283
        .group_by([col("uid")])
284
        .agg([
285
            col("day").alias("day"),
286
            col("cumcases")
287
                .apply(|s: Column| &s - &(s.shift(1)), |_, f| Ok(f.clone()))
288
                .alias("diff_cases"),
289
        ])
290
        .explode(by_name(["day", "diff_cases"], true))
291
        .join(
292
            base_df,
293
            [col("uid"), col("day")],
294
            [col("uid"), col("day")],
295
            JoinType::Inner.into(),
296
        )
297
        .collect()
298
        .unwrap();
299
    assert_eq!(
300
        Vec::from(out.column("diff_cases").unwrap().i32().unwrap()),
301
        &[None, Some(2), Some(3), None, Some(5), Some(11)]
302
    );
303

304
    Ok(())
305
}
306

307
#[test]
308
fn test_lazy_query_5() {
309
    // if this one fails, the list builder probably does not handle offsets
310
    let df = df! {
311
        "uid" => [0, 0, 0, 1, 1, 1],
312
        "day" => [1, 2, 4, 1, 2, 3],
313
        "cumcases" => [10, 12, 15, 25, 30, 41]
314
    }
315
    .unwrap();
316

317
    let out = df
318
        .lazy()
319
        .group_by([col("uid")])
320
        .agg([col("day").head(Some(2))])
321
        .collect()
322
        .unwrap();
323
    let s = out
324
        .select_at_idx(1)
325
        .unwrap()
326
        .list()
327
        .unwrap()
328
        .get_as_series(0)
329
        .unwrap();
330
    assert_eq!(s.len(), 2);
331
    let s = out
332
        .select_at_idx(1)
333
        .unwrap()
334
        .list()
335
        .unwrap()
336
        .get_as_series(0)
337
        .unwrap();
338
    assert_eq!(s.len(), 2);
339
}
340

341
#[test]
342
#[cfg(feature = "is_in")]
343
fn test_lazy_query_8() -> PolarsResult<()> {
344
    // https://github.com/pola-rs/polars/issues/842
345
    let df = df![
346
        "A" => [1, 2, 3],
347
        "B" => [1, 2, 3],
348
        "C" => [1, 2, 3],
349
        "D" => [1, 2, 3],
350
        "E" => [1, 2, 3]
351
    ]?;
352

353
    let mut selection = vec![];
354

355
    for &c in &["A", "B", "C", "D", "E"] {
356
        let e = when(col(c).is_in(col("E"), false))
357
            .then(col("A"))
358
            .otherwise(Null {}.lit())
359
            .alias(c);
360
        selection.push(e);
361
    }
362

363
    let out = df
364
        .lazy()
365
        .select(selection)
366
        .filter(col("D").gt(lit(1)))
367
        .collect()?;
368
    assert_eq!(out.shape(), (2, 5));
369
    Ok(())
370
}
371

372
#[test]
373
fn test_lazy_query_9() -> PolarsResult<()> {
374
    // https://github.com/pola-rs/polars/issues/958
375
    let cities = df![
376
        "Cities.City"=> ["Moscow", "Berlin", "Paris","Hamburg", "Lyon", "Novosibirsk"],
377
        "Cities.Population"=> [11.92, 3.645, 2.161, 1.841, 0.513, 1.511],
378
        "Cities.Country"=> ["Russia", "Germany", "France", "Germany", "France", "Russia"]
379
    ]?;
380

381
    let sales = df![
382
               "Sales.City"=> ["Moscow", "Berlin", "Paris", "Moscow", "Berlin", "Paris", "Moscow", "Berlin", "Paris"],
383
    "Sales.Item"=> ["Item A", "Item A","Item A",
384
                   "Item B", "Item B","Item B",
385
                   "Item C", "Item C","Item C"],
386
    "Sales.Amount"=> [200, 180, 100,
387
                    3, 30, 20,
388
                    90, 130, 125]
389
        ]?;
390

391
    let out = sales
392
        .lazy()
393
        .join(
394
            cities.lazy(),
395
            [col("Sales.City")],
396
            [col("Cities.City")],
397
            JoinType::Inner.into(),
398
        )
399
        .group_by([col("Cities.Country")])
400
        .agg([col("Sales.Amount").sum().alias("sum")])
401
        .sort(["sum"], Default::default())
402
        .collect()?;
403
    let vals = out
404
        .column("sum")?
405
        .i32()?
406
        .into_no_null_iter()
407
        .collect::<Vec<_>>();
408
    assert_eq!(vals, &[245, 293, 340]);
409
    Ok(())
410
}
411

412
#[test]
413
#[cfg(all(
414
    feature = "temporal",
415
    feature = "dtype-datetime",
416
    feature = "dtype-date",
417
    feature = "dtype-duration"
418
))]
419
fn test_lazy_query_10() {
420
    use chrono::Duration as ChronoDuration;
421
    let date = NaiveDate::from_ymd_opt(2021, 3, 5).unwrap();
422
    let x = DatetimeChunked::from_naive_datetime(
423
        "x".into(),
424
        [
425
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 0, 0).unwrap()),
426
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(13, 0, 0).unwrap()),
427
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(14, 0, 0).unwrap()),
428
        ],
429
        TimeUnit::Nanoseconds,
430
    )
431
    .into_column();
432
    let y = DatetimeChunked::from_naive_datetime(
433
        "y".into(),
434
        [
435
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(11, 0, 0).unwrap()),
436
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(11, 0, 0).unwrap()),
437
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(11, 0, 0).unwrap()),
438
        ],
439
        TimeUnit::Nanoseconds,
440
    )
441
    .into_column();
442
    let df = DataFrame::new(vec![x, y]).unwrap();
443
    let out = df
444
        .lazy()
445
        .select(&[(col("x") - col("y")).alias("z")])
446
        .collect()
447
        .unwrap();
448
    let z = DurationChunked::from_duration(
449
        "z".into(),
450
        [
451
            ChronoDuration::try_hours(1).unwrap(),
452
            ChronoDuration::try_hours(2).unwrap(),
453
            ChronoDuration::try_hours(3).unwrap(),
454
        ],
455
        TimeUnit::Nanoseconds,
456
    )
457
    .into_column();
458
    assert!(out.column("z").unwrap().equals(&z));
459
    let x = DatetimeChunked::from_naive_datetime(
460
        "x".into(),
461
        [
462
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(2, 0, 0).unwrap()),
463
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(3, 0, 0).unwrap()),
464
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(4, 0, 0).unwrap()),
465
        ],
466
        TimeUnit::Milliseconds,
467
    )
468
    .into_column();
469
    let y = DatetimeChunked::from_naive_datetime(
470
        "y".into(),
471
        [
472
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(1, 0, 0).unwrap()),
473
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(1, 0, 0).unwrap()),
474
            NaiveDateTime::new(date, NaiveTime::from_hms_opt(1, 0, 0).unwrap()),
475
        ],
476
        TimeUnit::Nanoseconds,
477
    )
478
    .into_column();
479
    let df = DataFrame::new(vec![x, y]).unwrap();
480
    let out = df
481
        .lazy()
482
        .select(&[(col("x") - col("y")).alias("z")])
483
        .collect()
484
        .unwrap();
485
    assert!(
486
        out.column("z")
487
            .unwrap()
488
            .equals(&z.cast(&DataType::Duration(TimeUnit::Milliseconds)).unwrap())
489
    );
490
}
491

492
#[test]
493
#[cfg(all(
494
    feature = "temporal",
495
    feature = "dtype-date",
496
    feature = "dtype-datetime"
497
))]
498
fn test_lazy_query_7() {
499
    let date = NaiveDate::from_ymd_opt(2021, 3, 5).unwrap();
500
    let dates = [
501
        NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 0, 0).unwrap()),
502
        NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 1, 0).unwrap()),
503
        NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 2, 0).unwrap()),
504
        NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 3, 0).unwrap()),
505
        NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 4, 0).unwrap()),
506
        NaiveDateTime::new(date, NaiveTime::from_hms_opt(12, 5, 0).unwrap()),
507
    ];
508
    let data = vec![Some(1.), Some(2.), Some(3.), Some(4.), None, None];
509
    let df = DataFrame::new(vec![
510
        DatetimeChunked::from_naive_datetime("date".into(), dates, TimeUnit::Nanoseconds)
511
            .into_column(),
512
        Column::new("data".into(), data),
513
    ])
514
    .unwrap();
515
    // this tests if predicate pushdown not interferes with the shift data.
516
    let out = df
517
        .lazy()
518
        .with_column(col("data").shift(lit(-1)).alias("output"))
519
        .with_column(col("output").shift(lit(2)).alias("shifted"))
520
        .filter(col("date").gt(lit(NaiveDateTime::new(
521
            date,
522
            NaiveTime::from_hms_opt(12, 2, 0).unwrap(),
523
        ))))
524
        .collect()
525
        .unwrap();
526
    let a = out
527
        .column("shifted")
528
        .unwrap()
529
        .as_materialized_series()
530
        .sum::<f64>()
531
        .unwrap()
532
        - 7.0;
533
    assert!(a < 0.01 && a > -0.01);
534
}
535

536
#[test]
537
fn test_lazy_shift_and_fill_all() {
538
    let data = &[1, 2, 3];
539
    let df = DataFrame::new(vec![Column::new("data".into(), data)]).unwrap();
540
    let out = df
541
        .lazy()
542
        .with_column(col("data").shift(lit(1)).fill_null(lit(0)).alias("output"))
543
        .collect()
544
        .unwrap();
545
    assert_eq!(
546
        Vec::from(out.column("output").unwrap().i32().unwrap()),
547
        vec![Some(0), Some(1), Some(2)]
548
    );
549
}
550

551
#[test]
552
fn test_lazy_shift_operation_no_filter() {
553
    // check if predicate pushdown optimization does not fail
554
    let df = df! {
555
        "a" => &[1, 2, 3],
556
        "b" => &[1, 2, 3]
557
    }
558
    .unwrap();
559
    df.lazy()
560
        .with_column(col("b").shift(lit(1)).alias("output"))
561
        .collect()
562
        .unwrap();
563
}
564

565
#[test]
566
fn test_simplify_expr() {
567
    // Test if expression containing literals is simplified
568
    let df = get_df();
569

570
    let plan = df
571
        .lazy()
572
        .select(&[lit(1.0) + lit(1.0) + col("sepal_width")])
573
        .logical_plan;
574

575
    let mut expr_arena = Arena::new();
576
    let mut lp_arena = Arena::new();
577

578
    #[allow(const_item_mutation)]
579
    let lp_top = to_alp(
580
        plan,
581
        &mut expr_arena,
582
        &mut lp_arena,
583
        &mut OptFlags::SIMPLIFY_EXPR,
584
    )
585
    .unwrap();
586

587
    assert!(matches!(
588
        lp_arena.get(lp_top),
589
        IR::Select { expr, .. }  if matches!(expr_arena.get(expr[0].node()), AExpr::BinaryExpr{ left, ..} if matches!(expr_arena.get(*left), &AExpr::Literal(LiteralValue::Dyn(DynLiteralValue::Float(2.0)))))
590
    ));
591
}
592

593
#[test]
594
fn test_lazy_wildcard() {
595
    let df = load_df();
596
    let new = df.clone().lazy().select([col("*")]).collect().unwrap();
597
    assert_eq!(new.shape(), (5, 3));
598

599
    let new = df
600
        .lazy()
601
        .group_by([col("b")])
602
        .agg([
603
            col("*").sum().name().suffix(""),
604
            col("*").first().name().suffix("_first"),
605
        ])
606
        .collect()
607
        .unwrap();
608
    assert_eq!(new.shape(), (3, 5)); // Should exclude b from wildcard aggregations.
609
}
610

611
#[test]
612
fn test_lazy_reverse() {
613
    let df = load_df();
614
    assert!(
615
        df.clone()
616
            .lazy()
617
            .reverse()
618
            .collect()
619
            .unwrap()
620
            .equals_missing(&df.reverse())
621
    )
622
}
623

624
#[test]
625
fn test_lazy_fill_null() {
626
    let df = df! {
627
        "a" => &[None, Some(2.0)],
628
        "b" => &[Some(1.0), None]
629
    }
630
    .unwrap();
631
    let out = df.lazy().fill_null(lit(10.0)).collect().unwrap();
632
    let correct = df! {
633
        "a" => &[Some(10.0), Some(2.0)],
634
        "b" => &[Some(1.0), Some(10.0)]
635
    }
636
    .unwrap();
637
    assert!(out.equals(&correct));
638
    assert_eq!(out.get_column_names(), vec!["a", "b"])
639
}
640

641
#[test]
642
fn test_lazy_double_projection() {
643
    let df = df! {
644
        "foo" => &[1, 2, 3]
645
    }
646
    .unwrap();
647
    df.lazy()
648
        .select([col("foo").alias("bar")])
649
        .select([col("bar")])
650
        .collect()
651
        .unwrap();
652
}
653

654
#[test]
655
fn test_type_coercion() {
656
    let df = df! {
657
        "foo" => &[1, 2, 3],
658
        "bar" => &[1.0, 2.0, 3.0]
659
    }
660
    .unwrap();
661

662
    let lp = df.lazy().select([col("foo") * col("bar")]).logical_plan;
663

664
    let mut expr_arena = Arena::new();
665
    let mut lp_arena = Arena::new();
666
    let lp_top = to_alp(lp, &mut expr_arena, &mut lp_arena, &mut OptFlags::default()).unwrap();
667

668
    if let IR::Select { expr, .. } = lp_arena.get(lp_top) {
669
        if let AExpr::BinaryExpr { left, right, .. } = expr_arena.get(expr[0].node()) {
670
            assert!(matches!(expr_arena.get(*left), AExpr::Cast { .. }));
671
            // bar is already float, does not have to be coerced
672
            assert!(matches!(expr_arena.get(*right), AExpr::Column { .. }));
673
        } else {
674
            panic!()
675
        }
676
    };
677
}
678

679
#[test]
680
#[cfg(feature = "csv")]
681
fn test_lazy_partition_agg() {
682
    let df = df! {
683
        "foo" => &[1, 1, 2, 2, 3],
684
        "bar" => &[1.0, 1.0, 2.0, 2.0, 3.0]
685
    }
686
    .unwrap();
687

688
    let out = df
689
        .lazy()
690
        .group_by([col("foo")])
691
        .agg([col("bar").mean()])
692
        .sort(["foo"], Default::default())
693
        .collect()
694
        .unwrap();
695

696
    assert_eq!(
697
        Vec::from(out.column("bar").unwrap().f64().unwrap()),
698
        &[Some(1.0), Some(2.0), Some(3.0)]
699
    );
700

701
    let out = scan_foods_csv()
702
        .group_by([col("category")])
703
        .agg([col("calories")])
704
        .sort(["category"], Default::default())
705
        .collect()
706
        .unwrap();
707
    let cat_agg_list = out.select_at_idx(1).unwrap();
708
    let fruit_series = cat_agg_list.list().unwrap().get_as_series(0).unwrap();
709
    let fruit_list = fruit_series.i64().unwrap();
710
    assert_eq!(
711
        Vec::from(fruit_list),
712
        &[
713
            Some(60),
714
            Some(30),
715
            Some(50),
716
            Some(30),
717
            Some(60),
718
            Some(130),
719
            Some(50),
720
        ]
721
    )
722
}
723

724
#[test]
725
fn test_lazy_group_by_apply() {
726
    let df = fruits_cars();
727

728
    df.lazy()
729
        .group_by([col("fruits")])
730
        .agg([col("cars").apply(
731
            |s: Column| Ok(Column::new("".into(), &[s.len() as u32])),
732
            |_, f| Ok(Field::new(f.name().clone(), DataType::UInt32)),
733
        )])
734
        .collect()
735
        .unwrap();
736
}
737

738
#[test]
739
fn test_lazy_shift_and_fill() {
740
    let df = df! {
741
        "A" => &[1, 2, 3, 4, 5],
742
        "B" => &[5, 4, 3, 2, 1]
743
    }
744
    .unwrap();
745
    let out = df
746
        .clone()
747
        .lazy()
748
        .with_column(col("A").shift_and_fill(lit(2), col("B").mean()))
749
        .collect()
750
        .unwrap();
751
    assert_eq!(out.column("A").unwrap().null_count(), 0);
752

753
    // shift from the other side
754
    let out = df
755
        .clone()
756
        .lazy()
757
        .with_column(col("A").shift_and_fill(lit(-2), col("B").mean()))
758
        .collect()
759
        .unwrap();
760
    assert_eq!(out.column("A").unwrap().null_count(), 0);
761

762
    let out = df
763
        .lazy()
764
        .shift_and_fill(lit(-1), col("B").std(1))
765
        .collect()
766
        .unwrap();
767
    assert_eq!(out.column("A").unwrap().null_count(), 0);
768
}
769

770
#[test]
771
fn test_lazy_group_by() {
772
    let df = df! {
773
        "a" => &[Some(1.0), None, Some(3.0), Some(4.0), Some(5.0)],
774
        "groups" => &["a", "a", "b", "c", "c"]
775
    }
776
    .unwrap();
777

778
    let out = df
779
        .lazy()
780
        .group_by([col("groups")])
781
        .agg([col("a").mean()])
782
        .sort(["a"], Default::default())
783
        .collect()
784
        .unwrap();
785

786
    assert_eq!(out.column("a").unwrap().f64().unwrap().get(0), Some(1.0));
787
}
788

789
#[test]
790
fn test_lazy_tail() {
791
    let df = df! {
792
        "A" => &[1, 2, 3, 4, 5],
793
        "B" => &[5, 4, 3, 2, 1]
794
    }
795
    .unwrap();
796

797
    let _out = df.lazy().tail(3).collect().unwrap();
798
}
799

800
#[test]
801
fn test_lazy_group_by_sort() {
802
    let df = df! {
803
        "a" => ["a", "b", "a", "b", "b", "c"],
804
        "b" => [1, 2, 3, 4, 5, 6]
805
    }
806
    .unwrap();
807

808
    let out = df
809
        .clone()
810
        .lazy()
811
        .group_by([col("a")])
812
        .agg([col("b").sort(Default::default()).first()])
813
        .collect()
814
        .unwrap()
815
        .sort(["a"], Default::default())
816
        .unwrap();
817

818
    assert_eq!(
819
        Vec::from(out.column("b").unwrap().i32().unwrap()),
820
        [Some(1), Some(2), Some(6)]
821
    );
822

823
    let out = df
824
        .lazy()
825
        .group_by([col("a")])
826
        .agg([col("b").sort(Default::default()).last()])
827
        .collect()
828
        .unwrap()
829
        .sort(["a"], Default::default())
830
        .unwrap();
831

832
    assert_eq!(
833
        Vec::from(out.column("b").unwrap().i32().unwrap()),
834
        [Some(3), Some(5), Some(6)]
835
    );
836
}
837

838
#[test]
839
fn test_lazy_group_by_sort_by() {
840
    let df = df! {
841
        "a" => ["a", "a", "a", "b", "b", "c"],
842
        "b" => [1, 2, 3, 4, 5, 6],
843
        "c" => [6, 1, 4, 3, 2, 1]
844
    }
845
    .unwrap();
846

847
    let out = df
848
        .lazy()
849
        .group_by([col("a")])
850
        .agg([col("b")
851
            .sort_by(
852
                [col("c")],
853
                SortMultipleOptions::default().with_order_descending(true),
854
            )
855
            .first()])
856
        .collect()
857
        .unwrap()
858
        .sort(["a"], Default::default())
859
        .unwrap();
860

861
    assert_eq!(
862
        Vec::from(out.column("b").unwrap().i32().unwrap()),
863
        [Some(1), Some(4), Some(6)]
864
    );
865
}
866

867
#[test]
868
#[cfg(feature = "dtype-datetime")]
869
fn test_lazy_group_by_cast() {
870
    let df = df! {
871
        "a" => ["a", "a", "a", "b", "b", "c"],
872
        "b" => [1, 2, 3, 4, 5, 6]
873
    }
874
    .unwrap();
875

876
    // test if it runs in group_by context
877
    let _out = df
878
        .lazy()
879
        .group_by([col("a")])
880
        .agg([col("b")
881
            .mean()
882
            .cast(DataType::Datetime(TimeUnit::Nanoseconds, None))])
883
        .collect()
884
        .unwrap();
885
}
886

887
#[test]
888
fn test_lazy_group_by_binary_expr() {
889
    let df = df! {
890
        "a" => ["a", "a", "a", "b", "b", "c"],
891
        "b" => [1, 2, 3, 4, 5, 6]
892
    }
893
    .unwrap();
894

895
    // test if it runs in group_by context
896
    let out = df
897
        .lazy()
898
        .group_by([col("a")])
899
        .agg([col("b").mean() * lit(2)])
900
        .sort(["a"], Default::default())
901
        .collect()
902
        .unwrap();
903
    assert_eq!(
904
        Vec::from(out.column("b").unwrap().f64().unwrap()),
905
        [Some(4.0), Some(9.0), Some(12.0)]
906
    );
907
}
908

909
#[test]
910
fn test_lazy_group_by_filter() -> PolarsResult<()> {
911
    let df = df! {
912
        "a" => ["a", "a", "a", "b", "b", "c"],
913
        "b" => [1, 2, 3, 4, 5, 6]
914
    }?;
915

916
    // We test if the filters work in the group_by context
917
    // and that the aggregations can deal with empty sets
918

919
    let out = df
920
        .lazy()
921
        .group_by([col("a")])
922
        .agg([
923
            col("b").filter(col("a").eq(lit("a"))).sum().alias("b_sum"),
924
            col("b")
925
                .filter(col("a").eq(lit("a")))
926
                .first()
927
                .alias("b_first"),
928
            col("b")
929
                .filter(col("a").eq(lit("e")))
930
                .mean()
931
                .alias("b_mean"),
932
            col("b")
933
                .filter(col("a").eq(lit("a")))
934
                .last()
935
                .alias("b_last"),
936
        ])
937
        .sort(["a"], SortMultipleOptions::default())
938
        .collect()?;
939

940
    assert_eq!(
941
        Vec::from(out.column("b_sum").unwrap().i32().unwrap()),
942
        [Some(6), Some(0), Some(0)]
943
    );
944
    assert_eq!(
945
        Vec::from(out.column("b_first").unwrap().i32().unwrap()),
946
        [Some(1), None, None]
947
    );
948
    assert_eq!(
949
        Vec::from(out.column("b_mean").unwrap().f64().unwrap()),
950
        [None, None, None]
951
    );
952
    assert_eq!(
953
        Vec::from(out.column("b_last").unwrap().i32().unwrap()),
954
        [Some(3), None, None]
955
    );
956

957
    Ok(())
958
}
959

960
#[test]
961
fn test_group_by_projection_pd_same_column() -> PolarsResult<()> {
962
    // this query failed when projection pushdown was enabled
963

964
    let a = || {
965
        let df = df![
966
            "col1" => ["a", "ab", "abc"],
967
            "col2" => [1, 2, 3]
968
        ]
969
        .unwrap();
970

971
        df.lazy()
972
            .select([col("col1").alias("foo"), col("col2").alias("bar")])
973
    };
974

975
    let out = a()
976
        .left_join(a(), col("foo"), col("foo"))
977
        .select([col("bar")])
978
        .collect()?;
979

980
    let a = out.column("bar")?.i32()?;
981
    assert_eq!(Vec::from(a), &[Some(1), Some(2), Some(3)]);
982

983
    Ok(())
984
}
985

986
#[test]
987
fn test_group_by_sort_slice() -> PolarsResult<()> {
988
    let df = df![
989
        "groups" => [1, 2, 2, 3, 3, 3],
990
        "vals" => [1, 5, 6, 3, 9, 8]
991
    ]?;
992
    // get largest two values per groups
993

994
    // expected:
995
    // group      values
996
    // 1          1
997
    // 2          6, 5
998
    // 3          9, 8
999

1000
    let out1 = df
1001
        .clone()
1002
        .lazy()
1003
        .sort(
1004
            ["vals"],
1005
            SortMultipleOptions::default().with_order_descending(true),
1006
        )
1007
        .group_by([col("groups")])
1008
        .agg([col("vals").head(Some(2)).alias("foo")])
1009
        .sort(["groups"], Default::default())
1010
        .collect()?;
1011

1012
    let out2 = df
1013
        .lazy()
1014
        .group_by([col("groups")])
1015
        .agg([col("vals")
1016
            .sort(SortOptions::default().with_order_descending(true))
1017
            .head(Some(2))
1018
            .alias("foo")])
1019
        .sort(["groups"], Default::default())
1020
        .collect()?;
1021

1022
    assert!(out1.column("foo")?.equals(out2.column("foo")?));
1023
    Ok(())
1024
}
1025

1026
#[test]
1027
#[cfg(feature = "cum_agg")]
1028
fn test_group_by_cum_sum() -> PolarsResult<()> {
1029
    let df = df![
1030
        "groups" => [1, 2, 2, 3, 3, 3],
1031
        "vals" => [1, 5, 6, 3, 9, 8]
1032
    ]?;
1033

1034
    let out = df
1035
        .lazy()
1036
        .group_by([col("groups")])
1037
        .agg([col("vals").cum_sum(false)])
1038
        .sort(["groups"], Default::default())
1039
        .collect()?;
1040

1041
    assert_eq!(
1042
        Vec::from(out.column("vals")?.explode(false)?.i32()?),
1043
        [1, 5, 11, 3, 12, 20]
1044
            .iter()
1045
            .copied()
1046
            .map(Some)
1047
            .collect::<Vec<_>>()
1048
    );
1049

1050
    Ok(())
1051
}
1052

1053
#[test]
1054
#[cfg(feature = "range")]
1055
fn test_arg_sort_multiple() -> PolarsResult<()> {
1056
    let df = df![
1057
        "int" => [1, 2, 3, 1, 2],
1058
        "flt" => [3.0, 2.0, 1.0, 2.0, 1.0],
1059
        "str" => ["a", "a", "a", "b", "b"]
1060
    ]?;
1061

1062
    let out = df
1063
        .clone()
1064
        .lazy()
1065
        .select([arg_sort_by(
1066
            [col("int"), col("flt")],
1067
            SortMultipleOptions::default().with_order_descending_multi([true, false]),
1068
        )])
1069
        .collect()?;
1070

1071
    assert_eq!(
1072
        Vec::from(out.column("int")?.idx()?),
1073
        [2, 4, 1, 3, 0]
1074
            .iter()
1075
            .copied()
1076
            .map(Some)
1077
            .collect::<Vec<_>>()
1078
    );
1079

1080
    // check if this runs
1081
    let _out = df
1082
        .lazy()
1083
        .select([arg_sort_by(
1084
            [col("str"), col("flt")],
1085
            SortMultipleOptions::default().with_order_descending_multi([true, false]),
1086
        )])
1087
        .collect()?;
1088
    Ok(())
1089
}
1090

1091
#[test]
1092
fn test_multiple_explode() -> PolarsResult<()> {
1093
    let df = df![
1094
        "a" => [0, 1, 2, 0, 2],
1095
        "b" => [5, 4, 3, 2, 1],
1096
        "c" => [2, 3, 4, 1, 5]
1097
    ]?;
1098

1099
    let out = df
1100
        .lazy()
1101
        .group_by([col("a")])
1102
        .agg([col("b").alias("b_list"), col("c").alias("c_list")])
1103
        .explode(by_name(["c_list", "b_list"], true))
1104
        .collect()?;
1105
    assert_eq!(out.shape(), (5, 3));
1106

1107
    Ok(())
1108
}
1109

1110
#[test]
1111
fn test_filter_and_alias() -> PolarsResult<()> {
1112
    let df = df![
1113
        "a" => [0, 1, 2, 0, 2]
1114
    ]?;
1115

1116
    let out = df
1117
        .lazy()
1118
        .with_column(col("a").pow(2.0).alias("a_squared"))
1119
        .filter(col("a_squared").gt(lit(1)).and(col("a").gt(lit(1))))
1120
        .collect()?;
1121

1122
    let expected = df![
1123
        "a" => [2, 2],
1124
        "a_squared" => [4.0, 4.0]
1125
    ]?;
1126
    assert!(out.equals(&expected));
1127
    Ok(())
1128
}
1129

1130
#[test]
1131
fn test_filter_lit() {
1132
    // see https://github.com/pola-rs/polars/issues/790
1133
    // failed due to broadcasting filters and splitting threads.
1134
    let iter = (0..100).map(|i| ('A'..='Z').nth(i % 26).unwrap().to_string());
1135
    let a = Series::from_iter(iter).into_column();
1136
    let df = DataFrame::new([a].into()).unwrap();
1137

1138
    let out = df.lazy().filter(lit(true)).collect().unwrap();
1139
    assert_eq!(out.shape(), (100, 1));
1140
}
1141

1142
#[test]
1143
fn test_ternary_null() -> PolarsResult<()> {
1144
    let df = df![
1145
        "a" => ["a", "b", "c"]
1146
    ]?;
1147

1148
    let out = df
1149
        .lazy()
1150
        .select([when(col("a").eq(lit("c")))
1151
            .then(Null {}.lit())
1152
            .otherwise(col("a"))
1153
            .alias("foo")])
1154
        .collect()?;
1155

1156
    assert_eq!(
1157
        out.column("foo")?.is_null().into_iter().collect::<Vec<_>>(),
1158
        &[Some(false), Some(false), Some(true)]
1159
    );
1160
    Ok(())
1161
}
1162

1163
#[test]
1164
fn test_fill_forward() -> PolarsResult<()> {
1165
    let df = df![
1166
        "a" => ["a", "b", "a"],
1167
        "b" => [Some(1), None, None]
1168
    ]?;
1169

1170
    let out = df
1171
        .lazy()
1172
        .select([col("b")
1173
            .fill_null_with_strategy(FillNullStrategy::Forward(FillNullLimit::None))
1174
            .over_with_options(Some([col("a")]), None, WindowMapping::Join)?])
1175
        .collect()?;
1176
    let agg = out.column("b")?.list()?;
1177

1178
    let a: Series = agg.get_as_series(0).unwrap();
1179
    assert!(a.equals(&Series::new("b".into(), &[1, 1])));
1180
    let a: Series = agg.get_as_series(2).unwrap();
1181
    assert!(a.equals(&Series::new("b".into(), &[1, 1])));
1182
    let a: Series = agg.get_as_series(1).unwrap();
1183
    assert_eq!(a.null_count(), 1);
1184
    Ok(())
1185
}
1186

1187
#[cfg(feature = "cross_join")]
1188
#[test]
1189
fn test_cross_join() -> PolarsResult<()> {
1190
    let df1 = df![
1191
        "a" => ["a", "b", "a"],
1192
        "b" => [Some(1), None, None]
1193
    ]?;
1194

1195
    let df2 = df![
1196
        "a" => [1, 2],
1197
        "b" => [None, Some(12)]
1198
    ]?;
1199

1200
    let out = df1.lazy().cross_join(df2.lazy(), None).collect()?;
1201
    assert_eq!(out.shape(), (6, 4));
1202
    Ok(())
1203
}
1204

1205
#[test]
1206
fn test_select_empty_df() -> PolarsResult<()> {
1207
    // https://github.com/pola-rs/polars/issues/1056
1208
    let df1 = df![
1209
    "a" => [1, 2, 3],
1210
    "b" => [1, 2, 3]
1211
    ]?;
1212

1213
    let out = df1
1214
        .lazy()
1215
        .filter(col("a").eq(lit(0))) // this will lead to an empty frame
1216
        .select([col("a"), lit(1).alias("c")])
1217
        .collect()?;
1218

1219
    assert_eq!(out.column("a")?.len(), 0);
1220
    assert_eq!(out.column("c")?.len(), 0);
1221

1222
    Ok(())
1223
}
1224

1225
#[test]
1226
fn test_keep_name() -> PolarsResult<()> {
1227
    let df = df![
1228
    "a" => [1, 2, 3],
1229
    "b" => [1, 2, 3]
1230
    ]?;
1231

1232
    let out = df
1233
        .lazy()
1234
        .select([
1235
            col("a").alias("bar").name().keep(),
1236
            col("b").alias("bar").name().keep(),
1237
        ])
1238
        .collect()?;
1239

1240
    assert_eq!(out.get_column_names(), &["a", "b"]);
1241
    Ok(())
1242
}
1243

1244
#[test]
1245
fn test_exclude() -> PolarsResult<()> {
1246
    let df = df![
1247
    "a" => [1, 2, 3],
1248
    "b" => [1, 2, 3],
1249
    "c" => [1, 2, 3]
1250
    ]?;
1251

1252
    let out = df
1253
        .lazy()
1254
        .select([all().exclude_cols(["b"]).as_expr()])
1255
        .collect()?;
1256

1257
    assert_eq!(out.get_column_names(), &["a", "c"]);
1258
    Ok(())
1259
}
1260

1261
#[test]
1262
#[cfg(feature = "regex")]
1263
fn test_regex_selection() -> PolarsResult<()> {
1264
    let df = df![
1265
    "anton" => [1, 2, 3],
1266
    "arnold schwars" => [1, 2, 3],
1267
    "annie" => [1, 2, 3]
1268
    ]?;
1269

1270
    let out = df.lazy().select([col("^a.*o.*$")]).collect()?;
1271

1272
    assert_eq!(out.get_column_names(), &["anton", "arnold schwars"]);
1273
    Ok(())
1274
}
1275

1276
#[test]
1277
fn test_sort_by() -> PolarsResult<()> {
1278
    let df = df![
1279
        "a" => [1, 2, 3, 4, 5],
1280
        "b" => [1, 1, 1, 2, 2],
1281
        "c" => [2, 3, 1, 2, 1]
1282
    ]?;
1283

1284
    // evaluate
1285
    let out = df
1286
        .clone()
1287
        .lazy()
1288
        .select([col("a").sort_by([col("b"), col("c")], SortMultipleOptions::default())])
1289
        .collect()?;
1290

1291
    let a = out.column("a")?;
1292
    assert_eq!(
1293
        Vec::from(a.i32().unwrap()),
1294
        &[Some(3), Some(1), Some(2), Some(5), Some(4)]
1295
    );
1296

1297
    // aggregate
1298
    let out = df
1299
        .clone()
1300
        .lazy()
1301
        .group_by_stable([col("b")])
1302
        .agg([col("a").sort_by([col("b"), col("c")], SortMultipleOptions::default())])
1303
        .collect()?;
1304
    let a = out.column("a")?.explode(false)?;
1305
    assert_eq!(
1306
        Vec::from(a.i32().unwrap()),
1307
        &[Some(3), Some(1), Some(2), Some(5), Some(4)]
1308
    );
1309

1310
    // evaluate_on_groups
1311
    let out = df
1312
        .lazy()
1313
        .group_by_stable([col("b")])
1314
        .agg([col("a").sort_by([col("b"), col("c")], SortMultipleOptions::default())])
1315
        .collect()?;
1316

1317
    let a = out.column("a")?.explode(false)?;
1318
    assert_eq!(
1319
        Vec::from(a.i32().unwrap()),
1320
        &[Some(3), Some(1), Some(2), Some(5), Some(4)]
1321
    );
1322

1323
    Ok(())
1324
}
1325

1326
#[test]
1327
fn test_filter_after_shift_in_groups() -> PolarsResult<()> {
1328
    let df = fruits_cars();
1329

1330
    let out = df
1331
        .lazy()
1332
        .select([
1333
            col("fruits"),
1334
            col("B")
1335
                .shift(lit(1))
1336
                .filter(col("B").shift(lit(1)).gt(lit(4)))
1337
                .over_with_options(Some([col("fruits")]), None, WindowMapping::Join)?
1338
                .alias("filtered"),
1339
        ])
1340
        .collect()?;
1341

1342
    assert_eq!(
1343
        out.column("filtered")?
1344
            .list()?
1345
            .get_as_series(0)
1346
            .unwrap()
1347
            .i32()?
1348
            .get(0)
1349
            .unwrap(),
1350
        5
1351
    );
1352
    assert_eq!(
1353
        out.column("filtered")?
1354
            .list()?
1355
            .get_as_series(1)
1356
            .unwrap()
1357
            .i32()?
1358
            .get(0)
1359
            .unwrap(),
1360
        5
1361
    );
1362
    assert_eq!(
1363
        out.column("filtered")?
1364
            .list()?
1365
            .get_as_series(2)
1366
            .unwrap()
1367
            .len(),
1368
        0
1369
    );
1370

1371
    Ok(())
1372
}
1373

1374
#[test]
1375
fn test_lazy_ternary_predicate_pushdown() -> PolarsResult<()> {
1376
    let df = df![
1377
        "a" => &[10, 1, 2, 3]
1378
    ]?;
1379

1380
    let out = df
1381
        .lazy()
1382
        .select([when(col("a").eq(lit(10)))
1383
            .then(Null {}.lit())
1384
            .otherwise(col("a"))])
1385
        .drop_nulls(None)
1386
        .collect()?;
1387

1388
    assert_eq!(
1389
        Vec::from(out.get_columns()[0].i32()?),
1390
        &[Some(1), Some(2), Some(3)]
1391
    );
1392

1393
    Ok(())
1394
}
1395

1396
#[test]
1397
#[cfg(feature = "dtype-categorical")]
1398
fn test_categorical_addition() -> PolarsResult<()> {
1399
    let df = fruits_cars();
1400

1401
    // test if we can do that arithmetic operation with String and Categorical
1402
    let out = df
1403
        .lazy()
1404
        .select([
1405
            col("fruits").cast(DataType::from_categories(Categories::global())),
1406
            col("cars").cast(DataType::from_categories(Categories::global())),
1407
        ])
1408
        .select([(col("fruits") + lit(" ") + col("cars")).alias("foo")])
1409
        .collect()?;
1410

1411
    assert_eq!(out.column("foo")?.str()?.get(0).unwrap(), "banana beetle");
1412

1413
    Ok(())
1414
}
1415

1416
#[test]
1417
fn test_error_duplicate_names() {
1418
    let df = fruits_cars();
1419
    assert!(df.lazy().select([col("*"), col("*")]).collect().is_err());
1420
}
1421

1422
#[test]
1423
fn test_filter_count() -> PolarsResult<()> {
1424
    let df = fruits_cars();
1425
    let out = df
1426
        .lazy()
1427
        .select([col("fruits")
1428
            .filter(col("fruits").eq(lit("banana")))
1429
            .count()])
1430
        .collect()?;
1431
    assert_eq!(out.column("fruits")?.idx()?.get(0), Some(3));
1432
    Ok(())
1433
}
1434

1435
#[test]
1436
#[cfg(feature = "dtype-i16")]
1437
fn test_group_by_small_ints() -> PolarsResult<()> {
1438
    let df = df![
1439
        "id_32" => [1i32, 2],
1440
        "id_16" => [1i16, 2]
1441
    ]?;
1442

1443
    // https://github.com/pola-rs/polars/issues/1255
1444
    let out = df
1445
        .lazy()
1446
        .group_by([col("id_16"), col("id_32")])
1447
        .agg([col("id_16").sum().alias("foo")])
1448
        .sort(
1449
            ["foo"],
1450
            SortMultipleOptions::default().with_order_descending(true),
1451
        )
1452
        .collect()?;
1453

1454
    assert_eq!(Vec::from(out.column("foo")?.i64()?), &[Some(2), Some(1)]);
1455
    Ok(())
1456
}
1457

1458
#[test]
1459
fn test_when_then_schema() -> PolarsResult<()> {
1460
    let df = fruits_cars();
1461

1462
    let schema = df
1463
        .lazy()
1464
        .select([when(col("A").gt(lit(1)))
1465
            .then(Null {}.lit())
1466
            .otherwise(col("A"))])
1467
        .collect_schema();
1468
    assert_ne!(schema?.get_at_index(0).unwrap().1, &DataType::Null);
1469

1470
    Ok(())
1471
}
1472

1473
#[test]
1474
fn test_singleton_broadcast() -> PolarsResult<()> {
1475
    let df = fruits_cars();
1476
    let out = df
1477
        .lazy()
1478
        .select([col("fruits"), lit(1).alias("foo")])
1479
        .collect()?;
1480

1481
    assert!(out.column("foo")?.len() > 1);
1482
    Ok(())
1483
}
1484

1485
#[test]
1486
fn test_list_in_select_context() -> PolarsResult<()> {
1487
    let s = Column::new("a".into(), &[1, 2, 3]);
1488
    let mut builder = get_list_builder(s.dtype(), s.len(), 1, s.name().clone());
1489
    builder.append_series(s.as_materialized_series()).unwrap();
1490
    let expected = builder.finish().into_column();
1491

1492
    let df = DataFrame::new(vec![s])?;
1493

1494
    let out = df.lazy().select([col("a").implode()]).collect()?;
1495

1496
    let s = out.column("a")?;
1497
    assert!(s.equals(&expected));
1498

1499
    Ok(())
1500
}
1501

1502
#[test]
1503
#[cfg(feature = "round_series")]
1504
fn test_round_after_agg() -> PolarsResult<()> {
1505
    let df = fruits_cars();
1506

1507
    let out = df
1508
        .lazy()
1509
        .group_by([col("fruits")])
1510
        .agg([col("A")
1511
            .cast(DataType::Float32)
1512
            .mean()
1513
            .round(2, polars_ops::series::RoundMode::default())
1514
            .alias("foo")])
1515
        .collect()?;
1516

1517
    assert!(out.column("foo")?.f32().is_ok());
1518

1519
    let df = df![
1520
        "groups" => ["pigeon",
1521
                 "rabbit",
1522
                 "rabbit",
1523
                 "Chris",
1524
                 "pigeon",
1525
                 "fast",
1526
                 "fast",
1527
                 "pigeon",
1528
                 "rabbit",
1529
                 "Chris"],
1530
        "b" => [5409, 4848, 4864, 3540, 8103, 3083, 8575, 9963, 8809, 5425],
1531
        "c" => [0.4517241160719615,
1532
                  0.2551467646274673,
1533
                  0.8682045191407308,
1534
                  0.9925316385786037,
1535
                  0.5392027792928116,
1536
                  0.7633847828107002,
1537
                  0.7967295231651537,
1538
                  0.01444779067224733,
1539
                  0.23807484087472652,
1540
                  0.10985868798350984]
1541
    ]?;
1542

1543
    let out = df
1544
        .lazy()
1545
        .group_by_stable([col("groups")])
1546
        .agg([((col("b") * col("c")).sum() / col("b").sum())
1547
            .round(2, polars_ops::series::RoundMode::default())
1548
            .alias("foo")])
1549
        .collect()?;
1550

1551
    let out = out.column("foo")?;
1552
    let out = out.f64()?;
1553

1554
    assert_eq!(
1555
        Vec::from(out),
1556
        &[Some(0.3), Some(0.41), Some(0.46), Some(0.79)]
1557
    );
1558

1559
    Ok(())
1560
}
1561

1562
#[test]
1563
#[cfg(feature = "dtype-date")]
1564
fn test_fill_nan() -> PolarsResult<()> {
1565
    let s0 = Column::new("date".into(), &[1, 2, 3]).cast(&DataType::Date)?;
1566
    let s1 = Column::new("float".into(), &[Some(1.0), Some(f32::NAN), Some(3.0)]);
1567

1568
    let df = DataFrame::new(vec![s0, s1])?;
1569
    let out = df.lazy().fill_nan(Null {}.lit()).collect()?;
1570
    let out = out.column("float")?;
1571
    assert_eq!(Vec::from(out.f32()?), &[Some(1.0), None, Some(3.0)]);
1572

1573
    Ok(())
1574
}
1575

1576
#[test]
1577
#[cfg(feature = "regex")]
1578
fn test_exclude_regex() -> PolarsResult<()> {
1579
    let df = fruits_cars();
1580
    let out = df
1581
        .lazy()
1582
        .select([(all() - Selector::Matches("^(fruits|cars)$".into())).as_expr()])
1583
        .collect()?;
1584

1585
    assert_eq!(out.get_column_names(), &["A", "B"]);
1586
    Ok(())
1587
}
1588

1589
#[test]
1590
#[cfg(feature = "rank")]
1591
fn test_group_by_rank() -> PolarsResult<()> {
1592
    let df = fruits_cars();
1593
    let out = df
1594
        .lazy()
1595
        .group_by_stable([col("cars")])
1596
        .agg([col("B").rank(
1597
            RankOptions {
1598
                method: RankMethod::Dense,
1599
                ..Default::default()
1600
            },
1601
            None,
1602
        )])
1603
        .collect()?;
1604

1605
    let out = out.column("B")?;
1606
    let out = out.list()?.get_as_series(1).unwrap();
1607
    let out = out.idx()?;
1608

1609
    assert_eq!(Vec::from(out), &[Some(1)]);
1610
    Ok(())
1611
}
1612

1613
#[test]
1614
pub fn test_select_by_dtypes() -> PolarsResult<()> {
1615
    let df = df![
1616
        "bools" => [true, false, true],
1617
        "ints" => [1, 2, 3],
1618
        "strings" => ["a", "b", "c"],
1619
        "floats" => [1.0, 2.0, 3.0f32]
1620
    ]?;
1621
    let out = df
1622
        .lazy()
1623
        .select([dtype_cols([DataType::Float32, DataType::String])
1624
            .as_selector()
1625
            .as_expr()])
1626
        .collect()?;
1627
    assert_eq!(out.dtypes(), &[DataType::String, DataType::Float32]);
1628

1629
    Ok(())
1630
}
1631

1632
#[test]
1633
fn test_binary_expr() -> PolarsResult<()> {
1634
    // test panic in schema names
1635
    let df = fruits_cars();
1636
    let _ = df.lazy().select([col("A").neq(lit(1))]).collect()?;
1637

1638
    // test type coercion
1639
    // https://github.com/pola-rs/polars/issues/1649
1640
    let df = df!(
1641
            "nrs"=> [Some(1i64), Some(2), Some(3), None, Some(5)],
1642
            "random"=> [0.1f64, 0.6, 0.2, 0.6, 0.3]
1643
    )?;
1644

1645
    let other = when(col("random").gt(lit(0.5)))
1646
        .then(lit(2))
1647
        .otherwise(col("random"))
1648
        .alias("other");
1649
    let out = df.lazy().select([other * col("nrs").sum()]).collect()?;
1650
    assert_eq!(out.dtypes(), &[DataType::Float64]);
1651
    Ok(())
1652
}
1653

1654
#[test]
1655
fn test_single_group_result() -> PolarsResult<()> {
1656
    // the arg_sort should not auto explode
1657
    let df = df![
1658
        "a" => [1, 2],
1659
        "b" => [1, 1]
1660
    ]?;
1661

1662
    let out = df
1663
        .lazy()
1664
        .select([col("a").arg_sort(false, false).over([col("a")])])
1665
        .collect()?;
1666

1667
    let a = out.column("a")?.idx()?;
1668
    assert_eq!(Vec::from(a), &[Some(0), Some(0)]);
1669

1670
    Ok(())
1671
}
1672

1673
#[test]
1674
#[cfg(feature = "rank")]
1675
fn test_single_ranked_group() -> PolarsResult<()> {
1676
    // tests type consistency of rank algorithm
1677
    let df = df!["group" => [1, 2, 2],
1678
        "value"=> [100, 50, 10]
1679
    ]?;
1680

1681
    let out = df
1682
        .lazy()
1683
        .with_columns([col("value")
1684
            .rank(
1685
                RankOptions {
1686
                    method: RankMethod::Average,
1687
                    ..Default::default()
1688
                },
1689
                None,
1690
            )
1691
            .over_with_options(Some([col("group")]), None, WindowMapping::Join)?])
1692
        .collect()?;
1693

1694
    let out = out.column("value")?.explode(false)?;
1695
    let out = out.f64()?;
1696
    assert_eq!(
1697
        Vec::from(out),
1698
        &[Some(1.0), Some(2.0), Some(1.0), Some(2.0), Some(1.0)]
1699
    );
1700

1701
    Ok(())
1702
}
1703

1704
#[test]
1705
#[cfg(feature = "diff")]
1706
fn empty_df() -> PolarsResult<()> {
1707
    let df = fruits_cars();
1708
    let df = df.filter(&BooleanChunked::full("".into(), false, df.height()))?;
1709

1710
    df.lazy()
1711
        .select([
1712
            col("A").shift(lit(1)).alias("1"),
1713
            col("A").shift_and_fill(lit(1), lit(1)).alias("2"),
1714
            col("A").shift_and_fill(lit(-1), lit(1)).alias("3"),
1715
            col("A").fill_null(lit(1)).alias("4"),
1716
            col("A").cum_count(false).alias("5"),
1717
            col("A").diff(lit(1), NullBehavior::Ignore).alias("6"),
1718
            col("A").cum_max(false).alias("7"),
1719
            col("A").cum_min(false).alias("8"),
1720
        ])
1721
        .collect()?;
1722

1723
    Ok(())
1724
}
1725

1726
#[test]
1727
#[cfg(feature = "abs")]
1728
fn test_apply_flatten() -> PolarsResult<()> {
1729
    let df = df![
1730
         "A"=> [1.1435, 2.223456, 3.44732, -1.5234, -2.1238, -3.2923],
1731
        "B"=> ["a", "b", "a", "b", "a", "b"]
1732
    ]?;
1733

1734
    let out = df
1735
        .lazy()
1736
        .group_by_stable([col("B")])
1737
        .agg([col("A").abs().sum().alias("A_sum")])
1738
        .collect()?;
1739

1740
    let out = out.column("A_sum")?;
1741
    assert_eq!(out.get(0)?, AnyValue::Float64(6.71462));
1742
    assert_eq!(out.get(1)?, AnyValue::Float64(7.039156));
1743

1744
    Ok(())
1745
}
1746

1747
#[test]
1748
#[cfg(feature = "is_in")]
1749
fn test_is_in() -> PolarsResult<()> {
1750
    let df = fruits_cars();
1751

1752
    // // this will be executed by apply
1753
    let out = df
1754
        .clone()
1755
        .lazy()
1756
        .group_by_stable([col("fruits")])
1757
        .agg([col("cars").is_in(
1758
            col("cars").filter(col("cars").eq(lit("beetle"))).implode(),
1759
            false,
1760
        )])
1761
        .collect()?;
1762
    let out = out.column("cars").unwrap();
1763
    let out = out.explode(false)?;
1764
    let out = out.bool().unwrap();
1765
    assert_eq!(
1766
        Vec::from(out),
1767
        &[Some(true), Some(false), Some(true), Some(true), Some(true)]
1768
    );
1769

1770
    // this will be executed by map
1771
    let out = df
1772
        .lazy()
1773
        .group_by_stable([col("fruits")])
1774
        .agg([col("cars").is_in(
1775
            lit(Series::new("a".into(), ["beetle", "vw"])).implode(),
1776
            false,
1777
        )])
1778
        .collect()?;
1779

1780
    let out = out.column("cars").unwrap();
1781
    let out = out.explode(false)?;
1782
    let out = out.bool().unwrap();
1783
    assert_eq!(
1784
        Vec::from(out),
1785
        &[Some(true), Some(false), Some(true), Some(true), Some(true)]
1786
    );
1787

1788
    Ok(())
1789
}
1790

1791
#[test]
1792
fn test_partitioned_gb_1() -> PolarsResult<()> {
1793
    // don't move these to integration tests
1794
    // keep these dtypes
1795
    let out = df![
1796
        "keys" => [1, 1, 1, 1, 2],
1797
        "vals" => ["a", "b", "c", "a", "a"]
1798
    ]?
1799
    .lazy()
1800
    .group_by([col("keys")])
1801
    .agg([
1802
        (col("vals").eq(lit("a"))).sum().alias("eq_a"),
1803
        (col("vals").eq(lit("b"))).sum().alias("eq_b"),
1804
    ])
1805
    .sort(["keys"], Default::default())
1806
    .collect()?;
1807

1808
    assert!(out.equals(&df![
1809
        "keys" => [1, 2],
1810
        "eq_a" => [2 as IdxSize, 1],
1811
        "eq_b" => [1 as IdxSize, 0],
1812
    ]?));
1813

1814
    Ok(())
1815
}
1816

1817
#[test]
1818
fn test_partitioned_gb_count() -> PolarsResult<()> {
1819
    // don't move these to integration tests
1820
    let out = df![
1821
        "col" => (0..100).map(|_| Some(0)).collect::<Int32Chunked>().into_series(),
1822
    ]?
1823
    .lazy()
1824
    .group_by([col("col")])
1825
    .agg([
1826
        // we make sure to alias with a different name
1827
        len().alias("counted"),
1828
        col("col").count().alias("count2"),
1829
    ])
1830
    .collect()?;
1831

1832
    assert!(out.equals(&df![
1833
        "col" => [0],
1834
        "counted" => [100 as IdxSize],
1835
        "count2" => [100 as IdxSize],
1836
    ]?));
1837

1838
    Ok(())
1839
}
1840

1841
#[test]
1842
fn test_partitioned_gb_mean() -> PolarsResult<()> {
1843
    // don't move these to integration tests
1844
    let out = df![
1845
        "key" => (0..100).map(|_| Some(0)).collect::<Int32Chunked>().into_series(),
1846
    ]?
1847
    .lazy()
1848
    .with_columns([lit("a").alias("str"), lit(1).alias("int")])
1849
    .group_by([col("key")])
1850
    .agg([
1851
        col("str").mean().alias("mean_str"),
1852
        col("int").mean().alias("mean_int"),
1853
    ])
1854
    .collect()?;
1855

1856
    assert_eq!(out.shape(), (1, 3));
1857
    let str_col = out.column("mean_str")?;
1858
    assert_eq!(str_col.get(0)?, AnyValue::Null);
1859
    let int_col = out.column("mean_int")?;
1860
    assert_eq!(int_col.get(0)?, AnyValue::Float64(1.0));
1861

1862
    Ok(())
1863
}
1864

1865
#[test]
1866
fn test_partitioned_gb_binary() -> PolarsResult<()> {
1867
    // don't move these to integration tests
1868
    let df = df![
1869
        "col" => (0..20).map(|_| Some(0)).collect::<Int32Chunked>().into_series(),
1870
    ]?;
1871

1872
    let out = df
1873
        .clone()
1874
        .lazy()
1875
        .group_by([col("col")])
1876
        .agg([(col("col") + lit(10)).sum().alias("sum")])
1877
        .collect()?;
1878

1879
    assert!(out.equals(&df![
1880
        "col" => [0],
1881
        "sum" => [200],
1882
    ]?));
1883

1884
    let out = df
1885
        .lazy()
1886
        .group_by([col("col")])
1887
        .agg([(col("col").cast(DataType::Float32) + lit(10.0))
1888
            .sum()
1889
            .alias("sum")])
1890
        .collect()?;
1891

1892
    assert!(out.equals(&df![
1893
        "col" => [0],
1894
        "sum" => [200.0_f32],
1895
    ]?));
1896

1897
    Ok(())
1898
}
1899

1900
#[test]
1901
fn test_partitioned_gb_ternary() -> PolarsResult<()> {
1902
    // don't move these to integration tests
1903
    let df = df![
1904
        "col" => (0..20).map(|_| Some(0)).collect::<Int32Chunked>().into_series(),
1905
        "val" => (0..20).map(Some).collect::<Int32Chunked>().into_series(),
1906
    ]?;
1907

1908
    let out = df
1909
        .lazy()
1910
        .group_by([col("col")])
1911
        .agg([when(col("val").gt(lit(10)))
1912
            .then(lit(1))
1913
            .otherwise(lit(0))
1914
            .sum()
1915
            .alias("sum")])
1916
        .collect()?;
1917

1918
    assert!(out.equals(&df![
1919
        "col" => [0],
1920
        "sum" => [9],
1921
    ]?));
1922

1923
    Ok(())
1924
}
1925

1926
#[test]
1927
fn test_sort_maintain_order_true() -> PolarsResult<()> {
1928
    let q = df![
1929
        "A" => [1, 1, 1, 1],
1930
        "B" => ["A", "B", "C", "D"],
1931
    ]?
1932
    .lazy();
1933

1934
    let res = q
1935
        .sort_by_exprs(
1936
            [col("A")],
1937
            SortMultipleOptions::default()
1938
                .with_maintain_order(true)
1939
                .with_nulls_last(true),
1940
        )
1941
        .slice(0, 3)
1942
        .collect()?;
1943
    assert!(res.equals(&df![
1944
        "A" => [1, 1, 1],
1945
        "B" => ["A", "B", "C"],
1946
    ]?));
1947
    Ok(())
1948
}
1949

1950
#[test]
1951
fn test_over_with_options_empty_join() -> PolarsResult<()> {
1952
    let empty_df = DataFrame::new(vec![
1953
        Series::new_empty("a".into(), &DataType::Int32).into(),
1954
        Series::new_empty("b".into(), &DataType::Int32).into(),
1955
    ])?;
1956

1957
    let empty_df_out = empty_df
1958
        .lazy()
1959
        .select([col("b").over_with_options(
1960
            Some([col("a")]),
1961
            Option::None,
1962
            WindowMapping::Join,
1963
        )?])
1964
        .collect()?;
1965

1966
    let f1: Field = Field::new("b".into(), DataType::List(Box::new(DataType::Int32)));
1967
    let sc: Schema = Schema::from_iter(vec![f1]);
1968

1969
    assert_eq!(&**empty_df_out.schema(), &sc);
1970

1971
    Ok(())
1972
}
1973

1974
#[test]
1975
#[cfg(feature = "serde")]
1976
fn test_named_udfs() -> PolarsResult<()> {
1977
    use polars_plan::dsl::named_serde::{ExprRegistry, set_named_serde_registry};
1978

1979
    let lf = DataFrame::new(vec![Column::new("a".into(), vec![1, 2, 3, 4])])?.lazy();
1980

1981
    struct X;
1982
    impl ExprRegistry for X {
1983
        fn get_function(&self, name: &str, payload: &[u8]) -> Option<Arc<dyn AnonymousColumnsUdf>> {
1984
            assert_eq!(name, "test-function");
1985
            assert_eq!(payload, b"check");
1986
            Some(Arc::new(BaseColumnUdf::new(
1987
                |c: &mut [Column]| Ok(std::mem::take(&mut c[0]) * 2),
1988
                |_: &Schema, f: &[Field]| Ok(f[0].clone()),
1989
            )))
1990
        }
1991
    }
1992

1993
    set_named_serde_registry(Arc::new(X) as _);
1994

1995
    let expr = Expr::AnonymousFunction {
1996
        input: vec![Expr::Column("a".into())],
1997
        function: LazySerde::Named {
1998
            name: "test-function".into(),
1999
            payload: Some(bytes::Bytes::from("check")),
2000
            value: None,
2001
        },
2002
        options: FunctionOptions::default(),
2003
        fmt_str: Box::new("test".into()),
2004
    };
2005

2006
    assert_eq!(
2007
        lf.select(&[expr]).collect()?,
2008
        DataFrame::new(vec![Column::new("a".into(), vec![2, 4, 6, 8])])?,
2009
    );
2010

2011
    Ok(())
2012
}
2013

2014
Product

Resources

Company