Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-lazy/src/tests/io.rs
6939 views
1
use polars_io::RowIndex;
2
#[cfg(feature = "is_between")]
3
use polars_ops::prelude::ClosedInterval;
4
use polars_utils::plpath::PlPath;
5
use polars_utils::slice_enum::Slice;
6
7
use super::*;
8
use crate::dsl;
9
10
#[test]
11
#[cfg(feature = "parquet")]
12
fn test_parquet_exec() -> PolarsResult<()> {
13
let _guard = SINGLE_LOCK.lock().unwrap();
14
// filter
15
for par in [true, false] {
16
let out = scan_foods_parquet(par)
17
.filter(col("category").eq(lit("seafood")))
18
.collect()?;
19
assert_eq!(out.shape(), (8, 4));
20
}
21
22
// project
23
for par in [true, false] {
24
let out = scan_foods_parquet(par)
25
.select([col("category"), col("sugars_g")])
26
.collect()?;
27
assert_eq!(out.shape(), (27, 2));
28
}
29
30
// project + filter
31
for par in [true, false] {
32
let out = scan_foods_parquet(par)
33
.select([col("category"), col("sugars_g")])
34
.filter(col("category").eq(lit("seafood")))
35
.collect()?;
36
assert_eq!(out.shape(), (8, 2));
37
}
38
39
Ok(())
40
}
41
42
#[test]
43
#[cfg(all(feature = "parquet", feature = "is_between"))]
44
fn test_parquet_statistics_no_skip() {
45
let _guard = SINGLE_LOCK.lock().unwrap();
46
init_files();
47
let par = true;
48
let out = scan_foods_parquet(par)
49
.filter(col("calories").gt(lit(0i32)))
50
.collect()
51
.unwrap();
52
assert_eq!(out.shape(), (27, 4));
53
54
let out = scan_foods_parquet(par)
55
.filter(col("calories").lt(lit(1000i32)))
56
.collect()
57
.unwrap();
58
assert_eq!(out.shape(), (27, 4));
59
60
let out = scan_foods_parquet(par)
61
.filter(lit(0i32).lt(col("calories")))
62
.collect()
63
.unwrap();
64
assert_eq!(out.shape(), (27, 4));
65
66
let out = scan_foods_parquet(par)
67
.filter(lit(1000i32).gt(col("calories")))
68
.collect()
69
.unwrap();
70
assert_eq!(out.shape(), (27, 4));
71
72
// statistics and `is_between`
73
// normal case
74
let out = scan_foods_parquet(par)
75
.filter(col("calories").is_between(40, 300, ClosedInterval::Both))
76
.collect()
77
.unwrap();
78
assert_eq!(out.shape(), (19, 4));
79
// normal case
80
let out = scan_foods_parquet(par)
81
.filter(col("calories").is_between(10, 50, ClosedInterval::Both))
82
.collect()
83
.unwrap();
84
assert_eq!(out.shape(), (11, 4));
85
// edge case: 20 = min(calories) but the right end is closed
86
let out = scan_foods_parquet(par)
87
.filter(col("calories").is_between(5, 20, ClosedInterval::Right))
88
.collect()
89
.unwrap();
90
assert_eq!(out.shape(), (1, 4));
91
// edge case: 200 = max(calories) but the left end is closed
92
let out = scan_foods_parquet(par)
93
.filter(col("calories").is_between(200, 250, ClosedInterval::Left))
94
.collect()
95
.unwrap();
96
assert_eq!(out.shape(), (3, 4));
97
// edge case: left == right but both ends are closed
98
let out = scan_foods_parquet(par)
99
.filter(col("calories").is_between(200, 200, ClosedInterval::Both))
100
.collect()
101
.unwrap();
102
assert_eq!(out.shape(), (3, 4));
103
104
// Or operation
105
let out = scan_foods_parquet(par)
106
.filter(
107
col("sugars_g")
108
.lt(lit(0i32))
109
.or(col("fats_g").lt(lit(1000.0))),
110
)
111
.collect()
112
.unwrap();
113
assert_eq!(out.shape(), (27, 4));
114
}
115
116
#[test]
117
#[cfg(all(feature = "parquet", feature = "is_between"))]
118
fn test_parquet_statistics() -> PolarsResult<()> {
119
let _guard = SINGLE_LOCK.lock().unwrap();
120
init_files();
121
unsafe { std::env::set_var("POLARS_PANIC_IF_PARQUET_PARSED", "1") };
122
let par = true;
123
124
// Test single predicates
125
let out = scan_foods_parquet(par)
126
.filter(col("calories").lt(lit(0i32)))
127
.collect()?;
128
assert_eq!(out.shape(), (0, 4));
129
130
let out = scan_foods_parquet(par)
131
.filter(col("calories").gt(lit(1000)))
132
.collect()?;
133
assert_eq!(out.shape(), (0, 4));
134
135
let out = scan_foods_parquet(par)
136
.filter(lit(0i32).gt(col("calories")))
137
.collect()?;
138
assert_eq!(out.shape(), (0, 4));
139
140
// issue: 13427
141
let out = scan_foods_parquet(par)
142
.filter(col("calories").is_in(lit(Series::new("".into(), [0, 500])), false))
143
.collect()?;
144
assert_eq!(out.shape(), (0, 4));
145
146
// statistics and `is_between`
147
// 15 < min(calories)=20
148
let out = scan_foods_parquet(par)
149
.filter(col("calories").is_between(5, 15, ClosedInterval::Both))
150
.collect()?;
151
assert_eq!(out.shape(), (0, 4));
152
153
// 300 > max(calories)=200
154
let out = scan_foods_parquet(par)
155
.filter(col("calories").is_between(300, 500, ClosedInterval::Both))
156
.collect()?;
157
assert_eq!(out.shape(), (0, 4));
158
159
// 20 == min(calories) but right end is open
160
let out = scan_foods_parquet(par)
161
.filter(col("calories").is_between(5, 20, ClosedInterval::Left))
162
.collect()?;
163
assert_eq!(out.shape(), (0, 4));
164
165
// 20 == min(calories) but both ends are open
166
let out = scan_foods_parquet(par)
167
.filter(col("calories").is_between(5, 20, ClosedInterval::None))
168
.collect()?;
169
assert_eq!(out.shape(), (0, 4));
170
171
// 200 == max(calories) but left end is open
172
let out = scan_foods_parquet(par)
173
.filter(col("calories").is_between(200, 250, ClosedInterval::Right))
174
.collect()?;
175
assert_eq!(out.shape(), (0, 4));
176
177
// 200 == max(calories) but both ends are open
178
let out = scan_foods_parquet(par)
179
.filter(col("calories").is_between(200, 250, ClosedInterval::None))
180
.collect()?;
181
assert_eq!(out.shape(), (0, 4));
182
183
// between(100, 40) is impossible
184
let out = scan_foods_parquet(par)
185
.filter(col("calories").is_between(100, 40, ClosedInterval::Both))
186
.collect()?;
187
assert_eq!(out.shape(), (0, 4));
188
189
// with strings
190
let out = scan_foods_parquet(par)
191
.filter(col("category").is_between(lit("yams"), lit("zest"), ClosedInterval::Both))
192
.collect()?;
193
assert_eq!(out.shape(), (0, 4));
194
195
// with strings
196
let out = scan_foods_parquet(par)
197
.filter(col("category").is_between(lit("dairy"), lit("eggs"), ClosedInterval::Both))
198
.collect()?;
199
assert_eq!(out.shape(), (0, 4));
200
201
let out = scan_foods_parquet(par)
202
.filter(lit(1000i32).lt(col("calories")))
203
.collect()?;
204
assert_eq!(out.shape(), (0, 4));
205
206
// not(a > b) => a <= b
207
let out = scan_foods_parquet(par)
208
.filter(not(col("calories").gt(5)))
209
.collect()?;
210
assert_eq!(out.shape(), (0, 4));
211
212
// not(a >= b) => a < b
213
// note that min(calories)=20
214
let out = scan_foods_parquet(par)
215
.filter(not(col("calories").gt_eq(20)))
216
.collect()?;
217
assert_eq!(out.shape(), (0, 4));
218
219
// not(a < b) => a >= b
220
let out = scan_foods_parquet(par)
221
.filter(not(col("calories").lt(250)))
222
.collect()?;
223
assert_eq!(out.shape(), (0, 4));
224
225
// not(a <= b) => a > b
226
// note that max(calories)=200
227
let out = scan_foods_parquet(par)
228
.filter(not(col("calories").lt_eq(200)))
229
.collect()?;
230
assert_eq!(out.shape(), (0, 4));
231
232
// not(a == b) => a != b
233
// note that proteins_g=10 for all rows
234
let out = scan_nutri_score_null_column_parquet(par)
235
.filter(not(col("proteins_g").eq(10)))
236
.collect()?;
237
assert_eq!(out.shape(), (0, 6));
238
239
// not(a != b) => a == b
240
// note that proteins_g=10 for all rows
241
let out = scan_nutri_score_null_column_parquet(par)
242
.filter(not(col("proteins_g").neq(5)))
243
.collect()?;
244
assert_eq!(out.shape(), (0, 6));
245
246
// not(col(c) is between [a, b]) => col(c) < a or col(c) > b
247
let out = scan_foods_parquet(par)
248
.filter(not(col("calories").is_between(
249
20,
250
200,
251
ClosedInterval::Both,
252
)))
253
.collect()?;
254
assert_eq!(out.shape(), (0, 4));
255
256
// not(col(c) is between [a, b[) => col(c) < a or col(c) >= b
257
let out = scan_foods_parquet(par)
258
.filter(not(col("calories").is_between(
259
20,
260
201,
261
ClosedInterval::Left,
262
)))
263
.collect()?;
264
assert_eq!(out.shape(), (0, 4));
265
266
// not(col(c) is between ]a, b]) => col(c) <= a or col(c) > b
267
let out = scan_foods_parquet(par)
268
.filter(not(col("calories").is_between(
269
19,
270
200,
271
ClosedInterval::Right,
272
)))
273
.collect()?;
274
assert_eq!(out.shape(), (0, 4));
275
276
// not(col(c) is between ]a, b]) => col(c) <= a or col(c) > b
277
let out = scan_foods_parquet(par)
278
.filter(not(col("calories").is_between(
279
19,
280
200,
281
ClosedInterval::Right,
282
)))
283
.collect()?;
284
assert_eq!(out.shape(), (0, 4));
285
286
// not(col(c) is between ]a, b[) => col(c) <= a or col(c) >= b
287
let out = scan_foods_parquet(par)
288
.filter(not(col("calories").is_between(
289
19,
290
201,
291
ClosedInterval::None,
292
)))
293
.collect()?;
294
assert_eq!(out.shape(), (0, 4));
295
296
// not (a or b) => not(a) and not(b)
297
// note that not(fats_g <= 9) is possible; not(calories > 5) should allow us skip the rg
298
let out = scan_foods_parquet(par)
299
.filter(not(col("calories").gt(5).or(col("fats_g").lt_eq(9))))
300
.collect()?;
301
assert_eq!(out.shape(), (0, 4));
302
303
// not (a and b) => not(a) or not(b)
304
let out = scan_foods_parquet(par)
305
.filter(not(col("calories").gt(5).and(col("fats_g").lt_eq(12))))
306
.collect()?;
307
assert_eq!(out.shape(), (0, 4));
308
309
// is_not_null
310
let out = scan_nutri_score_null_column_parquet(par)
311
.filter(col("nutri_score").is_not_null())
312
.collect()?;
313
assert_eq!(out.shape(), (0, 6));
314
315
// not(is_null) (~pl.col('nutri_score').is_null())
316
let out = scan_nutri_score_null_column_parquet(par)
317
.filter(not(col("nutri_score").is_null()))
318
.collect()?;
319
assert_eq!(out.shape(), (0, 6));
320
321
// Test multiple predicates
322
323
// And operation
324
let out = scan_foods_parquet(par)
325
.filter(col("calories").lt(lit(0i32)))
326
.filter(col("calories").gt(lit(1000)))
327
.collect()?;
328
assert_eq!(out.shape(), (0, 4));
329
330
let out = scan_foods_parquet(par)
331
.filter(col("calories").lt(lit(0i32)))
332
.filter(col("calories").gt(lit(1000)))
333
.filter(col("calories").lt(lit(50i32)))
334
.collect()?;
335
assert_eq!(out.shape(), (0, 4));
336
337
let out = scan_foods_parquet(par)
338
.filter(
339
col("calories")
340
.lt(lit(0i32))
341
.and(col("fats_g").lt(lit(0.0))),
342
)
343
.collect()?;
344
assert_eq!(out.shape(), (0, 4));
345
346
// Or operation
347
let out = scan_foods_parquet(par)
348
.filter(
349
col("sugars_g")
350
.lt(lit(0i32))
351
.or(col("fats_g").gt(lit(1000.0))),
352
)
353
.collect()?;
354
assert_eq!(out.shape(), (0, 4));
355
356
unsafe { std::env::remove_var("POLARS_PANIC_IF_PARQUET_PARSED") };
357
358
Ok(())
359
}
360
361
#[test]
362
#[cfg(not(target_os = "windows"))]
363
fn test_parquet_globbing() -> PolarsResult<()> {
364
// for side effects
365
init_files();
366
let _guard = SINGLE_LOCK.lock().unwrap();
367
let glob = "../../examples/datasets/foods*.parquet";
368
let df = LazyFrame::scan_parquet(
369
PlPath::new(glob),
370
ScanArgsParquet {
371
n_rows: None,
372
cache: true,
373
parallel: Default::default(),
374
..Default::default()
375
},
376
)?
377
.collect()?;
378
assert_eq!(df.shape(), (54, 4));
379
let cal = df.column("calories")?;
380
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
381
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
382
383
Ok(())
384
}
385
386
#[test]
387
fn test_scan_parquet_limit_9001() {
388
init_files();
389
let path = GLOB_PARQUET;
390
let args = ScanArgsParquet {
391
n_rows: Some(10000),
392
cache: false,
393
rechunk: true,
394
..Default::default()
395
};
396
let q = LazyFrame::scan_parquet(PlPath::new(path), args)
397
.unwrap()
398
.limit(3);
399
let IRPlan {
400
lp_top, lp_arena, ..
401
} = q.to_alp_optimized().unwrap();
402
lp_arena.iter(lp_top).all(|(_, lp)| match lp {
403
IR::Union { options, .. } => {
404
let sliced = options.slice.unwrap();
405
sliced.1 == 3
406
},
407
IR::Scan {
408
unified_scan_args, ..
409
} => unified_scan_args.pre_slice == Some(Slice::Positive { offset: 0, len: 3 }),
410
_ => true,
411
});
412
}
413
414
#[test]
415
#[cfg(not(target_os = "windows"))]
416
fn test_ipc_globbing() -> PolarsResult<()> {
417
// for side effects
418
init_files();
419
let glob = "../../examples/datasets/foods*.ipc";
420
let df = LazyFrame::scan_ipc(
421
PlPath::new(glob),
422
ScanArgsIpc {
423
n_rows: None,
424
cache: true,
425
rechunk: false,
426
row_index: None,
427
cloud_options: None,
428
hive_options: Default::default(),
429
include_file_paths: None,
430
},
431
)?
432
.collect()?;
433
assert_eq!(df.shape(), (54, 4));
434
let cal = df.column("calories")?;
435
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
436
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
437
438
Ok(())
439
}
440
441
fn slice_at_union(lp_arena: &Arena<IR>, lp: Node) -> bool {
442
lp_arena.iter(lp).all(|(_, lp)| {
443
if let IR::Union { options, .. } = lp {
444
options.slice.is_some()
445
} else {
446
true
447
}
448
})
449
}
450
451
#[test]
452
fn test_csv_globbing() -> PolarsResult<()> {
453
let glob = "../../examples/datasets/foods*.csv";
454
let full_df = LazyCsvReader::new(PlPath::new(glob)).finish()?.collect()?;
455
456
// all 5 files * 27 rows
457
assert_eq!(full_df.shape(), (135, 4));
458
let cal = full_df.column("calories")?;
459
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
460
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
461
462
let glob = "../../examples/datasets/foods*.csv";
463
let lf = LazyCsvReader::new(PlPath::new(glob))
464
.finish()?
465
.slice(0, 100);
466
467
let df = lf.clone().collect()?;
468
assert_eq!(df, full_df.slice(0, 100));
469
let df = LazyCsvReader::new(PlPath::new(glob))
470
.finish()?
471
.slice(20, 60)
472
.collect()?;
473
assert_eq!(df, full_df.slice(20, 60));
474
475
let mut expr_arena = Arena::with_capacity(16);
476
let mut lp_arena = Arena::with_capacity(8);
477
let node = lf.optimize(&mut lp_arena, &mut expr_arena)?;
478
assert!(slice_at_union(&lp_arena, node));
479
480
let lf = LazyCsvReader::new(PlPath::new(glob))
481
.finish()?
482
.filter(col("sugars_g").lt(lit(1i32)))
483
.slice(0, 100);
484
let node = lf.optimize(&mut lp_arena, &mut expr_arena)?;
485
assert!(slice_at_union(&lp_arena, node));
486
487
Ok(())
488
}
489
490
#[test]
491
#[cfg(feature = "json")]
492
fn test_ndjson_globbing() -> PolarsResult<()> {
493
// for side effects
494
init_files();
495
let glob = "../../examples/datasets/foods*.ndjson";
496
let df = LazyJsonLineReader::new(PlPath::new(glob))
497
.finish()?
498
.collect()?;
499
assert_eq!(df.shape(), (54, 4));
500
let cal = df.column("calories")?;
501
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
502
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
503
504
Ok(())
505
}
506
507
#[test]
508
pub fn test_simple_slice() -> PolarsResult<()> {
509
let _guard = SINGLE_LOCK.lock().unwrap();
510
let out = scan_foods_parquet(false).limit(3).collect()?;
511
assert_eq!(out.height(), 3);
512
513
Ok(())
514
}
515
#[test]
516
fn test_union_and_agg_projections() -> PolarsResult<()> {
517
init_files();
518
let _guard = SINGLE_LOCK.lock().unwrap();
519
// a union vstacks columns and aggscan optimization determines columns to aggregate in a
520
// hashmap, if that doesn't set them sorted the vstack will panic.
521
let lf1: LazyFrame = DslBuilder::scan_parquet(
522
ScanSources::Paths([PlPath::new(GLOB_PARQUET)].into()),
523
Default::default(),
524
UnifiedScanArgs {
525
extra_columns_policy: ExtraColumnsPolicy::Ignore,
526
..Default::default()
527
},
528
)
529
.unwrap()
530
.build()
531
.into();
532
533
let lf2: LazyFrame = DslBuilder::scan_ipc(
534
ScanSources::Paths([PlPath::new(GLOB_IPC)].into()),
535
Default::default(),
536
UnifiedScanArgs {
537
extra_columns_policy: ExtraColumnsPolicy::Ignore,
538
..Default::default()
539
},
540
)
541
.unwrap()
542
.build()
543
.into();
544
545
let lf3: LazyFrame = DslBuilder::scan_csv(
546
ScanSources::Paths([PlPath::new(GLOB_CSV)].into()),
547
Default::default(),
548
UnifiedScanArgs {
549
extra_columns_policy: ExtraColumnsPolicy::Ignore,
550
..Default::default()
551
},
552
)
553
.unwrap()
554
.build()
555
.into();
556
557
for lf in [lf1, lf2, lf3] {
558
let lf = lf.filter(col("category").eq(lit("vegetables"))).select([
559
col("fats_g").sum().alias("sum"),
560
col("fats_g").cast(DataType::Float64).mean().alias("mean"),
561
col("fats_g").min().alias("min"),
562
]);
563
564
let out = lf.collect()?;
565
assert_eq!(out.shape(), (1, 3));
566
}
567
568
Ok(())
569
}
570
571
#[test]
572
#[cfg(all(feature = "ipc", feature = "csv"))]
573
fn test_slice_filter() -> PolarsResult<()> {
574
init_files();
575
let _guard = SINGLE_LOCK.lock().unwrap();
576
577
// make sure that the slices are not applied before the predicates.
578
let len = 5;
579
let offset = 3;
580
581
let df1 = scan_foods_csv()
582
.filter(col("category").eq(lit("fruit")))
583
.slice(offset, len)
584
.collect()?;
585
let df2 = scan_foods_parquet(false)
586
.filter(col("category").eq(lit("fruit")))
587
.slice(offset, len)
588
.collect()?;
589
let df3 = scan_foods_ipc()
590
.filter(col("category").eq(lit("fruit")))
591
.slice(offset, len)
592
.collect()?;
593
594
let df1_ = scan_foods_csv()
595
.collect()?
596
.lazy()
597
.filter(col("category").eq(lit("fruit")))
598
.slice(offset, len)
599
.collect()?;
600
let df2_ = scan_foods_parquet(false)
601
.collect()?
602
.lazy()
603
.filter(col("category").eq(lit("fruit")))
604
.slice(offset, len)
605
.collect()?;
606
let df3_ = scan_foods_ipc()
607
.collect()?
608
.lazy()
609
.filter(col("category").eq(lit("fruit")))
610
.slice(offset, len)
611
.collect()?;
612
613
assert_eq!(df1.shape(), df1_.shape());
614
assert_eq!(df2.shape(), df2_.shape());
615
assert_eq!(df3.shape(), df3_.shape());
616
617
Ok(())
618
}
619
620
#[test]
621
fn skip_rows_and_slice() -> PolarsResult<()> {
622
let out = LazyCsvReader::new(PlPath::new(FOODS_CSV))
623
.with_skip_rows(4)
624
.finish()?
625
.limit(1)
626
.collect()?;
627
assert_eq!(out.column("fruit")?.get(0)?, AnyValue::String("seafood"));
628
assert_eq!(out.shape(), (1, 4));
629
Ok(())
630
}
631
632
#[test]
633
fn test_row_index_on_files() -> PolarsResult<()> {
634
let _guard = SINGLE_LOCK.lock().unwrap();
635
for offset in [0 as IdxSize, 10] {
636
let lf = LazyCsvReader::new(PlPath::new(FOODS_CSV))
637
.with_row_index(Some(RowIndex {
638
name: PlSmallStr::from_static("index"),
639
offset,
640
}))
641
.finish()?;
642
643
assert!(row_index_at_scan(lf.clone()));
644
let df = lf.collect()?;
645
let idx = df.column("index")?;
646
assert_eq!(
647
idx.idx()?.into_no_null_iter().collect::<Vec<_>>(),
648
(offset..27 + offset).collect::<Vec<_>>()
649
);
650
651
let lf = LazyFrame::scan_parquet(PlPath::new(FOODS_PARQUET), Default::default())?
652
.with_row_index("index", Some(offset));
653
assert!(row_index_at_scan(lf.clone()));
654
let df = lf.collect()?;
655
let idx = df.column("index")?;
656
assert_eq!(
657
idx.idx()?.into_no_null_iter().collect::<Vec<_>>(),
658
(offset..27 + offset).collect::<Vec<_>>()
659
);
660
661
let lf = LazyFrame::scan_ipc(PlPath::new(FOODS_IPC), Default::default())?
662
.with_row_index("index", Some(offset));
663
664
assert!(row_index_at_scan(lf.clone()));
665
let df = lf.clone().collect()?;
666
let idx = df.column("index")?;
667
assert_eq!(
668
idx.idx()?.into_no_null_iter().collect::<Vec<_>>(),
669
(offset..27 + offset).collect::<Vec<_>>()
670
);
671
672
let out = lf
673
.filter(col("index").gt(lit(-1)))
674
.select([col("calories")])
675
.collect()?;
676
assert!(out.column("calories").is_ok());
677
assert_eq!(out.shape(), (27, 1));
678
}
679
680
Ok(())
681
}
682
683
#[test]
684
fn scan_predicate_on_set_null_values() -> PolarsResult<()> {
685
let df = LazyCsvReader::new(PlPath::new(FOODS_CSV))
686
.with_null_values(Some(NullValues::Named(vec![("fats_g".into(), "0".into())])))
687
.with_infer_schema_length(Some(0))
688
.finish()?
689
.select([col("category"), col("fats_g")])
690
.filter(col("fats_g").is_null())
691
.collect()?;
692
693
assert_eq!(df.shape(), (12, 2));
694
Ok(())
695
}
696
697
#[test]
698
fn scan_anonymous_fn_with_options() -> PolarsResult<()> {
699
struct MyScan {}
700
701
impl AnonymousScan for MyScan {
702
fn as_any(&self) -> &dyn std::any::Any {
703
self
704
}
705
706
fn allows_projection_pushdown(&self) -> bool {
707
true
708
}
709
710
fn scan(&self, scan_opts: AnonymousScanArgs) -> PolarsResult<DataFrame> {
711
assert_eq!(scan_opts.with_columns.clone().unwrap().len(), 2);
712
assert_eq!(scan_opts.n_rows, Some(3));
713
let out = fruits_cars().select(scan_opts.with_columns.unwrap().iter().cloned())?;
714
Ok(out.slice(0, scan_opts.n_rows.unwrap()))
715
}
716
}
717
718
let function = Arc::new(MyScan {});
719
720
let args = ScanArgsAnonymous {
721
schema: Some(fruits_cars().schema().clone()),
722
..ScanArgsAnonymous::default()
723
};
724
725
let q = LazyFrame::anonymous_scan(function, args)?
726
.with_column((col("A") * lit(2)).alias("A2"))
727
.select([col("A2"), col("fruits")])
728
.limit(3);
729
730
let df = q.collect()?;
731
732
assert_eq!(df.shape(), (3, 2));
733
Ok(())
734
}
735
736
#[test]
737
fn scan_anonymous_fn_count() -> PolarsResult<()> {
738
struct MyScan {}
739
740
impl AnonymousScan for MyScan {
741
fn as_any(&self) -> &dyn std::any::Any {
742
self
743
}
744
745
fn allows_projection_pushdown(&self) -> bool {
746
true
747
}
748
749
fn scan(&self, scan_opts: AnonymousScanArgs) -> PolarsResult<DataFrame> {
750
assert_eq!(scan_opts.with_columns.as_deref(), Some(&["A".into()][..]));
751
752
Ok(fruits_cars()
753
.select(scan_opts.with_columns.unwrap().iter().cloned())
754
.unwrap())
755
}
756
}
757
758
let function = Arc::new(MyScan {});
759
760
let args = ScanArgsAnonymous {
761
schema: Some(fruits_cars().schema().clone()),
762
..ScanArgsAnonymous::default()
763
};
764
765
let df = LazyFrame::anonymous_scan(function, args)?
766
.select(&[dsl::len()])
767
.collect()
768
.unwrap();
769
770
assert_eq!(df.get_columns().len(), 1);
771
assert_eq!(df.get_columns()[0].len(), 1);
772
assert_eq!(
773
df.get_columns()[0]
774
.cast(&DataType::UInt32)
775
.unwrap()
776
.as_materialized_series()
777
.first(),
778
Scalar::new(DataType::UInt32, AnyValue::UInt32(5))
779
);
780
781
Ok(())
782
}
783
784
#[test]
785
#[cfg(feature = "dtype-full")]
786
fn scan_small_dtypes() -> PolarsResult<()> {
787
let small_dt = vec![
788
DataType::Int8,
789
DataType::UInt8,
790
DataType::Int16,
791
DataType::UInt16,
792
];
793
for dt in small_dt {
794
let df = LazyCsvReader::new(PlPath::new(FOODS_CSV))
795
.with_has_header(true)
796
.with_dtype_overwrite(Some(Arc::new(Schema::from_iter([Field::new(
797
"sugars_g".into(),
798
dt.clone(),
799
)]))))
800
.finish()?
801
.select(&[col("sugars_g")])
802
.collect()?;
803
804
assert_eq!(df.dtypes(), &[dt]);
805
}
806
Ok(())
807
}
808
809