Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-lazy/src/tests/io.rs
8446 views
1
use polars_io::RowIndex;
2
#[cfg(feature = "is_between")]
3
use polars_ops::prelude::ClosedInterval;
4
use polars_utils::pl_path::PlRefPath;
5
use polars_utils::slice_enum::Slice;
6
7
use super::*;
8
use crate::dsl;
9
10
#[test]
11
#[cfg(feature = "parquet")]
12
fn test_parquet_exec() -> PolarsResult<()> {
13
let _guard = SINGLE_LOCK.lock().unwrap();
14
// filter
15
for par in [true, false] {
16
let out = scan_foods_parquet(par)
17
.filter(col("category").eq(lit("seafood")))
18
.collect()?;
19
assert_eq!(out.shape(), (8, 4));
20
}
21
22
// project
23
for par in [true, false] {
24
let out = scan_foods_parquet(par)
25
.select([col("category"), col("sugars_g")])
26
.collect()?;
27
assert_eq!(out.shape(), (27, 2));
28
}
29
30
// project + filter
31
for par in [true, false] {
32
let out = scan_foods_parquet(par)
33
.select([col("category"), col("sugars_g")])
34
.filter(col("category").eq(lit("seafood")))
35
.collect()?;
36
assert_eq!(out.shape(), (8, 2));
37
}
38
39
Ok(())
40
}
41
42
#[test]
43
#[cfg(all(feature = "parquet", feature = "is_between"))]
44
fn test_parquet_statistics_no_skip() {
45
let _guard = SINGLE_LOCK.lock().unwrap();
46
init_files();
47
let par = true;
48
let out = scan_foods_parquet(par)
49
.filter(col("calories").gt(lit(0i32)))
50
.collect()
51
.unwrap();
52
assert_eq!(out.shape(), (27, 4));
53
54
let out = scan_foods_parquet(par)
55
.filter(col("calories").lt(lit(1000i32)))
56
.collect()
57
.unwrap();
58
assert_eq!(out.shape(), (27, 4));
59
60
let out = scan_foods_parquet(par)
61
.filter(lit(0i32).lt(col("calories")))
62
.collect()
63
.unwrap();
64
assert_eq!(out.shape(), (27, 4));
65
66
let out = scan_foods_parquet(par)
67
.filter(lit(1000i32).gt(col("calories")))
68
.collect()
69
.unwrap();
70
assert_eq!(out.shape(), (27, 4));
71
72
// statistics and `is_between`
73
// normal case
74
let out = scan_foods_parquet(par)
75
.filter(col("calories").is_between(40, 300, ClosedInterval::Both))
76
.collect()
77
.unwrap();
78
assert_eq!(out.shape(), (19, 4));
79
// normal case
80
let out = scan_foods_parquet(par)
81
.filter(col("calories").is_between(10, 50, ClosedInterval::Both))
82
.collect()
83
.unwrap();
84
assert_eq!(out.shape(), (11, 4));
85
// edge case: 20 = min(calories) but the right end is closed
86
let out = scan_foods_parquet(par)
87
.filter(col("calories").is_between(5, 20, ClosedInterval::Right))
88
.collect()
89
.unwrap();
90
assert_eq!(out.shape(), (1, 4));
91
// edge case: 200 = max(calories) but the left end is closed
92
let out = scan_foods_parquet(par)
93
.filter(col("calories").is_between(200, 250, ClosedInterval::Left))
94
.collect()
95
.unwrap();
96
assert_eq!(out.shape(), (3, 4));
97
// edge case: left == right but both ends are closed
98
let out = scan_foods_parquet(par)
99
.filter(col("calories").is_between(200, 200, ClosedInterval::Both))
100
.collect()
101
.unwrap();
102
assert_eq!(out.shape(), (3, 4));
103
104
// Or operation
105
let out = scan_foods_parquet(par)
106
.filter(
107
col("sugars_g")
108
.lt(lit(0i32))
109
.or(col("fats_g").lt(lit(1000.0))),
110
)
111
.collect()
112
.unwrap();
113
assert_eq!(out.shape(), (27, 4));
114
}
115
116
#[test]
117
#[cfg(all(feature = "parquet", feature = "is_between"))]
118
fn test_parquet_statistics() -> PolarsResult<()> {
119
let _guard = SINGLE_LOCK.lock().unwrap();
120
init_files();
121
unsafe { std::env::set_var("POLARS_PANIC_IF_PARQUET_PARSED", "1") };
122
let par = true;
123
124
// Test single predicates
125
let out = scan_foods_parquet(par)
126
.filter(col("calories").lt(lit(0i32)))
127
.collect()?;
128
assert_eq!(out.shape(), (0, 4));
129
130
let out = scan_foods_parquet(par)
131
.filter(col("calories").gt(lit(1000)))
132
.collect()?;
133
assert_eq!(out.shape(), (0, 4));
134
135
let out = scan_foods_parquet(par)
136
.filter(lit(0i32).gt(col("calories")))
137
.collect()?;
138
assert_eq!(out.shape(), (0, 4));
139
140
// issue: 13427
141
let out = scan_foods_parquet(par)
142
.filter(col("calories").is_in(lit(Series::new("".into(), [0, 500])), false))
143
.collect()?;
144
assert_eq!(out.shape(), (0, 4));
145
146
// statistics and `is_between`
147
// 15 < min(calories)=20
148
let out = scan_foods_parquet(par)
149
.filter(col("calories").is_between(5, 15, ClosedInterval::Both))
150
.collect()?;
151
assert_eq!(out.shape(), (0, 4));
152
153
// 300 > max(calories)=200
154
let out = scan_foods_parquet(par)
155
.filter(col("calories").is_between(300, 500, ClosedInterval::Both))
156
.collect()?;
157
assert_eq!(out.shape(), (0, 4));
158
159
// 20 == min(calories) but right end is open
160
let out = scan_foods_parquet(par)
161
.filter(col("calories").is_between(5, 20, ClosedInterval::Left))
162
.collect()?;
163
assert_eq!(out.shape(), (0, 4));
164
165
// 20 == min(calories) but both ends are open
166
let out = scan_foods_parquet(par)
167
.filter(col("calories").is_between(5, 20, ClosedInterval::None))
168
.collect()?;
169
assert_eq!(out.shape(), (0, 4));
170
171
// 200 == max(calories) but left end is open
172
let out = scan_foods_parquet(par)
173
.filter(col("calories").is_between(200, 250, ClosedInterval::Right))
174
.collect()?;
175
assert_eq!(out.shape(), (0, 4));
176
177
// 200 == max(calories) but both ends are open
178
let out = scan_foods_parquet(par)
179
.filter(col("calories").is_between(200, 250, ClosedInterval::None))
180
.collect()?;
181
assert_eq!(out.shape(), (0, 4));
182
183
// between(100, 40) is impossible
184
let out = scan_foods_parquet(par)
185
.filter(col("calories").is_between(100, 40, ClosedInterval::Both))
186
.collect()?;
187
assert_eq!(out.shape(), (0, 4));
188
189
// with strings
190
let out = scan_foods_parquet(par)
191
.filter(col("category").is_between(lit("yams"), lit("zest"), ClosedInterval::Both))
192
.collect()?;
193
assert_eq!(out.shape(), (0, 4));
194
195
// with strings
196
let out = scan_foods_parquet(par)
197
.filter(col("category").is_between(lit("dairy"), lit("eggs"), ClosedInterval::Both))
198
.collect()?;
199
assert_eq!(out.shape(), (0, 4));
200
201
let out = scan_foods_parquet(par)
202
.filter(lit(1000i32).lt(col("calories")))
203
.collect()?;
204
assert_eq!(out.shape(), (0, 4));
205
206
// not(a > b) => a <= b
207
let out = scan_foods_parquet(par)
208
.filter(not(col("calories").gt(5)))
209
.collect()?;
210
assert_eq!(out.shape(), (0, 4));
211
212
// not(a >= b) => a < b
213
// note that min(calories)=20
214
let out = scan_foods_parquet(par)
215
.filter(not(col("calories").gt_eq(20)))
216
.collect()?;
217
assert_eq!(out.shape(), (0, 4));
218
219
// not(a < b) => a >= b
220
let out = scan_foods_parquet(par)
221
.filter(not(col("calories").lt(250)))
222
.collect()?;
223
assert_eq!(out.shape(), (0, 4));
224
225
// not(a <= b) => a > b
226
// note that max(calories)=200
227
let out = scan_foods_parquet(par)
228
.filter(not(col("calories").lt_eq(200)))
229
.collect()?;
230
assert_eq!(out.shape(), (0, 4));
231
232
// not(a == b) => a != b
233
// note that proteins_g=10 for all rows
234
let out = scan_nutri_score_null_column_parquet(par)
235
.filter(not(col("proteins_g").eq(10)))
236
.collect()?;
237
assert_eq!(out.shape(), (0, 6));
238
239
// not(a != b) => a == b
240
// note that proteins_g=10 for all rows
241
let out = scan_nutri_score_null_column_parquet(par)
242
.filter(not(col("proteins_g").neq(5)))
243
.collect()?;
244
assert_eq!(out.shape(), (0, 6));
245
246
// not(col(c) is between [a, b]) => col(c) < a or col(c) > b
247
let out = scan_foods_parquet(par)
248
.filter(not(col("calories").is_between(
249
20,
250
200,
251
ClosedInterval::Both,
252
)))
253
.collect()?;
254
assert_eq!(out.shape(), (0, 4));
255
256
// not(col(c) is between [a, b[) => col(c) < a or col(c) >= b
257
let out = scan_foods_parquet(par)
258
.filter(not(col("calories").is_between(
259
20,
260
201,
261
ClosedInterval::Left,
262
)))
263
.collect()?;
264
assert_eq!(out.shape(), (0, 4));
265
266
// not(col(c) is between ]a, b]) => col(c) <= a or col(c) > b
267
let out = scan_foods_parquet(par)
268
.filter(not(col("calories").is_between(
269
19,
270
200,
271
ClosedInterval::Right,
272
)))
273
.collect()?;
274
assert_eq!(out.shape(), (0, 4));
275
276
// not(col(c) is between ]a, b]) => col(c) <= a or col(c) > b
277
let out = scan_foods_parquet(par)
278
.filter(not(col("calories").is_between(
279
19,
280
200,
281
ClosedInterval::Right,
282
)))
283
.collect()?;
284
assert_eq!(out.shape(), (0, 4));
285
286
// not(col(c) is between ]a, b[) => col(c) <= a or col(c) >= b
287
let out = scan_foods_parquet(par)
288
.filter(not(col("calories").is_between(
289
19,
290
201,
291
ClosedInterval::None,
292
)))
293
.collect()?;
294
assert_eq!(out.shape(), (0, 4));
295
296
// not (a or b) => not(a) and not(b)
297
// note that not(fats_g <= 9) is possible; not(calories > 5) should allow us skip the rg
298
let out = scan_foods_parquet(par)
299
.filter(not(col("calories").gt(5).or(col("fats_g").lt_eq(9))))
300
.collect()?;
301
assert_eq!(out.shape(), (0, 4));
302
303
// not (a and b) => not(a) or not(b)
304
let out = scan_foods_parquet(par)
305
.filter(not(col("calories").gt(5).and(col("fats_g").lt_eq(12))))
306
.collect()?;
307
assert_eq!(out.shape(), (0, 4));
308
309
// is_not_null
310
let out = scan_nutri_score_null_column_parquet(par)
311
.filter(col("nutri_score").is_not_null())
312
.collect()?;
313
assert_eq!(out.shape(), (0, 6));
314
315
// not(is_null) (~pl.col('nutri_score').is_null())
316
let out = scan_nutri_score_null_column_parquet(par)
317
.filter(not(col("nutri_score").is_null()))
318
.collect()?;
319
assert_eq!(out.shape(), (0, 6));
320
321
// Test multiple predicates
322
323
// And operation
324
let out = scan_foods_parquet(par)
325
.filter(col("calories").lt(lit(0i32)))
326
.filter(col("calories").gt(lit(1000)))
327
.collect()?;
328
assert_eq!(out.shape(), (0, 4));
329
330
let out = scan_foods_parquet(par)
331
.filter(col("calories").lt(lit(0i32)))
332
.filter(col("calories").gt(lit(1000)))
333
.filter(col("calories").lt(lit(50i32)))
334
.collect()?;
335
assert_eq!(out.shape(), (0, 4));
336
337
let out = scan_foods_parquet(par)
338
.filter(
339
col("calories")
340
.lt(lit(0i32))
341
.and(col("fats_g").lt(lit(0.0))),
342
)
343
.collect()?;
344
assert_eq!(out.shape(), (0, 4));
345
346
// Or operation
347
let out = scan_foods_parquet(par)
348
.filter(
349
col("sugars_g")
350
.lt(lit(0i32))
351
.or(col("fats_g").gt(lit(1000.0))),
352
)
353
.collect()?;
354
assert_eq!(out.shape(), (0, 4));
355
356
unsafe { std::env::remove_var("POLARS_PANIC_IF_PARQUET_PARSED") };
357
358
Ok(())
359
}
360
361
#[test]
362
#[cfg(not(target_os = "windows"))]
363
fn test_parquet_globbing() -> PolarsResult<()> {
364
// for side effects
365
init_files();
366
let _guard = SINGLE_LOCK.lock().unwrap();
367
let glob = "../../examples/datasets/foods*.parquet";
368
let df = LazyFrame::scan_parquet(
369
PlRefPath::new(glob),
370
ScanArgsParquet {
371
n_rows: None,
372
cache: true,
373
parallel: Default::default(),
374
..Default::default()
375
},
376
)?
377
.collect()?;
378
assert_eq!(df.shape(), (54, 4));
379
let cal = df.column("calories")?;
380
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
381
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
382
383
Ok(())
384
}
385
386
#[test]
387
fn test_scan_parquet_limit_9001() {
388
init_files();
389
let path = GLOB_PARQUET;
390
let args = ScanArgsParquet {
391
n_rows: Some(10000),
392
cache: false,
393
rechunk: true,
394
..Default::default()
395
};
396
let q = LazyFrame::scan_parquet(PlRefPath::new(path), args)
397
.unwrap()
398
.limit(3);
399
let IRPlan {
400
lp_top, lp_arena, ..
401
} = q.to_alp_optimized().unwrap();
402
lp_arena.iter(lp_top).all(|(_, lp)| match lp {
403
IR::Union { options, .. } => {
404
let sliced = options.slice.unwrap();
405
sliced.1 == 3
406
},
407
IR::Scan {
408
unified_scan_args, ..
409
} => unified_scan_args.pre_slice == Some(Slice::Positive { offset: 0, len: 3 }),
410
_ => true,
411
});
412
}
413
414
#[test]
415
#[cfg(not(target_os = "windows"))]
416
fn test_ipc_globbing() -> PolarsResult<()> {
417
// for side effects
418
init_files();
419
let glob = "../../examples/datasets/foods*.ipc";
420
let df = LazyFrame::scan_ipc(
421
PlRefPath::new(glob),
422
Default::default(),
423
UnifiedScanArgs {
424
cache: true,
425
glob: true,
426
..Default::default()
427
},
428
)?
429
.collect()?;
430
assert_eq!(df.shape(), (54, 4));
431
let cal = df.column("calories")?;
432
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
433
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
434
435
Ok(())
436
}
437
438
fn slice_at_union(lp_arena: &Arena<IR>, lp: Node) -> bool {
439
lp_arena.iter(lp).all(|(_, lp)| {
440
if let IR::Union { options, .. } = lp {
441
options.slice.is_some()
442
} else {
443
true
444
}
445
})
446
}
447
448
#[test]
449
fn test_csv_globbing() -> PolarsResult<()> {
450
let glob = "../../examples/datasets/foods*.csv";
451
let full_df = LazyCsvReader::new(PlRefPath::new(glob))
452
.finish()?
453
.collect()?;
454
455
// all 5 files * 27 rows
456
assert_eq!(full_df.shape(), (135, 4));
457
let cal = full_df.column("calories")?;
458
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
459
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
460
461
let glob = "../../examples/datasets/foods*.csv";
462
let lf = LazyCsvReader::new(PlRefPath::new(glob))
463
.finish()?
464
.slice(0, 100);
465
466
let df = lf.clone().collect()?;
467
assert_eq!(df, full_df.slice(0, 100));
468
let df = LazyCsvReader::new(PlRefPath::new(glob))
469
.finish()?
470
.slice(20, 60)
471
.collect()?;
472
assert_eq!(df, full_df.slice(20, 60));
473
474
let mut expr_arena = Arena::with_capacity(16);
475
let mut lp_arena = Arena::with_capacity(8);
476
let node = lf.optimize(&mut lp_arena, &mut expr_arena)?;
477
assert!(slice_at_union(&lp_arena, node));
478
479
let lf = LazyCsvReader::new(PlRefPath::new(glob))
480
.finish()?
481
.filter(col("sugars_g").lt(lit(1i32)))
482
.slice(0, 100);
483
let node = lf.optimize(&mut lp_arena, &mut expr_arena)?;
484
assert!(slice_at_union(&lp_arena, node));
485
486
Ok(())
487
}
488
489
#[test]
490
#[cfg(feature = "json")]
491
fn test_ndjson_globbing() -> PolarsResult<()> {
492
// for side effects
493
init_files();
494
let glob = "../../examples/datasets/foods*.ndjson";
495
let df = LazyJsonLineReader::new(PlRefPath::new(glob))
496
.finish()?
497
.collect()?;
498
assert_eq!(df.shape(), (54, 4));
499
let cal = df.column("calories")?;
500
assert_eq!(cal.get(0)?, AnyValue::Int64(45));
501
assert_eq!(cal.get(53)?, AnyValue::Int64(194));
502
503
Ok(())
504
}
505
506
#[test]
507
pub fn test_simple_slice() -> PolarsResult<()> {
508
let _guard = SINGLE_LOCK.lock().unwrap();
509
let out = scan_foods_parquet(false).limit(3).collect()?;
510
assert_eq!(out.height(), 3);
511
512
Ok(())
513
}
514
#[test]
515
fn test_union_and_agg_projections() -> PolarsResult<()> {
516
init_files();
517
let _guard = SINGLE_LOCK.lock().unwrap();
518
// a union vstacks columns and aggscan optimization determines columns to aggregate in a
519
// hashmap, if that doesn't set them sorted the vstack will panic.
520
let lf1: LazyFrame = DslBuilder::scan_parquet(
521
ScanSources::Paths(FromIterator::from_iter([PlRefPath::new(GLOB_PARQUET)])),
522
ParquetOptions::default(),
523
UnifiedScanArgs {
524
extra_columns_policy: ExtraColumnsPolicy::Ignore,
525
..Default::default()
526
},
527
)
528
.unwrap()
529
.build()
530
.into();
531
532
let lf2: LazyFrame = DslBuilder::scan_ipc(
533
ScanSources::Paths(FromIterator::from_iter([PlRefPath::new(GLOB_IPC)])),
534
IpcScanOptions {
535
..Default::default()
536
},
537
UnifiedScanArgs {
538
extra_columns_policy: ExtraColumnsPolicy::Ignore,
539
..Default::default()
540
},
541
)
542
.unwrap()
543
.build()
544
.into();
545
546
let lf3: LazyFrame = DslBuilder::scan_csv(
547
ScanSources::Paths(FromIterator::from_iter([PlRefPath::new(GLOB_CSV)])),
548
CsvReadOptions::default(),
549
UnifiedScanArgs {
550
extra_columns_policy: ExtraColumnsPolicy::Ignore,
551
..Default::default()
552
},
553
)
554
.unwrap()
555
.build()
556
.into();
557
558
for lf in [lf1, lf2, lf3] {
559
let lf = lf.filter(col("category").eq(lit("vegetables"))).select([
560
col("fats_g").sum().alias("sum"),
561
col("fats_g").cast(DataType::Float64).mean().alias("mean"),
562
col("fats_g").min().alias("min"),
563
]);
564
565
let out = lf.collect()?;
566
assert_eq!(out.shape(), (1, 3));
567
}
568
569
Ok(())
570
}
571
572
#[test]
573
#[cfg(all(feature = "ipc", feature = "csv"))]
574
fn test_slice_filter() -> PolarsResult<()> {
575
init_files();
576
let _guard = SINGLE_LOCK.lock().unwrap();
577
578
// make sure that the slices are not applied before the predicates.
579
let len = 5;
580
let offset = 3;
581
582
let df1 = scan_foods_csv()
583
.filter(col("category").eq(lit("fruit")))
584
.slice(offset, len)
585
.collect()?;
586
let df2 = scan_foods_parquet(false)
587
.filter(col("category").eq(lit("fruit")))
588
.slice(offset, len)
589
.collect()?;
590
let df3 = scan_foods_ipc()
591
.filter(col("category").eq(lit("fruit")))
592
.slice(offset, len)
593
.collect()?;
594
595
let df1_ = scan_foods_csv()
596
.collect()?
597
.lazy()
598
.filter(col("category").eq(lit("fruit")))
599
.slice(offset, len)
600
.collect()?;
601
let df2_ = scan_foods_parquet(false)
602
.collect()?
603
.lazy()
604
.filter(col("category").eq(lit("fruit")))
605
.slice(offset, len)
606
.collect()?;
607
let df3_ = scan_foods_ipc()
608
.collect()?
609
.lazy()
610
.filter(col("category").eq(lit("fruit")))
611
.slice(offset, len)
612
.collect()?;
613
614
assert_eq!(df1.shape(), df1_.shape());
615
assert_eq!(df2.shape(), df2_.shape());
616
assert_eq!(df3.shape(), df3_.shape());
617
618
Ok(())
619
}
620
621
#[test]
622
fn skip_rows_and_slice() -> PolarsResult<()> {
623
let out = LazyCsvReader::new(PlRefPath::new(FOODS_CSV))
624
.with_skip_rows(4)
625
.finish()?
626
.limit(1)
627
.collect()?;
628
assert_eq!(out.column("fruit")?.get(0)?, AnyValue::String("seafood"));
629
assert_eq!(out.shape(), (1, 4));
630
Ok(())
631
}
632
633
#[test]
634
fn test_row_index_on_files() -> PolarsResult<()> {
635
let _guard = SINGLE_LOCK.lock().unwrap();
636
for offset in [0 as IdxSize, 10] {
637
let lf = LazyCsvReader::new(PlRefPath::new(FOODS_CSV))
638
.with_row_index(Some(RowIndex {
639
name: PlSmallStr::from_static("index"),
640
offset,
641
}))
642
.finish()?;
643
644
assert!(row_index_at_scan(lf.clone()));
645
let df = lf.collect()?;
646
let idx = df.column("index")?;
647
assert_eq!(
648
idx.idx()?.into_no_null_iter().collect::<Vec<_>>(),
649
(offset..27 + offset).collect::<Vec<_>>()
650
);
651
652
let lf = LazyFrame::scan_parquet(PlRefPath::new(FOODS_PARQUET), Default::default())?
653
.with_row_index("index", Some(offset));
654
assert!(row_index_at_scan(lf.clone()));
655
let df = lf.collect()?;
656
let idx = df.column("index")?;
657
assert_eq!(
658
idx.idx()?.into_no_null_iter().collect::<Vec<_>>(),
659
(offset..27 + offset).collect::<Vec<_>>()
660
);
661
662
let lf = LazyFrame::scan_ipc(
663
PlRefPath::new(FOODS_IPC),
664
Default::default(),
665
Default::default(),
666
)?
667
.with_row_index("index", Some(offset));
668
669
assert!(row_index_at_scan(lf.clone()));
670
let df = lf.clone().collect()?;
671
let idx = df.column("index")?;
672
assert_eq!(
673
idx.idx()?.into_no_null_iter().collect::<Vec<_>>(),
674
(offset..27 + offset).collect::<Vec<_>>()
675
);
676
677
let out = lf
678
.filter(col("index").gt(lit(-1)))
679
.select([col("calories")])
680
.collect()?;
681
assert!(out.column("calories").is_ok());
682
assert_eq!(out.shape(), (27, 1));
683
}
684
685
Ok(())
686
}
687
688
#[test]
689
fn scan_predicate_on_set_null_values() -> PolarsResult<()> {
690
let df = LazyCsvReader::new(PlRefPath::new(FOODS_CSV))
691
.with_null_values(Some(NullValues::Named(vec![("fats_g".into(), "0".into())])))
692
.with_infer_schema_length(Some(0))
693
.finish()?
694
.select([col("category"), col("fats_g")])
695
.filter(col("fats_g").is_null())
696
.collect()?;
697
698
assert_eq!(df.shape(), (12, 2));
699
Ok(())
700
}
701
702
#[test]
703
fn scan_anonymous_fn_with_options() -> PolarsResult<()> {
704
struct MyScan {}
705
706
impl AnonymousScan for MyScan {
707
fn as_any(&self) -> &dyn std::any::Any {
708
self
709
}
710
711
fn allows_projection_pushdown(&self) -> bool {
712
true
713
}
714
715
fn scan(&self, scan_opts: AnonymousScanArgs) -> PolarsResult<DataFrame> {
716
assert_eq!(scan_opts.with_columns.clone().unwrap().len(), 2);
717
assert_eq!(scan_opts.n_rows, Some(3));
718
let out = fruits_cars().select(scan_opts.with_columns.unwrap().iter().cloned())?;
719
Ok(out.slice(0, scan_opts.n_rows.unwrap()))
720
}
721
}
722
723
let function = Arc::new(MyScan {});
724
725
let args = ScanArgsAnonymous {
726
schema: Some(fruits_cars().schema().clone()),
727
..ScanArgsAnonymous::default()
728
};
729
730
let q = LazyFrame::anonymous_scan(function, args)?
731
.with_column((col("A") * lit(2)).alias("A2"))
732
.select([col("A2"), col("fruits")])
733
.limit(3);
734
735
let df = q.collect()?;
736
737
assert_eq!(df.shape(), (3, 2));
738
Ok(())
739
}
740
741
#[test]
742
fn scan_anonymous_fn_count() -> PolarsResult<()> {
743
struct MyScan {}
744
745
impl AnonymousScan for MyScan {
746
fn as_any(&self) -> &dyn std::any::Any {
747
self
748
}
749
750
fn allows_projection_pushdown(&self) -> bool {
751
true
752
}
753
754
fn scan(&self, scan_opts: AnonymousScanArgs) -> PolarsResult<DataFrame> {
755
assert_eq!(scan_opts.with_columns.as_deref(), Some(&["A".into()][..]));
756
757
Ok(fruits_cars()
758
.select(scan_opts.with_columns.unwrap().iter().cloned())
759
.unwrap())
760
}
761
}
762
763
let function = Arc::new(MyScan {});
764
765
let args = ScanArgsAnonymous {
766
schema: Some(fruits_cars().schema().clone()),
767
..ScanArgsAnonymous::default()
768
};
769
770
let df = LazyFrame::anonymous_scan(function, args)?
771
.select(&[dsl::len()])
772
.collect()
773
.unwrap();
774
775
assert_eq!(df.columns().len(), 1);
776
assert_eq!(df.columns()[0].len(), 1);
777
assert_eq!(
778
df.columns()[0]
779
.cast(&DataType::UInt32)
780
.unwrap()
781
.as_materialized_series()
782
.first(),
783
Scalar::new(DataType::UInt32, AnyValue::UInt32(5))
784
);
785
786
Ok(())
787
}
788
789
#[test]
790
#[cfg(feature = "dtype-full")]
791
fn scan_small_dtypes() -> PolarsResult<()> {
792
let small_dt = vec![
793
DataType::Int8,
794
DataType::UInt8,
795
DataType::Int16,
796
DataType::UInt16,
797
];
798
for dt in small_dt {
799
let df = LazyCsvReader::new(PlRefPath::new(FOODS_CSV))
800
.with_has_header(true)
801
.with_dtype_overwrite(Some(Arc::new(Schema::from_iter([Field::new(
802
"sugars_g".into(),
803
dt.clone(),
804
)]))))
805
.finish()?
806
.select(&[col("sugars_g")])
807
.collect()?;
808
809
assert_eq!(df.dtypes(), &[dt]);
810
}
811
Ok(())
812
}
813
814