Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/lazyframe/test_lazyframe.py
8446 views
1
from __future__ import annotations
2
3
import re
4
from datetime import date, datetime
5
from functools import reduce
6
from inspect import signature
7
from operator import add
8
from string import ascii_letters
9
from typing import TYPE_CHECKING, Any, NoReturn, cast
10
11
import numpy as np
12
import pytest
13
14
import polars as pl
15
import polars.selectors as cs
16
from polars import lit, when
17
from polars.exceptions import (
18
InvalidOperationError,
19
PerformanceWarning,
20
PolarsInefficientMapWarning,
21
)
22
from polars.testing import assert_frame_equal, assert_series_equal
23
from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES
24
25
if TYPE_CHECKING:
26
from collections.abc import Callable
27
28
from _pytest.capture import CaptureFixture
29
30
from polars._typing import MapElementsStrategy, PolarsDataType
31
from tests.conftest import PlMonkeyPatch
32
33
34
def test_init_signature_match() -> None:
35
# eager/lazy init signatures are expected to match; if this test fails, it
36
# means a parameter was added to one but not the other, and that should be
37
# fixed (or an explicit exemption should be made here, with an explanation)
38
assert signature(pl.DataFrame.__init__) == signature(pl.LazyFrame.__init__)
39
40
41
def test_lazy_misc() -> None:
42
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
43
_ = ldf.with_columns(pl.lit(1).alias("foo")).select([pl.col("a"), pl.col("foo")])
44
45
# test if it executes
46
_ = ldf.with_columns(
47
when(pl.col("a") > pl.lit(2)).then(pl.lit(10)).otherwise(pl.lit(1)).alias("new")
48
).collect()
49
50
51
def test_implode() -> None:
52
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
53
eager = (
54
ldf.group_by(pl.col("a").alias("grp"), maintain_order=True)
55
.agg(pl.implode("a", "b").name.suffix("_imp"))
56
.collect()
57
)
58
assert_frame_equal(
59
eager,
60
pl.DataFrame(
61
{
62
"grp": [1, 2, 3],
63
"a_imp": [[1], [2], [3]],
64
"b_imp": [[1.0], [2.0], [3.0]],
65
}
66
),
67
)
68
69
70
def test_lazyframe_membership_operator() -> None:
71
ldf = pl.LazyFrame({"name": ["Jane", "John"], "age": [20, 30]})
72
assert "name" in ldf
73
assert "phone" not in ldf
74
75
# note: cannot use lazyframe in boolean context
76
with pytest.raises(TypeError, match="ambiguous"):
77
not ldf
78
79
80
def test_apply() -> None:
81
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
82
new = ldf.with_columns_seq(
83
pl.col("a").map_batches(lambda s: s * 2, return_dtype=pl.Int64).alias("foo")
84
)
85
expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo"))
86
assert_frame_equal(new, expected)
87
assert_frame_equal(new.collect(), expected.collect())
88
89
ldf = pl.LazyFrame({"a": [1, 2, 3] * 20, "b": [1.0, 2.0, 3.0] * 20})
90
strategy: MapElementsStrategy
91
for strategy in ("thread_local", "threading"):
92
with pytest.warns(
93
PolarsInefficientMapWarning,
94
match="with this one instead",
95
):
96
df_new = ldf.with_columns(
97
pl.col("a")
98
.map_elements(lambda s: s * 2, strategy=strategy, return_dtype=pl.Int64)
99
.alias("foo")
100
).collect()
101
102
df_expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo")).collect()
103
assert_frame_equal(df_new, df_expected)
104
105
106
def test_add_eager_column() -> None:
107
lf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
108
assert lf.collect_schema().len() == 2
109
110
out = lf.with_columns(pl.lit(pl.Series("c", [1, 2, 3]))).collect()
111
assert out["c"].sum() == 6
112
assert out.collect_schema().len() == 3
113
114
115
def test_set_null() -> None:
116
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
117
out = ldf.with_columns(
118
when(pl.col("a") > 1).then(lit(None)).otherwise(100).alias("foo")
119
).collect()
120
s = out["foo"]
121
assert s[0] == 100
122
assert s[1] is None
123
assert s[2] is None
124
125
126
def test_gather_every() -> None:
127
ldf = pl.LazyFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})
128
expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})
129
assert_frame_equal(expected_df, ldf.gather_every(2).collect())
130
expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})
131
assert_frame_equal(expected_df, ldf.gather_every(2, offset=1).collect())
132
133
134
def test_agg() -> None:
135
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
136
ldf = df.lazy().min()
137
res = ldf.collect()
138
assert res.shape == (1, 2)
139
assert res.row(0) == (1, 1.0)
140
141
142
def test_count_suffix_10783() -> None:
143
df = pl.DataFrame(
144
{
145
"a": [["a", "c", "b"], ["a", "b", "c"], ["a", "d", "c"], ["c", "a", "b"]],
146
"b": [["a", "c", "b"], ["a", "b", "c"], ["a", "d", "c"], ["c", "a", "b"]],
147
}
148
)
149
df_with_cnt = df.with_columns(
150
pl.len()
151
.over(pl.col("a").list.sort().list.join("").hash())
152
.name.suffix("_suffix")
153
)
154
df_expect = df.with_columns(pl.Series("len_suffix", [3, 3, 1, 3]))
155
assert_frame_equal(df_with_cnt, df_expect, check_dtypes=False)
156
157
158
def test_or() -> None:
159
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
160
out = ldf.filter((pl.col("a") == 1) | (pl.col("b") > 2)).collect()
161
assert out.rows() == [(1, 1.0), (3, 3.0)]
162
163
164
def test_filter_str() -> None:
165
# use a str instead of a column expr
166
ldf = pl.LazyFrame(
167
{
168
"time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"],
169
"bools": [True, False, True, False],
170
}
171
)
172
173
# last row based on a filter
174
result = ldf.filter(pl.col("bools")).select_seq(pl.last("*")).collect()
175
expected = pl.DataFrame({"time": ["11:13:00"], "bools": [True]})
176
assert_frame_equal(result, expected)
177
178
# last row based on a filter
179
result = ldf.filter("bools").select(pl.last("*")).collect()
180
assert_frame_equal(result, expected)
181
182
183
def test_filter_multiple_predicates() -> None:
184
ldf = pl.LazyFrame(
185
{
186
"a": [1, 1, 1, 2, 2],
187
"b": [1, 1, 2, 2, 2],
188
"c": [1, 1, 2, 3, 4],
189
}
190
)
191
192
# multiple predicates
193
expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})
194
for out in (
195
ldf.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat
196
ldf.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list
197
):
198
assert_frame_equal(out.collect(), expected)
199
200
# multiple kwargs
201
assert_frame_equal(
202
ldf.filter(a=1, b=2).collect(),
203
pl.DataFrame({"a": [1], "b": [2], "c": [2]}),
204
)
205
206
# both positional and keyword args
207
assert_frame_equal(
208
ldf.filter(pl.col("c") < 4, a=2, b=2).collect(),
209
pl.DataFrame({"a": [2], "b": [2], "c": [3]}),
210
)
211
212
ldf = pl.LazyFrame(
213
{
214
"description": ["eq", "gt", "ge"],
215
"predicate": ["==", ">", ">="],
216
},
217
)
218
assert ldf.filter(predicate="==").select("description").collect().item() == "eq"
219
220
221
@pytest.mark.parametrize(
222
"predicate",
223
[
224
[pl.lit(True)],
225
iter([pl.lit(True)]),
226
[True, True, True],
227
iter([True, True, True]),
228
(p for p in (pl.col("c") < 9,)),
229
(p for p in (pl.col("a") > 0, pl.col("b") > 0)),
230
],
231
)
232
def test_filter_seq_iterable_all_true(predicate: Any) -> None:
233
ldf = pl.LazyFrame(
234
{
235
"a": [1, 1, 1],
236
"b": [1, 1, 2],
237
"c": [3, 1, 2],
238
}
239
)
240
assert_frame_equal(ldf, ldf.filter(predicate))
241
242
243
def test_apply_custom_function() -> None:
244
ldf = pl.LazyFrame(
245
{
246
"A": [1, 2, 3, 4, 5],
247
"fruits": ["banana", "banana", "apple", "apple", "banana"],
248
"B": [5, 4, 3, 2, 1],
249
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
250
}
251
)
252
253
# two ways to determine the length groups.
254
df = (
255
ldf.group_by("fruits")
256
.agg(
257
[
258
pl.col("cars")
259
.implode()
260
.map_elements(lambda groups: groups.len(), return_dtype=pl.Int64)
261
.alias("custom_1"),
262
pl.col("cars")
263
.implode()
264
.map_elements(lambda groups: groups.len(), return_dtype=pl.Int64)
265
.alias("custom_2"),
266
pl.count("cars").alias("cars_count"),
267
]
268
)
269
.sort("custom_1", descending=True)
270
).collect()
271
272
expected = pl.DataFrame(
273
{
274
"fruits": ["banana", "apple"],
275
"custom_1": [3, 2],
276
"custom_2": [3, 2],
277
"cars_count": [3, 2],
278
}
279
)
280
expected = expected.with_columns(pl.col("cars_count").cast(pl.get_index_type()))
281
assert_frame_equal(df, expected)
282
283
284
def test_group_by() -> None:
285
ldf = pl.LazyFrame(
286
{
287
"a": [1.0, None, 3.0, 4.0],
288
"b": [5.0, 2.5, -3.0, 2.0],
289
"grp": ["a", "a", "b", "b"],
290
}
291
)
292
expected_a = pl.DataFrame({"grp": ["a", "b"], "a": [1.0, 3.5]})
293
expected_a_b = pl.DataFrame({"grp": ["a", "b"], "a": [1.0, 3.5], "b": [3.75, -0.5]})
294
295
for out in (
296
ldf.group_by("grp").agg(pl.mean("a")).collect(),
297
ldf.group_by(pl.col("grp")).agg(pl.mean("a")).collect(),
298
):
299
assert_frame_equal(out.sort(by="grp"), expected_a)
300
301
out = ldf.group_by("grp").agg(pl.mean("a", "b")).collect()
302
assert_frame_equal(out.sort(by="grp"), expected_a_b)
303
304
305
def test_arg_unique() -> None:
306
ldf = pl.LazyFrame({"a": [4, 1, 4]})
307
col_a_unique = ldf.select(pl.col("a").arg_unique()).collect()["a"]
308
assert_series_equal(col_a_unique, pl.Series("a", [0, 1]).cast(pl.get_index_type()))
309
310
311
def test_arg_sort() -> None:
312
ldf = pl.LazyFrame({"a": [4, 1, 3]}).select(pl.col("a").arg_sort())
313
assert ldf.collect()["a"].to_list() == [1, 2, 0]
314
315
316
def test_window_function() -> None:
317
lf = pl.LazyFrame(
318
{
319
"A": [1, 2, 3, 4, 5],
320
"fruits": ["banana", "banana", "apple", "apple", "banana"],
321
"B": [5, 4, 3, 2, 1],
322
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
323
}
324
)
325
assert lf.collect_schema().len() == 4
326
327
q = lf.with_columns(
328
pl.sum("A").over("fruits").alias("fruit_sum_A"),
329
pl.first("B").over("fruits").alias("fruit_first_B"),
330
pl.max("B").over("cars").alias("cars_max_B"),
331
)
332
assert q.collect_schema().len() == 7
333
334
assert q.collect()["cars_max_B"].to_list() == [5, 4, 5, 5, 5]
335
336
out = lf.select([pl.first("B").over(["fruits", "cars"]).alias("B_first")])
337
assert out.collect()["B_first"].to_list() == [5, 4, 3, 3, 5]
338
339
340
def test_when_then_flatten() -> None:
341
ldf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [3, 4, 5]})
342
343
assert ldf.select(
344
when(pl.col("foo") > 1)
345
.then(pl.col("bar"))
346
.when(pl.col("bar") < 3)
347
.then(10)
348
.otherwise(30)
349
).collect()["bar"].to_list() == [30, 4, 5]
350
351
352
def test_describe_plan() -> None:
353
assert isinstance(pl.LazyFrame({"a": [1]}).explain(optimized=True), str)
354
assert isinstance(pl.LazyFrame({"a": [1]}).explain(optimized=False), str)
355
356
357
@pytest.mark.may_fail_cloud # reason: inspects logs
358
def test_inspect(capsys: CaptureFixture[str]) -> None:
359
ldf = pl.LazyFrame({"a": [1]})
360
ldf.inspect().collect()
361
captured = capsys.readouterr()
362
assert len(captured.out) > 0
363
364
ldf.select(pl.col("a").cum_sum().inspect().alias("bar")).collect()
365
res = capsys.readouterr()
366
assert len(res.out) > 0
367
368
369
@pytest.mark.may_fail_auto_streaming
370
def test_fetch(fruits_cars: pl.DataFrame) -> None:
371
with pytest.warns(
372
DeprecationWarning,
373
match=r"use `LazyFrame\.collect` instead",
374
):
375
res = fruits_cars.lazy().select("*").fetch(2)
376
assert_frame_equal(res, res[:2])
377
378
379
def test_fold_filter() -> None:
380
lf = pl.LazyFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
381
382
out = lf.filter(
383
pl.fold(
384
acc=pl.lit(True),
385
function=lambda a, b: a & b,
386
exprs=[pl.col(c) > 1 for c in lf.collect_schema()],
387
)
388
).collect()
389
390
assert out.shape == (1, 2)
391
assert out.rows() == [(3, 2)]
392
393
out = lf.filter(
394
pl.fold(
395
acc=pl.lit(True),
396
function=lambda a, b: a | b,
397
exprs=[pl.col(c) > 1 for c in lf.collect_schema()],
398
)
399
).collect()
400
401
assert out.rows() == [(1, 0), (2, 1), (3, 2)]
402
403
404
def test_head_group_by() -> None:
405
commodity_prices = {
406
"commodity": [
407
"Wheat",
408
"Wheat",
409
"Wheat",
410
"Wheat",
411
"Corn",
412
"Corn",
413
"Corn",
414
"Corn",
415
"Corn",
416
],
417
"location": [
418
"StPaul",
419
"StPaul",
420
"StPaul",
421
"Chicago",
422
"Chicago",
423
"Chicago",
424
"Chicago",
425
"Chicago",
426
"Chicago",
427
],
428
"seller": [
429
"Bob",
430
"Charlie",
431
"Susan",
432
"Paul",
433
"Ed",
434
"Mary",
435
"Paul",
436
"Charlie",
437
"Norman",
438
],
439
"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],
440
}
441
ldf = pl.LazyFrame(commodity_prices)
442
443
# this query flexes the wildcard exclusion quite a bit.
444
keys = ["commodity", "location"]
445
out = (
446
ldf.sort(by="price", descending=True)
447
.group_by(keys, maintain_order=True)
448
.agg([pl.col("*").exclude(keys).head(2).name.keep()])
449
.explode(cs.all().exclude(keys))
450
)
451
452
assert out.collect().rows() == [
453
("Corn", "Chicago", "Mary", 3.0),
454
("Corn", "Chicago", "Paul", 2.4),
455
("Wheat", "StPaul", "Bob", 1.0),
456
("Wheat", "StPaul", "Susan", 0.8),
457
("Wheat", "Chicago", "Paul", 0.55),
458
]
459
460
ldf = pl.LazyFrame(
461
{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
462
)
463
out = ldf.group_by("letters").tail(2).sort("letters")
464
assert_frame_equal(
465
out.collect(),
466
pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),
467
)
468
out = ldf.group_by("letters").head(2).sort("letters")
469
assert_frame_equal(
470
out.collect(),
471
pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),
472
)
473
474
475
def test_is_null_is_not_null() -> None:
476
ldf = pl.LazyFrame({"nrs": [1, 2, None]}).select(
477
pl.col("nrs").is_null().alias("is_null"),
478
pl.col("nrs").is_not_null().alias("not_null"),
479
)
480
assert ldf.collect()["is_null"].to_list() == [False, False, True]
481
assert ldf.collect()["not_null"].to_list() == [True, True, False]
482
483
484
def test_is_nan_is_not_nan() -> None:
485
ldf = pl.LazyFrame({"nrs": np.array([1, 2, np.nan])}).select(
486
pl.col("nrs").is_nan().alias("is_nan"),
487
pl.col("nrs").is_not_nan().alias("not_nan"),
488
)
489
assert ldf.collect()["is_nan"].to_list() == [False, False, True]
490
assert ldf.collect()["not_nan"].to_list() == [True, True, False]
491
492
493
def test_is_finite_is_infinite() -> None:
494
ldf = pl.LazyFrame({"nrs": np.array([1, 2, np.inf])}).select(
495
pl.col("nrs").is_infinite().alias("is_inf"),
496
pl.col("nrs").is_finite().alias("not_inf"),
497
)
498
assert ldf.collect()["is_inf"].to_list() == [False, False, True]
499
assert ldf.collect()["not_inf"].to_list() == [True, True, False]
500
501
502
def test_len() -> None:
503
ldf = pl.LazyFrame({"nrs": [1, 2, 3]})
504
assert cast("int", ldf.select(pl.col("nrs").len()).collect().item()) == 3
505
506
507
@pytest.mark.parametrize("dtype", NUMERIC_DTYPES)
508
def test_cum_agg(dtype: PolarsDataType) -> None:
509
ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}, schema={"a": dtype})
510
assert_series_equal(
511
ldf.select(pl.col("a").cum_min()).collect()["a"],
512
pl.Series("a", [1, 1, 1, 1], dtype=dtype),
513
)
514
assert_series_equal(
515
ldf.select(pl.col("a").cum_max()).collect()["a"],
516
pl.Series("a", [1, 2, 3, 3], dtype=dtype),
517
)
518
519
expected_dtype = (
520
pl.Int64 if dtype in [pl.Int8, pl.Int16, pl.UInt8, pl.UInt16] else dtype
521
)
522
assert_series_equal(
523
ldf.select(pl.col("a").cum_sum()).collect()["a"],
524
pl.Series("a", [1, 3, 6, 8], dtype=expected_dtype),
525
)
526
527
expected_dtype = (
528
pl.Int64
529
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.UInt8, pl.UInt16, pl.UInt32]
530
else dtype
531
)
532
assert_series_equal(
533
ldf.select(pl.col("a").cum_prod()).collect()["a"],
534
pl.Series("a", [1, 2, 6, 12], dtype=expected_dtype),
535
)
536
537
538
def test_ceil() -> None:
539
ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0]})
540
result = ldf.select(pl.col("a").ceil()).collect()
541
assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))
542
543
ldf = pl.LazyFrame({"a": [1, 2, 3]})
544
result = ldf.select(pl.col("a").ceil()).collect()
545
assert_frame_equal(ldf.collect(), result)
546
547
548
def test_floor() -> None:
549
ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0]})
550
result = ldf.select(pl.col("a").floor()).collect()
551
assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))
552
553
ldf = pl.LazyFrame({"a": [1, 2, 3]})
554
result = ldf.select(pl.col("a").floor()).collect()
555
assert_frame_equal(ldf.collect(), result)
556
557
558
@pytest.mark.parametrize(
559
("n", "ndigits", "expected"),
560
[
561
(1.005, 2, 1.0),
562
(1234.00000254495, 10, 1234.000002545),
563
(1835.665, 2, 1835.67),
564
(-1835.665, 2, -1835.67),
565
(2.49, 0, 2.0),
566
(123.45678, 2, 123.46),
567
(1254, 2, 1254.0),
568
(1254, 0, 1254.0),
569
(123.55, 0, 124.0),
570
(123.55, 1, 123.6),
571
(-1.23456789, 6, -1.234568),
572
(1.0e-5, 5, 0.00001),
573
(1.0e-20, 20, 1e-20),
574
(1.0e20, 2, 100000000000000000000.0),
575
],
576
)
577
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
578
def test_round(n: float, ndigits: int, expected: float, dtype: pl.DataType) -> None:
579
ldf = pl.LazyFrame({"value": [n]}, schema_overrides={"value": dtype})
580
assert_series_equal(
581
ldf.select(pl.col("value").round(decimals=ndigits)).collect().to_series(),
582
pl.Series("value", [expected], dtype=dtype),
583
)
584
585
586
@pytest.mark.parametrize(
587
("n", "ndigits", "expected1", "expected2"),
588
[
589
(0.5, 0, 0.0, 1.0),
590
(1.5, 0, 2.0, 2.0),
591
(2.5, 0, 2.0, 3.0),
592
(-0.5, 0, -0.0, -1.0),
593
(-1.5, 0, -2.0, -2.0),
594
(2.25, 1, 2.2, 2.3),
595
(2.75, 1, 2.8, 2.8),
596
(-2.25, 1, -2.2, -2.3),
597
],
598
)
599
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
600
def test_round_mode(
601
n: float, ndigits: int, expected1: float, expected2: float, dtype: pl.DataType
602
) -> None:
603
ldf = pl.LazyFrame({"value": [n]}, schema_overrides={"value": dtype})
604
assert_series_equal(
605
ldf.select(pl.col("value").round(ndigits, mode="half_to_even"))
606
.collect()
607
.to_series(),
608
pl.Series("value", [expected1], dtype=dtype),
609
)
610
assert_series_equal(
611
ldf.select(pl.col("value").round(ndigits, mode="half_away_from_zero"))
612
.collect()
613
.to_series(),
614
pl.Series("value", [expected2], dtype=dtype),
615
)
616
617
618
def test_dot() -> None:
619
ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]}).select(
620
pl.col("a").dot(pl.col("b"))
621
)
622
assert cast("float", ldf.collect().item()) == 12.96
623
624
625
def test_sort() -> None:
626
ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}).select(pl.col("a").sort())
627
assert_series_equal(ldf.collect()["a"], pl.Series("a", [1, 2, 2, 3]))
628
629
630
def test_custom_group_by() -> None:
631
ldf = pl.LazyFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})
632
out = (
633
ldf.group_by("b", maintain_order=True)
634
.agg(
635
[
636
pl.col("a")
637
.implode()
638
.map_elements(lambda x: x.sum(), return_dtype=pl.Int64)
639
]
640
)
641
.collect()
642
)
643
assert out.rows() == [("a", 1), ("b", 2), ("c", 2)]
644
645
646
def test_lazy_columns() -> None:
647
lf = pl.LazyFrame(
648
{
649
"a": [1],
650
"b": [1],
651
"c": [1],
652
}
653
)
654
assert lf.select("a", "c").collect_schema().names() == ["a", "c"]
655
656
657
def test_cast_frame() -> None:
658
lf = pl.LazyFrame(
659
{
660
"a": [1.0, 2.5, 3.0],
661
"b": [4, 5, None],
662
"c": [True, False, True],
663
"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],
664
}
665
)
666
667
# cast via col:dtype map
668
assert lf.cast(
669
dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")}
670
).collect_schema() == {
671
"a": pl.Float64,
672
"b": pl.Float32,
673
"c": pl.String,
674
"d": pl.Datetime("ms"),
675
}
676
677
# cast via selector:dtype map
678
lfc = lf.cast(
679
{
680
cs.float(): pl.UInt8,
681
cs.integer(): pl.Int32,
682
cs.temporal(): pl.String,
683
}
684
)
685
assert lfc.collect_schema() == {
686
"a": pl.UInt8,
687
"b": pl.Int32,
688
"c": pl.Boolean,
689
"d": pl.String,
690
}
691
assert lfc.collect().rows() == [
692
(1, 4, True, "2020-01-02"),
693
(2, 5, False, "2021-03-04"),
694
(3, None, True, "2022-05-06"),
695
]
696
697
# cast all fields to a single type
698
result = lf.cast(pl.String)
699
expected = pl.LazyFrame(
700
{
701
"a": ["1.0", "2.5", "3.0"],
702
"b": ["4", "5", None],
703
"c": ["true", "false", "true"],
704
"d": ["2020-01-02", "2021-03-04", "2022-05-06"],
705
}
706
)
707
assert_frame_equal(result, expected)
708
709
# test 'strict' mode
710
lf = pl.LazyFrame({"a": [1000, 2000, 3000]})
711
712
with pytest.raises(InvalidOperationError, match=r"conversion .* failed"):
713
lf.cast(pl.UInt8).collect()
714
715
assert lf.cast(pl.UInt8, strict=False).collect().rows() == [
716
(None,),
717
(None,),
718
(None,),
719
]
720
721
722
def test_interpolate() -> None:
723
df = pl.DataFrame({"a": [1, None, 3]})
724
assert df.select(pl.col("a").interpolate())["a"].to_list() == [1, 2, 3]
725
assert df["a"].interpolate().to_list() == [1, 2, 3]
726
assert df.interpolate()["a"].to_list() == [1, 2, 3]
727
assert df.lazy().interpolate().collect()["a"].to_list() == [1, 2, 3]
728
729
730
def test_fill_nan() -> None:
731
df = pl.DataFrame({"a": [1.0, np.nan, 3.0]})
732
assert_series_equal(df.fill_nan(2.0)["a"], pl.Series("a", [1.0, 2.0, 3.0]))
733
assert_series_equal(
734
df.lazy().fill_nan(2.0).collect()["a"], pl.Series("a", [1.0, 2.0, 3.0])
735
)
736
assert_series_equal(
737
df.lazy().fill_nan(None).collect()["a"], pl.Series("a", [1.0, None, 3.0])
738
)
739
assert_series_equal(
740
df.select(pl.col("a").fill_nan(2))["a"], pl.Series("a", [1.0, 2.0, 3.0])
741
)
742
# nearest
743
assert pl.Series([None, 1, None, None, None, -8, None, None, 10]).interpolate(
744
method="nearest"
745
).to_list() == [None, 1, 1, -8, -8, -8, -8, 10, 10]
746
747
748
def test_fill_null() -> None:
749
df = pl.DataFrame({"a": [1.0, None, 3.0]})
750
751
assert df.select([pl.col("a").fill_null(strategy="min")])["a"][1] == 1.0
752
assert df.lazy().fill_null(2).collect()["a"].to_list() == [1.0, 2.0, 3.0]
753
754
with pytest.raises(ValueError, match="must specify either"):
755
df.fill_null()
756
with pytest.raises(ValueError, match="cannot specify both"):
757
df.fill_null(value=3.0, strategy="max")
758
with pytest.raises(ValueError, match="can only specify `limit`"):
759
df.fill_null(strategy="max", limit=2)
760
761
762
def test_backward_fill() -> None:
763
ldf = pl.LazyFrame({"a": [1.0, None, 3.0]})
764
col_a_backward_fill = ldf.select(
765
[pl.col("a").fill_null(strategy="backward")]
766
).collect()["a"]
767
assert_series_equal(col_a_backward_fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))
768
769
770
def test_rolling(fruits_cars: pl.DataFrame) -> None:
771
ldf = fruits_cars.lazy()
772
out = ldf.select(
773
pl.col("A").rolling_min(3, min_samples=1).alias("1"),
774
pl.col("A").rolling_min(3).alias("1b"),
775
pl.col("A").rolling_mean(3, min_samples=1).alias("2"),
776
pl.col("A").rolling_mean(3).alias("2b"),
777
pl.col("A").rolling_max(3, min_samples=1).alias("3"),
778
pl.col("A").rolling_max(3).alias("3b"),
779
pl.col("A").rolling_sum(3, min_samples=1).alias("4"),
780
pl.col("A").rolling_sum(3).alias("4b"),
781
# below we use .round purely for the ability to do assert frame equality
782
pl.col("A").rolling_std(3).round(1).alias("std"),
783
pl.col("A").rolling_var(3).round(1).alias("var"),
784
)
785
786
assert_frame_equal(
787
out.collect(),
788
pl.DataFrame(
789
{
790
"1": [1, 1, 1, 2, 3],
791
"1b": [None, None, 1, 2, 3],
792
"2": [1.0, 1.5, 2.0, 3.0, 4.0],
793
"2b": [None, None, 2.0, 3.0, 4.0],
794
"3": [1, 2, 3, 4, 5],
795
"3b": [None, None, 3, 4, 5],
796
"4": [1, 3, 6, 9, 12],
797
"4b": [None, None, 6, 9, 12],
798
"std": [None, None, 1.0, 1.0, 1.0],
799
"var": [None, None, 1.0, 1.0, 1.0],
800
}
801
),
802
)
803
804
out_single_val_variance = ldf.select(
805
pl.col("A").rolling_std(3, min_samples=1).round(decimals=4).alias("std"),
806
pl.col("A").rolling_var(3, min_samples=1).round(decimals=1).alias("var"),
807
).collect()
808
809
assert cast("float", out_single_val_variance[0, "std"]) is None
810
assert cast("float", out_single_val_variance[0, "var"]) is None
811
812
813
def test_arr_namespace(fruits_cars: pl.DataFrame) -> None:
814
ldf = fruits_cars.lazy()
815
out = ldf.select(
816
"fruits",
817
pl.col("B")
818
.over("fruits", mapping_strategy="join")
819
.list.min()
820
.alias("B_by_fruits_min1"),
821
pl.col("B")
822
.min()
823
.over("fruits", mapping_strategy="join")
824
.alias("B_by_fruits_min2"),
825
pl.col("B")
826
.over("fruits", mapping_strategy="join")
827
.list.max()
828
.alias("B_by_fruits_max1"),
829
pl.col("B")
830
.max()
831
.over("fruits", mapping_strategy="join")
832
.alias("B_by_fruits_max2"),
833
pl.col("B")
834
.over("fruits", mapping_strategy="join")
835
.list.sum()
836
.alias("B_by_fruits_sum1"),
837
pl.col("B")
838
.sum()
839
.over("fruits", mapping_strategy="join")
840
.alias("B_by_fruits_sum2"),
841
pl.col("B")
842
.over("fruits", mapping_strategy="join")
843
.list.mean()
844
.alias("B_by_fruits_mean1"),
845
pl.col("B")
846
.mean()
847
.over("fruits", mapping_strategy="join")
848
.alias("B_by_fruits_mean2"),
849
)
850
expected = pl.DataFrame(
851
{
852
"fruits": ["banana", "banana", "apple", "apple", "banana"],
853
"B_by_fruits_min1": [1, 1, 2, 2, 1],
854
"B_by_fruits_min2": [1, 1, 2, 2, 1],
855
"B_by_fruits_max1": [5, 5, 3, 3, 5],
856
"B_by_fruits_max2": [5, 5, 3, 3, 5],
857
"B_by_fruits_sum1": [10, 10, 5, 5, 10],
858
"B_by_fruits_sum2": [10, 10, 5, 5, 10],
859
"B_by_fruits_mean1": [
860
3.3333333333333335,
861
3.3333333333333335,
862
2.5,
863
2.5,
864
3.3333333333333335,
865
],
866
"B_by_fruits_mean2": [
867
3.3333333333333335,
868
3.3333333333333335,
869
2.5,
870
2.5,
871
3.3333333333333335,
872
],
873
}
874
)
875
assert_frame_equal(out.collect(), expected)
876
877
878
def test_arithmetic() -> None:
879
ldf = pl.LazyFrame({"a": [1, 2, 3]})
880
881
out = ldf.select(
882
(pl.col("a") % 2).alias("1"),
883
(2 % pl.col("a")).alias("2"),
884
(1 // pl.col("a")).alias("3"),
885
(1 * pl.col("a")).alias("4"),
886
(1 + pl.col("a")).alias("5"),
887
(1 - pl.col("a")).alias("6"),
888
(pl.col("a") // 2).alias("7"),
889
(pl.col("a") * 2).alias("8"),
890
(pl.col("a") + 2).alias("9"),
891
(pl.col("a") - 2).alias("10"),
892
(-pl.col("a")).alias("11"),
893
)
894
expected = pl.DataFrame(
895
{
896
"1": [1, 0, 1],
897
"2": [0, 0, 2],
898
"3": [1, 0, 0],
899
"4": [1, 2, 3],
900
"5": [2, 3, 4],
901
"6": [0, -1, -2],
902
"7": [0, 1, 1],
903
"8": [2, 4, 6],
904
"9": [3, 4, 5],
905
"10": [-1, 0, 1],
906
"11": [-1, -2, -3],
907
}
908
)
909
assert_frame_equal(out.collect(), expected)
910
911
912
def test_float_floor_divide() -> None:
913
x = 10.4
914
step = 0.5
915
ldf = pl.LazyFrame({"x": [x]})
916
ldf_res = ldf.with_columns(pl.col("x") // step).collect().item()
917
assert ldf_res == x // step
918
919
920
def test_argminmax() -> None:
921
ldf = pl.LazyFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 2, 2]})
922
out = ldf.select(
923
pl.col("a").arg_min().alias("min"),
924
pl.col("a").arg_max().alias("max"),
925
).collect()
926
assert out["max"][0] == 4
927
assert out["min"][0] == 0
928
929
out = (
930
ldf.group_by("b", maintain_order=True)
931
.agg([pl.col("a").arg_min().alias("min"), pl.col("a").arg_max().alias("max")])
932
.collect()
933
)
934
assert out["max"][0] == 1
935
assert out["min"][0] == 0
936
937
938
def test_limit(fruits_cars: pl.DataFrame) -> None:
939
assert_frame_equal(fruits_cars.lazy().limit(1).collect(), fruits_cars[0, :])
940
941
942
def test_head(fruits_cars: pl.DataFrame) -> None:
943
assert_frame_equal(fruits_cars.lazy().head(2).collect(), fruits_cars[:2, :])
944
945
946
def test_tail(fruits_cars: pl.DataFrame) -> None:
947
assert_frame_equal(fruits_cars.lazy().tail(2).collect(), fruits_cars[3:, :])
948
949
950
def test_last(fruits_cars: pl.DataFrame) -> None:
951
result = fruits_cars.lazy().last().collect()
952
expected = fruits_cars[(len(fruits_cars) - 1) :, :]
953
assert_frame_equal(result, expected)
954
955
956
def test_first(fruits_cars: pl.DataFrame) -> None:
957
assert_frame_equal(fruits_cars.lazy().first().collect(), fruits_cars[0, :])
958
959
960
def test_join_suffix() -> None:
961
df_left = pl.DataFrame(
962
{
963
"a": ["a", "b", "a", "z"],
964
"b": [1, 2, 3, 4],
965
"c": [6, 5, 4, 3],
966
}
967
)
968
df_right = pl.DataFrame(
969
{
970
"a": ["b", "c", "b", "a"],
971
"b": [0, 3, 9, 6],
972
"c": [1, 0, 2, 1],
973
}
974
)
975
out = df_left.join(df_right, on="a", suffix="_bar")
976
assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]
977
out = df_left.lazy().join(df_right.lazy(), on="a", suffix="_bar").collect()
978
assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]
979
980
981
@pytest.mark.may_fail_cloud # reason: no
982
def test_collect_unexpected_kwargs(df: pl.DataFrame) -> None:
983
with pytest.raises(TypeError, match="unexpected keyword argument"):
984
df.lazy().collect(common_subexpr_elim=False) # type: ignore[call-overload]
985
986
987
def test_spearman_corr() -> None:
988
ldf = pl.LazyFrame(
989
{
990
"era": [1, 1, 1, 2, 2, 2],
991
"prediction": [2, 4, 5, 190, 1, 4],
992
"target": [1, 3, 2, 1, 43, 3],
993
}
994
)
995
996
out = (
997
ldf.group_by("era", maintain_order=True).agg(
998
pl.corr(pl.col("prediction"), pl.col("target"), method="spearman").alias(
999
"c"
1000
),
1001
)
1002
).collect()["c"]
1003
assert np.isclose(out[0], 0.5)
1004
assert np.isclose(out[1], -1.0)
1005
1006
# we can also pass in column names directly
1007
out = (
1008
ldf.group_by("era", maintain_order=True).agg(
1009
pl.corr("prediction", "target", method="spearman").alias("c"),
1010
)
1011
).collect()["c"]
1012
assert np.isclose(out[0], 0.5)
1013
assert np.isclose(out[1], -1.0)
1014
1015
1016
def test_spearman_corr_ties() -> None:
1017
"""In Spearman correlation, ranks are computed using the average method ."""
1018
df = pl.DataFrame({"a": [1, 1, 1, 2, 3, 7, 4], "b": [4, 3, 2, 2, 4, 3, 1]})
1019
1020
result = df.select(
1021
pl.corr("a", "b", method="spearman").alias("a1"),
1022
pl.corr(pl.col("a").rank("min"), pl.col("b").rank("min")).alias("a2"),
1023
pl.corr(pl.col("a").rank(), pl.col("b").rank()).alias("a3"),
1024
)
1025
expected = pl.DataFrame(
1026
[
1027
pl.Series("a1", [-0.19048482943986483], dtype=pl.Float64),
1028
pl.Series("a2", [-0.17223653586587362], dtype=pl.Float64),
1029
pl.Series("a3", [-0.19048482943986483], dtype=pl.Float64),
1030
]
1031
)
1032
assert_frame_equal(result, expected)
1033
1034
1035
def test_pearson_corr() -> None:
1036
ldf = pl.LazyFrame(
1037
{
1038
"era": [1, 1, 1, 2, 2, 2],
1039
"prediction": [2, 4, 5, 190, 1, 4],
1040
"target": [1, 3, 2, 1, 43, 3],
1041
}
1042
)
1043
1044
out = (
1045
ldf.group_by("era", maintain_order=True).agg(
1046
pl.corr(
1047
pl.col("prediction"),
1048
pl.col("target"),
1049
method="pearson",
1050
).alias("c"),
1051
)
1052
).collect()["c"]
1053
assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])
1054
1055
# we can also pass in column names directly
1056
out = (
1057
ldf.group_by("era", maintain_order=True).agg(
1058
pl.corr("prediction", "target", method="pearson").alias("c"),
1059
)
1060
).collect()["c"]
1061
assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])
1062
1063
1064
def test_null_count() -> None:
1065
lf = pl.LazyFrame({"a": [1, 2, None, 2], "b": [None, 3, None, 3]})
1066
assert lf.null_count().collect().rows() == [(1, 2)]
1067
1068
1069
def test_lazy_concat(df: pl.DataFrame) -> None:
1070
shape = df.shape
1071
shape = (shape[0] * 2, shape[1])
1072
1073
out = pl.concat([df.lazy(), df.lazy()]).collect()
1074
assert out.shape == shape
1075
assert_frame_equal(out, df.vstack(df))
1076
1077
1078
def test_self_join() -> None:
1079
# 2720
1080
ldf = pl.from_dict(
1081
data={
1082
"employee_id": [100, 101, 102],
1083
"employee_name": ["James", "Alice", "Bob"],
1084
"manager_id": [None, 100, 101],
1085
}
1086
).lazy()
1087
1088
out = (
1089
ldf.join(other=ldf, left_on="manager_id", right_on="employee_id", how="left")
1090
.select(
1091
pl.col("employee_id"),
1092
pl.col("employee_name"),
1093
pl.col("employee_name_right").alias("manager_name"),
1094
)
1095
.collect()
1096
)
1097
assert set(out.rows()) == {
1098
(100, "James", None),
1099
(101, "Alice", "James"),
1100
(102, "Bob", "Alice"),
1101
}
1102
1103
1104
def test_group_lengths() -> None:
1105
ldf = pl.LazyFrame(
1106
{
1107
"group": ["A", "A", "A", "B", "B", "B", "B"],
1108
"id": ["1", "1", "2", "3", "4", "3", "5"],
1109
}
1110
)
1111
1112
result = ldf.group_by(["group"], maintain_order=True).agg(
1113
[
1114
(pl.col("id").unique_counts() / pl.col("id").len())
1115
.sum()
1116
.alias("unique_counts_sum"),
1117
pl.col("id").unique().len().alias("unique_len"),
1118
]
1119
)
1120
expected = pl.DataFrame(
1121
{
1122
"group": ["A", "B"],
1123
"unique_counts_sum": [1.0, 1.0],
1124
"unique_len": [2, 3],
1125
},
1126
schema_overrides={"unique_len": pl.get_index_type()},
1127
)
1128
assert_frame_equal(result.collect(), expected)
1129
1130
1131
def test_quantile_filtered_agg() -> None:
1132
assert (
1133
pl.LazyFrame(
1134
{
1135
"group": [0, 0, 0, 0, 1, 1, 1, 1],
1136
"value": [1, 2, 3, 4, 1, 2, 3, 4],
1137
}
1138
)
1139
.group_by("group")
1140
.agg(pl.col("value").filter(pl.col("value") < 2).quantile(0.5))
1141
.collect()["value"]
1142
.to_list()
1143
) == [1.0, 1.0]
1144
1145
1146
def test_predicate_count_vstack() -> None:
1147
l1 = pl.LazyFrame(
1148
{
1149
"k": ["x", "y"],
1150
"v": [3, 2],
1151
}
1152
)
1153
l2 = pl.LazyFrame(
1154
{
1155
"k": ["x", "y"],
1156
"v": [5, 7],
1157
}
1158
)
1159
assert pl.concat([l1, l2]).filter(pl.len().over("k") == 2).collect()[
1160
"v"
1161
].to_list() == [3, 2, 5, 7]
1162
1163
1164
def test_lazy_method() -> None:
1165
# We want to support `.lazy()` on a Lazy DataFrame to allow more generic user code.
1166
df = pl.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
1167
assert_frame_equal(df.lazy(), df.lazy().lazy())
1168
1169
1170
def test_update_schema_after_projection_pd_t4157() -> None:
1171
ldf = pl.LazyFrame({"c0": [], "c1": [], "c2": []}).rename({"c2": "c2_"})
1172
assert ldf.drop("c2_").select(pl.col("c0")).collect().columns == ["c0"]
1173
1174
1175
def test_type_coercion_unknown_4190() -> None:
1176
df = (
1177
pl.LazyFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).with_columns(
1178
pl.col("a") & pl.col("a").fill_null(True)
1179
)
1180
).collect()
1181
assert df.shape == (3, 2)
1182
assert df.rows() == [(1, 1), (2, 2), (3, 3)]
1183
1184
1185
def test_lazy_cache_same_key() -> None:
1186
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]})
1187
1188
# these have the same schema, but should not be used by cache as they are different
1189
add_node = ldf.select([(pl.col("a") + pl.col("b")).alias("a"), pl.col("c")]).cache()
1190
mult_node = ldf.select((pl.col("a") * pl.col("b")).alias("a"), pl.col("c")).cache()
1191
1192
result = mult_node.join(add_node, on="c", suffix="_mult").select(
1193
(pl.col("a") - pl.col("a_mult")).alias("a"), pl.col("c")
1194
)
1195
expected = pl.LazyFrame({"a": [-1, 2, 7], "c": ["x", "y", "z"]})
1196
assert_frame_equal(result, expected, check_row_order=False)
1197
1198
1199
@pytest.mark.may_fail_cloud # reason: inspects logs
1200
@pytest.mark.may_fail_auto_streaming
1201
def test_lazy_cache_hit(plmonkeypatch: PlMonkeyPatch, capfd: Any) -> None:
1202
plmonkeypatch.setenv("POLARS_VERBOSE", "1")
1203
1204
ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]})
1205
add_node = ldf.select([(pl.col("a") + pl.col("b")).alias("a"), pl.col("c")]).cache()
1206
1207
result = add_node.join(add_node, on="c", suffix="_mult").select(
1208
(pl.col("a") - pl.col("a_mult")).alias("a"), pl.col("c")
1209
)
1210
expected = pl.LazyFrame({"a": [0, 0, 0], "c": ["x", "y", "z"]})
1211
assert_frame_equal(result, expected, check_row_order=False)
1212
1213
(_, err) = capfd.readouterr()
1214
assert "CACHE HIT" in err
1215
1216
1217
@pytest.mark.may_fail_cloud # reason: impure udf
1218
def test_lazy_cache_parallel() -> None:
1219
df_evaluated = 0
1220
1221
def map_df(df: pl.DataFrame) -> pl.DataFrame:
1222
nonlocal df_evaluated
1223
df_evaluated += 1
1224
return df
1225
1226
df = pl.LazyFrame({"a": [1]}).map_batches(map_df).cache()
1227
1228
df = pl.concat(
1229
[
1230
df.select(pl.col("a") + 1),
1231
df.select(pl.col("a") + 2),
1232
df.select(pl.col("a") + 3),
1233
],
1234
parallel=True,
1235
)
1236
1237
assert df_evaluated == 0
1238
1239
df.collect()
1240
assert df_evaluated == 1
1241
1242
1243
@pytest.mark.may_fail_cloud # reason: impure udf
1244
def test_lazy_cache_nested_parallel() -> None:
1245
df_inner_evaluated = 0
1246
df_outer_evaluated = 0
1247
1248
def map_df_inner(df: pl.DataFrame) -> pl.DataFrame:
1249
nonlocal df_inner_evaluated
1250
df_inner_evaluated += 1
1251
return df
1252
1253
def map_df_outer(df: pl.DataFrame) -> pl.DataFrame:
1254
nonlocal df_outer_evaluated
1255
df_outer_evaluated += 1
1256
return df
1257
1258
df_inner = pl.LazyFrame({"a": [1]}).map_batches(map_df_inner).cache()
1259
df_outer = df_inner.select(pl.col("a") + 1).map_batches(map_df_outer).cache()
1260
1261
df = pl.concat(
1262
[
1263
df_outer.select(pl.col("a") + 2),
1264
df_outer.select(pl.col("a") + 3),
1265
],
1266
parallel=True,
1267
)
1268
1269
assert df_inner_evaluated == 0
1270
assert df_outer_evaluated == 0
1271
1272
df.collect()
1273
assert df_inner_evaluated == 1
1274
assert df_outer_evaluated == 1
1275
1276
1277
def test_quadratic_behavior_4736() -> None:
1278
# no assert; if this function does not stall our tests it has passed!
1279
lf = pl.LazyFrame(schema=list(ascii_letters))
1280
lf.select(reduce(add, (pl.col(c) for c in lf.collect_schema())))
1281
1282
1283
@pytest.mark.parametrize("input_dtype", [pl.Int64, pl.Float64])
1284
def test_from_epoch(input_dtype: PolarsDataType) -> None:
1285
ldf = pl.LazyFrame(
1286
[
1287
pl.Series("timestamp_d", [13285]).cast(input_dtype),
1288
pl.Series("timestamp_s", [1147880044]).cast(input_dtype),
1289
pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(input_dtype),
1290
pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(input_dtype),
1291
pl.Series("timestamp_ns", [1147880044 * 1_000_000_000]).cast(input_dtype),
1292
]
1293
)
1294
1295
exp_dt = datetime(2006, 5, 17, 15, 34, 4)
1296
expected = pl.DataFrame(
1297
[
1298
# 'd' → Date, 'ns' → Datetime('ns'), otherwise → Datetime('us')
1299
pl.Series("timestamp_d", [date(2006, 5, 17)]),
1300
pl.Series("timestamp_s", [exp_dt]),
1301
pl.Series("timestamp_ms", [exp_dt]),
1302
pl.Series("timestamp_us", [exp_dt]),
1303
pl.Series("timestamp_ns", [exp_dt]).cast(pl.Datetime("ns")),
1304
]
1305
)
1306
1307
ldf_result = ldf.select(
1308
pl.from_epoch(pl.col("timestamp_d"), time_unit="d"),
1309
pl.from_epoch(pl.col("timestamp_s"), time_unit="s"),
1310
pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"),
1311
pl.from_epoch(pl.col("timestamp_us"), time_unit="us"),
1312
pl.from_epoch(pl.col("timestamp_ns"), time_unit="ns"),
1313
).collect()
1314
1315
assert_frame_equal(ldf_result, expected)
1316
1317
ts_col = pl.col("timestamp_s")
1318
with pytest.raises(ValueError):
1319
_ = ldf.select(pl.from_epoch(ts_col, time_unit="s2")) # type: ignore[call-overload]
1320
1321
1322
def test_from_epoch_str() -> None:
1323
ldf = pl.LazyFrame(
1324
[
1325
pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(pl.String),
1326
pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(pl.String),
1327
]
1328
)
1329
1330
with pytest.raises(InvalidOperationError):
1331
ldf.select(
1332
pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"),
1333
pl.from_epoch(pl.col("timestamp_us"), time_unit="us"),
1334
).collect()
1335
1336
1337
def test_cum_agg_types() -> None:
1338
ldf = pl.LazyFrame({"a": [1, 2], "b": [True, False], "c": [1.3, 2.4]})
1339
cum_sum_lf = ldf.select(
1340
pl.col("a").cum_sum(),
1341
pl.col("b").cum_sum(),
1342
pl.col("c").cum_sum(),
1343
)
1344
assert cum_sum_lf.collect_schema()["a"] == pl.Int64
1345
assert cum_sum_lf.collect_schema()["b"] == pl.UInt32
1346
assert cum_sum_lf.collect_schema()["c"] == pl.Float64
1347
collected_cumsum_lf = cum_sum_lf.collect()
1348
assert collected_cumsum_lf.schema == cum_sum_lf.collect_schema()
1349
1350
cum_prod_lf = ldf.select(
1351
pl.col("a").cast(pl.UInt64).cum_prod(),
1352
pl.col("b").cum_prod(),
1353
pl.col("c").cum_prod(),
1354
)
1355
assert cum_prod_lf.collect_schema()["a"] == pl.UInt64
1356
assert cum_prod_lf.collect_schema()["b"] == pl.Int64
1357
assert cum_prod_lf.collect_schema()["c"] == pl.Float64
1358
collected_cum_prod_lf = cum_prod_lf.collect()
1359
assert collected_cum_prod_lf.schema == cum_prod_lf.collect_schema()
1360
1361
1362
def test_compare_schema_between_lazy_and_eager_6904() -> None:
1363
float32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Float32)})
1364
eager_result = float32_df.select(pl.col("x").sqrt()).select(pl.col(pl.Float32))
1365
lazy_result = (
1366
float32_df.lazy()
1367
.select(pl.col("x").sqrt())
1368
.select(pl.col(pl.Float32))
1369
.collect()
1370
)
1371
assert eager_result.shape == lazy_result.shape
1372
1373
eager_result = float32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float32))
1374
lazy_result = (
1375
float32_df.lazy()
1376
.select(pl.col("x").pow(2))
1377
.select(pl.col(pl.Float32))
1378
.collect()
1379
)
1380
assert eager_result.shape == lazy_result.shape
1381
1382
int32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int32)})
1383
eager_result = int32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float64))
1384
lazy_result = (
1385
int32_df.lazy().select(pl.col("x").pow(2)).select(pl.col(pl.Float64)).collect()
1386
)
1387
assert eager_result.shape == lazy_result.shape
1388
1389
int8_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int8)})
1390
eager_result = int8_df.select(pl.col("x").diff()).select(pl.col(pl.Int16))
1391
lazy_result = (
1392
int8_df.lazy().select(pl.col("x").diff()).select(pl.col(pl.Int16)).collect()
1393
)
1394
assert eager_result.shape == lazy_result.shape
1395
1396
1397
@pytest.mark.slow
1398
@pytest.mark.parametrize(
1399
"dtype",
1400
[
1401
pl.UInt8,
1402
pl.UInt16,
1403
pl.UInt32,
1404
pl.UInt64,
1405
pl.Int8,
1406
pl.Int16,
1407
pl.Int32,
1408
pl.Int64,
1409
pl.Float32,
1410
pl.Float64,
1411
],
1412
)
1413
@pytest.mark.parametrize(
1414
"func",
1415
[
1416
pl.col("x").arg_max(),
1417
pl.col("x").arg_min(),
1418
pl.col("x").max(),
1419
pl.col("x").mean(),
1420
pl.col("x").median(),
1421
pl.col("x").min(),
1422
pl.col("x").nan_max(),
1423
pl.col("x").nan_min(),
1424
pl.col("x").product(),
1425
pl.col("x").quantile(0.5),
1426
pl.col("x").std(),
1427
pl.col("x").sum(),
1428
pl.col("x").var(),
1429
],
1430
)
1431
def test_compare_aggregation_between_lazy_and_eager_6904(
1432
dtype: PolarsDataType, func: pl.Expr
1433
) -> None:
1434
df = pl.DataFrame(
1435
{
1436
"x": pl.Series(values=[1, 2, 3] * 2, dtype=dtype),
1437
"y": pl.Series(values=["a"] * 3 + ["b"] * 3),
1438
}
1439
)
1440
result_eager = df.select(func.over("y")).select("x")
1441
dtype_eager = result_eager["x"].dtype
1442
result_lazy = df.lazy().select(func.over("y")).select(pl.col(dtype_eager)).collect()
1443
assert_frame_equal(result_eager, result_lazy)
1444
1445
1446
@pytest.mark.parametrize(
1447
"comparators",
1448
[
1449
("==", pl.LazyFrame.__eq__),
1450
("!=", pl.LazyFrame.__ne__),
1451
(">", pl.LazyFrame.__gt__),
1452
("<", pl.LazyFrame.__lt__),
1453
(">=", pl.LazyFrame.__ge__),
1454
("<=", pl.LazyFrame.__le__),
1455
],
1456
)
1457
def test_lazy_comparison_operators(
1458
comparators: tuple[str, Callable[[pl.LazyFrame, Any], NoReturn]],
1459
) -> None:
1460
# we cannot compare lazy frames, so all should raise a TypeError
1461
with pytest.raises(
1462
TypeError,
1463
match=f'"{comparators[0]!r}" comparison not supported for LazyFrame objects',
1464
):
1465
comparators[1](pl.LazyFrame(), pl.LazyFrame())
1466
1467
1468
def test_lf_properties() -> None:
1469
lf = pl.LazyFrame(
1470
{
1471
"foo": [1, 2, 3],
1472
"bar": [6.0, 7.0, 8.0],
1473
"ham": ["a", "b", "c"],
1474
}
1475
)
1476
with pytest.warns(PerformanceWarning):
1477
assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}
1478
with pytest.warns(PerformanceWarning):
1479
assert lf.columns == ["foo", "bar", "ham"]
1480
with pytest.warns(PerformanceWarning):
1481
assert lf.dtypes == [pl.Int64, pl.Float64, pl.String]
1482
with pytest.warns(PerformanceWarning):
1483
assert lf.width == 3
1484
1485
1486
def test_lf_unnest() -> None:
1487
lf = pl.DataFrame(
1488
[
1489
pl.Series(
1490
"a",
1491
[{"ab": [1, 2, 3], "ac": [3, 4, 5]}],
1492
dtype=pl.Struct({"ab": pl.List(pl.Int64), "ac": pl.List(pl.Int64)}),
1493
),
1494
pl.Series(
1495
"b",
1496
[{"ba": [5, 6, 7], "bb": [7, 8, 9]}],
1497
dtype=pl.Struct({"ba": pl.List(pl.Int64), "bb": pl.List(pl.Int64)}),
1498
),
1499
]
1500
).lazy()
1501
1502
expected = pl.DataFrame(
1503
[
1504
pl.Series("ab", [[1, 2, 3]], dtype=pl.List(pl.Int64)),
1505
pl.Series("ac", [[3, 4, 5]], dtype=pl.List(pl.Int64)),
1506
pl.Series("ba", [[5, 6, 7]], dtype=pl.List(pl.Int64)),
1507
pl.Series("bb", [[7, 8, 9]], dtype=pl.List(pl.Int64)),
1508
]
1509
)
1510
assert_frame_equal(lf.unnest("a", "b").collect(), expected)
1511
1512
1513
def test_type_coercion_cast_boolean_after_comparison() -> None:
1514
import operator
1515
1516
lf = pl.LazyFrame({"a": 1, "b": 2})
1517
1518
for op in [
1519
operator.eq,
1520
operator.ne,
1521
operator.lt,
1522
operator.le,
1523
operator.gt,
1524
operator.ge,
1525
pl.Expr.eq_missing,
1526
pl.Expr.ne_missing,
1527
]:
1528
e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean).alias("o")
1529
assert "cast" not in lf.with_columns(e).explain()
1530
1531
e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean).cast(pl.Boolean).alias("o")
1532
assert "cast" not in lf.with_columns(e).explain()
1533
1534
for op in [operator.and_, operator.or_, operator.xor]:
1535
e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean)
1536
assert "cast" in lf.with_columns(e).explain()
1537
1538
1539
def test_unique_length_multiple_columns() -> None:
1540
lf = pl.LazyFrame(
1541
{
1542
"a": [1, 1, 1, 2, 3],
1543
"b": [100, 100, 200, 100, 300],
1544
}
1545
)
1546
assert lf.unique().select(pl.len()).collect().item() == 4
1547
1548
1549
def test_asof_cross_join() -> None:
1550
left = pl.LazyFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(
1551
pl.col("a").set_sorted()
1552
)
1553
right = pl.LazyFrame(
1554
{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}
1555
).with_columns(pl.col("a").set_sorted())
1556
1557
out = left.join_asof(right, on="a").collect()
1558
assert out.shape == (3, 3)
1559
1560
1561
def test_join_bad_input_type() -> None:
1562
left = pl.LazyFrame({"a": [1, 2, 3]})
1563
right = pl.LazyFrame({"a": [1, 2, 3]})
1564
1565
with pytest.raises(
1566
TypeError,
1567
match=r"expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",
1568
):
1569
left.join(right.collect(), on="a") # type: ignore[arg-type]
1570
1571
with pytest.raises(
1572
TypeError,
1573
match=r"expected `other` .*to be a 'LazyFrame'.* not 'Series'",
1574
):
1575
left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]
1576
1577
class DummyLazyFrameSubclass(pl.LazyFrame):
1578
pass
1579
1580
a = DummyLazyFrameSubclass(left.collect())
1581
b = DummyLazyFrameSubclass(right.collect())
1582
1583
a.join(b, on="a").collect()
1584
1585
1586
def test_join_where() -> None:
1587
east = pl.LazyFrame(
1588
{
1589
"id": [100, 101, 102],
1590
"dur": [120, 140, 160],
1591
"rev": [12, 14, 16],
1592
"cores": [2, 8, 4],
1593
}
1594
)
1595
west = pl.LazyFrame(
1596
{
1597
"t_id": [404, 498, 676, 742],
1598
"time": [90, 130, 150, 170],
1599
"cost": [9, 13, 15, 16],
1600
"cores": [4, 2, 1, 4],
1601
}
1602
)
1603
out = east.join_where(
1604
west,
1605
pl.col("dur") < pl.col("time"),
1606
pl.col("rev") < pl.col("cost"),
1607
).collect()
1608
1609
expected = pl.DataFrame(
1610
{
1611
"id": [100, 100, 100, 101, 101],
1612
"dur": [120, 120, 120, 140, 140],
1613
"rev": [12, 12, 12, 14, 14],
1614
"cores": [2, 2, 2, 8, 8],
1615
"t_id": [498, 676, 742, 676, 742],
1616
"time": [130, 150, 170, 150, 170],
1617
"cost": [13, 15, 16, 15, 16],
1618
"cores_right": [2, 1, 4, 1, 4],
1619
}
1620
)
1621
1622
assert_frame_equal(out, expected)
1623
1624
1625
def test_join_where_bad_input_type() -> None:
1626
east = pl.LazyFrame(
1627
{
1628
"id": [100, 101, 102],
1629
"dur": [120, 140, 160],
1630
"rev": [12, 14, 16],
1631
"cores": [2, 8, 4],
1632
}
1633
)
1634
west = pl.LazyFrame(
1635
{
1636
"t_id": [404, 498, 676, 742],
1637
"time": [90, 130, 150, 170],
1638
"cost": [9, 13, 15, 16],
1639
"cores": [4, 2, 1, 4],
1640
}
1641
)
1642
with pytest.raises(
1643
TypeError,
1644
match=r"expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",
1645
):
1646
east.join_where(
1647
west.collect(), # type: ignore[arg-type]
1648
pl.col("dur") < pl.col("time"),
1649
pl.col("rev") < pl.col("cost"),
1650
)
1651
1652
with pytest.raises(
1653
TypeError,
1654
match=r"expected `other` .*to be a 'LazyFrame'.* not 'Series'",
1655
):
1656
east.join_where(
1657
pl.Series(west.collect()), # type: ignore[arg-type]
1658
pl.col("dur") < pl.col("time"),
1659
pl.col("rev") < pl.col("cost"),
1660
)
1661
1662
class DummyLazyFrameSubclass(pl.LazyFrame):
1663
pass
1664
1665
a = DummyLazyFrameSubclass(east.collect())
1666
b = DummyLazyFrameSubclass(west.collect())
1667
1668
a.join_where(
1669
b,
1670
pl.col("dur") < pl.col("time"),
1671
pl.col("rev") < pl.col("cost"),
1672
).collect()
1673
1674
1675
def test_cache_hit_with_proj_and_pred_pushdown() -> None:
1676
rgx = re.compile(r"CACHE\[id: (.*)\]")
1677
1678
lf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]}).cache()
1679
1680
q = pl.concat([lf, lf]).select("a", "b")
1681
assert_frame_equal(
1682
q.collect(), pl.DataFrame({"a": [1, 2, 3] * 2, "b": [3, 4, 5] * 2})
1683
)
1684
e = rgx.findall(q.explain())
1685
1686
assert len(e) == 2 # there are only 2 caches
1687
assert e[0] == e[1] # all caches are the same
1688
1689
q = pl.concat([lf, lf]).filter(pl.col.a != 0)
1690
assert_frame_equal(
1691
q.collect(),
1692
pl.DataFrame(
1693
{"a": [1, 2, 3] * 2, "b": [3, 4, 5] * 2, "c": ["x", "y", "z"] * 2}
1694
),
1695
)
1696
e = rgx.findall(q.explain())
1697
1698
assert len(e) == 2 # there are only 2 caches
1699
assert e[0] == e[1] # all caches are the same
1700
1701
1702
def test_cache_hit_child_removal() -> None:
1703
df = pl.DataFrame(
1704
{
1705
"a": [1, 2, 3],
1706
}
1707
)
1708
1709
q = df.lazy().sort("a").cache()
1710
1711
q1 = pl.concat([q.unique(), q.unique()])
1712
q2 = pl.concat([q.unique(), q.unique(keep="none")])
1713
1714
e1 = q1.explain()
1715
e2 = q2.explain()
1716
1717
assert "SORT" not in e1
1718
assert "SORT" not in e2
1719
1720
rgx = re.compile(r"CACHE\[id: (.*)\]")
1721
1722
e1m = rgx.findall(e1)
1723
e2m = rgx.findall(e2)
1724
1725
assert len(e1m) == 2 # there are only 2 caches
1726
assert len(e2m) == 2 # there are only 2 caches
1727
assert e1m[0] == e1m[1] # all caches are the same
1728
assert e2m[0] == e2m[1] # all caches are the same
1729
1730
df1 = q1.collect()
1731
df2 = q2.collect()
1732
1733
assert_frame_equal(df1.head(3), df, check_row_order=False)
1734
assert_frame_equal(df1.tail(3), df, check_row_order=False)
1735
assert_frame_equal(df2.head(3), df, check_row_order=False)
1736
assert_frame_equal(df2.tail(3), df, check_row_order=False)
1737
1738