Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/functions/test_functions.py
8408 views
1
from __future__ import annotations
2
3
from typing import TYPE_CHECKING, Any
4
5
import numpy as np
6
import pytest
7
8
import polars as pl
9
from polars.exceptions import DuplicateError, InvalidOperationError
10
from polars.testing import assert_frame_equal, assert_series_equal
11
from tests.unit.conftest import NUMERIC_DTYPES, TEMPORAL_DTYPES
12
13
if TYPE_CHECKING:
14
from polars._typing import ConcatMethod, CorrelationMethod, PolarsDataType
15
16
17
def test_concat_align() -> None:
18
a = pl.DataFrame({"a": ["a", "b", "d", "e", "e"], "b": [1, 2, 4, 5, 6]})
19
b = pl.DataFrame({"a": ["a", "b", "c"], "c": [5.5, 6.0, 7.5]})
20
c = pl.DataFrame({"a": ["a", "b", "c", "d", "e"], "d": ["w", "x", "y", "z", None]})
21
22
for align_full in ("align", "align_full"):
23
result = pl.concat([a, b, c], how=align_full)
24
expected = pl.DataFrame(
25
{
26
"a": ["a", "b", "c", "d", "e", "e"],
27
"b": [1, 2, None, 4, 5, 6],
28
"c": [5.5, 6.0, 7.5, None, None, None],
29
"d": ["w", "x", "y", "z", None, None],
30
}
31
)
32
assert_frame_equal(result, expected)
33
34
result = pl.concat([a, b, c], how="align_left")
35
expected = pl.DataFrame(
36
{
37
"a": ["a", "b", "d", "e", "e"],
38
"b": [1, 2, 4, 5, 6],
39
"c": [5.5, 6.0, None, None, None],
40
"d": ["w", "x", "z", None, None],
41
}
42
)
43
assert_frame_equal(result, expected)
44
45
result = pl.concat([a, b, c], how="align_right")
46
expected = pl.DataFrame(
47
{
48
"a": ["a", "b", "c", "d", "e"],
49
"b": [1, 2, None, None, None],
50
"c": [5.5, 6.0, 7.5, None, None],
51
"d": ["w", "x", "y", "z", None],
52
}
53
)
54
assert_frame_equal(result, expected)
55
56
result = pl.concat([a, b, c], how="align_inner")
57
expected = pl.DataFrame(
58
{
59
"a": ["a", "b"],
60
"b": [1, 2],
61
"c": [5.5, 6.0],
62
"d": ["w", "x"],
63
}
64
)
65
assert_frame_equal(result, expected)
66
67
68
@pytest.mark.parametrize(
69
"strategy", ["align", "align_full", "align_left", "align_right"]
70
)
71
def test_concat_align_no_common_cols(strategy: ConcatMethod) -> None:
72
df1 = pl.DataFrame({"a": [1, 2], "b": [1, 2]})
73
df2 = pl.DataFrame({"c": [3, 4], "d": [3, 4]})
74
75
with pytest.raises(
76
InvalidOperationError,
77
match=f"{strategy!r} strategy requires at least one common column",
78
):
79
pl.concat((df1, df2), how=strategy)
80
81
82
@pytest.mark.parametrize(
83
("a", "b", "c", "strategy"),
84
[
85
(
86
pl.DataFrame({"a": [1, 2]}),
87
pl.DataFrame({"b": ["a", "b"], "c": [3, 4]}),
88
pl.DataFrame({"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]}),
89
"diagonal",
90
),
91
(
92
pl.DataFrame(
93
{"a": [1, 2]},
94
schema_overrides={"a": pl.Int32},
95
),
96
pl.DataFrame(
97
{"b": ["a", "b"], "c": [3, 4]},
98
schema_overrides={"c": pl.UInt8},
99
),
100
pl.DataFrame(
101
{"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]},
102
schema_overrides={"b": pl.Categorical},
103
),
104
"diagonal_relaxed",
105
),
106
],
107
)
108
def test_concat_diagonal(
109
a: pl.DataFrame, b: pl.DataFrame, c: pl.DataFrame, strategy: ConcatMethod
110
) -> None:
111
for out in [
112
pl.concat([a, b, c], how=strategy),
113
pl.concat([a.lazy(), b.lazy(), c.lazy()], how=strategy).collect(),
114
]:
115
expected = pl.DataFrame(
116
{
117
"a": [1, 2, None, None, 5, 6],
118
"b": [None, None, "a", "b", "x", "y"],
119
"c": [None, None, 3, 4, 5, 6],
120
"d": [None, None, None, None, 5, 6],
121
}
122
)
123
assert_frame_equal(out, expected)
124
125
126
def test_concat_diagonal_relaxed_with_empty_frame() -> None:
127
df1 = pl.DataFrame()
128
df2 = pl.DataFrame(
129
{
130
"a": ["a", "b"],
131
"b": [1, 2],
132
}
133
)
134
out = pl.concat((df1, df2), how="diagonal_relaxed")
135
expected = df2
136
assert_frame_equal(out, expected)
137
138
139
@pytest.mark.parametrize("lazy", [False, True])
140
def test_concat_horizontal(lazy: bool) -> None:
141
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
142
b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]})
143
144
if lazy:
145
out = pl.concat([a.lazy(), b.lazy()], how="horizontal").collect()
146
else:
147
out = pl.concat([a, b], how="horizontal")
148
149
expected = pl.DataFrame(
150
{
151
"a": ["a", "b", None, None],
152
"b": [1, 2, None, None],
153
"c": [5, 7, 8, 9],
154
"d": [1, 2, 1, 2],
155
"e": [1, 2, 1, 2],
156
}
157
)
158
assert_frame_equal(out, expected)
159
160
161
@pytest.mark.parametrize("lazy", [False, True])
162
def test_concat_horizontal_three_dfs(lazy: bool) -> None:
163
a = pl.DataFrame({"a1": [1, 2, 3], "a2": ["a", "b", "c"]})
164
b = pl.DataFrame({"b1": [0.25, 0.5]})
165
c = pl.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8], "c3": [9, 10, 11, 12]})
166
167
if lazy:
168
out = pl.concat([a.lazy(), b.lazy(), c.lazy()], how="horizontal").collect()
169
else:
170
out = pl.concat([a, b, c], how="horizontal")
171
172
expected = pl.DataFrame(
173
{
174
"a1": [1, 2, 3, None],
175
"a2": ["a", "b", "c", None],
176
"b1": [0.25, 0.5, None, None],
177
"c1": [1, 2, 3, 4],
178
"c2": [5, 6, 7, 8],
179
"c3": [9, 10, 11, 12],
180
}
181
)
182
assert_frame_equal(out, expected)
183
184
185
@pytest.mark.parametrize("lazy", [False, True])
186
def test_concat_horizontal_single_df(lazy: bool) -> None:
187
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
188
189
if lazy:
190
out = pl.concat([a.lazy()], how="horizontal").collect()
191
else:
192
out = pl.concat([a], how="horizontal")
193
194
expected = a
195
assert_frame_equal(out, expected)
196
197
198
def test_concat_horizontal_duplicate_col() -> None:
199
a = pl.LazyFrame({"a": ["a", "b"], "b": [1, 2]})
200
b = pl.LazyFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "a": [1, 2, 1, 2]})
201
202
with pytest.raises(DuplicateError):
203
pl.concat([a, b], how="horizontal").collect()
204
205
206
def test_concat_vertical() -> None:
207
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
208
b = pl.DataFrame({"a": ["c", "d", "e"], "b": [3, 4, 5]})
209
210
result = pl.concat([a, b], how="vertical")
211
expected = pl.DataFrame(
212
{
213
"a": ["a", "b", "c", "d", "e"],
214
"b": [1, 2, 3, 4, 5],
215
}
216
)
217
assert_frame_equal(result, expected)
218
219
220
def test_cov() -> None:
221
s1 = pl.Series("a", [10, 37, -40])
222
s2 = pl.Series("b", [70, -10, 35])
223
224
# lazy/expression
225
lf = pl.LazyFrame([s1, s2])
226
res1 = lf.select(
227
x=pl.cov("a", "b"),
228
y=pl.cov("a", "b", ddof=2),
229
).collect()
230
231
# eager/series
232
res2 = (
233
pl.cov(s1, s2, eager=True).alias("x"),
234
pl.cov(s1, s2, eager=True, ddof=2).alias("y"),
235
)
236
237
# expect same result from both approaches
238
for idx, (r1, r2) in enumerate(zip(res1, res2, strict=True)):
239
expected_value = -645.8333333333 if idx == 0 else -1291.6666666666
240
assert pytest.approx(expected_value) == r1.item()
241
assert_series_equal(r1, r2)
242
243
244
def test_corr() -> None:
245
s1 = pl.Series("a", [10, 37, -40])
246
s2 = pl.Series("b", [70, -10, 35])
247
248
# lazy/expression
249
lf = pl.LazyFrame([s1, s2])
250
res1 = lf.select(
251
x=pl.corr("a", "b"),
252
y=pl.corr("a", "b", method="spearman"),
253
).collect()
254
255
# eager/series
256
res2 = (
257
pl.corr(s1, s2, eager=True).alias("x"),
258
pl.corr(s1, s2, method="spearman", eager=True).alias("y"),
259
)
260
261
# expect same result from both approaches
262
for idx, (r1, r2) in enumerate(zip(res1, res2, strict=True)):
263
assert pytest.approx(-0.412199756 if idx == 0 else -0.5) == r1.item()
264
assert_series_equal(r1, r2)
265
266
267
def test_extend_ints() -> None:
268
a = pl.DataFrame({"a": [1 for _ in range(1)]}, schema={"a": pl.Int64})
269
with pytest.raises(pl.exceptions.SchemaError):
270
a.extend(a.select(pl.lit(0, dtype=pl.Int32).alias("a")))
271
272
273
def test_null_handling_correlation() -> None:
274
df = pl.DataFrame({"a": [1, 2, 3, None, 4], "b": [1, 2, 3, 10, 4]})
275
276
out = df.select(
277
pl.corr("a", "b").alias("pearson"),
278
pl.corr("a", "b", method="spearman").alias("spearman"),
279
)
280
assert out["pearson"][0] == pytest.approx(1.0)
281
assert out["spearman"][0] == pytest.approx(1.0)
282
283
# see #4930
284
df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]})
285
df2 = pl.DataFrame({"a": [np.nan, 1, 2], "b": [np.nan, 2, 1]})
286
287
assert np.isclose(df1.select(pl.corr("a", "b", method="spearman")).item(), -1.0)
288
assert (
289
str(
290
df2.select(pl.corr("a", "b", method="spearman", propagate_nans=True)).item()
291
)
292
== "nan"
293
)
294
295
296
# see #25407
297
def test_spearman_propagate_nans_with_all_nulls_does_not_panic() -> None:
298
df = pl.select(x=None, y=None).cast(pl.Float64)
299
300
out = df.select(pl.corr("x", "y", method="spearman", propagate_nans=True))
301
302
assert str(out.item()) == "nan"
303
304
305
def test_align_frames() -> None:
306
import numpy as np
307
import pandas as pd
308
309
# setup some test frames
310
pdf1 = pd.DataFrame(
311
{
312
"date": pd.date_range(start="2019-01-02", periods=9),
313
"a": np.array([0, 1, 2, np.nan, 4, 5, 6, 7, 8], dtype=np.float64),
314
"b": np.arange(9, 18, dtype=np.float64),
315
}
316
).set_index("date")
317
318
pdf2 = pd.DataFrame(
319
{
320
"date": pd.date_range(start="2019-01-04", periods=7),
321
"a": np.arange(9, 16, dtype=np.float64),
322
"b": np.arange(10, 17, dtype=np.float64),
323
}
324
).set_index("date")
325
326
# calculate dot-product in pandas
327
pd_dot = (pdf1 * pdf2).sum(axis="columns").to_frame("dot").reset_index()
328
329
# use "align_frames" to calculate dot-product from disjoint rows. pandas uses an
330
# index to automatically infer the correct frame-alignment for the calculation;
331
# we need to do it explicitly (which also makes it clearer what is happening)
332
pf1, pf2 = pl.align_frames(
333
pl.from_pandas(pdf1.reset_index()),
334
pl.from_pandas(pdf2.reset_index()),
335
on="date",
336
)
337
pl_dot = (
338
(pf1[["a", "b"]] * pf2[["a", "b"]])
339
.fill_null(0)
340
.select(pl.sum_horizontal("*").alias("dot"))
341
.insert_column(0, pf1["date"])
342
)
343
# confirm we match the same operation in pandas
344
assert_frame_equal(pl_dot, pl.from_pandas(pd_dot))
345
pd.testing.assert_frame_equal(pd_dot, pl_dot.to_pandas())
346
347
# confirm alignment function works with lazy frames
348
lf1, lf2 = pl.align_frames(
349
pl.from_pandas(pdf1.reset_index()).lazy(),
350
pl.from_pandas(pdf2.reset_index()).lazy(),
351
on="date",
352
)
353
assert isinstance(lf1, pl.LazyFrame)
354
assert_frame_equal(lf1.collect(), pf1)
355
assert_frame_equal(lf2.collect(), pf2)
356
357
# misc: no frames results in an empty list
358
assert pl.align_frames(on="date") == []
359
360
# expected error condition
361
with pytest.raises(TypeError):
362
pl.align_frames( # type: ignore[type-var]
363
pl.from_pandas(pdf1.reset_index()).lazy(),
364
pl.from_pandas(pdf2.reset_index()),
365
on="date",
366
)
367
368
369
def test_align_frames_misc() -> None:
370
df1 = pl.DataFrame([[3, 5, 6], [5, 8, 9]], orient="row")
371
df2 = pl.DataFrame([[2, 5, 6], [3, 8, 9], [4, 2, 0]], orient="row")
372
373
# descending result
374
pf1, pf2 = pl.align_frames(
375
[df1, df2], # list input
376
on="column_0",
377
descending=True,
378
)
379
assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]
380
assert pf2.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]
381
382
# handle identical frames
383
pf1, pf2, pf3 = pl.align_frames(
384
(df for df in (df1, df2, df2)), # generator input
385
on="column_0",
386
descending=True,
387
)
388
assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]
389
for pf in (pf2, pf3):
390
assert pf.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]
391
392
393
def test_align_frames_with_nulls() -> None:
394
df1 = pl.DataFrame({"key": ["x", "y", None], "value": [1, 2, 0]})
395
df2 = pl.DataFrame({"key": ["x", None, "z", "y"], "value": [4, 3, 6, 5]})
396
397
a1, a2 = pl.align_frames(df1, df2, on="key")
398
399
aligned_frame_data = a1.to_dict(as_series=False), a2.to_dict(as_series=False)
400
assert aligned_frame_data == (
401
{"key": [None, "x", "y", "z"], "value": [0, 1, 2, None]},
402
{"key": [None, "x", "y", "z"], "value": [3, 4, 5, 6]},
403
)
404
405
406
def test_align_frames_duplicate_key() -> None:
407
# setup some test frames with duplicate key/alignment values
408
df1 = pl.DataFrame({"x": ["a", "a", "a", "e"], "y": [1, 2, 4, 5]})
409
df2 = pl.DataFrame({"y": [0, 0, -1], "z": [5.5, 6.0, 7.5], "x": ["a", "b", "b"]})
410
411
# align rows, confirming correctness and original column order
412
af1, af2 = pl.align_frames(df1, df2, on="x")
413
414
# shape: (6, 2) shape: (6, 3)
415
# ┌─────┬──────┐ ┌──────┬──────┬─────┐
416
# │ x ┆ y │ │ y ┆ z ┆ x │
417
# │ --- ┆ --- │ │ --- ┆ --- ┆ --- │
418
# │ str ┆ i64 │ │ i64 ┆ f64 ┆ str │
419
# ╞═════╪══════╡ ╞══════╪══════╪═════╡
420
# │ a ┆ 1 │ │ 0 ┆ 5.5 ┆ a │
421
# │ a ┆ 2 │ │ 0 ┆ 5.5 ┆ a │
422
# │ a ┆ 4 │ │ 0 ┆ 5.5 ┆ a │
423
# │ b ┆ null │ │ 0 ┆ 6.0 ┆ b │
424
# │ b ┆ null │ │ -1 ┆ 7.5 ┆ b │
425
# │ e ┆ 5 │ │ null ┆ null ┆ e │
426
# └─────┴──────┘ └──────┴──────┴─────┘
427
assert af1.rows() == [
428
("a", 1),
429
("a", 2),
430
("a", 4),
431
("b", None),
432
("b", None),
433
("e", 5),
434
]
435
assert af2.rows() == [
436
(0, 5.5, "a"),
437
(0, 5.5, "a"),
438
(0, 5.5, "a"),
439
(0, 6.0, "b"),
440
(-1, 7.5, "b"),
441
(None, None, "e"),
442
]
443
444
# align frames the other way round, using "left" alignment strategy
445
af1, af2 = pl.align_frames(df2, df1, on="x", how="left")
446
447
# shape: (5, 3) shape: (5, 2)
448
# ┌─────┬─────┬─────┐ ┌─────┬──────┐
449
# │ y ┆ z ┆ x │ │ x ┆ y │
450
# │ --- ┆ --- ┆ --- │ │ --- ┆ --- │
451
# │ i64 ┆ f64 ┆ str │ │ str ┆ i64 │
452
# ╞═════╪═════╪═════╡ ╞═════╪══════╡
453
# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 1 │
454
# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 2 │
455
# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 4 │
456
# │ 0 ┆ 6.0 ┆ b │ │ b ┆ null │
457
# │ -1 ┆ 7.5 ┆ b │ │ b ┆ null │
458
# └─────┴─────┴─────┘ └─────┴──────┘
459
assert af1.rows() == [
460
(0, 5.5, "a"),
461
(0, 5.5, "a"),
462
(0, 5.5, "a"),
463
(0, 6.0, "b"),
464
(-1, 7.5, "b"),
465
]
466
assert af2.rows() == [
467
("a", 1),
468
("a", 2),
469
("a", 4),
470
("b", None),
471
("b", None),
472
]
473
474
475
def test_align_frames_single_row_20445() -> None:
476
left = pl.DataFrame({"a": [1], "b": [2]})
477
right = pl.DataFrame({"a": [1], "c": [3]})
478
result = pl.align_frames(left, right, how="left", on="a")
479
assert_frame_equal(result[0], left)
480
assert_frame_equal(result[1], right)
481
482
483
def test_coalesce() -> None:
484
df = pl.DataFrame(
485
{
486
"a": [1, None, None, None],
487
"b": [1, 2, None, None],
488
"c": [5, None, 3, None],
489
}
490
)
491
# list inputs
492
expected = pl.Series("d", [1, 2, 3, 10]).to_frame()
493
result = df.select(pl.coalesce(["a", "b", "c", 10]).alias("d"))
494
assert_frame_equal(expected, result)
495
496
# positional inputs
497
expected = pl.Series("d", [1.0, 2.0, 3.0, 10.0]).to_frame()
498
result = df.select(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))
499
assert_frame_equal(result, expected)
500
501
502
def test_coalesce_eager() -> None:
503
# eager/series inputs
504
s1 = pl.Series("colx", [None, 2, None])
505
s2 = pl.Series("coly", [1, None, None])
506
s3 = pl.Series("colz", [None, None, 3])
507
508
res = pl.coalesce(s1, s2, s3, eager=True)
509
expected = pl.Series("colx", [1, 2, 3])
510
assert_series_equal(expected, res)
511
512
for zero in (0, pl.lit(0)):
513
res = pl.coalesce(s1, zero, eager=True)
514
expected = pl.Series("colx", [0, 2, 0])
515
assert_series_equal(expected, res)
516
517
res = pl.coalesce(zero, s1, eager=True)
518
expected = pl.Series("literal", [0, 0, 0])
519
assert_series_equal(expected, res)
520
521
with pytest.raises(
522
ValueError,
523
match="expected at least one Series in 'coalesce' if 'eager=True'",
524
):
525
pl.coalesce("x", "y", eager=True)
526
527
528
def test_overflow_diff() -> None:
529
df = pl.DataFrame({"a": [20, 10, 30]})
530
assert df.select(pl.col("a").cast(pl.UInt64).diff()).to_dict(as_series=False) == {
531
"a": [None, -10, 20]
532
}
533
534
535
@pytest.mark.may_fail_cloud # reason: unknown type
536
def test_fill_null_unknown_output_type() -> None:
537
df = pl.DataFrame({"a": [None, 2, 3, 4, 5]})
538
assert df.with_columns(
539
np.exp(pl.col("a")).fill_null(pl.lit(1, pl.Float64))
540
).to_dict(as_series=False) == {
541
"a": [
542
1.0,
543
7.38905609893065,
544
20.085536923187668,
545
54.598150033144236,
546
148.4131591025766,
547
]
548
}
549
550
551
@pytest.mark.parametrize(("dtype"), [*NUMERIC_DTYPES, *TEMPORAL_DTYPES])
552
def test_approx_n_unique(dtype: pl.DataType) -> None:
553
df = pl.DataFrame({"a": pl.arange(100, eager=True).cast(dtype)})
554
cardinality = df.select(pl.col("a").approx_n_unique()).to_series()[0]
555
assert 92 <= cardinality <= 108
556
557
558
def test_approx_n_unique_null() -> None:
559
df = pl.DataFrame({"a": 100 * [None]})
560
cardinality = df.select(pl.col("a").approx_n_unique()).to_series()[0]
561
assert cardinality == 1
562
563
564
def test_lazy_functions() -> None:
565
df = pl.DataFrame(
566
{
567
"a": ["foo", "bar", "foo"],
568
"b": [1, 2, 3],
569
"c": [-1.0, 2.0, 4.0],
570
}
571
)
572
573
# test function expressions against frame
574
out = df.select(
575
pl.var("b").name.suffix("_var"),
576
pl.std("b").name.suffix("_std"),
577
pl.max("a", "b").name.suffix("_max"),
578
pl.min("a", "b").name.suffix("_min"),
579
pl.sum("b", "c").name.suffix("_sum"),
580
pl.mean("b", "c").name.suffix("_mean"),
581
pl.median("c", "b").name.suffix("_median"),
582
pl.n_unique("b", "a").name.suffix("_n_unique"),
583
pl.first("a").name.suffix("_first"),
584
pl.first("b", "c").name.suffix("_first"),
585
pl.last("c", "b", "a").name.suffix("_last"),
586
)
587
expected: dict[str, list[Any]] = {
588
"b_var": [1.0],
589
"b_std": [1.0],
590
"a_max": ["foo"],
591
"b_max": [3],
592
"a_min": ["bar"],
593
"b_min": [1],
594
"b_sum": [6],
595
"c_sum": [5.0],
596
"b_mean": [2.0],
597
"c_mean": [5 / 3],
598
"c_median": [2.0],
599
"b_median": [2.0],
600
"b_n_unique": [3],
601
"a_n_unique": [2],
602
"a_first": ["foo"],
603
"b_first": [1],
604
"c_first": [-1.0],
605
"c_last": [4.0],
606
"b_last": [3],
607
"a_last": ["foo"],
608
}
609
assert_frame_equal(
610
out,
611
pl.DataFrame(
612
data=expected,
613
schema_overrides={
614
"a_n_unique": pl.get_index_type(),
615
"b_n_unique": pl.get_index_type(),
616
},
617
),
618
)
619
620
# test function expressions against series
621
for name, value in expected.items():
622
col, fn = name.split("_", 1)
623
if series_fn := getattr(df[col], fn, None):
624
assert series_fn() == value[0]
625
626
# regex selection
627
out = df.select(
628
pl.struct(pl.max("^a|b$")).alias("x"),
629
pl.struct(pl.min("^.*[bc]$")).alias("y"),
630
pl.struct(pl.sum("^[^a]$")).alias("z"),
631
)
632
assert out.rows() == [
633
({"a": "foo", "b": 3}, {"b": 1, "c": -1.0}, {"b": 6, "c": 5.0})
634
]
635
636
637
def test_count() -> None:
638
df = pl.DataFrame({"a": [1, 1, 1], "b": [None, "xx", "yy"]})
639
out = df.select(pl.count("a"))
640
assert list(out["a"]) == [3]
641
642
for count_expr in (
643
pl.count("b", "a"),
644
[pl.count("b"), pl.count("a")],
645
):
646
out = df.select(count_expr)
647
assert out.rows() == [(2, 3)]
648
649
650
def test_head_tail(fruits_cars: pl.DataFrame) -> None:
651
res_expr = fruits_cars.select(pl.head("A", 2))
652
expected = pl.Series("A", [1, 2])
653
assert_series_equal(res_expr.to_series(), expected)
654
655
res_expr = fruits_cars.select(pl.tail("A", 2))
656
expected = pl.Series("A", [4, 5])
657
assert_series_equal(res_expr.to_series(), expected)
658
659
660
@pytest.mark.parametrize(
661
"dtype", [pl.Int32, pl.Boolean, pl.String, pl.Categorical, pl.List]
662
)
663
def test_first_last(dtype: PolarsDataType) -> None:
664
# Ensure multiple chunks.
665
s1 = pl.Series("a", [None, None], dtype=pl.Int32)
666
s2 = pl.Series("a", [None, 3, 4, None], dtype=pl.Int32)
667
s3 = pl.Series("a", [None, None], dtype=pl.Int32)
668
s = s1.append(s2).append(s3)
669
if dtype == pl.Categorical:
670
# For categorical, we must go through String
671
s = s.cast(pl.String)
672
s = s.cast(dtype)
673
lf = s.to_frame().lazy()
674
675
result = lf.select(pl.col("a").first()).collect()
676
expected_value = pl.Series("a", [None])
677
if dtype == pl.Categorical:
678
# For categorical, we must go through String
679
expected_value = expected_value.cast(pl.String)
680
expected = expected_value.cast(dtype).to_frame()
681
assert_frame_equal(result, expected)
682
683
result = lf.select(pl.col("a").first(ignore_nulls=True)).collect()
684
expected_value = pl.Series("a", [3])
685
if dtype == pl.Categorical:
686
# For categorical, we must go through String
687
expected_value = expected_value.cast(pl.String)
688
689
expected = expected_value.cast(dtype).to_frame()
690
assert_frame_equal(result, expected)
691
692
result = lf.select(pl.col("a").last()).collect()
693
expected_value = pl.Series("a", [None])
694
if dtype == pl.Categorical:
695
# For categorical, we must go through String
696
expected_value = expected_value.cast(pl.String)
697
expected = expected_value.cast(dtype).to_frame()
698
assert_frame_equal(result, expected)
699
700
result = lf.select(pl.col("a").last(ignore_nulls=True)).collect()
701
expected_value = pl.Series("a", [4])
702
if dtype == pl.Categorical:
703
# For categorical, we must go through String
704
expected_value = expected_value.cast(pl.String)
705
expected = expected_value.cast(dtype).to_frame()
706
assert_frame_equal(result, expected)
707
708
# Test with empty
709
lf = pl.Series("a", [], dtype=dtype).to_frame().lazy()
710
expected = pl.Series("a", [None], dtype=dtype).to_frame()
711
712
result = lf.select(pl.col("a").first()).collect()
713
assert_frame_equal(result, expected)
714
715
result = lf.select(pl.col("a").first(ignore_nulls=True)).collect()
716
assert_frame_equal(result, expected)
717
718
result = lf.select(pl.col("a").last()).collect()
719
assert_frame_equal(result, expected)
720
721
result = lf.select(pl.col("a").last(ignore_nulls=True)).collect()
722
assert_frame_equal(result, expected)
723
724
# Test with no nulls
725
lf = pl.Series("a", [1, 2, 3, 4, 5], dtype=pl.Int32).to_frame().lazy()
726
expected_value = pl.Series("a", [1])
727
if dtype == pl.Categorical:
728
# For categorical, we must go through String
729
expected_value = expected_value.cast(pl.String)
730
lf = lf.with_columns(pl.col("a").cast(pl.String))
731
732
lf = lf.with_columns(pl.col("a").cast(dtype))
733
expected = expected_value.cast(dtype).to_frame()
734
735
result = lf.select(pl.col("a").first()).collect()
736
assert_frame_equal(result, expected)
737
738
result = lf.select(pl.col("a").first(ignore_nulls=True)).collect()
739
assert_frame_equal(result, expected)
740
741
expected_value = pl.Series("a", [5])
742
if dtype == pl.Categorical:
743
# For categorical, we must go through String
744
expected_value = expected_value.cast(pl.String)
745
expected = expected_value.cast(dtype).to_frame()
746
747
result = lf.select(pl.col("a").last()).collect()
748
assert_frame_equal(result, expected)
749
750
result = lf.select(pl.col("a").last(ignore_nulls=True)).collect()
751
assert_frame_equal(result, expected)
752
753
754
def test_escape_regex() -> None:
755
result = pl.escape_regex("abc(\\w+)")
756
expected = "abc\\(\\\\w\\+\\)"
757
assert result == expected
758
759
df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
760
with pytest.raises(
761
TypeError,
762
match=r"escape_regex function is unsupported for `Expr`, you may want use `Expr\.str\.escape_regex` instead",
763
):
764
df.with_columns(escaped=pl.escape_regex(pl.col("text"))) # type: ignore[arg-type]
765
766
with pytest.raises(
767
TypeError,
768
match="escape_regex function supports only `str` type, got `int`",
769
):
770
pl.escape_regex(3) # type: ignore[arg-type]
771
772
773
@pytest.mark.parametrize("func", ["var", "std"])
774
def test_var_std_lit_23156(func: str) -> None:
775
for n in range(100):
776
input = pl.DataFrame({"x": list(range(n))}).select(pl.col("x"), pl.lit(0))
777
out = getattr(input, func)()
778
if n <= 1:
779
assert_series_equal(
780
out["literal"], pl.Series("literal", [None], dtype=pl.Float64)
781
)
782
else:
783
assert_series_equal(
784
out["literal"], pl.Series("literal", [0.0], dtype=pl.Float64)
785
)
786
787
788
def test_row_index_expr() -> None:
789
lf = pl.LazyFrame({"x": ["A", "A", "B", "B", "B"]})
790
791
assert_frame_equal(
792
lf.with_columns(pl.row_index(), pl.row_index("another_index")).collect(),
793
pl.DataFrame(
794
{
795
"x": ["A", "A", "B", "B", "B"],
796
"index": [0, 1, 2, 3, 4],
797
"another_index": [0, 1, 2, 3, 4],
798
},
799
schema={
800
"x": pl.String,
801
"index": pl.get_index_type(),
802
"another_index": pl.get_index_type(),
803
},
804
),
805
)
806
807
assert_frame_equal(
808
(
809
lf.group_by("x")
810
.agg(pl.row_index(), pl.row_index("another_index"))
811
.sort("x")
812
.collect()
813
),
814
pl.DataFrame(
815
{
816
"x": ["A", "B"],
817
"index": [[0, 1], [0, 1, 2]],
818
"another_index": [[0, 1], [0, 1, 2]],
819
},
820
schema={
821
"x": pl.String,
822
"index": pl.List(pl.get_index_type()),
823
"another_index": pl.List(pl.get_index_type()),
824
},
825
),
826
)
827
828
assert_frame_equal(
829
lf.select(pl.row_index()).collect(),
830
pl.DataFrame(
831
{"index": [0, 1, 2, 3, 4]},
832
schema={"index": pl.get_index_type()},
833
),
834
)
835
836
837
@pytest.mark.parametrize("dt", [pl.Float16, pl.Float32, pl.Float64])
838
@pytest.mark.parametrize("method", ["pearson", "spearman"])
839
def test_corr_spearman_float_dtype_26335(
840
dt: pl.DataType, method: CorrelationMethod
841
) -> None:
842
df = pl.DataFrame(
843
{
844
"a": [1, 8, 3],
845
"b": [4, 5, 2],
846
"c": ["foo", "foo", "foo"],
847
},
848
schema_overrides={"a": dt, "b": dt},
849
)
850
851
q = df.lazy().select(pl.corr("a", "b", method=method))
852
out = q.collect()
853
assert out.schema["a"] == dt
854
855
q = df.lazy().group_by("c").agg(pl.corr("a", "b", method=method))
856
out = q.collect()
857
assert out.schema["a"] == dt
858
859