Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/functions/test_functions.py
6939 views
1
from __future__ import annotations
2
3
from typing import TYPE_CHECKING, Any
4
5
import numpy as np
6
import pytest
7
8
import polars as pl
9
from polars.exceptions import DuplicateError, InvalidOperationError
10
from polars.testing import assert_frame_equal, assert_series_equal
11
12
if TYPE_CHECKING:
13
from polars._typing import ConcatMethod
14
15
16
def test_concat_align() -> None:
17
a = pl.DataFrame({"a": ["a", "b", "d", "e", "e"], "b": [1, 2, 4, 5, 6]})
18
b = pl.DataFrame({"a": ["a", "b", "c"], "c": [5.5, 6.0, 7.5]})
19
c = pl.DataFrame({"a": ["a", "b", "c", "d", "e"], "d": ["w", "x", "y", "z", None]})
20
21
for align_full in ("align", "align_full"):
22
result = pl.concat([a, b, c], how=align_full)
23
expected = pl.DataFrame(
24
{
25
"a": ["a", "b", "c", "d", "e", "e"],
26
"b": [1, 2, None, 4, 5, 6],
27
"c": [5.5, 6.0, 7.5, None, None, None],
28
"d": ["w", "x", "y", "z", None, None],
29
}
30
)
31
assert_frame_equal(result, expected)
32
33
result = pl.concat([a, b, c], how="align_left")
34
expected = pl.DataFrame(
35
{
36
"a": ["a", "b", "d", "e", "e"],
37
"b": [1, 2, 4, 5, 6],
38
"c": [5.5, 6.0, None, None, None],
39
"d": ["w", "x", "z", None, None],
40
}
41
)
42
assert_frame_equal(result, expected)
43
44
result = pl.concat([a, b, c], how="align_right")
45
expected = pl.DataFrame(
46
{
47
"a": ["a", "b", "c", "d", "e"],
48
"b": [1, 2, None, None, None],
49
"c": [5.5, 6.0, 7.5, None, None],
50
"d": ["w", "x", "y", "z", None],
51
}
52
)
53
assert_frame_equal(result, expected)
54
55
result = pl.concat([a, b, c], how="align_inner")
56
expected = pl.DataFrame(
57
{
58
"a": ["a", "b"],
59
"b": [1, 2],
60
"c": [5.5, 6.0],
61
"d": ["w", "x"],
62
}
63
)
64
assert_frame_equal(result, expected)
65
66
67
@pytest.mark.parametrize(
68
"strategy", ["align", "align_full", "align_left", "align_right"]
69
)
70
def test_concat_align_no_common_cols(strategy: ConcatMethod) -> None:
71
df1 = pl.DataFrame({"a": [1, 2], "b": [1, 2]})
72
df2 = pl.DataFrame({"c": [3, 4], "d": [3, 4]})
73
74
with pytest.raises(
75
InvalidOperationError,
76
match=f"{strategy!r} strategy requires at least one common column",
77
):
78
pl.concat((df1, df2), how=strategy)
79
80
81
@pytest.mark.parametrize(
82
("a", "b", "c", "strategy"),
83
[
84
(
85
pl.DataFrame({"a": [1, 2]}),
86
pl.DataFrame({"b": ["a", "b"], "c": [3, 4]}),
87
pl.DataFrame({"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]}),
88
"diagonal",
89
),
90
(
91
pl.DataFrame(
92
{"a": [1, 2]},
93
schema_overrides={"a": pl.Int32},
94
),
95
pl.DataFrame(
96
{"b": ["a", "b"], "c": [3, 4]},
97
schema_overrides={"c": pl.UInt8},
98
),
99
pl.DataFrame(
100
{"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]},
101
schema_overrides={"b": pl.Categorical},
102
),
103
"diagonal_relaxed",
104
),
105
],
106
)
107
def test_concat_diagonal(
108
a: pl.DataFrame, b: pl.DataFrame, c: pl.DataFrame, strategy: ConcatMethod
109
) -> None:
110
for out in [
111
pl.concat([a, b, c], how=strategy),
112
pl.concat([a.lazy(), b.lazy(), c.lazy()], how=strategy).collect(),
113
]:
114
expected = pl.DataFrame(
115
{
116
"a": [1, 2, None, None, 5, 6],
117
"b": [None, None, "a", "b", "x", "y"],
118
"c": [None, None, 3, 4, 5, 6],
119
"d": [None, None, None, None, 5, 6],
120
}
121
)
122
assert_frame_equal(out, expected)
123
124
125
def test_concat_diagonal_relaxed_with_empty_frame() -> None:
126
df1 = pl.DataFrame()
127
df2 = pl.DataFrame(
128
{
129
"a": ["a", "b"],
130
"b": [1, 2],
131
}
132
)
133
out = pl.concat((df1, df2), how="diagonal_relaxed")
134
expected = df2
135
assert_frame_equal(out, expected)
136
137
138
@pytest.mark.parametrize("lazy", [False, True])
139
def test_concat_horizontal(lazy: bool) -> None:
140
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
141
b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]})
142
143
if lazy:
144
out = pl.concat([a.lazy(), b.lazy()], how="horizontal").collect()
145
else:
146
out = pl.concat([a, b], how="horizontal")
147
148
expected = pl.DataFrame(
149
{
150
"a": ["a", "b", None, None],
151
"b": [1, 2, None, None],
152
"c": [5, 7, 8, 9],
153
"d": [1, 2, 1, 2],
154
"e": [1, 2, 1, 2],
155
}
156
)
157
assert_frame_equal(out, expected)
158
159
160
@pytest.mark.parametrize("lazy", [False, True])
161
def test_concat_horizontal_three_dfs(lazy: bool) -> None:
162
a = pl.DataFrame({"a1": [1, 2, 3], "a2": ["a", "b", "c"]})
163
b = pl.DataFrame({"b1": [0.25, 0.5]})
164
c = pl.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8], "c3": [9, 10, 11, 12]})
165
166
if lazy:
167
out = pl.concat([a.lazy(), b.lazy(), c.lazy()], how="horizontal").collect()
168
else:
169
out = pl.concat([a, b, c], how="horizontal")
170
171
expected = pl.DataFrame(
172
{
173
"a1": [1, 2, 3, None],
174
"a2": ["a", "b", "c", None],
175
"b1": [0.25, 0.5, None, None],
176
"c1": [1, 2, 3, 4],
177
"c2": [5, 6, 7, 8],
178
"c3": [9, 10, 11, 12],
179
}
180
)
181
assert_frame_equal(out, expected)
182
183
184
@pytest.mark.parametrize("lazy", [False, True])
185
def test_concat_horizontal_single_df(lazy: bool) -> None:
186
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
187
188
if lazy:
189
out = pl.concat([a.lazy()], how="horizontal").collect()
190
else:
191
out = pl.concat([a], how="horizontal")
192
193
expected = a
194
assert_frame_equal(out, expected)
195
196
197
def test_concat_horizontal_duplicate_col() -> None:
198
a = pl.LazyFrame({"a": ["a", "b"], "b": [1, 2]})
199
b = pl.LazyFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "a": [1, 2, 1, 2]})
200
201
with pytest.raises(DuplicateError):
202
pl.concat([a, b], how="horizontal").collect()
203
204
205
def test_concat_vertical() -> None:
206
a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
207
b = pl.DataFrame({"a": ["c", "d", "e"], "b": [3, 4, 5]})
208
209
result = pl.concat([a, b], how="vertical")
210
expected = pl.DataFrame(
211
{
212
"a": ["a", "b", "c", "d", "e"],
213
"b": [1, 2, 3, 4, 5],
214
}
215
)
216
assert_frame_equal(result, expected)
217
218
219
def test_cov() -> None:
220
s1 = pl.Series("a", [10, 37, -40])
221
s2 = pl.Series("b", [70, -10, 35])
222
223
# lazy/expression
224
lf = pl.LazyFrame([s1, s2])
225
res1 = lf.select(
226
x=pl.cov("a", "b"),
227
y=pl.cov("a", "b", ddof=2),
228
).collect()
229
230
# eager/series
231
res2 = (
232
pl.cov(s1, s2, eager=True).alias("x"),
233
pl.cov(s1, s2, eager=True, ddof=2).alias("y"),
234
)
235
236
# expect same result from both approaches
237
for idx, (r1, r2) in enumerate(zip(res1, res2)):
238
expected_value = -645.8333333333 if idx == 0 else -1291.6666666666
239
assert pytest.approx(expected_value) == r1.item()
240
assert_series_equal(r1, r2)
241
242
243
def test_corr() -> None:
244
s1 = pl.Series("a", [10, 37, -40])
245
s2 = pl.Series("b", [70, -10, 35])
246
247
# lazy/expression
248
lf = pl.LazyFrame([s1, s2])
249
res1 = lf.select(
250
x=pl.corr("a", "b"),
251
y=pl.corr("a", "b", method="spearman"),
252
).collect()
253
254
# eager/series
255
res2 = (
256
pl.corr(s1, s2, eager=True).alias("x"),
257
pl.corr(s1, s2, method="spearman", eager=True).alias("y"),
258
)
259
260
# expect same result from both approaches
261
for idx, (r1, r2) in enumerate(zip(res1, res2)):
262
assert pytest.approx(-0.412199756 if idx == 0 else -0.5) == r1.item()
263
assert_series_equal(r1, r2)
264
265
266
def test_extend_ints() -> None:
267
a = pl.DataFrame({"a": [1 for _ in range(1)]}, schema={"a": pl.Int64})
268
with pytest.raises(pl.exceptions.SchemaError):
269
a.extend(a.select(pl.lit(0, dtype=pl.Int32).alias("a")))
270
271
272
def test_null_handling_correlation() -> None:
273
df = pl.DataFrame({"a": [1, 2, 3, None, 4], "b": [1, 2, 3, 10, 4]})
274
275
out = df.select(
276
pl.corr("a", "b").alias("pearson"),
277
pl.corr("a", "b", method="spearman").alias("spearman"),
278
)
279
assert out["pearson"][0] == pytest.approx(1.0)
280
assert out["spearman"][0] == pytest.approx(1.0)
281
282
# see #4930
283
df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]})
284
df2 = pl.DataFrame({"a": [np.nan, 1, 2], "b": [np.nan, 2, 1]})
285
286
assert np.isclose(df1.select(pl.corr("a", "b", method="spearman")).item(), -1.0)
287
assert (
288
str(
289
df2.select(pl.corr("a", "b", method="spearman", propagate_nans=True)).item()
290
)
291
== "nan"
292
)
293
294
295
def test_align_frames() -> None:
296
import numpy as np
297
import pandas as pd
298
299
# setup some test frames
300
pdf1 = pd.DataFrame(
301
{
302
"date": pd.date_range(start="2019-01-02", periods=9),
303
"a": np.array([0, 1, 2, np.nan, 4, 5, 6, 7, 8], dtype=np.float64),
304
"b": np.arange(9, 18, dtype=np.float64),
305
}
306
).set_index("date")
307
308
pdf2 = pd.DataFrame(
309
{
310
"date": pd.date_range(start="2019-01-04", periods=7),
311
"a": np.arange(9, 16, dtype=np.float64),
312
"b": np.arange(10, 17, dtype=np.float64),
313
}
314
).set_index("date")
315
316
# calculate dot-product in pandas
317
pd_dot = (pdf1 * pdf2).sum(axis="columns").to_frame("dot").reset_index()
318
319
# use "align_frames" to calculate dot-product from disjoint rows. pandas uses an
320
# index to automatically infer the correct frame-alignment for the calculation;
321
# we need to do it explicitly (which also makes it clearer what is happening)
322
pf1, pf2 = pl.align_frames(
323
pl.from_pandas(pdf1.reset_index()),
324
pl.from_pandas(pdf2.reset_index()),
325
on="date",
326
)
327
pl_dot = (
328
(pf1[["a", "b"]] * pf2[["a", "b"]])
329
.fill_null(0)
330
.select(pl.sum_horizontal("*").alias("dot"))
331
.insert_column(0, pf1["date"])
332
)
333
# confirm we match the same operation in pandas
334
assert_frame_equal(pl_dot, pl.from_pandas(pd_dot))
335
pd.testing.assert_frame_equal(pd_dot, pl_dot.to_pandas())
336
337
# confirm alignment function works with lazy frames
338
lf1, lf2 = pl.align_frames(
339
pl.from_pandas(pdf1.reset_index()).lazy(),
340
pl.from_pandas(pdf2.reset_index()).lazy(),
341
on="date",
342
)
343
assert isinstance(lf1, pl.LazyFrame)
344
assert_frame_equal(lf1.collect(), pf1)
345
assert_frame_equal(lf2.collect(), pf2)
346
347
# misc: no frames results in an empty list
348
assert pl.align_frames(on="date") == []
349
350
# expected error condition
351
with pytest.raises(TypeError):
352
pl.align_frames( # type: ignore[type-var]
353
pl.from_pandas(pdf1.reset_index()).lazy(),
354
pl.from_pandas(pdf2.reset_index()),
355
on="date",
356
)
357
358
359
def test_align_frames_misc() -> None:
360
df1 = pl.DataFrame([[3, 5, 6], [5, 8, 9]], orient="row")
361
df2 = pl.DataFrame([[2, 5, 6], [3, 8, 9], [4, 2, 0]], orient="row")
362
363
# descending result
364
pf1, pf2 = pl.align_frames(
365
[df1, df2], # list input
366
on="column_0",
367
descending=True,
368
)
369
assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]
370
assert pf2.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]
371
372
# handle identical frames
373
pf1, pf2, pf3 = pl.align_frames(
374
(df for df in (df1, df2, df2)), # generator input
375
on="column_0",
376
descending=True,
377
)
378
assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]
379
for pf in (pf2, pf3):
380
assert pf.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]
381
382
383
def test_align_frames_with_nulls() -> None:
384
df1 = pl.DataFrame({"key": ["x", "y", None], "value": [1, 2, 0]})
385
df2 = pl.DataFrame({"key": ["x", None, "z", "y"], "value": [4, 3, 6, 5]})
386
387
a1, a2 = pl.align_frames(df1, df2, on="key")
388
389
aligned_frame_data = a1.to_dict(as_series=False), a2.to_dict(as_series=False)
390
assert aligned_frame_data == (
391
{"key": [None, "x", "y", "z"], "value": [0, 1, 2, None]},
392
{"key": [None, "x", "y", "z"], "value": [3, 4, 5, 6]},
393
)
394
395
396
def test_align_frames_duplicate_key() -> None:
397
# setup some test frames with duplicate key/alignment values
398
df1 = pl.DataFrame({"x": ["a", "a", "a", "e"], "y": [1, 2, 4, 5]})
399
df2 = pl.DataFrame({"y": [0, 0, -1], "z": [5.5, 6.0, 7.5], "x": ["a", "b", "b"]})
400
401
# align rows, confirming correctness and original column order
402
af1, af2 = pl.align_frames(df1, df2, on="x")
403
404
# shape: (6, 2) shape: (6, 3)
405
# ┌─────┬──────┐ ┌──────┬──────┬─────┐
406
# │ x ┆ y │ │ y ┆ z ┆ x │
407
# │ --- ┆ --- │ │ --- ┆ --- ┆ --- │
408
# │ str ┆ i64 │ │ i64 ┆ f64 ┆ str │
409
# ╞═════╪══════╡ ╞══════╪══════╪═════╡
410
# │ a ┆ 1 │ │ 0 ┆ 5.5 ┆ a │
411
# │ a ┆ 2 │ │ 0 ┆ 5.5 ┆ a │
412
# │ a ┆ 4 │ │ 0 ┆ 5.5 ┆ a │
413
# │ b ┆ null │ │ 0 ┆ 6.0 ┆ b │
414
# │ b ┆ null │ │ -1 ┆ 7.5 ┆ b │
415
# │ e ┆ 5 │ │ null ┆ null ┆ e │
416
# └─────┴──────┘ └──────┴──────┴─────┘
417
assert af1.rows() == [
418
("a", 1),
419
("a", 2),
420
("a", 4),
421
("b", None),
422
("b", None),
423
("e", 5),
424
]
425
assert af2.rows() == [
426
(0, 5.5, "a"),
427
(0, 5.5, "a"),
428
(0, 5.5, "a"),
429
(0, 6.0, "b"),
430
(-1, 7.5, "b"),
431
(None, None, "e"),
432
]
433
434
# align frames the other way round, using "left" alignment strategy
435
af1, af2 = pl.align_frames(df2, df1, on="x", how="left")
436
437
# shape: (5, 3) shape: (5, 2)
438
# ┌─────┬─────┬─────┐ ┌─────┬──────┐
439
# │ y ┆ z ┆ x │ │ x ┆ y │
440
# │ --- ┆ --- ┆ --- │ │ --- ┆ --- │
441
# │ i64 ┆ f64 ┆ str │ │ str ┆ i64 │
442
# ╞═════╪═════╪═════╡ ╞═════╪══════╡
443
# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 1 │
444
# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 2 │
445
# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 4 │
446
# │ 0 ┆ 6.0 ┆ b │ │ b ┆ null │
447
# │ -1 ┆ 7.5 ┆ b │ │ b ┆ null │
448
# └─────┴─────┴─────┘ └─────┴──────┘
449
assert af1.rows() == [
450
(0, 5.5, "a"),
451
(0, 5.5, "a"),
452
(0, 5.5, "a"),
453
(0, 6.0, "b"),
454
(-1, 7.5, "b"),
455
]
456
assert af2.rows() == [
457
("a", 1),
458
("a", 2),
459
("a", 4),
460
("b", None),
461
("b", None),
462
]
463
464
465
def test_align_frames_single_row_20445() -> None:
466
left = pl.DataFrame({"a": [1], "b": [2]})
467
right = pl.DataFrame({"a": [1], "c": [3]})
468
result = pl.align_frames(left, right, how="left", on="a")
469
assert_frame_equal(result[0], left)
470
assert_frame_equal(result[1], right)
471
472
473
def test_coalesce() -> None:
474
df = pl.DataFrame(
475
{
476
"a": [1, None, None, None],
477
"b": [1, 2, None, None],
478
"c": [5, None, 3, None],
479
}
480
)
481
# list inputs
482
expected = pl.Series("d", [1, 2, 3, 10]).to_frame()
483
result = df.select(pl.coalesce(["a", "b", "c", 10]).alias("d"))
484
assert_frame_equal(expected, result)
485
486
# positional inputs
487
expected = pl.Series("d", [1.0, 2.0, 3.0, 10.0]).to_frame()
488
result = df.select(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))
489
assert_frame_equal(result, expected)
490
491
492
def test_coalesce_eager() -> None:
493
# eager/series inputs
494
s1 = pl.Series("colx", [None, 2, None])
495
s2 = pl.Series("coly", [1, None, None])
496
s3 = pl.Series("colz", [None, None, 3])
497
498
res = pl.coalesce(s1, s2, s3, eager=True)
499
expected = pl.Series("colx", [1, 2, 3])
500
assert_series_equal(expected, res)
501
502
for zero in (0, pl.lit(0)):
503
res = pl.coalesce(s1, zero, eager=True)
504
expected = pl.Series("colx", [0, 2, 0])
505
assert_series_equal(expected, res)
506
507
res = pl.coalesce(zero, s1, eager=True)
508
expected = pl.Series("literal", [0, 0, 0])
509
assert_series_equal(expected, res)
510
511
with pytest.raises(
512
ValueError,
513
match="expected at least one Series in 'coalesce' if 'eager=True'",
514
):
515
pl.coalesce("x", "y", eager=True)
516
517
518
def test_overflow_diff() -> None:
519
df = pl.DataFrame({"a": [20, 10, 30]})
520
assert df.select(pl.col("a").cast(pl.UInt64).diff()).to_dict(as_series=False) == {
521
"a": [None, -10, 20]
522
}
523
524
525
@pytest.mark.may_fail_cloud # reason: unknown type
526
def test_fill_null_unknown_output_type() -> None:
527
df = pl.DataFrame({"a": [None, 2, 3, 4, 5]})
528
assert df.with_columns(
529
np.exp(pl.col("a")).fill_null(pl.lit(1, pl.Float64))
530
).to_dict(as_series=False) == {
531
"a": [
532
1.0,
533
7.38905609893065,
534
20.085536923187668,
535
54.598150033144236,
536
148.4131591025766,
537
]
538
}
539
540
541
def test_approx_n_unique() -> None:
542
df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]})
543
544
assert_frame_equal(
545
df1.select(pl.approx_n_unique("b")),
546
pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}),
547
)
548
549
assert_frame_equal(
550
df1.select(pl.col("b").approx_n_unique()),
551
pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}),
552
)
553
554
555
def test_lazy_functions() -> None:
556
df = pl.DataFrame(
557
{
558
"a": ["foo", "bar", "foo"],
559
"b": [1, 2, 3],
560
"c": [-1.0, 2.0, 4.0],
561
}
562
)
563
564
# test function expressions against frame
565
out = df.select(
566
pl.var("b").name.suffix("_var"),
567
pl.std("b").name.suffix("_std"),
568
pl.max("a", "b").name.suffix("_max"),
569
pl.min("a", "b").name.suffix("_min"),
570
pl.sum("b", "c").name.suffix("_sum"),
571
pl.mean("b", "c").name.suffix("_mean"),
572
pl.median("c", "b").name.suffix("_median"),
573
pl.n_unique("b", "a").name.suffix("_n_unique"),
574
pl.first("a").name.suffix("_first"),
575
pl.first("b", "c").name.suffix("_first"),
576
pl.last("c", "b", "a").name.suffix("_last"),
577
)
578
expected: dict[str, list[Any]] = {
579
"b_var": [1.0],
580
"b_std": [1.0],
581
"a_max": ["foo"],
582
"b_max": [3],
583
"a_min": ["bar"],
584
"b_min": [1],
585
"b_sum": [6],
586
"c_sum": [5.0],
587
"b_mean": [2.0],
588
"c_mean": [5 / 3],
589
"c_median": [2.0],
590
"b_median": [2.0],
591
"b_n_unique": [3],
592
"a_n_unique": [2],
593
"a_first": ["foo"],
594
"b_first": [1],
595
"c_first": [-1.0],
596
"c_last": [4.0],
597
"b_last": [3],
598
"a_last": ["foo"],
599
}
600
assert_frame_equal(
601
out,
602
pl.DataFrame(
603
data=expected,
604
schema_overrides={
605
"a_n_unique": pl.UInt32,
606
"b_n_unique": pl.UInt32,
607
},
608
),
609
)
610
611
# test function expressions against series
612
for name, value in expected.items():
613
col, fn = name.split("_", 1)
614
if series_fn := getattr(df[col], fn, None):
615
assert series_fn() == value[0]
616
617
# regex selection
618
out = df.select(
619
pl.struct(pl.max("^a|b$")).alias("x"),
620
pl.struct(pl.min("^.*[bc]$")).alias("y"),
621
pl.struct(pl.sum("^[^a]$")).alias("z"),
622
)
623
assert out.rows() == [
624
({"a": "foo", "b": 3}, {"b": 1, "c": -1.0}, {"b": 6, "c": 5.0})
625
]
626
627
628
def test_count() -> None:
629
df = pl.DataFrame({"a": [1, 1, 1], "b": [None, "xx", "yy"]})
630
out = df.select(pl.count("a"))
631
assert list(out["a"]) == [3]
632
633
for count_expr in (
634
pl.count("b", "a"),
635
[pl.count("b"), pl.count("a")],
636
):
637
out = df.select(count_expr)
638
assert out.rows() == [(2, 3)]
639
640
641
def test_head_tail(fruits_cars: pl.DataFrame) -> None:
642
res_expr = fruits_cars.select(pl.head("A", 2))
643
expected = pl.Series("A", [1, 2])
644
assert_series_equal(res_expr.to_series(), expected)
645
646
res_expr = fruits_cars.select(pl.tail("A", 2))
647
expected = pl.Series("A", [4, 5])
648
assert_series_equal(res_expr.to_series(), expected)
649
650
651
def test_escape_regex() -> None:
652
result = pl.escape_regex("abc(\\w+)")
653
expected = "abc\\(\\\\w\\+\\)"
654
assert result == expected
655
656
df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
657
with pytest.raises(
658
TypeError,
659
match="escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead",
660
):
661
df.with_columns(escaped=pl.escape_regex(pl.col("text"))) # type: ignore[arg-type]
662
663
with pytest.raises(
664
TypeError,
665
match="escape_regex function supports only `str` type, got `int`",
666
):
667
pl.escape_regex(3) # type: ignore[arg-type]
668
669
670
@pytest.mark.parametrize("func", ["var", "std"])
671
def test_var_std_lit_23156(func: str) -> None:
672
for n in range(100):
673
input = pl.DataFrame({"x": list(range(n))}).select(pl.col("x"), pl.lit(0))
674
out = getattr(input, func)()
675
if n <= 1:
676
assert_series_equal(
677
out["literal"], pl.Series("literal", [None], dtype=pl.Float64)
678
)
679
else:
680
assert_series_equal(
681
out["literal"], pl.Series("literal", [0.0], dtype=pl.Float64)
682
)
683
684
685
def test_row_index_expr() -> None:
686
lf = pl.LazyFrame({"x": ["A", "A", "B", "B", "B"]})
687
688
assert_frame_equal(
689
lf.with_columns(pl.row_index(), pl.row_index("another_index")).collect(),
690
pl.DataFrame(
691
{
692
"x": ["A", "A", "B", "B", "B"],
693
"index": [0, 1, 2, 3, 4],
694
"another_index": [0, 1, 2, 3, 4],
695
},
696
schema={
697
"x": pl.String,
698
"index": pl.get_index_type(),
699
"another_index": pl.get_index_type(),
700
},
701
),
702
)
703
704
assert_frame_equal(
705
(
706
lf.group_by("x")
707
.agg(pl.row_index(), pl.row_index("another_index"))
708
.sort("x")
709
.collect()
710
),
711
pl.DataFrame(
712
{
713
"x": ["A", "B"],
714
"index": [[0, 1], [0, 1, 2]],
715
"another_index": [[0, 1], [0, 1, 2]],
716
},
717
schema={
718
"x": pl.String,
719
"index": pl.List(pl.get_index_type()),
720
"another_index": pl.List(pl.get_index_type()),
721
},
722
),
723
)
724
725
assert_frame_equal(
726
lf.select(pl.row_index()).collect(),
727
pl.DataFrame(
728
{"index": [0, 1, 2, 3, 4]},
729
schema={"index": pl.get_index_type()},
730
),
731
)
732
733