CoCalc -- test_functions.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/functions/test_functions.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from typing import TYPE_CHECKING, Any
4

5
import numpy as np
6
import pytest
7

8
import polars as pl
9
from polars.exceptions import DuplicateError, InvalidOperationError
10
from polars.testing import assert_frame_equal, assert_series_equal
11

12
if TYPE_CHECKING:
13
    from polars._typing import ConcatMethod
14

15

16
def test_concat_align() -> None:
17
    a = pl.DataFrame({"a": ["a", "b", "d", "e", "e"], "b": [1, 2, 4, 5, 6]})
18
    b = pl.DataFrame({"a": ["a", "b", "c"], "c": [5.5, 6.0, 7.5]})
19
    c = pl.DataFrame({"a": ["a", "b", "c", "d", "e"], "d": ["w", "x", "y", "z", None]})
20

21
    for align_full in ("align", "align_full"):
22
        result = pl.concat([a, b, c], how=align_full)
23
        expected = pl.DataFrame(
24
            {
25
                "a": ["a", "b", "c", "d", "e", "e"],
26
                "b": [1, 2, None, 4, 5, 6],
27
                "c": [5.5, 6.0, 7.5, None, None, None],
28
                "d": ["w", "x", "y", "z", None, None],
29
            }
30
        )
31
        assert_frame_equal(result, expected)
32

33
    result = pl.concat([a, b, c], how="align_left")
34
    expected = pl.DataFrame(
35
        {
36
            "a": ["a", "b", "d", "e", "e"],
37
            "b": [1, 2, 4, 5, 6],
38
            "c": [5.5, 6.0, None, None, None],
39
            "d": ["w", "x", "z", None, None],
40
        }
41
    )
42
    assert_frame_equal(result, expected)
43

44
    result = pl.concat([a, b, c], how="align_right")
45
    expected = pl.DataFrame(
46
        {
47
            "a": ["a", "b", "c", "d", "e"],
48
            "b": [1, 2, None, None, None],
49
            "c": [5.5, 6.0, 7.5, None, None],
50
            "d": ["w", "x", "y", "z", None],
51
        }
52
    )
53
    assert_frame_equal(result, expected)
54

55
    result = pl.concat([a, b, c], how="align_inner")
56
    expected = pl.DataFrame(
57
        {
58
            "a": ["a", "b"],
59
            "b": [1, 2],
60
            "c": [5.5, 6.0],
61
            "d": ["w", "x"],
62
        }
63
    )
64
    assert_frame_equal(result, expected)
65

66

67
@pytest.mark.parametrize(
68
    "strategy", ["align", "align_full", "align_left", "align_right"]
69
)
70
def test_concat_align_no_common_cols(strategy: ConcatMethod) -> None:
71
    df1 = pl.DataFrame({"a": [1, 2], "b": [1, 2]})
72
    df2 = pl.DataFrame({"c": [3, 4], "d": [3, 4]})
73

74
    with pytest.raises(
75
        InvalidOperationError,
76
        match=f"{strategy!r} strategy requires at least one common column",
77
    ):
78
        pl.concat((df1, df2), how=strategy)
79

80

81
@pytest.mark.parametrize(
82
    ("a", "b", "c", "strategy"),
83
    [
84
        (
85
            pl.DataFrame({"a": [1, 2]}),
86
            pl.DataFrame({"b": ["a", "b"], "c": [3, 4]}),
87
            pl.DataFrame({"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]}),
88
            "diagonal",
89
        ),
90
        (
91
            pl.DataFrame(
92
                {"a": [1, 2]},
93
                schema_overrides={"a": pl.Int32},
94
            ),
95
            pl.DataFrame(
96
                {"b": ["a", "b"], "c": [3, 4]},
97
                schema_overrides={"c": pl.UInt8},
98
            ),
99
            pl.DataFrame(
100
                {"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]},
101
                schema_overrides={"b": pl.Categorical},
102
            ),
103
            "diagonal_relaxed",
104
        ),
105
    ],
106
)
107
def test_concat_diagonal(
108
    a: pl.DataFrame, b: pl.DataFrame, c: pl.DataFrame, strategy: ConcatMethod
109
) -> None:
110
    for out in [
111
        pl.concat([a, b, c], how=strategy),
112
        pl.concat([a.lazy(), b.lazy(), c.lazy()], how=strategy).collect(),
113
    ]:
114
        expected = pl.DataFrame(
115
            {
116
                "a": [1, 2, None, None, 5, 6],
117
                "b": [None, None, "a", "b", "x", "y"],
118
                "c": [None, None, 3, 4, 5, 6],
119
                "d": [None, None, None, None, 5, 6],
120
            }
121
        )
122
        assert_frame_equal(out, expected)
123

124

125
def test_concat_diagonal_relaxed_with_empty_frame() -> None:
126
    df1 = pl.DataFrame()
127
    df2 = pl.DataFrame(
128
        {
129
            "a": ["a", "b"],
130
            "b": [1, 2],
131
        }
132
    )
133
    out = pl.concat((df1, df2), how="diagonal_relaxed")
134
    expected = df2
135
    assert_frame_equal(out, expected)
136

137

138
@pytest.mark.parametrize("lazy", [False, True])
139
def test_concat_horizontal(lazy: bool) -> None:
140
    a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
141
    b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]})
142

143
    if lazy:
144
        out = pl.concat([a.lazy(), b.lazy()], how="horizontal").collect()
145
    else:
146
        out = pl.concat([a, b], how="horizontal")
147

148
    expected = pl.DataFrame(
149
        {
150
            "a": ["a", "b", None, None],
151
            "b": [1, 2, None, None],
152
            "c": [5, 7, 8, 9],
153
            "d": [1, 2, 1, 2],
154
            "e": [1, 2, 1, 2],
155
        }
156
    )
157
    assert_frame_equal(out, expected)
158

159

160
@pytest.mark.parametrize("lazy", [False, True])
161
def test_concat_horizontal_three_dfs(lazy: bool) -> None:
162
    a = pl.DataFrame({"a1": [1, 2, 3], "a2": ["a", "b", "c"]})
163
    b = pl.DataFrame({"b1": [0.25, 0.5]})
164
    c = pl.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8], "c3": [9, 10, 11, 12]})
165

166
    if lazy:
167
        out = pl.concat([a.lazy(), b.lazy(), c.lazy()], how="horizontal").collect()
168
    else:
169
        out = pl.concat([a, b, c], how="horizontal")
170

171
    expected = pl.DataFrame(
172
        {
173
            "a1": [1, 2, 3, None],
174
            "a2": ["a", "b", "c", None],
175
            "b1": [0.25, 0.5, None, None],
176
            "c1": [1, 2, 3, 4],
177
            "c2": [5, 6, 7, 8],
178
            "c3": [9, 10, 11, 12],
179
        }
180
    )
181
    assert_frame_equal(out, expected)
182

183

184
@pytest.mark.parametrize("lazy", [False, True])
185
def test_concat_horizontal_single_df(lazy: bool) -> None:
186
    a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
187

188
    if lazy:
189
        out = pl.concat([a.lazy()], how="horizontal").collect()
190
    else:
191
        out = pl.concat([a], how="horizontal")
192

193
    expected = a
194
    assert_frame_equal(out, expected)
195

196

197
def test_concat_horizontal_duplicate_col() -> None:
198
    a = pl.LazyFrame({"a": ["a", "b"], "b": [1, 2]})
199
    b = pl.LazyFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "a": [1, 2, 1, 2]})
200

201
    with pytest.raises(DuplicateError):
202
        pl.concat([a, b], how="horizontal").collect()
203

204

205
def test_concat_vertical() -> None:
206
    a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
207
    b = pl.DataFrame({"a": ["c", "d", "e"], "b": [3, 4, 5]})
208

209
    result = pl.concat([a, b], how="vertical")
210
    expected = pl.DataFrame(
211
        {
212
            "a": ["a", "b", "c", "d", "e"],
213
            "b": [1, 2, 3, 4, 5],
214
        }
215
    )
216
    assert_frame_equal(result, expected)
217

218

219
def test_cov() -> None:
220
    s1 = pl.Series("a", [10, 37, -40])
221
    s2 = pl.Series("b", [70, -10, 35])
222

223
    # lazy/expression
224
    lf = pl.LazyFrame([s1, s2])
225
    res1 = lf.select(
226
        x=pl.cov("a", "b"),
227
        y=pl.cov("a", "b", ddof=2),
228
    ).collect()
229

230
    # eager/series
231
    res2 = (
232
        pl.cov(s1, s2, eager=True).alias("x"),
233
        pl.cov(s1, s2, eager=True, ddof=2).alias("y"),
234
    )
235

236
    # expect same result from both approaches
237
    for idx, (r1, r2) in enumerate(zip(res1, res2)):
238
        expected_value = -645.8333333333 if idx == 0 else -1291.6666666666
239
        assert pytest.approx(expected_value) == r1.item()
240
        assert_series_equal(r1, r2)
241

242

243
def test_corr() -> None:
244
    s1 = pl.Series("a", [10, 37, -40])
245
    s2 = pl.Series("b", [70, -10, 35])
246

247
    # lazy/expression
248
    lf = pl.LazyFrame([s1, s2])
249
    res1 = lf.select(
250
        x=pl.corr("a", "b"),
251
        y=pl.corr("a", "b", method="spearman"),
252
    ).collect()
253

254
    # eager/series
255
    res2 = (
256
        pl.corr(s1, s2, eager=True).alias("x"),
257
        pl.corr(s1, s2, method="spearman", eager=True).alias("y"),
258
    )
259

260
    # expect same result from both approaches
261
    for idx, (r1, r2) in enumerate(zip(res1, res2)):
262
        assert pytest.approx(-0.412199756 if idx == 0 else -0.5) == r1.item()
263
        assert_series_equal(r1, r2)
264

265

266
def test_extend_ints() -> None:
267
    a = pl.DataFrame({"a": [1 for _ in range(1)]}, schema={"a": pl.Int64})
268
    with pytest.raises(pl.exceptions.SchemaError):
269
        a.extend(a.select(pl.lit(0, dtype=pl.Int32).alias("a")))
270

271

272
def test_null_handling_correlation() -> None:
273
    df = pl.DataFrame({"a": [1, 2, 3, None, 4], "b": [1, 2, 3, 10, 4]})
274

275
    out = df.select(
276
        pl.corr("a", "b").alias("pearson"),
277
        pl.corr("a", "b", method="spearman").alias("spearman"),
278
    )
279
    assert out["pearson"][0] == pytest.approx(1.0)
280
    assert out["spearman"][0] == pytest.approx(1.0)
281

282
    # see #4930
283
    df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]})
284
    df2 = pl.DataFrame({"a": [np.nan, 1, 2], "b": [np.nan, 2, 1]})
285

286
    assert np.isclose(df1.select(pl.corr("a", "b", method="spearman")).item(), -1.0)
287
    assert (
288
        str(
289
            df2.select(pl.corr("a", "b", method="spearman", propagate_nans=True)).item()
290
        )
291
        == "nan"
292
    )
293

294

295
def test_align_frames() -> None:
296
    import numpy as np
297
    import pandas as pd
298

299
    # setup some test frames
300
    pdf1 = pd.DataFrame(
301
        {
302
            "date": pd.date_range(start="2019-01-02", periods=9),
303
            "a": np.array([0, 1, 2, np.nan, 4, 5, 6, 7, 8], dtype=np.float64),
304
            "b": np.arange(9, 18, dtype=np.float64),
305
        }
306
    ).set_index("date")
307

308
    pdf2 = pd.DataFrame(
309
        {
310
            "date": pd.date_range(start="2019-01-04", periods=7),
311
            "a": np.arange(9, 16, dtype=np.float64),
312
            "b": np.arange(10, 17, dtype=np.float64),
313
        }
314
    ).set_index("date")
315

316
    # calculate dot-product in pandas
317
    pd_dot = (pdf1 * pdf2).sum(axis="columns").to_frame("dot").reset_index()
318

319
    # use "align_frames" to calculate dot-product from disjoint rows. pandas uses an
320
    # index to automatically infer the correct frame-alignment for the calculation;
321
    # we need to do it explicitly (which also makes it clearer what is happening)
322
    pf1, pf2 = pl.align_frames(
323
        pl.from_pandas(pdf1.reset_index()),
324
        pl.from_pandas(pdf2.reset_index()),
325
        on="date",
326
    )
327
    pl_dot = (
328
        (pf1[["a", "b"]] * pf2[["a", "b"]])
329
        .fill_null(0)
330
        .select(pl.sum_horizontal("*").alias("dot"))
331
        .insert_column(0, pf1["date"])
332
    )
333
    # confirm we match the same operation in pandas
334
    assert_frame_equal(pl_dot, pl.from_pandas(pd_dot))
335
    pd.testing.assert_frame_equal(pd_dot, pl_dot.to_pandas())
336

337
    # confirm alignment function works with lazy frames
338
    lf1, lf2 = pl.align_frames(
339
        pl.from_pandas(pdf1.reset_index()).lazy(),
340
        pl.from_pandas(pdf2.reset_index()).lazy(),
341
        on="date",
342
    )
343
    assert isinstance(lf1, pl.LazyFrame)
344
    assert_frame_equal(lf1.collect(), pf1)
345
    assert_frame_equal(lf2.collect(), pf2)
346

347
    # misc: no frames results in an empty list
348
    assert pl.align_frames(on="date") == []
349

350
    # expected error condition
351
    with pytest.raises(TypeError):
352
        pl.align_frames(  # type: ignore[type-var]
353
            pl.from_pandas(pdf1.reset_index()).lazy(),
354
            pl.from_pandas(pdf2.reset_index()),
355
            on="date",
356
        )
357

358

359
def test_align_frames_misc() -> None:
360
    df1 = pl.DataFrame([[3, 5, 6], [5, 8, 9]], orient="row")
361
    df2 = pl.DataFrame([[2, 5, 6], [3, 8, 9], [4, 2, 0]], orient="row")
362

363
    # descending result
364
    pf1, pf2 = pl.align_frames(
365
        [df1, df2],  # list input
366
        on="column_0",
367
        descending=True,
368
    )
369
    assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]
370
    assert pf2.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]
371

372
    # handle identical frames
373
    pf1, pf2, pf3 = pl.align_frames(
374
        (df for df in (df1, df2, df2)),  # generator input
375
        on="column_0",
376
        descending=True,
377
    )
378
    assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]
379
    for pf in (pf2, pf3):
380
        assert pf.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]
381

382

383
def test_align_frames_with_nulls() -> None:
384
    df1 = pl.DataFrame({"key": ["x", "y", None], "value": [1, 2, 0]})
385
    df2 = pl.DataFrame({"key": ["x", None, "z", "y"], "value": [4, 3, 6, 5]})
386

387
    a1, a2 = pl.align_frames(df1, df2, on="key")
388

389
    aligned_frame_data = a1.to_dict(as_series=False), a2.to_dict(as_series=False)
390
    assert aligned_frame_data == (
391
        {"key": [None, "x", "y", "z"], "value": [0, 1, 2, None]},
392
        {"key": [None, "x", "y", "z"], "value": [3, 4, 5, 6]},
393
    )
394

395

396
def test_align_frames_duplicate_key() -> None:
397
    # setup some test frames with duplicate key/alignment values
398
    df1 = pl.DataFrame({"x": ["a", "a", "a", "e"], "y": [1, 2, 4, 5]})
399
    df2 = pl.DataFrame({"y": [0, 0, -1], "z": [5.5, 6.0, 7.5], "x": ["a", "b", "b"]})
400

401
    # align rows, confirming correctness and original column order
402
    af1, af2 = pl.align_frames(df1, df2, on="x")
403

404
    # shape: (6, 2)   shape: (6, 3)
405
    # ┌─────┬──────┐  ┌──────┬──────┬─────┐
406
    # │ x   ┆ y    │  │ y    ┆ z    ┆ x   │
407
    # │ --- ┆ ---  │  │ ---  ┆ ---  ┆ --- │
408
    # │ str ┆ i64  │  │ i64  ┆ f64  ┆ str │
409
    # ╞═════╪══════╡  ╞══════╪══════╪═════╡
410
    # │ a   ┆ 1    │  │ 0    ┆ 5.5  ┆ a   │
411
    # │ a   ┆ 2    │  │ 0    ┆ 5.5  ┆ a   │
412
    # │ a   ┆ 4    │  │ 0    ┆ 5.5  ┆ a   │
413
    # │ b   ┆ null │  │ 0    ┆ 6.0  ┆ b   │
414
    # │ b   ┆ null │  │ -1   ┆ 7.5  ┆ b   │
415
    # │ e   ┆ 5    │  │ null ┆ null ┆ e   │
416
    # └─────┴──────┘  └──────┴──────┴─────┘
417
    assert af1.rows() == [
418
        ("a", 1),
419
        ("a", 2),
420
        ("a", 4),
421
        ("b", None),
422
        ("b", None),
423
        ("e", 5),
424
    ]
425
    assert af2.rows() == [
426
        (0, 5.5, "a"),
427
        (0, 5.5, "a"),
428
        (0, 5.5, "a"),
429
        (0, 6.0, "b"),
430
        (-1, 7.5, "b"),
431
        (None, None, "e"),
432
    ]
433

434
    # align frames the other way round, using "left" alignment strategy
435
    af1, af2 = pl.align_frames(df2, df1, on="x", how="left")
436

437
    # shape: (5, 3)        shape: (5, 2)
438
    # ┌─────┬─────┬─────┐  ┌─────┬──────┐
439
    # │ y   ┆ z   ┆ x   │  │ x   ┆ y    │
440
    # │ --- ┆ --- ┆ --- │  │ --- ┆ ---  │
441
    # │ i64 ┆ f64 ┆ str │  │ str ┆ i64  │
442
    # ╞═════╪═════╪═════╡  ╞═════╪══════╡
443
    # │ 0   ┆ 5.5 ┆ a   │  │ a   ┆ 1    │
444
    # │ 0   ┆ 5.5 ┆ a   │  │ a   ┆ 2    │
445
    # │ 0   ┆ 5.5 ┆ a   │  │ a   ┆ 4    │
446
    # │ 0   ┆ 6.0 ┆ b   │  │ b   ┆ null │
447
    # │ -1  ┆ 7.5 ┆ b   │  │ b   ┆ null │
448
    # └─────┴─────┴─────┘  └─────┴──────┘
449
    assert af1.rows() == [
450
        (0, 5.5, "a"),
451
        (0, 5.5, "a"),
452
        (0, 5.5, "a"),
453
        (0, 6.0, "b"),
454
        (-1, 7.5, "b"),
455
    ]
456
    assert af2.rows() == [
457
        ("a", 1),
458
        ("a", 2),
459
        ("a", 4),
460
        ("b", None),
461
        ("b", None),
462
    ]
463

464

465
def test_align_frames_single_row_20445() -> None:
466
    left = pl.DataFrame({"a": [1], "b": [2]})
467
    right = pl.DataFrame({"a": [1], "c": [3]})
468
    result = pl.align_frames(left, right, how="left", on="a")
469
    assert_frame_equal(result[0], left)
470
    assert_frame_equal(result[1], right)
471

472

473
def test_coalesce() -> None:
474
    df = pl.DataFrame(
475
        {
476
            "a": [1, None, None, None],
477
            "b": [1, 2, None, None],
478
            "c": [5, None, 3, None],
479
        }
480
    )
481
    # list inputs
482
    expected = pl.Series("d", [1, 2, 3, 10]).to_frame()
483
    result = df.select(pl.coalesce(["a", "b", "c", 10]).alias("d"))
484
    assert_frame_equal(expected, result)
485

486
    # positional inputs
487
    expected = pl.Series("d", [1.0, 2.0, 3.0, 10.0]).to_frame()
488
    result = df.select(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))
489
    assert_frame_equal(result, expected)
490

491

492
def test_coalesce_eager() -> None:
493
    # eager/series inputs
494
    s1 = pl.Series("colx", [None, 2, None])
495
    s2 = pl.Series("coly", [1, None, None])
496
    s3 = pl.Series("colz", [None, None, 3])
497

498
    res = pl.coalesce(s1, s2, s3, eager=True)
499
    expected = pl.Series("colx", [1, 2, 3])
500
    assert_series_equal(expected, res)
501

502
    for zero in (0, pl.lit(0)):
503
        res = pl.coalesce(s1, zero, eager=True)
504
        expected = pl.Series("colx", [0, 2, 0])
505
        assert_series_equal(expected, res)
506

507
        res = pl.coalesce(zero, s1, eager=True)
508
        expected = pl.Series("literal", [0, 0, 0])
509
        assert_series_equal(expected, res)
510

511
    with pytest.raises(
512
        ValueError,
513
        match="expected at least one Series in 'coalesce' if 'eager=True'",
514
    ):
515
        pl.coalesce("x", "y", eager=True)
516

517

518
def test_overflow_diff() -> None:
519
    df = pl.DataFrame({"a": [20, 10, 30]})
520
    assert df.select(pl.col("a").cast(pl.UInt64).diff()).to_dict(as_series=False) == {
521
        "a": [None, -10, 20]
522
    }
523

524

525
@pytest.mark.may_fail_cloud  # reason: unknown type
526
def test_fill_null_unknown_output_type() -> None:
527
    df = pl.DataFrame({"a": [None, 2, 3, 4, 5]})
528
    assert df.with_columns(
529
        np.exp(pl.col("a")).fill_null(pl.lit(1, pl.Float64))
530
    ).to_dict(as_series=False) == {
531
        "a": [
532
            1.0,
533
            7.38905609893065,
534
            20.085536923187668,
535
            54.598150033144236,
536
            148.4131591025766,
537
        ]
538
    }
539

540

541
def test_approx_n_unique() -> None:
542
    df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]})
543

544
    assert_frame_equal(
545
        df1.select(pl.approx_n_unique("b")),
546
        pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}),
547
    )
548

549
    assert_frame_equal(
550
        df1.select(pl.col("b").approx_n_unique()),
551
        pl.DataFrame({"b": pl.Series(values=[3], dtype=pl.UInt32)}),
552
    )
553

554

555
def test_lazy_functions() -> None:
556
    df = pl.DataFrame(
557
        {
558
            "a": ["foo", "bar", "foo"],
559
            "b": [1, 2, 3],
560
            "c": [-1.0, 2.0, 4.0],
561
        }
562
    )
563

564
    # test function expressions against frame
565
    out = df.select(
566
        pl.var("b").name.suffix("_var"),
567
        pl.std("b").name.suffix("_std"),
568
        pl.max("a", "b").name.suffix("_max"),
569
        pl.min("a", "b").name.suffix("_min"),
570
        pl.sum("b", "c").name.suffix("_sum"),
571
        pl.mean("b", "c").name.suffix("_mean"),
572
        pl.median("c", "b").name.suffix("_median"),
573
        pl.n_unique("b", "a").name.suffix("_n_unique"),
574
        pl.first("a").name.suffix("_first"),
575
        pl.first("b", "c").name.suffix("_first"),
576
        pl.last("c", "b", "a").name.suffix("_last"),
577
    )
578
    expected: dict[str, list[Any]] = {
579
        "b_var": [1.0],
580
        "b_std": [1.0],
581
        "a_max": ["foo"],
582
        "b_max": [3],
583
        "a_min": ["bar"],
584
        "b_min": [1],
585
        "b_sum": [6],
586
        "c_sum": [5.0],
587
        "b_mean": [2.0],
588
        "c_mean": [5 / 3],
589
        "c_median": [2.0],
590
        "b_median": [2.0],
591
        "b_n_unique": [3],
592
        "a_n_unique": [2],
593
        "a_first": ["foo"],
594
        "b_first": [1],
595
        "c_first": [-1.0],
596
        "c_last": [4.0],
597
        "b_last": [3],
598
        "a_last": ["foo"],
599
    }
600
    assert_frame_equal(
601
        out,
602
        pl.DataFrame(
603
            data=expected,
604
            schema_overrides={
605
                "a_n_unique": pl.UInt32,
606
                "b_n_unique": pl.UInt32,
607
            },
608
        ),
609
    )
610

611
    # test function expressions against series
612
    for name, value in expected.items():
613
        col, fn = name.split("_", 1)
614
        if series_fn := getattr(df[col], fn, None):
615
            assert series_fn() == value[0]
616

617
    # regex selection
618
    out = df.select(
619
        pl.struct(pl.max("^a|b$")).alias("x"),
620
        pl.struct(pl.min("^.*[bc]$")).alias("y"),
621
        pl.struct(pl.sum("^[^a]$")).alias("z"),
622
    )
623
    assert out.rows() == [
624
        ({"a": "foo", "b": 3}, {"b": 1, "c": -1.0}, {"b": 6, "c": 5.0})
625
    ]
626

627

628
def test_count() -> None:
629
    df = pl.DataFrame({"a": [1, 1, 1], "b": [None, "xx", "yy"]})
630
    out = df.select(pl.count("a"))
631
    assert list(out["a"]) == [3]
632

633
    for count_expr in (
634
        pl.count("b", "a"),
635
        [pl.count("b"), pl.count("a")],
636
    ):
637
        out = df.select(count_expr)
638
        assert out.rows() == [(2, 3)]
639

640

641
def test_head_tail(fruits_cars: pl.DataFrame) -> None:
642
    res_expr = fruits_cars.select(pl.head("A", 2))
643
    expected = pl.Series("A", [1, 2])
644
    assert_series_equal(res_expr.to_series(), expected)
645

646
    res_expr = fruits_cars.select(pl.tail("A", 2))
647
    expected = pl.Series("A", [4, 5])
648
    assert_series_equal(res_expr.to_series(), expected)
649

650

651
def test_escape_regex() -> None:
652
    result = pl.escape_regex("abc(\\w+)")
653
    expected = "abc\\(\\\\w\\+\\)"
654
    assert result == expected
655

656
    df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
657
    with pytest.raises(
658
        TypeError,
659
        match="escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead",
660
    ):
661
        df.with_columns(escaped=pl.escape_regex(pl.col("text")))  # type: ignore[arg-type]
662

663
    with pytest.raises(
664
        TypeError,
665
        match="escape_regex function supports only `str` type, got `int`",
666
    ):
667
        pl.escape_regex(3)  # type: ignore[arg-type]
668

669

670
@pytest.mark.parametrize("func", ["var", "std"])
671
def test_var_std_lit_23156(func: str) -> None:
672
    for n in range(100):
673
        input = pl.DataFrame({"x": list(range(n))}).select(pl.col("x"), pl.lit(0))
674
        out = getattr(input, func)()
675
        if n <= 1:
676
            assert_series_equal(
677
                out["literal"], pl.Series("literal", [None], dtype=pl.Float64)
678
            )
679
        else:
680
            assert_series_equal(
681
                out["literal"], pl.Series("literal", [0.0], dtype=pl.Float64)
682
            )
683

684

685
def test_row_index_expr() -> None:
686
    lf = pl.LazyFrame({"x": ["A", "A", "B", "B", "B"]})
687

688
    assert_frame_equal(
689
        lf.with_columns(pl.row_index(), pl.row_index("another_index")).collect(),
690
        pl.DataFrame(
691
            {
692
                "x": ["A", "A", "B", "B", "B"],
693
                "index": [0, 1, 2, 3, 4],
694
                "another_index": [0, 1, 2, 3, 4],
695
            },
696
            schema={
697
                "x": pl.String,
698
                "index": pl.get_index_type(),
699
                "another_index": pl.get_index_type(),
700
            },
701
        ),
702
    )
703

704
    assert_frame_equal(
705
        (
706
            lf.group_by("x")
707
            .agg(pl.row_index(), pl.row_index("another_index"))
708
            .sort("x")
709
            .collect()
710
        ),
711
        pl.DataFrame(
712
            {
713
                "x": ["A", "B"],
714
                "index": [[0, 1], [0, 1, 2]],
715
                "another_index": [[0, 1], [0, 1, 2]],
716
            },
717
            schema={
718
                "x": pl.String,
719
                "index": pl.List(pl.get_index_type()),
720
                "another_index": pl.List(pl.get_index_type()),
721
            },
722
        ),
723
    )
724

725
    assert_frame_equal(
726
        lf.select(pl.row_index()).collect(),
727
        pl.DataFrame(
728
            {"index": [0, 1, 2, 3, 4]},
729
            schema={"index": pl.get_index_type()},
730
        ),
731
    )
732

733
Product

Resources

Company