CoCalc -- test

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_df.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
import sys
4
import typing
5
from collections import OrderedDict
6
from collections.abc import Iterator, Mapping
7
from datetime import date, datetime, time, timedelta, timezone
8
from decimal import Decimal
9
from io import BytesIO
10
from operator import floordiv, truediv
11
from typing import TYPE_CHECKING, Any, Callable, cast
12
from zoneinfo import ZoneInfo
13

14
import numpy as np
15
import pyarrow as pa
16
import pytest
17

18
import polars as pl
19
import polars.selectors as cs
20
from polars._plr import PySeries
21
from polars._utils.construction import iterable_to_pydf
22
from polars.datatypes import DTYPE_TEMPORAL_UNITS
23
from polars.exceptions import (
24
    ColumnNotFoundError,
25
    ComputeError,
26
    DuplicateError,
27
    InvalidOperationError,
28
    OutOfBoundsError,
29
    ShapeError,
30
)
31
from polars.testing import (
32
    assert_frame_equal,
33
    assert_frame_not_equal,
34
    assert_series_equal,
35
)
36
from tests.unit.conftest import INTEGER_DTYPES
37

38
if TYPE_CHECKING:
39
    from collections.abc import Iterator, Sequence
40

41
    from polars import Expr
42
    from polars._typing import JoinStrategy, UniqueKeepStrategy
43

44

45
class MappingObject(Mapping[str, Any]):  # noqa: D101
46
    def __init__(self, **values: Any) -> None:
47
        self._data = {**values}
48

49
    def __getitem__(self, key: str) -> Any:
50
        return self._data[key]
51

52
    def __iter__(self) -> Iterator[str]:
53
        yield from self._data
54

55
    def __len__(self) -> int:
56
        return len(self._data)
57

58

59
def test_version() -> None:
60
    isinstance(pl.__version__, str)
61

62

63
def test_null_count() -> None:
64
    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", None]})
65
    assert df.null_count().shape == (1, 2)
66
    assert df.null_count().row(0) == (0, 1)
67
    assert df.null_count().row(np.int64(0)) == (0, 1)  # type: ignore[call-overload]
68

69

70
@pytest.mark.parametrize("input", [None, (), [], {}, pa.Table.from_arrays([])])
71
def test_init_empty(input: Any) -> None:
72
    # test various flavours of empty init
73
    df = pl.DataFrame(input)
74
    assert df.shape == (0, 0)
75
    assert df.is_empty()
76

77

78
def test_df_bool_ambiguous() -> None:
79
    empty_df = pl.DataFrame()
80
    with pytest.raises(TypeError, match="ambiguous"):
81
        not empty_df
82

83

84
def test_special_char_colname_init() -> None:
85
    from string import punctuation
86

87
    cols = [(c, pl.Int8) for c in punctuation]
88
    df = pl.DataFrame(schema=cols)
89

90
    assert len(cols) == df.width
91
    assert len(df.rows()) == 0
92
    assert df.is_empty()
93

94

95
def test_comparisons() -> None:
96
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
97

98
    # Constants
99
    assert_frame_equal(df == 2, pl.DataFrame({"a": [False, True], "b": [False, False]}))
100
    assert_frame_equal(df != 2, pl.DataFrame({"a": [True, False], "b": [True, True]}))
101
    assert_frame_equal(df < 3.0, pl.DataFrame({"a": [True, True], "b": [False, False]}))
102
    assert_frame_equal(df >= 2, pl.DataFrame({"a": [False, True], "b": [True, True]}))
103
    assert_frame_equal(df <= 2, pl.DataFrame({"a": [True, True], "b": [False, False]}))
104

105
    with pytest.raises(ComputeError):
106
        df > "2"  # noqa: B015
107

108
    # Series
109
    s = pl.Series([3, 1])
110
    assert_frame_equal(df >= s, pl.DataFrame({"a": [False, True], "b": [True, True]}))
111

112
    # DataFrame
113
    other = pl.DataFrame({"a": [1, 2], "b": [2, 3]})
114
    assert_frame_equal(
115
        df == other, pl.DataFrame({"a": [True, True], "b": [False, False]})
116
    )
117
    assert_frame_equal(
118
        df != other, pl.DataFrame({"a": [False, False], "b": [True, True]})
119
    )
120
    assert_frame_equal(
121
        df > other, pl.DataFrame({"a": [False, False], "b": [True, True]})
122
    )
123
    assert_frame_equal(
124
        df < other, pl.DataFrame({"a": [False, False], "b": [False, False]})
125
    )
126
    assert_frame_equal(
127
        df >= other, pl.DataFrame({"a": [True, True], "b": [True, True]})
128
    )
129
    assert_frame_equal(
130
        df <= other, pl.DataFrame({"a": [True, True], "b": [False, False]})
131
    )
132

133
    # DataFrame columns mismatch
134
    with pytest.raises(ValueError):
135
        df == pl.DataFrame({"a": [1, 2], "c": [3, 4]})  # noqa: B015
136
    with pytest.raises(ValueError):
137
        df == pl.DataFrame({"b": [3, 4], "a": [1, 2]})  # noqa: B015
138

139
    # DataFrame shape mismatch
140
    with pytest.raises(ValueError):
141
        df == pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})  # noqa: B015
142

143
    # Type mismatch
144
    with pytest.raises(ComputeError):
145
        df == pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})  # noqa: B015
146

147

148
def test_column_selection() -> None:
149
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
150

151
    # get column by name
152
    b = pl.Series("b", [1.0, 2.0, 3.0])
153
    assert_series_equal(df["b"], b)
154
    assert_series_equal(df.get_column("b"), b)
155

156
    with pytest.raises(ColumnNotFoundError, match="x"):
157
        df.get_column("x")
158

159
    default_series = pl.Series("x", ["?", "?", "?"])
160
    assert_series_equal(df.get_column("x", default=default_series), default_series)
161

162
    assert df.get_column("x", default=None) is None
163

164
    # get column by index
165
    assert_series_equal(df.to_series(1), pl.Series("b", [1.0, 2.0, 3.0]))
166
    assert_series_equal(df.to_series(-1), pl.Series("c", ["a", "b", "c"]))
167

168

169
def test_mixed_sequence_selection() -> None:
170
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
171
    result = df.select(["a", pl.col("b"), pl.lit("c")])
172
    expected = pl.DataFrame({"a": [1, 2], "b": [3, 4], "literal": ["c", "c"]})
173
    assert_frame_equal(result, expected)
174

175

176
def test_from_arrow(monkeypatch: Any) -> None:
177
    tbl = pa.table(
178
        {
179
            "a": pa.array([1, 2], pa.timestamp("s")),
180
            "b": pa.array([1, 2], pa.timestamp("ms")),
181
            "c": pa.array([1, 2], pa.timestamp("us")),
182
            "d": pa.array([1, 2], pa.timestamp("ns")),
183
            "e": pa.array([1, 2], pa.int32()),
184
            "decimal1": pa.array([1, 2], pa.decimal128(2, 1)),
185
            "struct": pa.array(
186
                [{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])
187
            ),
188
        }
189
    )
190
    record_batches = tbl.to_batches(max_chunksize=1)
191
    expected_schema = {
192
        "a": pl.Datetime("ms"),
193
        "b": pl.Datetime("ms"),
194
        "c": pl.Datetime("us"),
195
        "d": pl.Datetime("ns"),
196
        "e": pl.Int32,
197
        "decimal1": pl.Decimal(2, 1),
198
        "struct": pl.Struct({"a": pl.Int32()}),
199
    }
200
    expected_data = [
201
        (
202
            datetime(1970, 1, 1, 0, 0, 1),
203
            datetime(1970, 1, 1, 0, 0, 0, 1000),
204
            datetime(1970, 1, 1, 0, 0, 0, 1),
205
            datetime(1970, 1, 1, 0, 0),
206
            1,
207
            Decimal("1.0"),
208
            {"a": 1},
209
        ),
210
        (
211
            datetime(1970, 1, 1, 0, 0, 2),
212
            datetime(1970, 1, 1, 0, 0, 0, 2000),
213
            datetime(1970, 1, 1, 0, 0, 0, 2),
214
            datetime(1970, 1, 1, 0, 0),
215
            2,
216
            Decimal("2.0"),
217
            {"a": 2},
218
        ),
219
    ]
220
    for arrow_data in (tbl, record_batches, (rb for rb in record_batches)):
221
        df = cast("pl.DataFrame", pl.from_arrow(arrow_data))
222
        assert df.schema == expected_schema
223
        assert df.rows() == expected_data
224

225
    # record batches (inc. empty)
226
    for b, n_expected in (
227
        (record_batches[0], 1),
228
        (record_batches[0][:0], 0),
229
    ):
230
        df = cast("pl.DataFrame", pl.from_arrow(b))
231
        assert df.schema == expected_schema
232
        assert df.rows() == expected_data[:n_expected]
233

234
    empty_tbl = tbl[:0]  # no rows
235
    df = cast("pl.DataFrame", pl.from_arrow(empty_tbl))
236
    assert df.schema == expected_schema
237
    assert df.rows() == []
238

239
    # try a single column dtype override
240
    for t in (tbl, empty_tbl):
241
        df = pl.DataFrame(t, schema_overrides={"e": pl.Int8})
242
        override_schema = expected_schema.copy()
243
        override_schema["e"] = pl.Int8
244
        assert df.schema == override_schema
245
        assert df.rows() == expected_data[: (df.height)]
246

247
    # init from record batches with overrides
248
    df = pl.DataFrame(
249
        {
250
            "id": ["a123", "b345", "c567", "d789", "e101"],
251
            "points": [99, 45, 50, 85, 35],
252
        }
253
    )
254
    tbl = df.to_arrow()
255
    batches = tbl.to_batches(max_chunksize=3)
256

257
    df0: pl.DataFrame = pl.from_arrow(batches)  # type: ignore[assignment]
258
    df1: pl.DataFrame = pl.from_arrow(  # type: ignore[assignment]
259
        data=batches,
260
        schema=["x", "y"],
261
        schema_overrides={"y": pl.Int32},
262
    )
263
    df2: pl.DataFrame = pl.from_arrow(  # type: ignore[assignment]
264
        data=batches[0],
265
        schema=["x", "y"],
266
        schema_overrides={"y": pl.Int32},
267
    )
268

269
    assert df0.rows() == df.rows()
270
    assert df1.rows() == df.rows()
271
    assert df2.rows() == df.rows()[:3]
272

273
    assert df0.schema == {"id": pl.String, "points": pl.Int64}
274
    print(df1.schema)
275
    assert df1.schema == {"x": pl.String, "y": pl.Int32}
276
    assert df2.schema == {"x": pl.String, "y": pl.Int32}
277

278
    with pytest.raises(TypeError, match="Cannot convert str"):
279
        pl.from_arrow(data="xyz")
280

281
    with pytest.raises(TypeError, match="Cannot convert int"):
282
        pl.from_arrow(data=(x for x in (1, 2, 3)))
283

284

285
@pytest.mark.parametrize(
286
    "data",
287
    [
288
        pa.Table.from_pydict(
289
            {
290
                "struct": pa.array(
291
                    [{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])
292
                ),
293
            }
294
        ),
295
        pa.Table.from_pydict(
296
            {
297
                "struct": pa.chunked_array(
298
                    [[{"a": 1}], [{"a": 2}]], pa.struct([pa.field("a", pa.int32())])
299
                ),
300
            }
301
        ),
302
    ],
303
)
304
def test_from_arrow_struct_column(data: pa.Table) -> None:
305
    df = cast("pl.DataFrame", pl.from_arrow(data=data))
306
    expected_schema = pl.Schema({"struct": pl.Struct({"a": pl.Int32()})})
307
    expected_data = [({"a": 1},), ({"a": 2},)]
308
    assert df.schema == expected_schema
309
    assert df.rows() == expected_data
310

311

312
def test_dataframe_membership_operator() -> None:
313
    # cf. issue #4032
314
    df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})
315
    assert "name" in df
316
    assert "phone" not in df
317
    assert df._ipython_key_completions_() == ["name", "age"]
318

319

320
def test_sort() -> None:
321
    df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
322
    expected = pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]})
323
    assert_frame_equal(df.sort("a"), expected)
324
    assert_frame_equal(df.sort(["a", "b"]), expected)
325

326

327
def test_sort_multi_output_exprs_01() -> None:
328
    df = pl.DataFrame(
329
        {
330
            "dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],
331
            "strs": ["abc", "def", "ghi"],
332
            "vals": [10.5, 20.3, 15.7],
333
        }
334
    )
335

336
    expected = pl.DataFrame(
337
        {
338
            "dts": [date(2077, 10, 2), date(2077, 10, 2), date(2077, 10, 3)],
339
            "strs": ["ghi", "def", "abc"],
340
            "vals": [15.7, 20.3, 10.5],
341
        }
342
    )
343
    assert_frame_equal(expected, df.sort(pl.col("^(d|v).*$")))
344
    assert_frame_equal(expected, df.sort(cs.temporal() | cs.numeric()))
345
    assert_frame_equal(expected, df.sort(cs.temporal(), cs.numeric(), cs.binary()))
346

347
    expected = pl.DataFrame(
348
        {
349
            "dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],
350
            "strs": ["abc", "def", "ghi"],
351
            "vals": [10.5, 20.3, 15.7],
352
        }
353
    )
354
    assert_frame_equal(
355
        expected,
356
        df.sort(pl.col("^(d|v).*$"), descending=[True]),
357
    )
358
    assert_frame_equal(
359
        expected,
360
        df.sort(cs.temporal() | cs.numeric(), descending=[True]),
361
    )
362
    assert_frame_equal(
363
        expected,
364
        df.sort(cs.temporal(), cs.numeric(), descending=[True, True]),
365
    )
366

367
    with pytest.raises(
368
        ValueError,
369
        match=r"the length of `descending` \(2\) does not match the length of `by` \(1\)",
370
    ):
371
        df.sort(by=[cs.temporal()], descending=[True, False])
372

373
    with pytest.raises(
374
        ValueError,
375
        match=r"the length of `nulls_last` \(3\) does not match the length of `by` \(2\)",
376
    ):
377
        df.sort("dts", "strs", nulls_last=[True, False, True])
378

379
    # No columns selected - return original input.
380
    assert_frame_equal(df, df.sort(pl.col("^xxx$")))
381

382

383
@pytest.mark.parametrize(
384
    ("by_explicit", "desc_explicit", "by_multi", "desc_multi"),
385
    [
386
        (
387
            ["w", "x", "y", "z"],
388
            [False, False, True, True],
389
            [cs.integer(), cs.string()],
390
            [False, True],
391
        ),
392
        (
393
            ["w", "y", "z"],
394
            [True, True, False],
395
            [pl.col("^(w|y)$"), pl.col("^z.*$")],
396
            [True, False],
397
        ),
398
        (
399
            ["z", "w", "x"],
400
            [True, False, False],
401
            [pl.col("z"), cs.numeric()],
402
            [True, False],
403
        ),
404
    ],
405
)
406
def test_sort_multi_output_exprs_02(
407
    by_explicit: list[str],
408
    desc_explicit: list[bool],
409
    by_multi: list[Expr],
410
    desc_multi: list[bool],
411
) -> None:
412
    df = pl.DataFrame(
413
        {
414
            "w": [100, 100, 100, 100, 200, 200, 200, 200],
415
            "x": [888, 888, 444, 444, 888, 888, 444, 888],
416
            "y": ["b", "b", "a", "a", "b", "b", "a", "a"],
417
            "z": ["x", "y", "x", "y", "x", "y", "x", "y"],
418
        }
419
    )
420
    res1 = df.sort(*by_explicit, descending=desc_explicit)
421
    res2 = df.sort(*by_multi, descending=desc_multi)
422
    assert_frame_equal(res1, res2)
423

424

425
def test_sort_maintain_order() -> None:
426
    l1 = (
427
        pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})
428
        .sort("A", maintain_order=True)
429
        .slice(0, 3)
430
        .collect()["B"]
431
        .to_list()
432
    )
433
    l2 = (
434
        pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})
435
        .sort("A")
436
        .collect()
437
        .slice(0, 3)["B"]
438
        .to_list()
439
    )
440
    assert l1 == l2 == ["A", "B", "C"]
441

442

443
@pytest.mark.parametrize("nulls_last", [False, True], ids=["nulls_first", "nulls_last"])
444
def test_sort_maintain_order_descending_repeated_nulls(nulls_last: bool) -> None:
445
    got = (
446
        pl.LazyFrame({"A": [None, -1, 1, 1, None], "B": [1, 2, 3, 4, 5]})
447
        .sort("A", descending=True, maintain_order=True, nulls_last=nulls_last)
448
        .collect()
449
    )
450
    if nulls_last:
451
        expect = pl.DataFrame({"A": [1, 1, -1, None, None], "B": [3, 4, 2, 1, 5]})
452
    else:
453
        expect = pl.DataFrame({"A": [None, None, 1, 1, -1], "B": [1, 5, 3, 4, 2]})
454
    assert_frame_equal(got, expect)
455

456

457
def test_replace() -> None:
458
    df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
459
    s = pl.Series("c", [True, False, True])
460
    df._replace("a", s)
461
    assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}))
462

463

464
def test_assignment() -> None:
465
    df = pl.DataFrame({"foo": [1, 2, 3], "bar": [2, 3, 4]})
466
    df = df.with_columns(pl.col("foo").alias("foo"))
467
    # make sure that assignment does not change column order
468
    assert df.columns == ["foo", "bar"]
469
    df = df.with_columns(
470
        pl.when(pl.col("foo") > 1).then(9).otherwise(pl.col("foo")).alias("foo")
471
    )
472
    assert df["foo"].to_list() == [1, 9, 9]
473

474

475
def test_insert_column() -> None:
476
    # insert series
477
    df = (
478
        pl.DataFrame({"z": [3, 4, 5]})
479
        .insert_column(0, pl.Series("x", [1, 2, 3]))
480
        .insert_column(-1, pl.Series("y", [2, 3, 4]))
481
    )
482
    expected_df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
483
    assert_frame_equal(expected_df, df)
484

485
    # insert expressions
486
    df = pl.DataFrame(
487
        {
488
            "id": ["xx", "yy", "zz"],
489
            "v1": [5, 4, 6],
490
            "v2": [7, 3, 3],
491
        }
492
    )
493
    df.insert_column(3, (pl.col("v1") * pl.col("v2")).alias("v3"))
494
    df.insert_column(1, (pl.col("v2") - pl.col("v1")).alias("v0"))
495

496
    expected = pl.DataFrame(
497
        {
498
            "id": ["xx", "yy", "zz"],
499
            "v0": [2, -1, -3],
500
            "v1": [5, 4, 6],
501
            "v2": [7, 3, 3],
502
            "v3": [35, 12, 18],
503
        }
504
    )
505
    assert_frame_equal(df, expected)
506

507
    # check that we raise suitable index errors
508
    for idx, column in (
509
        (10, pl.col("v1").sqrt().alias("v1_sqrt")),
510
        (-10, pl.Series("foo", [1, 2, 3])),
511
    ):
512
        with pytest.raises(
513
            IndexError,
514
            match=rf"column index {idx} is out of range \(frame has 5 columns\)",
515
        ):
516
            df.insert_column(idx, column)
517

518

519
def test_replace_column() -> None:
520
    df = (
521
        pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
522
        .replace_column(0, pl.Series("a", [4, 5, 6]))
523
        .replace_column(-2, pl.Series("b", [5, 6, 7]))
524
        .replace_column(-1, pl.Series("c", [6, 7, 8]))
525
    )
526
    expected_df = pl.DataFrame({"a": [4, 5, 6], "b": [5, 6, 7], "c": [6, 7, 8]})
527
    assert_frame_equal(expected_df, df)
528

529

530
def test_to_series() -> None:
531
    df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
532

533
    assert_series_equal(df.to_series(), df["x"])
534
    assert_series_equal(df.to_series(0), df["x"])
535
    assert_series_equal(df.to_series(-3), df["x"])
536

537
    assert_series_equal(df.to_series(1), df["y"])
538
    assert_series_equal(df.to_series(-2), df["y"])
539

540
    assert_series_equal(df.to_series(2), df["z"])
541
    assert_series_equal(df.to_series(-1), df["z"])
542

543

544
def test_to_series_bad_inputs() -> None:
545
    df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
546

547
    with pytest.raises(IndexError, match="index 5 is out of bounds"):
548
        df.to_series(5)
549

550
    with pytest.raises(IndexError, match="index -100 is out of bounds"):
551
        df.to_series(-100)
552

553
    with pytest.raises(
554
        TypeError, match="'str' object cannot be interpreted as an integer"
555
    ):
556
        df.to_series("x")  # type: ignore[arg-type]
557

558

559
def test_gather_every() -> None:
560
    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})
561
    expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})
562
    assert_frame_equal(expected_df, df.gather_every(2))
563

564
    expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})
565
    assert_frame_equal(expected_df, df.gather_every(2, offset=1))
566

567

568
def test_gather_every_agg() -> None:
569
    df = pl.DataFrame(
570
        {
571
            "g": [1, 1, 1, 2, 2, 2],
572
            "a": ["a", "b", "c", "d", "e", "f"],
573
        }
574
    )
575
    out = df.group_by(pl.col("g")).agg(pl.col("a").gather_every(2)).sort("g")
576
    expected = pl.DataFrame(
577
        {
578
            "g": [1, 2],
579
            "a": [["a", "c"], ["d", "f"]],
580
        }
581
    )
582
    assert_frame_equal(out, expected)
583

584

585
def test_take_misc(fruits_cars: pl.DataFrame) -> None:
586
    df = fruits_cars
587

588
    # Out of bounds error.
589
    with pytest.raises(OutOfBoundsError):
590
        df.sort("fruits").select(
591
            pl.col("B").reverse().gather([1, 2]).implode().over("fruits"),
592
            "fruits",
593
        )
594

595
    # Null indices.
596
    assert_frame_equal(
597
        df.select(pl.col("fruits").gather(pl.Series([0, None]))),
598
        pl.DataFrame({"fruits": ["banana", None]}),
599
    )
600

601
    for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]:
602
        out = df.sort("fruits").select(
603
            pl.col("B")
604
            .reverse()
605
            .gather(index)  # type: ignore[arg-type]
606
            .over("fruits", mapping_strategy="join"),
607
            "fruits",
608
        )
609

610
        assert out[0, "B"].to_list() == [2, 3]
611
        assert out[4, "B"].to_list() == [1, 4]
612

613
    out = df.sort("fruits").select(
614
        pl.col("B").reverse().get(pl.lit(1)).over("fruits"),
615
        "fruits",
616
    )
617
    assert out[0, "B"] == 3
618
    assert out[4, "B"] == 4
619

620

621
def test_pipe() -> None:
622
    df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8]})
623

624
    def _multiply(data: pl.DataFrame, mul: int) -> pl.DataFrame:
625
        return data * mul
626

627
    result = df.pipe(_multiply, mul=3)
628

629
    assert_frame_equal(result, df * 3)
630

631

632
def test_explode() -> None:
633
    df = pl.DataFrame({"letters": ["c", "a"], "nrs": [[1, 2], [1, 3]]})
634
    out = df.explode("nrs")
635
    assert out["letters"].to_list() == ["c", "c", "a", "a"]
636
    assert out["nrs"].to_list() == [1, 2, 1, 3]
637

638

639
@pytest.mark.parametrize(
640
    ("stack", "exp_shape", "exp_columns"),
641
    [
642
        ([pl.Series("stacked", [-1, -1, -1])], (3, 3), ["a", "b", "stacked"]),
643
        (
644
            [pl.Series("stacked2", [-1, -1, -1]), pl.Series("stacked3", [-1, -1, -1])],
645
            (3, 4),
646
            ["a", "b", "stacked2", "stacked3"],
647
        ),
648
    ],
649
)
650
@pytest.mark.parametrize("in_place", [True, False])
651
def test_hstack_list_of_series(
652
    stack: list[pl.Series],
653
    exp_shape: tuple[int, int],
654
    exp_columns: list[str],
655
    in_place: bool,
656
) -> None:
657
    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
658
    if in_place:
659
        df.hstack(stack, in_place=True)
660
        assert df.shape == exp_shape
661
        assert df.columns == exp_columns
662
    else:
663
        df_out = df.hstack(stack, in_place=False)
664
        assert df_out.shape == exp_shape
665
        assert df_out.columns == exp_columns
666

667

668
@pytest.mark.parametrize("in_place", [True, False])
669
def test_hstack_dataframe(in_place: bool) -> None:
670
    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
671
    df2 = pl.DataFrame({"c": [2, 1, 3], "d": ["a", "b", "c"]})
672
    expected = pl.DataFrame(
673
        {"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [2, 1, 3], "d": ["a", "b", "c"]}
674
    )
675
    if in_place:
676
        df.hstack(df2, in_place=True)
677
        assert_frame_equal(df, expected)
678
    else:
679
        df_out = df.hstack(df2, in_place=False)
680
        assert_frame_equal(df_out, expected)
681

682

683
@pytest.mark.may_fail_cloud
684
def test_file_buffer() -> None:
685
    f = BytesIO()
686
    f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
687
    f.seek(0)
688
    df = pl.read_csv(f, has_header=False)
689
    assert df.shape == (2, 6)
690

691
    f = BytesIO()
692
    f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
693
    f.seek(0)
694
    # check if not fails on TryClone and Length impl in file.rs
695
    with pytest.raises(ComputeError):
696
        pl.read_parquet(f)
697

698

699
def test_shift() -> None:
700
    df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
701
    a = df.shift(1)
702
    b = pl.DataFrame(
703
        {"A": [None, "a", "b"], "B": [None, 1, 3]},
704
    )
705
    assert_frame_equal(a, b)
706

707

708
def test_multiple_columns_drop() -> None:
709
    df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
710
    # List input
711
    out = df.drop(["a", "b"])
712
    assert out.columns == ["c"]
713
    # Positional input
714
    out = df.drop("b", "c")
715
    assert out.columns == ["a"]
716

717

718
def test_arg_where() -> None:
719
    s = pl.Series([True, False, True, False])
720
    assert_series_equal(
721
        pl.arg_where(s, eager=True).cast(int),
722
        pl.Series([0, 2]),
723
    )
724

725

726
def test_to_dummies() -> None:
727
    df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
728
    dummies = df.to_dummies()
729

730
    assert dummies["A_a"].to_list() == [1, 0, 0]
731
    assert dummies["A_b"].to_list() == [0, 1, 0]
732
    assert dummies["A_c"].to_list() == [0, 0, 1]
733

734
    df = pl.DataFrame({"a": [1, 2, 3]})
735
    res = df.to_dummies()
736

737
    expected = pl.DataFrame(
738
        {"a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]}
739
    ).with_columns(pl.all().cast(pl.UInt8))
740
    assert_frame_equal(res, expected)
741

742
    df = pl.DataFrame(
743
        {
744
            "i": [1, 2, 3],
745
            "category": ["dog", "cat", "cat"],
746
        },
747
        schema={"i": pl.Int32, "category": pl.Categorical("lexical")},
748
    )
749
    expected = pl.DataFrame(
750
        {
751
            "i": [1, 2, 3],
752
            "category|cat": [0, 1, 1],
753
            "category|dog": [1, 0, 0],
754
        },
755
        schema={"i": pl.Int32, "category|cat": pl.UInt8, "category|dog": pl.UInt8},
756
    )
757
    for _cols in ("category", cs.string()):
758
        result = df.to_dummies(columns=["category"], separator="|")
759
        assert_frame_equal(result, expected)
760

761
    # test sorted fast path
762
    result = pl.DataFrame({"x": pl.arange(0, 3, eager=True)}).to_dummies("x")
763
    expected = pl.DataFrame(
764
        {"x_0": [1, 0, 0], "x_1": [0, 1, 0], "x_2": [0, 0, 1]}
765
    ).with_columns(pl.all().cast(pl.UInt8))
766
    assert_frame_equal(result, expected)
767

768

769
def test_to_dummies_drop_first() -> None:
770
    df = pl.DataFrame(
771
        {
772
            "foo": [0, 1, 2],
773
            "bar": [3, 4, 5],
774
            "baz": ["x", "y", "z"],
775
        }
776
    )
777
    dm = df.to_dummies()
778
    dd = df.to_dummies(drop_first=True)
779

780
    assert dd.columns == ["foo_1", "foo_2", "bar_4", "bar_5", "baz_y", "baz_z"]
781
    assert set(dm.columns) - set(dd.columns) == {"foo_0", "bar_3", "baz_x"}
782
    assert_frame_equal(dm.select(dd.columns), dd)
783
    assert dd.rows() == [
784
        (0, 0, 0, 0, 0, 0),
785
        (1, 0, 1, 0, 1, 0),
786
        (0, 1, 0, 1, 0, 1),
787
    ]
788

789

790
def test_to_dummies_drop_nulls() -> None:
791
    df = pl.DataFrame(
792
        {
793
            "foo": [0, 1, None],
794
            "bar": [3, None, 5],
795
            "baz": [None, "y", "z"],
796
        }
797
    )
798

799
    dm = df.to_dummies(drop_nulls=True)
800

801
    expected = pl.DataFrame(
802
        {
803
            "foo_0": [1, 0, 0],
804
            "foo_1": [0, 1, 0],
805
            "bar_3": [1, 0, 0],
806
            "bar_5": [0, 0, 1],
807
            "baz_y": [0, 1, 0],
808
            "baz_z": [0, 0, 1],
809
        },
810
        schema={
811
            "foo_0": pl.UInt8,
812
            "foo_1": pl.UInt8,
813
            "bar_3": pl.UInt8,
814
            "bar_5": pl.UInt8,
815
            "baz_y": pl.UInt8,
816
            "baz_z": pl.UInt8,
817
        },
818
    )
819
    assert_frame_equal(dm, expected)
820

821

822
def test_to_pandas(df: pl.DataFrame) -> None:
823
    # pyarrow cannot deal with unsigned dictionary integer yet.
824
    # pyarrow cannot convert a time64 w/ non-zero nanoseconds
825
    df = df.drop(["cat", "time", "enum"])
826
    df.to_arrow()
827
    df.to_pandas()
828
    # test shifted df
829
    df.shift(2).to_pandas()
830
    df = pl.DataFrame({"col": pl.Series([True, False, True])})
831
    df.shift(2).to_pandas()
832

833

834
def test_from_arrow_table() -> None:
835
    data = {"a": [1, 2], "b": [1, 2]}
836
    tbl = pa.table(data)
837

838
    df = cast("pl.DataFrame", pl.from_arrow(tbl))
839
    assert_frame_equal(df, pl.DataFrame(data))
840

841

842
def test_df_stats(df: pl.DataFrame) -> None:
843
    df.var()
844
    df.std()
845
    df.min()
846
    df.max()
847
    df.sum()
848
    df.mean()
849
    df.median()
850
    df.quantile(0.4, "nearest")
851

852

853
def test_df_fold() -> None:
854
    df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
855

856
    assert_series_equal(
857
        df.fold(lambda s1, s2: s1 + s2), pl.Series("a", [4.0, 5.0, 9.0])
858
    )
859
    assert_series_equal(
860
        df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)),
861
        pl.Series("a", [1.0, 1.0, 3.0]),
862
    )
863

864
    df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
865
    out = df.fold(lambda s1, s2: s1 + s2)
866
    assert_series_equal(out, pl.Series("a", ["foo11.0", "bar22.0", "233.0"]))
867

868
    df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
869
    # just check dispatch. values are tested on rust side.
870
    assert len(df.sum_horizontal()) == 3
871
    assert len(df.mean_horizontal()) == 3
872
    assert len(df.min_horizontal()) == 3
873
    assert len(df.max_horizontal()) == 3
874

875
    df_width_one = df[["a"]]
876
    assert_series_equal(df_width_one.fold(lambda s1, s2: s1), df["a"])
877

878

879
@pytest.mark.may_fail_cloud  # TODO: make pickleable
880
def test_fold_filter() -> None:
881
    df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
882

883
    out = df.filter(
884
        pl.fold(
885
            acc=pl.lit(True),
886
            function=lambda a, b: a & b,
887
            exprs=[pl.col(c) > 1 for c in df.columns],
888
        )
889
    )
890

891
    assert out.shape == (1, 2)
892
    assert out.rows() == [(3, 2)]
893

894
    out = df.filter(
895
        pl.fold(
896
            acc=pl.lit(True),
897
            function=lambda a, b: a | b,
898
            exprs=[pl.col(c) > 1 for c in df.columns],
899
        )
900
    )
901

902
    assert out.shape == (3, 2)
903
    assert out.rows() == [(1, 0), (2, 1), (3, 2)]
904

905

906
def test_column_names() -> None:
907
    tbl = pa.table(
908
        {
909
            "a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)),
910
            "b": pa.array([1, 2, 3, 4, 5], pa.int64()),
911
        }
912
    )
913
    for a in (tbl, tbl[:0]):
914
        df = cast("pl.DataFrame", pl.from_arrow(a))
915
        assert df.columns == ["a", "b"]
916

917

918
def test_init_series_edge_cases() -> None:
919
    # confirm that we don't modify the name of the input series in-place
920
    s1 = pl.Series("X", [1, 2, 3])
921
    df1 = pl.DataFrame({"A": s1}, schema_overrides={"A": pl.UInt8})
922
    assert s1.name == "X"
923
    assert df1["A"].name == "A"
924

925
    # init same series object under different names
926
    df2 = pl.DataFrame({"A": s1, "B": s1})
927
    assert df2.rows(named=True) == [
928
        {"A": 1, "B": 1},
929
        {"A": 2, "B": 2},
930
        {"A": 3, "B": 3},
931
    ]
932

933
    # empty series names should not be overwritten
934
    s2 = pl.Series([1, 2, 3])
935
    s3 = pl.Series([2, 3, 4])
936
    df3 = pl.DataFrame([s2, s3])
937
    assert s2.name == s3.name == ""
938
    assert df3.columns == ["column_0", "column_1"]
939

940

941
def test_head_group_by() -> None:
942
    commodity_prices = {
943
        "commodity": [
944
            "Wheat",
945
            "Wheat",
946
            "Wheat",
947
            "Wheat",
948
            "Corn",
949
            "Corn",
950
            "Corn",
951
            "Corn",
952
            "Corn",
953
        ],
954
        "location": [
955
            "StPaul",
956
            "StPaul",
957
            "StPaul",
958
            "Chicago",
959
            "Chicago",
960
            "Chicago",
961
            "Chicago",
962
            "Chicago",
963
            "Chicago",
964
        ],
965
        "seller": [
966
            "Bob",
967
            "Charlie",
968
            "Susan",
969
            "Paul",
970
            "Ed",
971
            "Mary",
972
            "Paul",
973
            "Charlie",
974
            "Norman",
975
        ],
976
        "price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],
977
    }
978
    df = pl.DataFrame(commodity_prices)
979

980
    # this query flexes the wildcard exclusion quite a bit.
981
    keys = ["commodity", "location"]
982
    out = (
983
        df.sort(by="price", descending=True)
984
        .group_by(keys, maintain_order=True)
985
        .agg([pl.col("*").exclude(keys).head(2).name.keep()])
986
        .explode(cs.all().exclude(keys))
987
    )
988

989
    assert out.shape == (5, 4)
990
    assert out.rows() == [
991
        ("Corn", "Chicago", "Mary", 3.0),
992
        ("Corn", "Chicago", "Paul", 2.4),
993
        ("Wheat", "StPaul", "Bob", 1.0),
994
        ("Wheat", "StPaul", "Susan", 0.8),
995
        ("Wheat", "Chicago", "Paul", 0.55),
996
    ]
997

998
    df = pl.DataFrame(
999
        {"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
1000
    )
1001
    out = df.group_by("letters").tail(2).sort("letters")
1002
    assert_frame_equal(
1003
        out,
1004
        pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),
1005
    )
1006
    out = df.group_by("letters").head(2).sort("letters")
1007
    assert_frame_equal(
1008
        out,
1009
        pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),
1010
    )
1011

1012

1013
def test_is_null_is_not_null() -> None:
1014
    df = pl.DataFrame({"nrs": [1, 2, None]})
1015
    assert df.select(pl.col("nrs").is_null())["nrs"].to_list() == [False, False, True]
1016
    assert df.select(pl.col("nrs").is_not_null())["nrs"].to_list() == [
1017
        True,
1018
        True,
1019
        False,
1020
    ]
1021

1022

1023
def test_is_nan_is_not_nan() -> None:
1024
    df = pl.DataFrame({"nrs": np.array([1, 2, np.nan])})
1025
    assert df.select(pl.col("nrs").is_nan())["nrs"].to_list() == [False, False, True]
1026
    assert df.select(pl.col("nrs").is_not_nan())["nrs"].to_list() == [True, True, False]
1027

1028

1029
def test_is_finite_is_infinite() -> None:
1030
    df = pl.DataFrame({"nrs": np.array([1, 2, np.inf])})
1031
    assert df.select(pl.col("nrs").is_infinite())["nrs"].to_list() == [
1032
        False,
1033
        False,
1034
        True,
1035
    ]
1036
    assert df.select(pl.col("nrs").is_finite())["nrs"].to_list() == [True, True, False]
1037

1038

1039
def test_is_finite_is_infinite_null_series() -> None:
1040
    df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})
1041
    result = df.select(
1042
        pl.col("a").is_finite().alias("finite"),
1043
        pl.col("a").is_infinite().alias("infinite"),
1044
    )
1045
    expected = pl.DataFrame(
1046
        {
1047
            "finite": pl.Series([None, None, None], dtype=pl.Boolean),
1048
            "infinite": pl.Series([None, None, None], dtype=pl.Boolean),
1049
        }
1050
    )
1051
    assert_frame_equal(result, expected)
1052

1053

1054
def test_is_nan_null_series() -> None:
1055
    df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})
1056
    result = df.select(pl.col("a").is_nan())
1057
    expected = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Boolean)})
1058
    assert_frame_equal(result, expected)
1059

1060

1061
def test_len() -> None:
1062
    df = pl.DataFrame({"nrs": [1, 2, 3]})
1063
    assert cast("int", df.select(pl.col("nrs").len()).item()) == 3
1064
    assert len(pl.DataFrame()) == 0
1065

1066

1067
def test_multiple_column_sort() -> None:
1068
    df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})
1069
    out = df.sort([pl.col("b"), pl.col("c").reverse()])
1070
    assert list(out["c"]) == [2.0, 1.0, 3.0]
1071
    assert list(out["b"]) == [2, 2, 3]
1072

1073
    # Explicitly specify numpy dtype because of different defaults on Windows
1074
    df = pl.DataFrame({"a": np.arange(1, 4, dtype=np.int64), "b": ["a", "a", "b"]})
1075

1076
    assert_frame_equal(
1077
        df.sort("a", descending=True),
1078
        pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]}),
1079
    )
1080
    assert_frame_equal(
1081
        df.sort("b", descending=True, maintain_order=True),
1082
        pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]}),
1083
    )
1084
    assert_frame_equal(
1085
        df.sort(["b", "a"], descending=[False, True]),
1086
        pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]}),
1087
    )
1088

1089

1090
def test_cast_frame() -> None:
1091
    df = pl.DataFrame(
1092
        {
1093
            "a": [1.0, 2.5, 3.0],
1094
            "b": [4, 5, None],
1095
            "c": [True, False, True],
1096
            "d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],
1097
        }
1098
    )
1099

1100
    # cast via col:dtype map
1101
    assert df.cast(
1102
        dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")},
1103
    ).schema == {
1104
        "a": pl.Float64,
1105
        "b": pl.Float32,
1106
        "c": pl.String,
1107
        "d": pl.Datetime("ms"),
1108
    }
1109

1110
    # cast via col:pytype map
1111
    assert df.cast(
1112
        dtypes={"b": float, "c": str, "d": datetime},
1113
    ).schema == {
1114
        "a": pl.Float64,
1115
        "b": pl.Float64,
1116
        "c": pl.String,
1117
        "d": pl.Datetime("us"),
1118
    }
1119

1120
    # cast via selector:dtype map
1121
    assert df.cast(
1122
        {
1123
            cs.numeric(): pl.UInt8,
1124
            cs.temporal(): pl.String,
1125
        }
1126
    ).rows() == [
1127
        (1, 4, True, "2020-01-02"),
1128
        (2, 5, False, "2021-03-04"),
1129
        (3, None, True, "2022-05-06"),
1130
    ]
1131

1132
    # cast all fields to a single type
1133
    assert df.cast(pl.String).to_dict(as_series=False) == {
1134
        "a": ["1.0", "2.5", "3.0"],
1135
        "b": ["4", "5", None],
1136
        "c": ["true", "false", "true"],
1137
        "d": ["2020-01-02", "2021-03-04", "2022-05-06"],
1138
    }
1139

1140

1141
def test_duration_arithmetic() -> None:
1142
    df = pl.DataFrame(
1143
        {"a": [datetime(2022, 1, 1, 0, 0, 0), datetime(2022, 1, 2, 0, 0, 0)]}
1144
    )
1145
    d1 = pl.duration(days=3, microseconds=987000)
1146
    d2 = pl.duration(days=6, milliseconds=987)
1147

1148
    assert_frame_equal(
1149
        df.with_columns(
1150
            b=(df["a"] + d1),
1151
            c=(pl.col("a") + d2),
1152
        ),
1153
        pl.DataFrame(
1154
            {
1155
                "a": [
1156
                    datetime(2022, 1, 1, 0, 0, 0),
1157
                    datetime(2022, 1, 2, 0, 0, 0),
1158
                ],
1159
                "b": [
1160
                    datetime(2022, 1, 4, 0, 0, 0, 987000),
1161
                    datetime(2022, 1, 5, 0, 0, 0, 987000),
1162
                ],
1163
                "c": [
1164
                    datetime(2022, 1, 7, 0, 0, 0, 987000),
1165
                    datetime(2022, 1, 8, 0, 0, 0, 987000),
1166
                ],
1167
            }
1168
        ),
1169
    )
1170

1171

1172
def test_assign() -> None:
1173
    # check if can assign in case of a single column
1174
    df = pl.DataFrame({"a": [1, 2, 3]})
1175
    # test if we can assign in case of single column
1176
    df = df.with_columns(pl.col("a") * 2)
1177
    assert list(df["a"]) == [2, 4, 6]
1178

1179

1180
def test_arg_sort_by(df: pl.DataFrame) -> None:
1181
    idx_df = df.select(
1182
        pl.arg_sort_by(["int_nulls", "floats"], descending=[False, True]).alias("idx")
1183
    )
1184
    assert (idx_df["idx"] == [1, 0, 2]).all()
1185

1186
    idx_df = df.select(
1187
        pl.arg_sort_by(["int_nulls", "floats"], descending=False).alias("idx")
1188
    )
1189
    assert (idx_df["idx"] == [1, 0, 2]).all()
1190

1191
    df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})
1192
    for expr, expected in (
1193
        (pl.arg_sort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),
1194
        (pl.arg_sort_by(["x", "y"], descending=[True, True]), [5, 3, 4, 0, 1, 2]),
1195
        (pl.arg_sort_by(["x", "y"], descending=[True, False]), [5, 4, 3, 2, 0, 1]),
1196
        (pl.arg_sort_by(["x", "y"], descending=[False, True]), [0, 1, 2, 3, 4, 5]),
1197
    ):
1198
        assert (df.select(expr.alias("idx"))["idx"] == expected).all()
1199

1200

1201
def test_literal_series() -> None:
1202
    df = pl.DataFrame(
1203
        {
1204
            "a": np.array([21.7, 21.8, 21], dtype=np.float32),
1205
            "b": np.array([1, 3, 2], dtype=np.int8),
1206
            "c": ["reg1", "reg2", "reg3"],
1207
            "d": np.array(
1208
                [datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],
1209
                dtype="<M8[ns]",
1210
            ),
1211
        },
1212
        schema_overrides={"a": pl.Float64},
1213
    )
1214
    out = (
1215
        df.lazy()
1216
        .with_columns(pl.Series("e", [2, 1, 3], pl.Int32))
1217
        .with_columns(pl.col("e").cast(pl.Float32))
1218
        .collect()
1219
    )
1220
    expected_schema = {
1221
        "a": pl.Float64,
1222
        "b": pl.Int8,
1223
        "c": pl.String,
1224
        "d": pl.Datetime("ns"),
1225
        "e": pl.Float32,
1226
    }
1227
    assert_frame_equal(
1228
        pl.DataFrame(
1229
            [
1230
                (21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),
1231
                (21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),
1232
                (21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),
1233
            ],
1234
            schema=expected_schema,  # type: ignore[arg-type]
1235
            orient="row",
1236
        ),
1237
        out,
1238
        abs_tol=0.00001,
1239
    )
1240

1241

1242
def test_write_csv() -> None:
1243
    df = pl.DataFrame(
1244
        {
1245
            "foo": [1, 2, 3, 4, 5],
1246
            "bar": [6, 7, 8, 9, 10],
1247
            "ham": ["a", "b", "c", "d", "e"],
1248
        }
1249
    )
1250
    expected = "foo,bar,ham\n1,6,a\n2,7,b\n3,8,c\n4,9,d\n5,10,e\n"
1251

1252
    # if no file argument is supplied, write_csv() will return the string
1253
    s = df.write_csv()
1254
    assert s == expected
1255

1256
    # otherwise it will write to the file/iobuffer
1257
    file = BytesIO()
1258
    df.write_csv(file)
1259
    file.seek(0)
1260
    s = file.read().decode("utf8")
1261
    assert s == expected
1262

1263

1264
def test_from_generator_or_iterable() -> None:
1265
    # generator function
1266
    def gen(n: int, *, strkey: bool = True) -> Iterator[Any]:
1267
        for i in range(n):
1268
            yield (str(i) if strkey else i), 1 * i, 2**i, 3**i
1269

1270
    # iterable object
1271
    class Rows:
1272
        def __init__(self, n: int, *, strkey: bool = True) -> None:
1273
            self._n = n
1274
            self._strkey = strkey
1275

1276
        def __iter__(self) -> Iterator[Any]:
1277
            yield from gen(self._n, strkey=self._strkey)
1278

1279
    # check init from column-oriented generator
1280
    assert_frame_equal(
1281
        pl.DataFrame(data=gen(4, strkey=False), orient="col"),
1282
        pl.DataFrame(
1283
            data=[(0, 0, 1, 1), (1, 1, 2, 3), (2, 2, 4, 9), (3, 3, 8, 27)], orient="col"
1284
        ),
1285
    )
1286
    # check init from row-oriented generators (more common)
1287
    expected = pl.DataFrame(
1288
        data=list(gen(4)), schema=["a", "b", "c", "d"], orient="row"
1289
    )
1290
    for generated_frame in (
1291
        pl.DataFrame(data=gen(4), schema=["a", "b", "c", "d"]),
1292
        pl.DataFrame(data=Rows(4), schema=["a", "b", "c", "d"]),
1293
        pl.DataFrame(data=(x for x in Rows(4)), schema=["a", "b", "c", "d"]),
1294
    ):
1295
        assert_frame_equal(expected, generated_frame)
1296
        assert generated_frame.schema == {
1297
            "a": pl.String,
1298
            "b": pl.Int64,
1299
            "c": pl.Int64,
1300
            "d": pl.Int64,
1301
        }
1302

1303
    # test 'iterable_to_pydf' directly to validate 'chunk_size' behaviour
1304
    cols = ["a", "b", ("c", pl.Int8), "d"]
1305

1306
    expected_data = [("0", 0, 1, 1), ("1", 1, 2, 3), ("2", 2, 4, 9), ("3", 3, 8, 27)]
1307
    expected_schema = [
1308
        ("a", pl.String),
1309
        ("b", pl.Int64),
1310
        ("c", pl.Int8),
1311
        ("d", pl.Int64),
1312
    ]
1313

1314
    for params in (
1315
        {"data": Rows(4)},
1316
        {"data": gen(4), "chunk_size": 2},
1317
        {"data": Rows(4), "chunk_size": 3},
1318
        {"data": gen(4), "infer_schema_length": None},
1319
        {"data": Rows(4), "infer_schema_length": 1},
1320
        {"data": gen(4), "chunk_size": 2},
1321
        {"data": Rows(4), "infer_schema_length": 5},
1322
        {"data": gen(4), "infer_schema_length": 3, "chunk_size": 2},
1323
        {"data": gen(4), "infer_schema_length": None, "chunk_size": 3},
1324
    ):
1325
        d = iterable_to_pydf(schema=cols, **params)  # type: ignore[arg-type]
1326
        assert expected_data == d.row_tuples()
1327
        assert expected_schema == list(zip(d.columns(), d.dtypes()))
1328

1329
    # ref: issue #6489 (initial chunk_size cannot be smaller than 'infer_schema_length')
1330
    df = pl.DataFrame(
1331
        data=iter(([{"col": None}] * 1000) + [{"col": ["a", "b", "c"]}]),
1332
        infer_schema_length=1001,
1333
    )
1334
    assert df.schema == {"col": pl.List(pl.String)}
1335
    assert df[-2:]["col"].to_list() == [None, ["a", "b", "c"]]
1336

1337
    # empty iterator
1338
    assert_frame_equal(
1339
        pl.DataFrame(data=gen(0), schema=["a", "b", "c", "d"]),
1340
        pl.DataFrame(schema=["a", "b", "c", "d"]),
1341
    )
1342

1343

1344
def test_from_rows() -> None:
1345
    df = pl.from_records([[1, 2, "foo"], [2, 3, "bar"]], orient="row")
1346
    assert_frame_equal(
1347
        df,
1348
        pl.DataFrame(
1349
            {"column_0": [1, 2], "column_1": [2, 3], "column_2": ["foo", "bar"]}
1350
        ),
1351
    )
1352
    df = pl.from_records(
1353
        [[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],
1354
        schema_overrides={"column_0": pl.UInt32},
1355
        orient="row",
1356
    )
1357
    assert df.dtypes == [pl.UInt32, pl.Datetime]
1358

1359
    # auto-inference with same num rows/cols
1360
    data = [(1, 2, "foo"), (2, 3, "bar"), (3, 4, "baz")]
1361
    df = pl.from_records(data, orient="row")
1362
    assert data == df.rows()
1363

1364

1365
@pytest.mark.parametrize(
1366
    "records",
1367
    [
1368
        [
1369
            {"id": 1, "value": 100, "_meta": "a"},
1370
            {"id": 2, "value": 101, "_meta": "b"},
1371
        ],
1372
        [
1373
            None,
1374
            {"id": 1, "value": 100, "_meta": "a"},
1375
            {"id": 2, "value": 101, "_meta": "b"},
1376
        ],
1377
        [
1378
            {"id": 1, "value": 100, "_meta": "a"},
1379
            {"id": 2, "value": 101, "_meta": "b"},
1380
            None,
1381
        ],
1382
        [
1383
            MappingObject(id=1, value=100, _meta="a"),
1384
            MappingObject(id=2, value=101, _meta="b"),
1385
        ],
1386
        [
1387
            None,
1388
            MappingObject(id=1, value=100, _meta="a"),
1389
            MappingObject(id=2, value=101, _meta="b"),
1390
        ],
1391
        [
1392
            MappingObject(id=1, value=100, _meta="a"),
1393
            MappingObject(id=2, value=101, _meta="b"),
1394
            None,
1395
        ],
1396
    ],
1397
)
1398
def test_from_rows_of_dicts(records: list[dict[str, Any]]) -> None:
1399
    for df_init in (pl.from_dicts, pl.DataFrame):
1400
        df1 = df_init(records).remove(pl.col("id").is_null())
1401
        assert df1.rows() == [(1, 100, "a"), (2, 101, "b")]
1402

1403
        overrides = {
1404
            "id": pl.Int16,
1405
            "value": pl.Int32,
1406
        }
1407
        df2 = df_init(records, schema_overrides=overrides).remove(
1408
            pl.col("id").is_null()
1409
        )
1410
        assert df2.rows() == [(1, 100, "a"), (2, 101, "b")]
1411
        assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.String}
1412

1413
        df3 = df_init(records, schema=overrides).remove(pl.col("id").is_null())
1414
        assert df3.rows() == [(1, 100), (2, 101)]
1415
        assert df3.schema == {"id": pl.Int16, "value": pl.Int32}
1416

1417
        # explicitly check "anyvalue" conversion for dict/mapping dtypes
1418
        py_s = PySeries.new_from_any_values("s", records, True)
1419
        assert py_s.dtype() == pl.Struct(
1420
            {
1421
                "id": pl.Int64,
1422
                "value": pl.Int64,
1423
                "_meta": pl.String,
1424
            }
1425
        )
1426

1427

1428
def test_from_records_with_schema_overrides_12032() -> None:
1429
    # the 'id' fields contains an int value that exceeds Int64 and doesn't have an exact
1430
    # Float64 representation; confirm that the override is applied *during* inference,
1431
    # not as a post-inference cast, so we maintain the accuracy of the original value.
1432
    rec = [
1433
        {"id": 9187643043065364490, "x": 333, "y": None},
1434
        {"id": 9223671840084328467, "x": 666.5, "y": 1698177261953686},
1435
        {"id": 9187643043065364505, "x": 999, "y": 9223372036854775807},
1436
    ]
1437
    df = pl.from_records(rec, schema_overrides={"x": pl.Float32, "id": pl.UInt64})
1438
    assert df.schema == OrderedDict(
1439
        [
1440
            ("id", pl.UInt64),
1441
            ("x", pl.Float32),
1442
            ("y", pl.Int64),
1443
        ]
1444
    )
1445
    assert rec == df.rows(named=True)
1446

1447

1448
def test_from_large_uint64_misc() -> None:
1449
    uint_data = [[9187643043065364490, 9223671840084328467, 9187643043065364505]]
1450

1451
    df = pl.DataFrame(uint_data, orient="col", schema_overrides={"column_0": pl.UInt64})
1452
    assert df["column_0"].dtype == pl.UInt64
1453
    assert df["column_0"].to_list() == uint_data[0]
1454

1455
    for overrides in ({}, {"column_1": pl.UInt64}):
1456
        df = pl.DataFrame(
1457
            uint_data,
1458
            orient="row",
1459
            schema_overrides=overrides,
1460
        )
1461
        assert df.schema == OrderedDict(
1462
            [
1463
                ("column_0", pl.Int64),
1464
                ("column_1", pl.Int128 if overrides == {} else pl.UInt64),
1465
                ("column_2", pl.Int64),
1466
            ]
1467
        )
1468
        assert df.row(0) == tuple(uint_data[0])
1469

1470

1471
def test_repeat_by_unequal_lengths_panic() -> None:
1472
    df = pl.DataFrame(
1473
        {
1474
            "a": ["x", "y", "z"],
1475
        }
1476
    )
1477
    with pytest.raises(ShapeError):
1478
        df.select(pl.col("a").repeat_by(pl.Series([2, 2])))
1479

1480

1481
@pytest.mark.parametrize(
1482
    ("value", "values_expect"),
1483
    [
1484
        (1.2, [[1.2], [1.2, 1.2], [1.2, 1.2, 1.2]]),
1485
        (True, [[True], [True, True], [True, True, True]]),
1486
        ("x", [["x"], ["x", "x"], ["x", "x", "x"]]),
1487
        (b"a", [[b"a"], [b"a", b"a"], [b"a", b"a", b"a"]]),
1488
    ],
1489
)
1490
def test_repeat_by_broadcast_left(
1491
    value: float | bool | str, values_expect: list[list[float | bool | str]]
1492
) -> None:
1493
    df = pl.DataFrame(
1494
        {
1495
            "n": [1, 2, 3],
1496
        }
1497
    )
1498
    expected = pl.DataFrame({"values": values_expect})
1499
    result = df.select(pl.lit(value).repeat_by(pl.col("n")).alias("values"))
1500
    assert_frame_equal(result, expected)
1501

1502

1503
@pytest.mark.parametrize(
1504
    ("a", "a_expected"),
1505
    [
1506
        ([1.2, 2.2, 3.3], [[1.2, 1.2, 1.2], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3]]),
1507
        ([True, False], [[True, True, True], [False, False, False]]),
1508
        (["x", "y", "z"], [["x", "x", "x"], ["y", "y", "y"], ["z", "z", "z"]]),
1509
        (
1510
            [b"a", b"b", b"c"],
1511
            [[b"a", b"a", b"a"], [b"b", b"b", b"b"], [b"c", b"c", b"c"]],
1512
        ),
1513
    ],
1514
)
1515
def test_repeat_by_broadcast_right(
1516
    a: list[float | bool | str], a_expected: list[list[float | bool | str]]
1517
) -> None:
1518
    df = pl.DataFrame(
1519
        {
1520
            "a": a,
1521
        }
1522
    )
1523
    expected = pl.DataFrame({"a": a_expected})
1524
    result = df.select(pl.col("a").repeat_by(3))
1525
    assert_frame_equal(result, expected)
1526
    result = df.select(pl.col("a").repeat_by(pl.lit(3)))
1527
    assert_frame_equal(result, expected)
1528

1529

1530
@pytest.mark.parametrize(
1531
    ("a", "a_expected"),
1532
    [
1533
        (["foo", "bar"], [["foo", "foo"], ["bar", "bar", "bar"]]),
1534
        ([1, 2], [[1, 1], [2, 2, 2]]),
1535
        ([True, False], [[True, True], [False, False, False]]),
1536
        (
1537
            [b"a", b"b"],
1538
            [[b"a", b"a"], [b"b", b"b", b"b"]],
1539
        ),
1540
    ],
1541
)
1542
def test_repeat_by(
1543
    a: list[float | bool | str], a_expected: list[list[float | bool | str]]
1544
) -> None:
1545
    df = pl.DataFrame({"a": a, "n": [2, 3]})
1546
    expected = pl.DataFrame({"a": a_expected})
1547
    result = df.select(pl.col("a").repeat_by("n"))
1548
    assert_frame_equal(result, expected)
1549

1550

1551
def test_join_dates() -> None:
1552
    dts_in = pl.datetime_range(
1553
        datetime(2021, 6, 24),
1554
        datetime(2021, 6, 24, 10, 0, 0),
1555
        interval=timedelta(hours=1),
1556
        closed="left",
1557
        eager=True,
1558
    )
1559
    dts = (
1560
        dts_in.cast(int)
1561
        .map_elements(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))
1562
        .cast(pl.Datetime)
1563
    )
1564

1565
    # some df with sensor id, (randomish) datetime and some value
1566
    df = pl.DataFrame(
1567
        {
1568
            "sensor": ["a"] * 5 + ["b"] * 5,
1569
            "datetime": dts,
1570
            "value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3],
1571
        }
1572
    )
1573
    out = df.join(df, on="datetime")
1574
    assert out.height == df.height
1575

1576

1577
def test_asof_cross_join() -> None:
1578
    left = pl.DataFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(
1579
        pl.col("a").set_sorted()
1580
    )
1581
    right = pl.DataFrame(
1582
        {"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}
1583
    ).with_columns(pl.col("a").set_sorted())
1584

1585
    # only test dispatch of asof join
1586
    out = left.join_asof(right, on="a")
1587
    assert out.shape == (3, 3)
1588

1589
    left.lazy().join_asof(right.lazy(), on="a").collect()
1590
    assert out.shape == (3, 3)
1591

1592
    # only test dispatch of cross join
1593
    out = left.join(right, how="cross")
1594
    assert out.shape == (15, 4)
1595

1596
    left.lazy().join(right.lazy(), how="cross").collect()
1597
    assert out.shape == (15, 4)
1598

1599

1600
def test_join_bad_input_type() -> None:
1601
    left = pl.DataFrame({"a": [1, 2, 3]})
1602
    right = pl.DataFrame({"a": [1, 2, 3]})
1603

1604
    with pytest.raises(
1605
        TypeError,
1606
        match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
1607
    ):
1608
        left.join(right.lazy(), on="a")  # type: ignore[arg-type]
1609

1610
    with pytest.raises(
1611
        TypeError,
1612
        match="expected `other` .*to be a 'DataFrame'.* not 'Series'",
1613
    ):
1614
        left.join(pl.Series([1, 2, 3]), on="a")  # type: ignore[arg-type]
1615

1616
    class DummyDataFrameSubclass(pl.DataFrame):
1617
        pass
1618

1619
    right = DummyDataFrameSubclass(right)
1620

1621
    left.join(right, on="a")
1622

1623

1624
def test_join_where() -> None:
1625
    east = pl.DataFrame(
1626
        {
1627
            "id": [100, 101, 102],
1628
            "dur": [120, 140, 160],
1629
            "rev": [12, 14, 16],
1630
            "cores": [2, 8, 4],
1631
        }
1632
    )
1633
    west = pl.DataFrame(
1634
        {
1635
            "t_id": [404, 498, 676, 742],
1636
            "time": [90, 130, 150, 170],
1637
            "cost": [9, 13, 15, 16],
1638
            "cores": [4, 2, 1, 4],
1639
        }
1640
    )
1641
    out = east.join_where(
1642
        west,
1643
        pl.col("dur") < pl.col("time"),
1644
        pl.col("rev") < pl.col("cost"),
1645
    )
1646

1647
    expected = pl.DataFrame(
1648
        {
1649
            "id": [100, 100, 100, 101, 101],
1650
            "dur": [120, 120, 120, 140, 140],
1651
            "rev": [12, 12, 12, 14, 14],
1652
            "cores": [2, 2, 2, 8, 8],
1653
            "t_id": [498, 676, 742, 676, 742],
1654
            "time": [130, 150, 170, 150, 170],
1655
            "cost": [13, 15, 16, 15, 16],
1656
            "cores_right": [2, 1, 4, 1, 4],
1657
        }
1658
    )
1659

1660
    assert_frame_equal(out, expected)
1661

1662

1663
def test_join_where_bad_input_type() -> None:
1664
    east = pl.DataFrame(
1665
        {
1666
            "id": [100, 101, 102],
1667
            "dur": [120, 140, 160],
1668
            "rev": [12, 14, 16],
1669
            "cores": [2, 8, 4],
1670
        }
1671
    )
1672
    west = pl.DataFrame(
1673
        {
1674
            "t_id": [404, 498, 676, 742],
1675
            "time": [90, 130, 150, 170],
1676
            "cost": [9, 13, 15, 16],
1677
            "cores": [4, 2, 1, 4],
1678
        }
1679
    )
1680
    with pytest.raises(
1681
        TypeError,
1682
        match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
1683
    ):
1684
        east.join_where(
1685
            west.lazy(),  # type: ignore[arg-type]
1686
            pl.col("dur") < pl.col("time"),
1687
            pl.col("rev") < pl.col("cost"),
1688
        )
1689

1690
    with pytest.raises(
1691
        TypeError,
1692
        match="expected `other` .*to be a 'DataFrame'.* not 'Series'",
1693
    ):
1694
        east.join_where(
1695
            pl.Series(west),  # type: ignore[arg-type]
1696
            pl.col("dur") < pl.col("time"),
1697
            pl.col("rev") < pl.col("cost"),
1698
        )
1699

1700
    class DummyDataFrameSubclass(pl.DataFrame):
1701
        pass
1702

1703
    west = DummyDataFrameSubclass(west)
1704

1705
    east.join_where(
1706
        west,
1707
        pl.col("dur") < pl.col("time"),
1708
        pl.col("rev") < pl.col("cost"),
1709
    )
1710

1711

1712
def test_str_concat() -> None:
1713
    df = pl.DataFrame(
1714
        {
1715
            "nrs": [1, 2, 3, 4],
1716
            "name": ["ham", "spam", "foo", None],
1717
        }
1718
    )
1719
    out = df.with_columns((pl.lit("Dr. ") + pl.col("name")).alias("graduated_name"))
1720
    assert out["graduated_name"][0] == "Dr. ham"
1721
    assert out["graduated_name"][1] == "Dr. spam"
1722

1723

1724
def test_dot_product() -> None:
1725
    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})
1726

1727
    assert df["a"].dot(df["b"]) == 20
1728
    assert typing.cast("int", df.select([pl.col("a").dot("b")])[0, "a"]) == 20
1729

1730
    result = pl.Series([1, 2, 3]) @ pl.Series([4, 5, 6])
1731
    assert isinstance(result, int)
1732
    assert result == 32
1733

1734
    result = pl.Series([1, 2, 3]) @ pl.Series([4.0, 5.0, 6.0])
1735
    assert isinstance(result, float)
1736
    assert result == 32.0
1737

1738
    result = pl.Series([1.0, 2.0, 3.0]) @ pl.Series([4.0, 5.0, 6.0])
1739
    assert isinstance(result, float)
1740
    assert result == 32.0
1741

1742
    with pytest.raises(
1743
        InvalidOperationError, match="`dot` operation not supported for dtype `bool`"
1744
    ):
1745
        pl.Series([True, False, False, True]) @ pl.Series([4, 5, 6, 7])
1746

1747
    with pytest.raises(
1748
        InvalidOperationError, match="`dot` operation not supported for dtype `str`"
1749
    ):
1750
        pl.Series([1, 2, 3, 4]) @ pl.Series(["True", "False", "False", "True"])
1751

1752

1753
def test_hash_rows() -> None:
1754
    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})
1755
    assert df.hash_rows().dtype == pl.UInt64
1756
    assert df["a"].hash().dtype == pl.UInt64
1757
    assert df.select([pl.col("a").hash().alias("foo")])["foo"].dtype == pl.UInt64
1758

1759

1760
def test_reproducible_hash_with_seeds() -> None:
1761
    """
1762
    Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.
1763

1764
    cf. issue #3966, hashes must always be reproducible across sessions when using
1765
    the same seeds.
1766
    """
1767
    df = pl.DataFrame({"s": [1234, None, 5678]})
1768
    seeds = (11, 22, 33, 44)
1769
    expected = pl.Series(
1770
        "s",
1771
        [10832467230526607564, 3044502640115867787, 17228373233104406792],
1772
        dtype=pl.UInt64,
1773
    )
1774
    result = df.hash_rows(*seeds)
1775
    assert_series_equal(expected, result, check_names=False, check_exact=True)
1776
    result = df["s"].hash(*seeds)
1777
    assert_series_equal(expected, result, check_names=False, check_exact=True)
1778
    result = df.select([pl.col("s").hash(*seeds)])["s"]
1779
    assert_series_equal(expected, result, check_names=False, check_exact=True)
1780

1781

1782
@pytest.mark.slow
1783
@pytest.mark.parametrize(
1784
    "e",
1785
    [
1786
        pl.int_range(1_000_000),
1787
        # Test code path for null_count > 0
1788
        pl.when(pl.int_range(1_000_000) != 0).then(pl.int_range(1_000_000)),
1789
    ],
1790
)
1791
def test_hash_collision_multiple_columns_equal_values_15390(e: pl.Expr) -> None:
1792
    df = pl.select(e.alias("a"))
1793

1794
    for n_columns in (1, 2, 3, 4):
1795
        s = df.select(pl.col("a").alias(f"x{i}") for i in range(n_columns)).hash_rows()
1796

1797
        vc = s.sort().value_counts(sort=True)
1798
        max_bucket_size = vc["count"][0]
1799

1800
        assert max_bucket_size == 1
1801

1802

1803
@pytest.mark.may_fail_auto_streaming  # Python objects not yet supported in row encoding
1804
@pytest.mark.may_fail_cloud
1805
def test_hashing_on_python_objects() -> None:
1806
    # see if we can do a group_by, drop_duplicates on a DataFrame with objects.
1807
    # this requires that the hashing and aggregations are done on python objects
1808

1809
    df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]})
1810

1811
    class Foo:
1812
        def __hash__(self) -> int:
1813
            return 0
1814

1815
        def __eq__(self, other: object) -> bool:
1816
            return True
1817

1818
    df = df.with_columns(pl.col("a").map_elements(lambda x: Foo()).alias("foo"))
1819
    assert df.group_by(["foo"]).first().shape == (1, 3)
1820
    assert df.unique().shape == (3, 3)
1821

1822

1823
def test_unique_unit_rows() -> None:
1824
    df = pl.DataFrame({"a": [1], "b": [None]}, schema={"a": pl.Int64, "b": pl.Float32})
1825

1826
    # 'unique' one-row frame should be equal to the original frame
1827
    assert_frame_equal(df, df.unique(subset="a"))
1828
    for col in df.columns:
1829
        assert df.n_unique(subset=[col]) == 1
1830

1831

1832
def test_panic() -> None:
1833
    # may contain some tests that yielded a panic in polars or pl_arrow
1834
    # https://github.com/pola-rs/polars/issues/1110
1835
    a = pl.DataFrame(
1836
        {
1837
            "col1": ["a"] * 500 + ["b"] * 500,
1838
        }
1839
    )
1840
    a.filter(pl.col("col1") != "b")
1841

1842

1843
def test_horizontal_agg() -> None:
1844
    df = pl.DataFrame({"a": [1, None, 3], "b": [1, 2, 3]})
1845

1846
    assert_series_equal(df.sum_horizontal(), pl.Series("sum", [2, 2, 6]))
1847
    assert_series_equal(
1848
        df.sum_horizontal(ignore_nulls=False), pl.Series("sum", [2, None, 6])
1849
    )
1850
    assert_series_equal(
1851
        df.mean_horizontal(ignore_nulls=False), pl.Series("mean", [1.0, None, 3.0])
1852
    )
1853

1854

1855
def test_slicing() -> None:
1856
    # https://github.com/pola-rs/polars/issues/1322
1857
    n = 20
1858

1859
    df = pl.DataFrame(
1860
        {
1861
            "d": ["u", "u", "d", "c", "c", "d", "d"] * n,
1862
            "v1": [None, "help", None, None, None, None, None] * n,
1863
        }
1864
    )
1865

1866
    assert (df.filter(pl.col("d") != "d").select([pl.col("v1").unique()])).shape == (
1867
        2,
1868
        1,
1869
    )
1870

1871

1872
def test_group_by_cat_list() -> None:
1873
    grouped = (
1874
        pl.DataFrame(
1875
            [
1876
                pl.Series("str_column", ["a", "b", "b", "a", "b"]),
1877
                pl.Series("int_column", [1, 1, 2, 2, 3]),
1878
            ]
1879
        )
1880
        .with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column"))
1881
        .group_by("int_column", maintain_order=True)
1882
        .agg([pl.col("cat_column")])["cat_column"]
1883
    )
1884

1885
    out = grouped.explode()
1886
    assert out.dtype == pl.Categorical
1887
    assert out[0] == "a"
1888

1889

1890
def test_group_by_agg_n_unique_floats() -> None:
1891
    # tests proper dispatch
1892
    df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1893

1894
    for dtype in [pl.Float32, pl.Float64]:
1895
        out = df.group_by("a", maintain_order=True).agg(
1896
            [pl.col("b").cast(dtype).n_unique()]
1897
        )
1898
        assert out["b"].to_list() == [2, 1]
1899

1900

1901
def test_group_by_agg_n_unique_empty_group_idx_path() -> None:
1902
    df = pl.DataFrame(
1903
        {
1904
            "key": [1, 1, 1, 2, 2, 2],
1905
            "value": [1, 2, 3, 4, 5, 6],
1906
            "filt": [True, True, True, False, False, False],
1907
        }
1908
    )
1909
    out = df.group_by("key", maintain_order=True).agg(
1910
        pl.col("value").filter("filt").n_unique().alias("n_unique")
1911
    )
1912
    expected = pl.DataFrame(
1913
        {
1914
            "key": [1, 2],
1915
            "n_unique": pl.Series([3, 0], dtype=pl.UInt32),
1916
        }
1917
    )
1918
    assert_frame_equal(out, expected)
1919

1920

1921
def test_group_by_agg_n_unique_empty_group_slice_path() -> None:
1922
    df = pl.DataFrame(
1923
        {
1924
            "key": [1, 1, 1, 2, 2, 2],
1925
            "value": [1, 2, 3, 4, 5, 6],
1926
            "filt": [False, False, False, False, False, False],
1927
        }
1928
    )
1929
    out = df.group_by("key", maintain_order=True).agg(
1930
        pl.col("value").filter("filt").n_unique().alias("n_unique")
1931
    )
1932
    expected = pl.DataFrame(
1933
        {
1934
            "key": [1, 2],
1935
            "n_unique": pl.Series([0, 0], dtype=pl.UInt32),
1936
        }
1937
    )
1938
    assert_frame_equal(out, expected)
1939

1940

1941
def test_select_by_dtype(df: pl.DataFrame) -> None:
1942
    out = df.select(pl.col(pl.String))
1943
    assert out.columns == ["strings", "strings_nulls"]
1944
    out = df.select(pl.col([pl.String, pl.Boolean]))
1945
    assert out.columns == ["bools", "bools_nulls", "strings", "strings_nulls"]
1946
    out = df.select(pl.col(INTEGER_DTYPES))
1947
    assert out.columns == ["int", "int_nulls"]
1948

1949
    out = df.select(ints=pl.struct(pl.col(INTEGER_DTYPES)))
1950
    assert out.schema == {
1951
        "ints": pl.Struct([pl.Field("int", pl.Int64), pl.Field("int_nulls", pl.Int64)])
1952
    }
1953

1954

1955
def test_with_row_index() -> None:
1956
    df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1957

1958
    out = df.with_row_index()
1959
    assert out["index"].to_list() == [0, 1, 2]
1960

1961
    out = df.lazy().with_row_index().collect()
1962
    assert out["index"].to_list() == [0, 1, 2]
1963

1964

1965
def test_with_row_index_bad_offset() -> None:
1966
    df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1967

1968
    with pytest.raises(ValueError, match="cannot be negative"):
1969
        df.with_row_index(offset=-1)
1970
    with pytest.raises(
1971
        ValueError, match="cannot be greater than the maximum index value"
1972
    ):
1973
        df.with_row_index(offset=2**32)
1974

1975

1976
def test_with_row_index_bad_offset_lazy() -> None:
1977
    lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1978

1979
    with pytest.raises(ValueError, match="cannot be negative"):
1980
        lf.with_row_index(offset=-1)
1981
    with pytest.raises(
1982
        ValueError, match="cannot be greater than the maximum index value"
1983
    ):
1984
        lf.with_row_index(offset=2**32)
1985

1986

1987
def test_with_row_count_deprecated() -> None:
1988
    df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1989

1990
    with pytest.deprecated_call():
1991
        out = df.with_row_count()
1992
    assert out["row_nr"].to_list() == [0, 1, 2]
1993

1994
    with pytest.deprecated_call():
1995
        out = df.lazy().with_row_count().collect()
1996
    assert out["row_nr"].to_list() == [0, 1, 2]
1997

1998

1999
@pytest.mark.may_fail_cloud
2000
def test_filter_with_all_expansion() -> None:
2001
    df = pl.DataFrame(
2002
        {
2003
            "b": [1, 2, None],
2004
            "c": [1, 2, None],
2005
            "a": [None, None, None],
2006
        }
2007
    )
2008
    out = df.filter(~pl.fold(True, lambda acc, s: acc & s.is_null(), pl.all()))
2009
    assert out.shape == (2, 3)
2010

2011

2012
# TODO: investigate this discrepancy in auto streaming
2013
@pytest.mark.may_fail_auto_streaming
2014
@pytest.mark.may_fail_cloud
2015
def test_extension() -> None:
2016
    class Foo:
2017
        def __init__(self, value: Any) -> None:
2018
            self.value = value
2019

2020
        def __repr__(self) -> str:
2021
            return f"foo({self.value})"
2022

2023
    foos = [Foo(1), Foo(2), Foo(3)]
2024

2025
    # foos and sys.getrefcount both have a reference.
2026
    base_count = 2
2027

2028
    # We compute the refcount on a separate line otherwise pytest's assert magic
2029
    # might add reference counts.
2030
    rc = sys.getrefcount(foos[0])
2031
    assert rc == base_count
2032

2033
    df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
2034
    rc = sys.getrefcount(foos[0])
2035
    assert rc == base_count + 1
2036
    del df
2037
    rc = sys.getrefcount(foos[0])
2038
    assert rc == base_count
2039

2040
    df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
2041
    rc = sys.getrefcount(foos[0])
2042
    assert rc == base_count + 1
2043

2044
    out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a"))
2045
    rc = sys.getrefcount(foos[0])
2046
    assert rc == base_count + 2
2047
    s = out["a"].list.explode()
2048
    rc = sys.getrefcount(foos[0])
2049
    assert rc == base_count + 3
2050
    del s
2051
    rc = sys.getrefcount(foos[0])
2052
    assert rc == base_count + 2
2053

2054
    assert out["a"].list.explode().to_list() == foos
2055
    rc = sys.getrefcount(foos[0])
2056
    assert rc == base_count + 2
2057
    del out
2058
    rc = sys.getrefcount(foos[0])
2059
    assert rc == base_count + 1
2060
    del df
2061
    rc = sys.getrefcount(foos[0])
2062
    assert rc == base_count
2063

2064

2065
@pytest.mark.parametrize("name", [None, "n", ""])
2066
def test_group_by_order_dispatch(name: str | None) -> None:
2067
    df = pl.DataFrame({"x": list("bab"), "y": range(3)})
2068
    lf = df.lazy()
2069

2070
    result = df.group_by("x", maintain_order=True).len(name=name)
2071
    lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)
2072

2073
    name = "len" if name is None else name
2074
    expected = pl.DataFrame(
2075
        data={"x": ["b", "a"], name: [2, 1]},
2076
        schema_overrides={name: pl.UInt32},
2077
    )
2078
    assert_frame_equal(result, expected)
2079
    assert_frame_equal(lazy_result.collect(), expected)
2080

2081
    result = df.group_by("x", maintain_order=True).all()
2082
    expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})
2083
    assert_frame_equal(result, expected)
2084

2085

2086
def test_partitioned_group_by_order() -> None:
2087
    # check if group ordering is maintained.
2088
    # we only have 30 groups, so this triggers a partitioned group by
2089
    df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
2090
    out = df.group_by("x", maintain_order=True).agg(pl.all().implode())
2091
    assert_series_equal(out["x"], df["x"])
2092

2093

2094
def test_schema() -> None:
2095
    df = pl.DataFrame(
2096
        {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
2097
    )
2098
    expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}
2099
    assert df.schema == expected
2100

2101

2102
def test_schema_equality() -> None:
2103
    lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]})
2104
    lf_rev = lf.select("bar", "foo")
2105

2106
    assert lf.collect_schema() != lf_rev.collect_schema()
2107
    assert lf.collect().schema != lf_rev.collect().schema
2108

2109

2110
def test_df_schema_unique() -> None:
2111
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
2112
    with pytest.raises(DuplicateError):
2113
        df.columns = ["a", "a"]
2114

2115
    with pytest.raises(DuplicateError):
2116
        df.rename({"b": "a"})
2117

2118

2119
def test_empty_projection() -> None:
2120
    empty_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).select([])
2121
    assert empty_df.rows() == []
2122
    assert empty_df.schema == {}
2123
    assert empty_df.shape == (0, 0)
2124

2125

2126
def test_fill_null() -> None:
2127
    df = pl.DataFrame({"a": [1, 2], "b": [3, None]})
2128
    assert_frame_equal(df.fill_null(4), pl.DataFrame({"a": [1, 2], "b": [3, 4]}))
2129
    assert_frame_equal(
2130
        df.fill_null(strategy="max"), pl.DataFrame({"a": [1, 2], "b": [3, 3]})
2131
    )
2132

2133
    # string and list data
2134
    # string goes via binary
2135
    df = pl.DataFrame(
2136
        {
2137
            "c": [
2138
                ["Apple", "Orange"],
2139
                ["Apple", "Orange"],
2140
                None,
2141
                ["Carrot"],
2142
                None,
2143
                None,
2144
            ],
2145
            "b": ["Apple", "Orange", None, "Carrot", None, None],
2146
        }
2147
    )
2148

2149
    assert df.select(
2150
        pl.all().fill_null(strategy="forward").name.suffix("_forward"),
2151
        pl.all().fill_null(strategy="backward").name.suffix("_backward"),
2152
    ).to_dict(as_series=False) == {
2153
        "c_forward": [
2154
            ["Apple", "Orange"],
2155
            ["Apple", "Orange"],
2156
            ["Apple", "Orange"],
2157
            ["Carrot"],
2158
            ["Carrot"],
2159
            ["Carrot"],
2160
        ],
2161
        "b_forward": ["Apple", "Orange", "Orange", "Carrot", "Carrot", "Carrot"],
2162
        "c_backward": [
2163
            ["Apple", "Orange"],
2164
            ["Apple", "Orange"],
2165
            ["Carrot"],
2166
            ["Carrot"],
2167
            None,
2168
            None,
2169
        ],
2170
        "b_backward": ["Apple", "Orange", "Carrot", "Carrot", None, None],
2171
    }
2172
    # categoricals
2173
    df = pl.DataFrame(pl.Series("cat", ["a", None], dtype=pl.Categorical))
2174
    s = df.select(pl.col("cat").fill_null(strategy="forward"))["cat"]
2175
    assert s.dtype == pl.Categorical
2176
    assert s.to_list() == ["a", "a"]
2177

2178

2179
def test_fill_nan() -> None:
2180
    df = pl.DataFrame({"a": [1, 2], "b": [3.0, float("nan")]})
2181
    assert_frame_equal(
2182
        df.fill_nan(4),
2183
        pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}),
2184
    )
2185
    assert_frame_equal(
2186
        df.fill_nan(None),
2187
        pl.DataFrame({"a": [1, 2], "b": [3.0, None]}),
2188
    )
2189
    assert df["b"].fill_nan(5.0).to_list() == [3.0, 5.0]
2190
    df = pl.DataFrame(
2191
        {
2192
            "a": [1.0, np.nan, 3.0],
2193
            "b": [datetime(1, 2, 2), datetime(2, 2, 2), datetime(3, 2, 2)],
2194
        }
2195
    )
2196
    assert df.fill_nan(2.0).dtypes == [pl.Float64, pl.Datetime]
2197

2198

2199
#
2200
def test_forward_fill() -> None:
2201
    df = pl.DataFrame({"a": [1.0, None, 3.0]})
2202
    fill = df.select(pl.col("a").forward_fill())["a"]
2203
    assert_series_equal(fill, pl.Series("a", [1, 1, 3]).cast(pl.Float64))
2204

2205
    df = pl.DataFrame({"a": [None, 1, None]})
2206
    fill = df.select(pl.col("a").forward_fill())["a"]
2207
    assert_series_equal(fill, pl.Series("a", [None, 1, 1]).cast(pl.Int64))
2208

2209

2210
def test_backward_fill() -> None:
2211
    df = pl.DataFrame({"a": [1.0, None, 3.0]})
2212
    fill = df.select(pl.col("a").backward_fill())["a"]
2213
    assert_series_equal(fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))
2214

2215
    df = pl.DataFrame({"a": [None, 1, None]})
2216
    fill = df.select(pl.col("a").backward_fill())["a"]
2217
    assert_series_equal(fill, pl.Series("a", [1, 1, None]).cast(pl.Int64))
2218

2219

2220
def test_shrink_to_fit() -> None:
2221
    df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})
2222

2223
    assert df.shrink_to_fit(in_place=True) is df
2224
    assert df.shrink_to_fit(in_place=False) is not df
2225
    assert_frame_equal(df.shrink_to_fit(in_place=False), df)
2226

2227

2228
def test_add_string() -> None:
2229
    df = pl.DataFrame({"a": ["hi", "there"], "b": ["hello", "world"]})
2230
    expected = pl.DataFrame(
2231
        {"a": ["hi hello", "there hello"], "b": ["hello hello", "world hello"]}
2232
    )
2233
    assert_frame_equal((df + " hello"), expected)
2234

2235
    expected = pl.DataFrame(
2236
        {"a": ["hello hi", "hello there"], "b": ["hello hello", "hello world"]}
2237
    )
2238
    assert_frame_equal(("hello " + df), expected)
2239

2240

2241
def test_df_broadcast() -> None:
2242
    df = pl.DataFrame({"a": [1, 2, 3]}, schema_overrides={"a": pl.UInt8})
2243
    out = df.with_columns(pl.lit(pl.Series("s", [[1, 2]])).first())
2244
    assert out.shape == (3, 2)
2245
    assert out.schema == {"a": pl.UInt8, "s": pl.List(pl.Int64)}
2246
    assert out.rows() == [(1, [1, 2]), (2, [1, 2]), (3, [1, 2])]
2247

2248

2249
@pytest.mark.may_fail_cloud  # not a lazyframe method
2250
def test_product() -> None:
2251
    df = pl.DataFrame(
2252
        {
2253
            "int": [1, 2, 3],
2254
            "flt": [-1.0, 12.0, 9.0],
2255
            "bool_0": [True, False, True],
2256
            "bool_1": [True, True, True],
2257
            "str": ["a", "b", "c"],
2258
        },
2259
        schema_overrides={
2260
            "int": pl.UInt16,
2261
            "flt": pl.Float32,
2262
        },
2263
    )
2264
    out = df.product()
2265
    expected = pl.DataFrame(
2266
        {"int": [6], "flt": [-108.0], "bool_0": [0], "bool_1": [1], "str": [None]}
2267
    )
2268
    assert_frame_not_equal(out, expected, check_dtypes=True)
2269
    assert_frame_equal(out, expected, check_dtypes=False)
2270

2271

2272
def test_first_last_nth_expressions(fruits_cars: pl.DataFrame) -> None:
2273
    df = fruits_cars
2274
    out = df.select(pl.first())
2275
    assert out.columns == ["A"]
2276

2277
    out = df.select(pl.last())
2278
    assert out.columns == ["cars"]
2279

2280
    out = df.select(pl.nth(0))
2281
    assert out.columns == ["A"]
2282

2283
    out = df.select(pl.nth(1))
2284
    assert out.columns == ["fruits"]
2285

2286
    out = df.select(pl.nth(-2))
2287
    assert out.columns == ["B"]
2288

2289

2290
def test_is_between(fruits_cars: pl.DataFrame) -> None:
2291
    result = fruits_cars.select(pl.col("A").is_between(2, 4)).to_series()
2292
    assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))
2293

2294
    result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="none")).to_series()
2295
    assert_series_equal(result, pl.Series("A", [False, False, True, False, False]))
2296

2297
    result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="both")).to_series()
2298
    assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))
2299

2300
    result = fruits_cars.select(
2301
        pl.col("A").is_between(2, 4, closed="right")
2302
    ).to_series()
2303
    assert_series_equal(result, pl.Series("A", [False, False, True, True, False]))
2304

2305
    result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="left")).to_series()
2306
    assert_series_equal(result, pl.Series("A", [False, True, True, False, False]))
2307

2308

2309
def test_is_between_data_types() -> None:
2310
    df = pl.DataFrame(
2311
        {
2312
            "flt": [1.4, 1.2, 2.5],
2313
            "int": [2, 3, 4],
2314
            "str": ["xyz", "str", "abc"],
2315
            "date": [date(2020, 1, 1), date(2020, 2, 2), date(2020, 3, 3)],
2316
            "datetime": [
2317
                datetime(2020, 1, 1, 0, 0, 0),
2318
                datetime(2020, 1, 1, 10, 0, 0),
2319
                datetime(2020, 1, 1, 12, 0, 0),
2320
            ],
2321
            "tm": [time(10, 30), time(0, 45), time(15, 15)],
2322
        }
2323
    )
2324

2325
    # on purpose, for float and int, we pass in a mixture of bound data types
2326
    assert_series_equal(
2327
        df.select(pl.col("flt").is_between(1, 2.3))[:, 0],
2328
        pl.Series("flt", [True, True, False]),
2329
    )
2330
    assert_series_equal(
2331
        df.select(pl.col("int").is_between(1.5, 3))[:, 0],
2332
        pl.Series("int", [True, True, False]),
2333
    )
2334
    assert_series_equal(
2335
        df.select(pl.col("date").is_between(date(2019, 1, 1), date(2020, 2, 5)))[:, 0],
2336
        pl.Series("date", [True, True, False]),
2337
    )
2338
    assert_series_equal(
2339
        df.select(
2340
            pl.col("datetime").is_between(
2341
                datetime(2020, 1, 1, 5, 0, 0), datetime(2020, 1, 1, 11, 0, 0)
2342
            )
2343
        )[:, 0],
2344
        pl.Series("datetime", [False, True, False]),
2345
    )
2346
    assert_series_equal(
2347
        df.select(
2348
            pl.col("str").is_between(pl.lit("str"), pl.lit("zzz"), closed="left")
2349
        )[:, 0],
2350
        pl.Series("str", [True, True, False]),
2351
    )
2352
    assert_series_equal(
2353
        df.select(
2354
            pl.col("tm")
2355
            .is_between(time(0, 45), time(10, 30), closed="right")
2356
            .alias("tm_between")
2357
        )[:, 0],
2358
        pl.Series("tm_between", [True, False, False]),
2359
    )
2360

2361

2362
def test_empty_is_in() -> None:
2363
    df_empty_isin = pl.DataFrame({"foo": ["a", "b", "c", "d"]}).filter(
2364
        pl.col("foo").is_in([])
2365
    )
2366
    assert df_empty_isin.shape == (0, 1)
2367
    assert df_empty_isin.rows() == []
2368
    assert df_empty_isin.schema == {"foo": pl.String}
2369

2370

2371
def test_group_by_slice_expression_args() -> None:
2372
    df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)})
2373

2374
    out = (
2375
        df.group_by("groups", maintain_order=True)
2376
        .agg([pl.col("vals").slice((pl.len() * 0.1).cast(int), (pl.len() // 5))])
2377
        .explode("vals")
2378
    )
2379

2380
    expected = pl.DataFrame(
2381
        {"groups": ["a", "a", "b", "b", "b", "b"], "vals": [1, 2, 12, 13, 14, 15]}
2382
    )
2383
    assert_frame_equal(out, expected)
2384

2385

2386
def test_join_suffixes() -> None:
2387
    df_a = pl.DataFrame({"A": [1], "B": [1]})
2388
    df_b = pl.DataFrame({"A": [1], "B": [1]})
2389

2390
    join_strategies: list[JoinStrategy] = ["left", "inner", "full", "cross"]
2391
    for how in join_strategies:
2392
        # no need for an assert, we error if wrong
2393
        df_a.join(df_b, on="A" if how != "cross" else None, suffix="_y", how=how)["B_y"]
2394

2395
    df_a.join_asof(df_b, on=pl.col("A").set_sorted(), suffix="_y")["B_y"]
2396

2397

2398
def test_explode_empty() -> None:
2399
    df = (
2400
        pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]})
2401
        .group_by("x", maintain_order=True)
2402
        .agg(pl.col("y").gather([]))
2403
    )
2404
    assert df.explode("y").to_dict(as_series=False) == {
2405
        "x": ["a", "b"],
2406
        "y": [None, None],
2407
    }
2408

2409
    df = pl.DataFrame({"x": ["1", "2", "4"], "y": [["a", "b", "c"], ["d"], []]})
2410
    assert_frame_equal(
2411
        df.explode("y"),
2412
        pl.DataFrame({"x": ["1", "1", "1", "2", "4"], "y": ["a", "b", "c", "d", None]}),
2413
    )
2414

2415
    df = pl.DataFrame(
2416
        {
2417
            "letters": ["a"],
2418
            "numbers": [[]],
2419
        }
2420
    )
2421
    assert df.explode("numbers").to_dict(as_series=False) == {
2422
        "letters": ["a"],
2423
        "numbers": [None],
2424
    }
2425

2426

2427
def test_asof_by_multiple_keys() -> None:
2428
    lhs = pl.DataFrame(
2429
        {
2430
            "a": [-20, -19, 8, 12, 14],
2431
            "by": [1, 1, 2, 2, 2],
2432
            "by2": [1, 1, 2, 2, 2],
2433
        }
2434
    )
2435

2436
    rhs = pl.DataFrame(
2437
        {
2438
            "a": [-19, -15, 3, 5, 13],
2439
            "by": [1, 1, 2, 2, 2],
2440
            "by2": [1, 1, 2, 2, 2],
2441
        }
2442
    )
2443

2444
    result = lhs.join_asof(
2445
        rhs, on=pl.col("a").set_sorted(), by=["by", "by2"], strategy="backward"
2446
    ).select(["a", "by"])
2447
    expected = pl.DataFrame({"a": [-20, -19, 8, 12, 14], "by": [1, 1, 2, 2, 2]})
2448
    assert_frame_equal(
2449
        result.group_by("by").agg("a"),
2450
        expected.group_by("by").agg("a"),
2451
        check_row_order=False,
2452
    )
2453

2454

2455
def test_asof_bad_input_type() -> None:
2456
    lhs = pl.DataFrame({"a": [1, 2, 3]})
2457
    rhs = pl.DataFrame({"a": [1, 2, 3]})
2458

2459
    with pytest.raises(
2460
        TypeError,
2461
        match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
2462
    ):
2463
        lhs.join_asof(rhs.lazy(), on="a")  # type: ignore[arg-type]
2464

2465
    with pytest.raises(
2466
        TypeError,
2467
        match="expected `other` .*to be a 'DataFrame'.* not 'Series'",
2468
    ):
2469
        lhs.join_asof(pl.Series([1, 2, 3]), on="a")  # type: ignore[arg-type]
2470

2471
    class DummyDataFrameSubclass(pl.DataFrame):
2472
        pass
2473

2474
    rhs = DummyDataFrameSubclass(rhs)
2475

2476
    lhs.join_asof(rhs, on="a")
2477

2478

2479
def test_list_of_list_of_struct() -> None:
2480
    expected = [{"list_of_list_of_struct": [[{"a": 1}, {"a": 2}]]}]
2481
    pa_df = pa.Table.from_pylist(expected)
2482

2483
    df = pl.from_arrow(pa_df)
2484
    assert df.rows() == [([[{"a": 1}, {"a": 2}]],)]  # type: ignore[union-attr]
2485
    assert df.to_dicts() == expected  # type: ignore[union-attr]
2486

2487
    df = pl.from_arrow(pa_df[:0])
2488
    assert df.to_dicts() == []  # type: ignore[union-attr]
2489

2490

2491
def test_fill_null_limits() -> None:
2492
    assert pl.DataFrame(
2493
        {
2494
            "a": [1, None, None, None, 5, 6, None, None, None, 10],
2495
            "b": ["a", None, None, None, "b", "c", None, None, None, "d"],
2496
            "c": [True, None, None, None, False, True, None, None, None, False],
2497
        }
2498
    ).select(
2499
        pl.all().fill_null(strategy="forward", limit=2),
2500
        pl.all().fill_null(strategy="backward", limit=2).name.suffix("_backward"),
2501
    ).to_dict(as_series=False) == {
2502
        "a": [1, 1, 1, None, 5, 6, 6, 6, None, 10],
2503
        "b": ["a", "a", "a", None, "b", "c", "c", "c", None, "d"],
2504
        "c": [True, True, True, None, False, True, True, True, None, False],
2505
        "a_backward": [1, None, 5, 5, 5, 6, None, 10, 10, 10],
2506
        "b_backward": ["a", None, "b", "b", "b", "c", None, "d", "d", "d"],
2507
        "c_backward": [
2508
            True,
2509
            None,
2510
            False,
2511
            False,
2512
            False,
2513
            True,
2514
            None,
2515
            False,
2516
            False,
2517
            False,
2518
        ],
2519
    }
2520

2521

2522
def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None:
2523
    res_expr = fruits_cars.select(pl.col("A").lower_bound())
2524
    assert res_expr.item() == -9223372036854775808
2525

2526
    res_expr = fruits_cars.select(pl.col("B").upper_bound())
2527
    assert res_expr.item() == 9223372036854775807
2528

2529
    with pytest.raises(ComputeError):
2530
        fruits_cars.select(pl.col("fruits").upper_bound())
2531

2532

2533
def test_selection_misc() -> None:
2534
    df = pl.DataFrame({"x": "abc"}, schema={"x": pl.String})
2535

2536
    # literal values (as scalar/list)
2537
    for zero in (0, [0]):
2538
        assert df.select(zero)["literal"].to_list() == [0]
2539
    assert df.select(literal=0)["literal"].to_list() == [0]
2540

2541
    # expect string values to be interpreted as cols
2542
    for x in ("x", ["x"], pl.col("x")):
2543
        assert df.select(x).rows() == [("abc",)]
2544

2545
    # string col + lit
2546
    assert df.with_columns(["x", 0]).to_dicts() == [{"x": "abc", "literal": 0}]
2547

2548

2549
def test_selection_regex_and_multicol() -> None:
2550
    test_df = pl.DataFrame(
2551
        {
2552
            "a": [1, 2, 3, 4],
2553
            "b": [5, 6, 7, 8],
2554
            "c": [9, 10, 11, 12],
2555
            "foo": [13, 14, 15, 16],
2556
        },
2557
        schema_overrides={"foo": pl.UInt8},
2558
    )
2559

2560
    # Selection only
2561
    test_df.select(
2562
        pl.col(["a", "b", "c"]).name.suffix("_list"),
2563
        pl.all().exclude("foo").name.suffix("_wild"),
2564
        pl.col("^\\w$").name.suffix("_regex"),
2565
    )
2566

2567
    # Multi * Single
2568
    assert test_df.select(pl.col(["a", "b", "c"]) * pl.col("foo")).to_dict(
2569
        as_series=False
2570
    ) == {
2571
        "a": [13, 28, 45, 64],
2572
        "b": [65, 84, 105, 128],
2573
        "c": [117, 140, 165, 192],
2574
    }
2575
    assert test_df.select(pl.all().exclude("foo") * pl.col("foo")).to_dict(
2576
        as_series=False
2577
    ) == {
2578
        "a": [13, 28, 45, 64],
2579
        "b": [65, 84, 105, 128],
2580
        "c": [117, 140, 165, 192],
2581
    }
2582

2583
    assert test_df.select(pl.col("^\\w$") * pl.col("foo")).to_dict(as_series=False) == {
2584
        "a": [13, 28, 45, 64],
2585
        "b": [65, 84, 105, 128],
2586
        "c": [117, 140, 165, 192],
2587
    }
2588

2589
    # Multi * Multi
2590
    result = test_df.select(pl.col(["a", "b", "c"]) * pl.col(["a", "b", "c"]))
2591
    expected = {"a": [1, 4, 9, 16], "b": [25, 36, 49, 64], "c": [81, 100, 121, 144]}
2592

2593
    assert result.to_dict(as_series=False) == expected
2594
    assert test_df.select(pl.exclude("foo") * pl.exclude("foo")).to_dict(
2595
        as_series=False
2596
    ) == {
2597
        "a": [1, 4, 9, 16],
2598
        "b": [25, 36, 49, 64],
2599
        "c": [81, 100, 121, 144],
2600
    }
2601
    assert test_df.select(pl.col("^\\w$") * pl.col("^\\w$")).to_dict(
2602
        as_series=False
2603
    ) == {
2604
        "a": [1, 4, 9, 16],
2605
        "b": [25, 36, 49, 64],
2606
        "c": [81, 100, 121, 144],
2607
    }
2608

2609
    df = test_df.select(
2610
        re=pl.struct(pl.col("^\\w$")),
2611
        odd=pl.struct((pl.col(INTEGER_DTYPES) % 2).name.suffix("_is_odd")),
2612
        maxes=pl.struct(pl.all().max().name.suffix("_max")),
2613
    ).head(2)
2614
    # ┌───────────┬───────────┬─────────────┐
2615
    # │ re        ┆ odd       ┆ maxes       │
2616
    # │ ---       ┆ ---       ┆ ---         │
2617
    # │ struct[3] ┆ struct[4] ┆ struct[4]   │
2618
    # ╞═══════════╪═══════════╪═════════════╡
2619
    # │ {1,5,9}   ┆ {1,1,1,1} ┆ {4,8,12,16} │
2620
    # │ {2,6,10}  ┆ {0,0,0,0} ┆ {4,8,12,16} │
2621
    # └───────────┴───────────┴─────────────┘
2622
    assert df.rows() == [
2623
        (
2624
            {"a": 1, "b": 5, "c": 9},
2625
            {"a_is_odd": 1, "b_is_odd": 1, "c_is_odd": 1, "foo_is_odd": 1},
2626
            {"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},
2627
        ),
2628
        (
2629
            {"a": 2, "b": 6, "c": 10},
2630
            {"a_is_odd": 0, "b_is_odd": 0, "c_is_odd": 0, "foo_is_odd": 0},
2631
            {"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},
2632
        ),
2633
    ]
2634

2635

2636
@pytest.mark.parametrize("subset", ["a", cs.starts_with("x", "a")])
2637
@pytest.mark.may_fail_auto_streaming  # Flaky in CI, see https://github.com/pola-rs/polars/issues/20943
2638
@pytest.mark.may_fail_cloud
2639
def test_unique_on_sorted(subset: Any) -> None:
2640
    df = pl.DataFrame(data={"a": [1, 1, 3], "b": [1, 2, 3]})
2641

2642
    result = df.with_columns([pl.col("a").set_sorted()]).unique(
2643
        subset=subset,
2644
        keep="last",
2645
    )
2646

2647
    expected = pl.DataFrame({"a": [1, 3], "b": [2, 3]})
2648
    assert_frame_equal(result, expected)
2649

2650

2651
def test_len_compute(df: pl.DataFrame) -> None:
2652
    df = df.with_columns(pl.struct(["list_bool", "cat"]).alias("struct"))
2653
    filtered = df.filter(pl.col("bools"))
2654
    for col in filtered.columns:
2655
        assert len(filtered[col]) == 1
2656

2657
    taken = df[[1, 2], :]
2658
    for col in taken.columns:
2659
        assert len(taken[col]) == 2
2660

2661

2662
def test_filter_sequence() -> None:
2663
    df = pl.DataFrame({"a": [1, 2, 3]})
2664
    assert df.filter([True, False, True])["a"].to_list() == [1, 3]
2665
    assert df.filter(np.array([True, False, True]))["a"].to_list() == [1, 3]
2666

2667

2668
def test_filter_multiple_predicates() -> None:
2669
    df = pl.DataFrame(
2670
        {
2671
            "a": [1, 1, 1, 2, 2],
2672
            "b": [1, 1, 2, 2, 2],
2673
            "c": [1, 1, 2, 3, 4],
2674
        }
2675
    )
2676

2677
    # multiple predicates
2678
    expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})
2679
    for out in (
2680
        df.filter(pl.col("a") == 1, pl.col("b") <= 2),  # positional/splat
2681
        df.filter([pl.col("a") == 1, pl.col("b") <= 2]),  # as list
2682
    ):
2683
        assert_frame_equal(out, expected)
2684

2685
    # multiple kwargs
2686
    assert_frame_equal(
2687
        df.filter(a=1, b=2),
2688
        pl.DataFrame({"a": [1], "b": [2], "c": [2]}),
2689
    )
2690

2691
    # both positional and keyword args
2692
    assert_frame_equal(
2693
        pl.DataFrame({"a": [2], "b": [2], "c": [3]}),
2694
        df.filter(pl.col("c") < 4, a=2, b=2),
2695
    )
2696

2697
    # boolean mask
2698
    out = df.filter([True, False, False, False, True])
2699
    expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 4]})
2700
    assert_frame_equal(out, expected)
2701

2702
    # multiple boolean masks
2703
    out = df.filter(
2704
        np.array([True, True, False, True, False]),
2705
        np.array([True, False, True, True, False]),
2706
    )
2707
    expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 3]})
2708
    assert_frame_equal(out, expected)
2709

2710

2711
def test_indexing_set() -> None:
2712
    df = pl.DataFrame({"bool": [True, True], "str": ["N/A", "N/A"], "nr": [1, 2]})
2713

2714
    df[0, "bool"] = False
2715
    df[0, "nr"] = 100
2716
    df[0, "str"] = "foo"
2717

2718
    assert df.to_dict(as_series=False) == {
2719
        "bool": [False, True],
2720
        "str": ["foo", "N/A"],
2721
        "nr": [100, 2],
2722
    }
2723

2724

2725
def test_set() -> None:
2726
    # Setting a dataframe using indices is deprecated.
2727
    # We keep these tests because we only generate a warning.
2728
    np.random.seed(1)
2729
    df = pl.DataFrame(
2730
        {"foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10}
2731
    )
2732
    with pytest.raises(
2733
        TypeError,
2734
        match=r"DataFrame object does not support `Series` assignment by index"
2735
        r"\n\nUse `DataFrame.with_columns`.",
2736
    ):
2737
        df["new"] = np.random.rand(10)
2738

2739
    with pytest.raises(
2740
        TypeError,
2741
        match=r"not allowed to set DataFrame by boolean mask in the row position"
2742
        r"\n\nConsider using `DataFrame.with_columns`.",
2743
    ):
2744
        df[df["ham"] > 0.5, "ham"] = "a"
2745
    with pytest.raises(
2746
        TypeError,
2747
        match=r"not allowed to set DataFrame by boolean mask in the row position"
2748
        r"\n\nConsider using `DataFrame.with_columns`.",
2749
    ):
2750
        df[[True, False], "ham"] = "a"
2751

2752
    # set 2D
2753
    df = pl.DataFrame({"b": [0, 0]})
2754
    df[["A", "B"]] = [[1, 2], [1, 2]]
2755

2756
    with pytest.raises(ValueError):
2757
        df[["C", "D"]] = 1
2758
    with pytest.raises(ValueError):
2759
        df[["C", "D"]] = [1, 1]
2760
    with pytest.raises(ValueError):
2761
        df[["C", "D"]] = [[1, 2, 3], [1, 2, 3]]
2762

2763
    # set tuple
2764
    df = pl.DataFrame({"b": [0, 0]})
2765
    df[0, "b"] = 1
2766
    assert df[0, "b"] == 1
2767

2768
    df[0, 0] = 2
2769
    assert df[0, "b"] == 2
2770

2771
    # row and col selection have to be int or str
2772
    with pytest.raises(TypeError):
2773
        df[:, [1]] = 1  # type: ignore[index]
2774
    with pytest.raises(TypeError):
2775
        df[True, :] = 1  # type: ignore[index]
2776

2777
    # needs to be a 2 element tuple
2778
    with pytest.raises(ValueError):
2779
        df[1, 2, 3] = 1
2780

2781
    # we cannot index with any type, such as bool
2782
    with pytest.raises(TypeError):
2783
        df[True] = 1  # type: ignore[index]
2784

2785

2786
def test_series_iter_over_frame() -> None:
2787
    df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
2788

2789
    expected = {
2790
        0: pl.Series("a", [1, 2, 3]),
2791
        1: pl.Series("b", [2, 3, 4]),
2792
        2: pl.Series("c", [3, 4, 5]),
2793
    }
2794
    for idx, s in enumerate(df):
2795
        assert_series_equal(s, expected[idx])
2796

2797
    expected = {
2798
        0: pl.Series("c", [3, 4, 5]),
2799
        1: pl.Series("b", [2, 3, 4]),
2800
        2: pl.Series("a", [1, 2, 3]),
2801
    }
2802
    for idx, s in enumerate(reversed(df)):
2803
        assert_series_equal(s, expected[idx])
2804

2805

2806
def test_union_with_aliases_4770() -> None:
2807
    lf = pl.DataFrame(
2808
        {
2809
            "a": [1, None],
2810
            "b": [3, 4],
2811
        }
2812
    ).lazy()
2813

2814
    lf = pl.concat(
2815
        [
2816
            lf.select([pl.col("a").alias("x")]),
2817
            lf.select([pl.col("b").alias("x")]),
2818
        ]
2819
    ).filter(pl.col("x").is_not_null())
2820

2821
    assert lf.collect()["x"].to_list() == [1, 3, 4]
2822

2823

2824
def test_init_datetimes_with_timezone() -> None:
2825
    tz_us = "America/New_York"
2826
    tz_europe = "Europe/Amsterdam"
2827

2828
    dtm = datetime(2022, 10, 12, 12, 30)
2829
    for time_unit in DTYPE_TEMPORAL_UNITS:
2830
        for type_overrides in (
2831
            {
2832
                "schema": [
2833
                    ("d1", pl.Datetime(time_unit, tz_us)),
2834
                    ("d2", pl.Datetime(time_unit, tz_europe)),
2835
                ]
2836
            },
2837
            {
2838
                "schema_overrides": {
2839
                    "d1": pl.Datetime(time_unit, tz_us),
2840
                    "d2": pl.Datetime(time_unit, tz_europe),
2841
                }
2842
            },
2843
        ):
2844
            result = pl.DataFrame(
2845
                data={
2846
                    "d1": [dtm.replace(tzinfo=ZoneInfo(tz_us))],
2847
                    "d2": [dtm.replace(tzinfo=ZoneInfo(tz_europe))],
2848
                },
2849
                **type_overrides,
2850
            )
2851
            expected = pl.DataFrame(
2852
                {"d1": ["2022-10-12 12:30"], "d2": ["2022-10-12 12:30"]}
2853
            ).with_columns(
2854
                pl.col("d1").str.to_datetime(time_unit=time_unit, time_zone=tz_us),
2855
                pl.col("d2").str.to_datetime(time_unit=time_unit, time_zone=tz_europe),
2856
            )
2857
            assert_frame_equal(result, expected)
2858

2859

2860
@pytest.mark.parametrize(
2861
    (
2862
        "tzinfo",
2863
        "offset",
2864
        "dtype_time_zone",
2865
        "expected_time_zone",
2866
        "expected_item",
2867
    ),
2868
    [
2869
        (None, "", None, None, datetime(2020, 1, 1)),
2870
        (
2871
            timezone(timedelta(hours=-8)),
2872
            "-08:00",
2873
            "UTC",
2874
            "UTC",
2875
            datetime(2020, 1, 1, 8, tzinfo=timezone.utc),
2876
        ),
2877
        (
2878
            timezone(timedelta(hours=-8)),
2879
            "-08:00",
2880
            None,
2881
            "UTC",
2882
            datetime(2020, 1, 1, 8, tzinfo=timezone.utc),
2883
        ),
2884
    ],
2885
)
2886
@pytest.mark.may_fail_cloud
2887
def test_init_vs_strptime_consistency(
2888
    tzinfo: timezone | None,
2889
    offset: str,
2890
    dtype_time_zone: str | None,
2891
    expected_time_zone: str,
2892
    expected_item: datetime,
2893
) -> None:
2894
    result_init = pl.Series(
2895
        [datetime(2020, 1, 1, tzinfo=tzinfo)],
2896
        dtype=pl.Datetime("us", dtype_time_zone),
2897
    )
2898
    result_strptime = pl.Series([f"2020-01-01 00:00{offset}"]).str.strptime(
2899
        pl.Datetime("us", dtype_time_zone)
2900
    )
2901
    assert result_init.dtype == pl.Datetime("us", expected_time_zone)
2902
    assert result_init.item() == expected_item
2903
    assert_series_equal(result_init, result_strptime)
2904

2905

2906
def test_init_vs_strptime_consistency_converts() -> None:
2907
    result = pl.Series(
2908
        [datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],
2909
        dtype=pl.Datetime("us", "US/Pacific"),
2910
    ).item()
2911
    assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))
2912
    result = (
2913
        pl.Series(["2020-01-01 00:00-08:00"])
2914
        .str.strptime(pl.Datetime("us", "US/Pacific"))
2915
        .item()
2916
    )
2917
    assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))
2918

2919

2920
def test_init_physical_with_timezone() -> None:
2921
    tz_uae = "Asia/Dubai"
2922
    tz_asia = "Asia/Tokyo"
2923

2924
    dtm_us = 1665577800000000
2925
    for time_unit in DTYPE_TEMPORAL_UNITS:
2926
        dtm = {"ms": dtm_us // 1_000, "ns": dtm_us * 1_000}.get(str(time_unit), dtm_us)
2927
        df = pl.DataFrame(
2928
            data={"d1": [dtm], "d2": [dtm]},
2929
            schema=[
2930
                ("d1", pl.Datetime(time_unit, tz_uae)),
2931
                ("d2", pl.Datetime(time_unit, tz_asia)),
2932
            ],
2933
        )
2934
        assert (df["d1"].to_physical() == df["d2"].to_physical()).all()
2935
        assert df.rows() == [
2936
            (
2937
                datetime(2022, 10, 12, 16, 30, tzinfo=ZoneInfo(tz_uae)),
2938
                datetime(2022, 10, 12, 21, 30, tzinfo=ZoneInfo(tz_asia)),
2939
            )
2940
        ]
2941

2942

2943
@pytest.mark.parametrize("divop", [floordiv, truediv])
2944
def test_floordiv_truediv(divop: Callable[..., Any]) -> None:
2945
    # validate truediv/floordiv dataframe ops against python
2946
    df1 = pl.DataFrame(
2947
        data={
2948
            "x": [0, -1, -2, -3],
2949
            "y": [-0.0, -3.0, 5.0, -7.0],
2950
            "z": [10, 3, -5, 7],
2951
        }
2952
    )
2953

2954
    # scalar
2955
    for df in [df1, df1.slice(0, 0)]:
2956
        for n in (3, 3.0, -3, -3.0):
2957
            py_div = [tuple(divop(elem, n) for elem in row) for row in df.rows()]
2958
            df_div = divop(df, n).rows()
2959
            assert py_div == df_div
2960

2961
    # series
2962
    xdf, s = df1["x"].to_frame(), pl.Series([2] * 4)
2963
    assert list(divop(xdf, s)["x"]) == [divop(x, 2) for x in list(df1["x"])]
2964

2965
    # frame
2966
    df2 = pl.DataFrame(
2967
        data={
2968
            "x": [2, -2, 2, 3],
2969
            "y": [4, 4, -4, 8],
2970
            "z": [0.5, 2.0, -2.0, -3],
2971
        }
2972
    )
2973
    df_div = divop(df1, df2).rows()
2974
    for i, (row1, row2) in enumerate(zip(df1.rows(), df2.rows())):
2975
        for j, (elem1, elem2) in enumerate(zip(row1, row2)):
2976
            assert divop(elem1, elem2) == df_div[i][j]
2977

2978

2979
@pytest.mark.parametrize(
2980
    ("subset", "keep", "expected_mask"),
2981
    [
2982
        (None, "first", [True, True, True, False]),
2983
        ("a", "first", [True, True, False, False]),
2984
        (["a", "b"], "first", [True, True, False, False]),
2985
        (("a", "b"), "last", [True, False, False, True]),
2986
        (("a", "b"), "none", [True, False, False, False]),
2987
    ],
2988
)
2989
def test_unique(
2990
    subset: str | Sequence[str], keep: UniqueKeepStrategy, expected_mask: list[bool]
2991
) -> None:
2992
    df = pl.DataFrame({"a": [1, 2, 2, 2], "b": [3, 4, 4, 4], "c": [5, 6, 7, 7]})
2993

2994
    result = df.unique(maintain_order=True, subset=subset, keep=keep).sort(pl.all())
2995
    expected = df.filter(expected_mask).sort(pl.all())
2996
    assert_frame_equal(result, expected)
2997

2998

2999
def test_iter_slices() -> None:
3000
    df = pl.DataFrame(
3001
        {
3002
            "a": range(95),
3003
            "b": date(2023, 1, 1),
3004
            "c": "klmnopqrstuvwxyz",
3005
        }
3006
    )
3007
    batches = list(df.iter_slices(n_rows=50))
3008

3009
    assert len(batches[0]) == 50
3010
    assert len(batches[1]) == 45
3011
    assert batches[1].rows() == df[50:].rows()
3012

3013

3014
def test_format_empty_df() -> None:
3015
    df = pl.DataFrame(
3016
        [
3017
            pl.Series("val1", [], dtype=pl.Categorical),
3018
            pl.Series("val2", [], dtype=pl.Categorical),
3019
        ]
3020
    ).select(
3021
        pl.format("{}:{}", pl.col("val1"), pl.col("val2")).alias("cat"),
3022
    )
3023
    assert df.shape == (0, 1)
3024
    assert df.dtypes == [pl.String]
3025

3026

3027
def test_deadlocks_3409() -> None:
3028
    assert (
3029
        pl.DataFrame({"col1": [[1, 2, 3]]})
3030
        .with_columns(
3031
            pl.col("col1").list.eval(
3032
                pl.element().map_elements(lambda x: x, return_dtype=pl.Int64)
3033
            )
3034
        )
3035
        .to_dict(as_series=False)
3036
    ) == {"col1": [[1, 2, 3]]}
3037

3038
    assert (
3039
        pl.DataFrame({"col1": [1, 2, 3]})
3040
        .with_columns(
3041
            pl.col("col1").cumulative_eval(
3042
                pl.element().map_batches(lambda x: 0, pl.Int64, returns_scalar=True)
3043
            )
3044
        )
3045
        .to_dict(as_series=False)
3046
    ) == {"col1": [0, 0, 0]}
3047

3048

3049
def test_ceil() -> None:
3050
    df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3051
    result = df.select(pl.col("a").ceil())
3052
    assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))
3053

3054
    df = pl.DataFrame({"a": [1, 2, 3]})
3055
    result = df.select(pl.col("a").ceil())
3056
    assert_frame_equal(df, result)
3057

3058

3059
def test_floor() -> None:
3060
    df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3061
    result = df.select(pl.col("a").floor())
3062
    assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))
3063

3064
    df = pl.DataFrame({"a": [1, 2, 3]})
3065
    result = df.select(pl.col("a").floor())
3066
    assert_frame_equal(df, result)
3067

3068

3069
def test_floor_divide() -> None:
3070
    x = 10.4
3071
    step = 0.5
3072
    df = pl.DataFrame({"x": [x]})
3073
    assert df.with_columns(pl.col("x") // step)[0, 0] == x // step
3074

3075

3076
def test_round() -> None:
3077
    df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3078
    col_a_rounded = df.select(pl.col("a").round(decimals=0))["a"]
3079
    assert_series_equal(col_a_rounded, pl.Series("a", [2, 1, 3]).cast(pl.Float64))
3080

3081

3082
def test_dot() -> None:
3083
    df = pl.DataFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]})
3084
    assert df.select(pl.col("a").dot(pl.col("b"))).item() == 12.96
3085

3086

3087
def test_unstack() -> None:
3088
    from string import ascii_uppercase
3089

3090
    df = pl.DataFrame(
3091
        {
3092
            "col1": list(ascii_uppercase[0:9]),
3093
            "col2": pl.int_range(0, 9, eager=True),
3094
            "col3": pl.int_range(-9, 0, eager=True),
3095
        }
3096
    )
3097
    assert df.unstack(step=3, how="vertical").to_dict(as_series=False) == {
3098
        "col1_0": ["A", "B", "C"],
3099
        "col1_1": ["D", "E", "F"],
3100
        "col1_2": ["G", "H", "I"],
3101
        "col2_0": [0, 1, 2],
3102
        "col2_1": [3, 4, 5],
3103
        "col2_2": [6, 7, 8],
3104
        "col3_0": [-9, -8, -7],
3105
        "col3_1": [-6, -5, -4],
3106
        "col3_2": [-3, -2, -1],
3107
    }
3108

3109
    assert df.unstack(step=3, how="horizontal").to_dict(as_series=False) == {
3110
        "col1_0": ["A", "D", "G"],
3111
        "col1_1": ["B", "E", "H"],
3112
        "col1_2": ["C", "F", "I"],
3113
        "col2_0": [0, 3, 6],
3114
        "col2_1": [1, 4, 7],
3115
        "col2_2": [2, 5, 8],
3116
        "col3_0": [-9, -6, -3],
3117
        "col3_1": [-8, -5, -2],
3118
        "col3_2": [-7, -4, -1],
3119
    }
3120

3121
    for column_subset in (("col2", "col3"), cs.integer()):
3122
        assert df.unstack(
3123
            step=3,
3124
            how="horizontal",
3125
            columns=column_subset,
3126
        ).to_dict(as_series=False) == {
3127
            "col2_0": [0, 3, 6],
3128
            "col2_1": [1, 4, 7],
3129
            "col2_2": [2, 5, 8],
3130
            "col3_0": [-9, -6, -3],
3131
            "col3_1": [-8, -5, -2],
3132
            "col3_2": [-7, -4, -1],
3133
        }
3134

3135

3136
def test_window_deadlock() -> None:
3137
    np.random.seed(12)
3138

3139
    df = pl.DataFrame(
3140
        {
3141
            "nrs": [1, 2, 3, None, 5],
3142
            "names": ["foo", "ham", "spam", "egg", None],
3143
            "random": np.random.rand(5),
3144
            "groups": ["A", "A", "B", "C", "B"],
3145
        }
3146
    )
3147

3148
    _df = df.select(
3149
        pl.col("*"),  # select all
3150
        pl.col("random").sum().over("groups").alias("sum[random]/groups"),
3151
        pl.col("random").implode().over("names").alias("random/name"),
3152
    )
3153

3154

3155
def test_sum_empty_column_names() -> None:
3156
    df = pl.DataFrame({"x": [], "y": []}, schema={"x": pl.Boolean, "y": pl.Boolean})
3157
    expected = pl.DataFrame(
3158
        {"x": [0], "y": [0]}, schema={"x": pl.UInt32, "y": pl.UInt32}
3159
    )
3160
    assert_frame_equal(df.sum(), expected)
3161

3162

3163
def test_flags() -> None:
3164
    df = pl.DataFrame({"a": [1, 2, 3], "b": [9, 5, 6]})
3165
    assert df.flags == {
3166
        "a": {"SORTED_ASC": False, "SORTED_DESC": False},
3167
        "b": {"SORTED_ASC": False, "SORTED_DESC": False},
3168
    }
3169
    assert df.set_sorted("a").flags == {
3170
        "a": {"SORTED_ASC": True, "SORTED_DESC": False},
3171
        "b": {"SORTED_ASC": False, "SORTED_DESC": False},
3172
    }
3173

3174

3175
def test_interchange() -> None:
3176
    df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
3177
    dfi = df.__dataframe__()
3178

3179
    # Testing some random properties to make sure conversion happened correctly
3180
    assert dfi.num_rows() == 2
3181
    assert dfi.get_column(0).dtype[1] == 64
3182
    assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6
3183

3184

3185
def test_from_dicts_undeclared_column_dtype() -> None:
3186
    data = [{"a": 1, "b": 2}]
3187
    result = pl.from_dicts(data, schema=["x"])
3188
    assert result.schema == {"x": pl.Null}
3189

3190

3191
def test_from_dicts_with_override() -> None:
3192
    data = [
3193
        {"a": "1", "b": str(2**64 - 1), "c": "1"},
3194
        {"a": "1", "b": "1", "c": "-5.0"},
3195
    ]
3196
    override = {"a": pl.Int32, "b": pl.UInt64, "c": pl.Float32}
3197
    result = pl.from_dicts(data, schema_overrides=override)
3198
    assert_frame_equal(
3199
        result,
3200
        pl.DataFrame(
3201
            {
3202
                "a": pl.Series([1, 1], dtype=pl.Int32),
3203
                "b": pl.Series([2**64 - 1, 1], dtype=pl.UInt64),
3204
                "c": pl.Series([1.0, -5.0], dtype=pl.Float32),
3205
            }
3206
        ),
3207
    )
3208

3209

3210
def test_from_records_u64_12329() -> None:
3211
    s = pl.from_records([{"a": 9908227375760408577}])
3212
    assert s.dtypes == [pl.Int128]
3213
    assert s["a"][0] == 9908227375760408577
3214

3215

3216
def test_negative_slice_12642() -> None:
3217
    df = pl.DataFrame({"x": range(5)})
3218
    assert_frame_equal(df.slice(-2, 1), df.tail(2).head(1))
3219

3220

3221
def test_iter_columns() -> None:
3222
    df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]})
3223
    iter_columns = df.iter_columns()
3224
    assert_series_equal(next(iter_columns), pl.Series("a", [1, 1, 2]))
3225
    assert_series_equal(next(iter_columns), pl.Series("b", [4, 5, 6]))
3226

3227

3228
def test_get_column_index() -> None:
3229
    df = pl.DataFrame({"actual": [1001], "expected": [1000]})
3230

3231
    assert df.get_column_index("actual") == 0
3232
    assert df.get_column_index("expected") == 1
3233

3234
    with pytest.raises(ColumnNotFoundError, match="missing"):
3235
        df.get_column_index("missing")
3236

3237

3238
def test_dataframe_creation_with_different_series_lengths_19795() -> None:
3239
    with pytest.raises(
3240
        ShapeError,
3241
        match=r"could not create a new DataFrame: height of column 'b' \(1\) does not match height of column 'a' \(2\)",
3242
    ):
3243
        pl.DataFrame({"a": [1, 2], "b": [1]})
3244

3245

3246
def test_get_column_after_drop_20119() -> None:
3247
    df = pl.DataFrame({"a": ["A"], "b": ["B"], "c": ["C"]})
3248
    df.drop_in_place("a")
3249
    c = df.get_column("c")
3250
    assert_series_equal(c, pl.Series("c", ["C"]))
3251

3252

3253
def test_select_oob_row_20775() -> None:
3254
    df = pl.DataFrame({"a": [1, 2, 3]})
3255
    with pytest.raises(
3256
        IndexError,
3257
        match="index 99 is out of bounds for DataFrame of height 3",
3258
    ):
3259
        df[99]
3260

3261

3262
@pytest.mark.parametrize("idx", [3, 99, -4, -99])
3263
def test_select_oob_element_20775_too_large(idx: int) -> None:
3264
    df = pl.DataFrame({"a": [1, 2, 3]})
3265
    with pytest.raises(
3266
        IndexError,
3267
        match=f"index {idx} is out of bounds for sequence of length 3",
3268
    ):
3269
        df[idx, "a"]
3270

3271

3272
def test_nan_to_null() -> None:
3273
    a = np.array([np.nan, 1])
3274

3275
    df1 = pl.DataFrame(a, nan_to_null=True)
3276
    df2 = pl.DataFrame(
3277
        (a,),
3278
        nan_to_null=True,
3279
    )
3280

3281
    assert_frame_equal(df1, df2)
3282

3283

3284
# Below 3 tests for https://github.com/pola-rs/polars/issues/17879
3285

3286

3287
def test_with_columns_dict_direct_typeerror() -> None:
3288
    data = {"a": pl.col("a") * 2}
3289
    df = pl.select(a=1)
3290
    with pytest.raises(
3291
        TypeError, match="Cannot pass a dictionary as a single positional argument"
3292
    ):
3293
        df.with_columns(data)
3294

3295

3296
def test_with_columns_dict_unpacking() -> None:
3297
    data = {"a": pl.col("a") * 2}
3298
    df = pl.select(a=1).with_columns(**data)
3299
    expected = pl.DataFrame({"a": [2]})
3300
    assert df.equals(expected)
3301

3302

3303
def test_with_columns_generator_alias() -> None:
3304
    data = {"a": pl.col("a") * 2}
3305
    df = pl.select(a=1).with_columns(expr.alias(name) for name, expr in data.items())
3306
    expected = pl.DataFrame({"a": [2]})
3307
    assert df.equals(expected)
3308

3309
Product

Resources

Company