Path: blob/main/py-polars/tests/unit/dataframe/test_df.py
6939 views
from __future__ import annotations12import sys3import typing4from collections import OrderedDict5from collections.abc import Iterator, Mapping6from datetime import date, datetime, time, timedelta, timezone7from decimal import Decimal8from io import BytesIO9from operator import floordiv, truediv10from typing import TYPE_CHECKING, Any, Callable, cast11from zoneinfo import ZoneInfo1213import numpy as np14import pyarrow as pa15import pytest1617import polars as pl18import polars.selectors as cs19from polars._plr import PySeries20from polars._utils.construction import iterable_to_pydf21from polars.datatypes import DTYPE_TEMPORAL_UNITS22from polars.exceptions import (23ColumnNotFoundError,24ComputeError,25DuplicateError,26InvalidOperationError,27OutOfBoundsError,28ShapeError,29)30from polars.testing import (31assert_frame_equal,32assert_frame_not_equal,33assert_series_equal,34)35from tests.unit.conftest import INTEGER_DTYPES3637if TYPE_CHECKING:38from collections.abc import Iterator, Sequence3940from polars import Expr41from polars._typing import JoinStrategy, UniqueKeepStrategy424344class MappingObject(Mapping[str, Any]): # noqa: D10145def __init__(self, **values: Any) -> None:46self._data = {**values}4748def __getitem__(self, key: str) -> Any:49return self._data[key]5051def __iter__(self) -> Iterator[str]:52yield from self._data5354def __len__(self) -> int:55return len(self._data)565758def test_version() -> None:59isinstance(pl.__version__, str)606162def test_null_count() -> None:63df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", None]})64assert df.null_count().shape == (1, 2)65assert df.null_count().row(0) == (0, 1)66assert df.null_count().row(np.int64(0)) == (0, 1) # type: ignore[call-overload]676869@pytest.mark.parametrize("input", [None, (), [], {}, pa.Table.from_arrays([])])70def test_init_empty(input: Any) -> None:71# test various flavours of empty init72df = pl.DataFrame(input)73assert df.shape == (0, 0)74assert df.is_empty()757677def test_df_bool_ambiguous() -> None:78empty_df = pl.DataFrame()79with pytest.raises(TypeError, match="ambiguous"):80not empty_df818283def test_special_char_colname_init() -> None:84from string import punctuation8586cols = [(c, pl.Int8) for c in punctuation]87df = pl.DataFrame(schema=cols)8889assert len(cols) == df.width90assert len(df.rows()) == 091assert df.is_empty()929394def test_comparisons() -> None:95df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})9697# Constants98assert_frame_equal(df == 2, pl.DataFrame({"a": [False, True], "b": [False, False]}))99assert_frame_equal(df != 2, pl.DataFrame({"a": [True, False], "b": [True, True]}))100assert_frame_equal(df < 3.0, pl.DataFrame({"a": [True, True], "b": [False, False]}))101assert_frame_equal(df >= 2, pl.DataFrame({"a": [False, True], "b": [True, True]}))102assert_frame_equal(df <= 2, pl.DataFrame({"a": [True, True], "b": [False, False]}))103104with pytest.raises(ComputeError):105df > "2" # noqa: B015106107# Series108s = pl.Series([3, 1])109assert_frame_equal(df >= s, pl.DataFrame({"a": [False, True], "b": [True, True]}))110111# DataFrame112other = pl.DataFrame({"a": [1, 2], "b": [2, 3]})113assert_frame_equal(114df == other, pl.DataFrame({"a": [True, True], "b": [False, False]})115)116assert_frame_equal(117df != other, pl.DataFrame({"a": [False, False], "b": [True, True]})118)119assert_frame_equal(120df > other, pl.DataFrame({"a": [False, False], "b": [True, True]})121)122assert_frame_equal(123df < other, pl.DataFrame({"a": [False, False], "b": [False, False]})124)125assert_frame_equal(126df >= other, pl.DataFrame({"a": [True, True], "b": [True, True]})127)128assert_frame_equal(129df <= other, pl.DataFrame({"a": [True, True], "b": [False, False]})130)131132# DataFrame columns mismatch133with pytest.raises(ValueError):134df == pl.DataFrame({"a": [1, 2], "c": [3, 4]}) # noqa: B015135with pytest.raises(ValueError):136df == pl.DataFrame({"b": [3, 4], "a": [1, 2]}) # noqa: B015137138# DataFrame shape mismatch139with pytest.raises(ValueError):140df == pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # noqa: B015141142# Type mismatch143with pytest.raises(ComputeError):144df == pl.DataFrame({"a": [1, 2], "b": ["x", "y"]}) # noqa: B015145146147def test_column_selection() -> None:148df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})149150# get column by name151b = pl.Series("b", [1.0, 2.0, 3.0])152assert_series_equal(df["b"], b)153assert_series_equal(df.get_column("b"), b)154155with pytest.raises(ColumnNotFoundError, match="x"):156df.get_column("x")157158default_series = pl.Series("x", ["?", "?", "?"])159assert_series_equal(df.get_column("x", default=default_series), default_series)160161assert df.get_column("x", default=None) is None162163# get column by index164assert_series_equal(df.to_series(1), pl.Series("b", [1.0, 2.0, 3.0]))165assert_series_equal(df.to_series(-1), pl.Series("c", ["a", "b", "c"]))166167168def test_mixed_sequence_selection() -> None:169df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})170result = df.select(["a", pl.col("b"), pl.lit("c")])171expected = pl.DataFrame({"a": [1, 2], "b": [3, 4], "literal": ["c", "c"]})172assert_frame_equal(result, expected)173174175def test_from_arrow(monkeypatch: Any) -> None:176tbl = pa.table(177{178"a": pa.array([1, 2], pa.timestamp("s")),179"b": pa.array([1, 2], pa.timestamp("ms")),180"c": pa.array([1, 2], pa.timestamp("us")),181"d": pa.array([1, 2], pa.timestamp("ns")),182"e": pa.array([1, 2], pa.int32()),183"decimal1": pa.array([1, 2], pa.decimal128(2, 1)),184"struct": pa.array(185[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])186),187}188)189record_batches = tbl.to_batches(max_chunksize=1)190expected_schema = {191"a": pl.Datetime("ms"),192"b": pl.Datetime("ms"),193"c": pl.Datetime("us"),194"d": pl.Datetime("ns"),195"e": pl.Int32,196"decimal1": pl.Decimal(2, 1),197"struct": pl.Struct({"a": pl.Int32()}),198}199expected_data = [200(201datetime(1970, 1, 1, 0, 0, 1),202datetime(1970, 1, 1, 0, 0, 0, 1000),203datetime(1970, 1, 1, 0, 0, 0, 1),204datetime(1970, 1, 1, 0, 0),2051,206Decimal("1.0"),207{"a": 1},208),209(210datetime(1970, 1, 1, 0, 0, 2),211datetime(1970, 1, 1, 0, 0, 0, 2000),212datetime(1970, 1, 1, 0, 0, 0, 2),213datetime(1970, 1, 1, 0, 0),2142,215Decimal("2.0"),216{"a": 2},217),218]219for arrow_data in (tbl, record_batches, (rb for rb in record_batches)):220df = cast("pl.DataFrame", pl.from_arrow(arrow_data))221assert df.schema == expected_schema222assert df.rows() == expected_data223224# record batches (inc. empty)225for b, n_expected in (226(record_batches[0], 1),227(record_batches[0][:0], 0),228):229df = cast("pl.DataFrame", pl.from_arrow(b))230assert df.schema == expected_schema231assert df.rows() == expected_data[:n_expected]232233empty_tbl = tbl[:0] # no rows234df = cast("pl.DataFrame", pl.from_arrow(empty_tbl))235assert df.schema == expected_schema236assert df.rows() == []237238# try a single column dtype override239for t in (tbl, empty_tbl):240df = pl.DataFrame(t, schema_overrides={"e": pl.Int8})241override_schema = expected_schema.copy()242override_schema["e"] = pl.Int8243assert df.schema == override_schema244assert df.rows() == expected_data[: (df.height)]245246# init from record batches with overrides247df = pl.DataFrame(248{249"id": ["a123", "b345", "c567", "d789", "e101"],250"points": [99, 45, 50, 85, 35],251}252)253tbl = df.to_arrow()254batches = tbl.to_batches(max_chunksize=3)255256df0: pl.DataFrame = pl.from_arrow(batches) # type: ignore[assignment]257df1: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]258data=batches,259schema=["x", "y"],260schema_overrides={"y": pl.Int32},261)262df2: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]263data=batches[0],264schema=["x", "y"],265schema_overrides={"y": pl.Int32},266)267268assert df0.rows() == df.rows()269assert df1.rows() == df.rows()270assert df2.rows() == df.rows()[:3]271272assert df0.schema == {"id": pl.String, "points": pl.Int64}273print(df1.schema)274assert df1.schema == {"x": pl.String, "y": pl.Int32}275assert df2.schema == {"x": pl.String, "y": pl.Int32}276277with pytest.raises(TypeError, match="Cannot convert str"):278pl.from_arrow(data="xyz")279280with pytest.raises(TypeError, match="Cannot convert int"):281pl.from_arrow(data=(x for x in (1, 2, 3)))282283284@pytest.mark.parametrize(285"data",286[287pa.Table.from_pydict(288{289"struct": pa.array(290[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])291),292}293),294pa.Table.from_pydict(295{296"struct": pa.chunked_array(297[[{"a": 1}], [{"a": 2}]], pa.struct([pa.field("a", pa.int32())])298),299}300),301],302)303def test_from_arrow_struct_column(data: pa.Table) -> None:304df = cast("pl.DataFrame", pl.from_arrow(data=data))305expected_schema = pl.Schema({"struct": pl.Struct({"a": pl.Int32()})})306expected_data = [({"a": 1},), ({"a": 2},)]307assert df.schema == expected_schema308assert df.rows() == expected_data309310311def test_dataframe_membership_operator() -> None:312# cf. issue #4032313df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})314assert "name" in df315assert "phone" not in df316assert df._ipython_key_completions_() == ["name", "age"]317318319def test_sort() -> None:320df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})321expected = pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]})322assert_frame_equal(df.sort("a"), expected)323assert_frame_equal(df.sort(["a", "b"]), expected)324325326def test_sort_multi_output_exprs_01() -> None:327df = pl.DataFrame(328{329"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],330"strs": ["abc", "def", "ghi"],331"vals": [10.5, 20.3, 15.7],332}333)334335expected = pl.DataFrame(336{337"dts": [date(2077, 10, 2), date(2077, 10, 2), date(2077, 10, 3)],338"strs": ["ghi", "def", "abc"],339"vals": [15.7, 20.3, 10.5],340}341)342assert_frame_equal(expected, df.sort(pl.col("^(d|v).*$")))343assert_frame_equal(expected, df.sort(cs.temporal() | cs.numeric()))344assert_frame_equal(expected, df.sort(cs.temporal(), cs.numeric(), cs.binary()))345346expected = pl.DataFrame(347{348"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],349"strs": ["abc", "def", "ghi"],350"vals": [10.5, 20.3, 15.7],351}352)353assert_frame_equal(354expected,355df.sort(pl.col("^(d|v).*$"), descending=[True]),356)357assert_frame_equal(358expected,359df.sort(cs.temporal() | cs.numeric(), descending=[True]),360)361assert_frame_equal(362expected,363df.sort(cs.temporal(), cs.numeric(), descending=[True, True]),364)365366with pytest.raises(367ValueError,368match=r"the length of `descending` \(2\) does not match the length of `by` \(1\)",369):370df.sort(by=[cs.temporal()], descending=[True, False])371372with pytest.raises(373ValueError,374match=r"the length of `nulls_last` \(3\) does not match the length of `by` \(2\)",375):376df.sort("dts", "strs", nulls_last=[True, False, True])377378# No columns selected - return original input.379assert_frame_equal(df, df.sort(pl.col("^xxx$")))380381382@pytest.mark.parametrize(383("by_explicit", "desc_explicit", "by_multi", "desc_multi"),384[385(386["w", "x", "y", "z"],387[False, False, True, True],388[cs.integer(), cs.string()],389[False, True],390),391(392["w", "y", "z"],393[True, True, False],394[pl.col("^(w|y)$"), pl.col("^z.*$")],395[True, False],396),397(398["z", "w", "x"],399[True, False, False],400[pl.col("z"), cs.numeric()],401[True, False],402),403],404)405def test_sort_multi_output_exprs_02(406by_explicit: list[str],407desc_explicit: list[bool],408by_multi: list[Expr],409desc_multi: list[bool],410) -> None:411df = pl.DataFrame(412{413"w": [100, 100, 100, 100, 200, 200, 200, 200],414"x": [888, 888, 444, 444, 888, 888, 444, 888],415"y": ["b", "b", "a", "a", "b", "b", "a", "a"],416"z": ["x", "y", "x", "y", "x", "y", "x", "y"],417}418)419res1 = df.sort(*by_explicit, descending=desc_explicit)420res2 = df.sort(*by_multi, descending=desc_multi)421assert_frame_equal(res1, res2)422423424def test_sort_maintain_order() -> None:425l1 = (426pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})427.sort("A", maintain_order=True)428.slice(0, 3)429.collect()["B"]430.to_list()431)432l2 = (433pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})434.sort("A")435.collect()436.slice(0, 3)["B"]437.to_list()438)439assert l1 == l2 == ["A", "B", "C"]440441442@pytest.mark.parametrize("nulls_last", [False, True], ids=["nulls_first", "nulls_last"])443def test_sort_maintain_order_descending_repeated_nulls(nulls_last: bool) -> None:444got = (445pl.LazyFrame({"A": [None, -1, 1, 1, None], "B": [1, 2, 3, 4, 5]})446.sort("A", descending=True, maintain_order=True, nulls_last=nulls_last)447.collect()448)449if nulls_last:450expect = pl.DataFrame({"A": [1, 1, -1, None, None], "B": [3, 4, 2, 1, 5]})451else:452expect = pl.DataFrame({"A": [None, None, 1, 1, -1], "B": [1, 5, 3, 4, 2]})453assert_frame_equal(got, expect)454455456def test_replace() -> None:457df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})458s = pl.Series("c", [True, False, True])459df._replace("a", s)460assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}))461462463def test_assignment() -> None:464df = pl.DataFrame({"foo": [1, 2, 3], "bar": [2, 3, 4]})465df = df.with_columns(pl.col("foo").alias("foo"))466# make sure that assignment does not change column order467assert df.columns == ["foo", "bar"]468df = df.with_columns(469pl.when(pl.col("foo") > 1).then(9).otherwise(pl.col("foo")).alias("foo")470)471assert df["foo"].to_list() == [1, 9, 9]472473474def test_insert_column() -> None:475# insert series476df = (477pl.DataFrame({"z": [3, 4, 5]})478.insert_column(0, pl.Series("x", [1, 2, 3]))479.insert_column(-1, pl.Series("y", [2, 3, 4]))480)481expected_df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})482assert_frame_equal(expected_df, df)483484# insert expressions485df = pl.DataFrame(486{487"id": ["xx", "yy", "zz"],488"v1": [5, 4, 6],489"v2": [7, 3, 3],490}491)492df.insert_column(3, (pl.col("v1") * pl.col("v2")).alias("v3"))493df.insert_column(1, (pl.col("v2") - pl.col("v1")).alias("v0"))494495expected = pl.DataFrame(496{497"id": ["xx", "yy", "zz"],498"v0": [2, -1, -3],499"v1": [5, 4, 6],500"v2": [7, 3, 3],501"v3": [35, 12, 18],502}503)504assert_frame_equal(df, expected)505506# check that we raise suitable index errors507for idx, column in (508(10, pl.col("v1").sqrt().alias("v1_sqrt")),509(-10, pl.Series("foo", [1, 2, 3])),510):511with pytest.raises(512IndexError,513match=rf"column index {idx} is out of range \(frame has 5 columns\)",514):515df.insert_column(idx, column)516517518def test_replace_column() -> None:519df = (520pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})521.replace_column(0, pl.Series("a", [4, 5, 6]))522.replace_column(-2, pl.Series("b", [5, 6, 7]))523.replace_column(-1, pl.Series("c", [6, 7, 8]))524)525expected_df = pl.DataFrame({"a": [4, 5, 6], "b": [5, 6, 7], "c": [6, 7, 8]})526assert_frame_equal(expected_df, df)527528529def test_to_series() -> None:530df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})531532assert_series_equal(df.to_series(), df["x"])533assert_series_equal(df.to_series(0), df["x"])534assert_series_equal(df.to_series(-3), df["x"])535536assert_series_equal(df.to_series(1), df["y"])537assert_series_equal(df.to_series(-2), df["y"])538539assert_series_equal(df.to_series(2), df["z"])540assert_series_equal(df.to_series(-1), df["z"])541542543def test_to_series_bad_inputs() -> None:544df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})545546with pytest.raises(IndexError, match="index 5 is out of bounds"):547df.to_series(5)548549with pytest.raises(IndexError, match="index -100 is out of bounds"):550df.to_series(-100)551552with pytest.raises(553TypeError, match="'str' object cannot be interpreted as an integer"554):555df.to_series("x") # type: ignore[arg-type]556557558def test_gather_every() -> None:559df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})560expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})561assert_frame_equal(expected_df, df.gather_every(2))562563expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})564assert_frame_equal(expected_df, df.gather_every(2, offset=1))565566567def test_gather_every_agg() -> None:568df = pl.DataFrame(569{570"g": [1, 1, 1, 2, 2, 2],571"a": ["a", "b", "c", "d", "e", "f"],572}573)574out = df.group_by(pl.col("g")).agg(pl.col("a").gather_every(2)).sort("g")575expected = pl.DataFrame(576{577"g": [1, 2],578"a": [["a", "c"], ["d", "f"]],579}580)581assert_frame_equal(out, expected)582583584def test_take_misc(fruits_cars: pl.DataFrame) -> None:585df = fruits_cars586587# Out of bounds error.588with pytest.raises(OutOfBoundsError):589df.sort("fruits").select(590pl.col("B").reverse().gather([1, 2]).implode().over("fruits"),591"fruits",592)593594# Null indices.595assert_frame_equal(596df.select(pl.col("fruits").gather(pl.Series([0, None]))),597pl.DataFrame({"fruits": ["banana", None]}),598)599600for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]:601out = df.sort("fruits").select(602pl.col("B")603.reverse()604.gather(index) # type: ignore[arg-type]605.over("fruits", mapping_strategy="join"),606"fruits",607)608609assert out[0, "B"].to_list() == [2, 3]610assert out[4, "B"].to_list() == [1, 4]611612out = df.sort("fruits").select(613pl.col("B").reverse().get(pl.lit(1)).over("fruits"),614"fruits",615)616assert out[0, "B"] == 3617assert out[4, "B"] == 4618619620def test_pipe() -> None:621df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8]})622623def _multiply(data: pl.DataFrame, mul: int) -> pl.DataFrame:624return data * mul625626result = df.pipe(_multiply, mul=3)627628assert_frame_equal(result, df * 3)629630631def test_explode() -> None:632df = pl.DataFrame({"letters": ["c", "a"], "nrs": [[1, 2], [1, 3]]})633out = df.explode("nrs")634assert out["letters"].to_list() == ["c", "c", "a", "a"]635assert out["nrs"].to_list() == [1, 2, 1, 3]636637638@pytest.mark.parametrize(639("stack", "exp_shape", "exp_columns"),640[641([pl.Series("stacked", [-1, -1, -1])], (3, 3), ["a", "b", "stacked"]),642(643[pl.Series("stacked2", [-1, -1, -1]), pl.Series("stacked3", [-1, -1, -1])],644(3, 4),645["a", "b", "stacked2", "stacked3"],646),647],648)649@pytest.mark.parametrize("in_place", [True, False])650def test_hstack_list_of_series(651stack: list[pl.Series],652exp_shape: tuple[int, int],653exp_columns: list[str],654in_place: bool,655) -> None:656df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})657if in_place:658df.hstack(stack, in_place=True)659assert df.shape == exp_shape660assert df.columns == exp_columns661else:662df_out = df.hstack(stack, in_place=False)663assert df_out.shape == exp_shape664assert df_out.columns == exp_columns665666667@pytest.mark.parametrize("in_place", [True, False])668def test_hstack_dataframe(in_place: bool) -> None:669df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})670df2 = pl.DataFrame({"c": [2, 1, 3], "d": ["a", "b", "c"]})671expected = pl.DataFrame(672{"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [2, 1, 3], "d": ["a", "b", "c"]}673)674if in_place:675df.hstack(df2, in_place=True)676assert_frame_equal(df, expected)677else:678df_out = df.hstack(df2, in_place=False)679assert_frame_equal(df_out, expected)680681682@pytest.mark.may_fail_cloud683def test_file_buffer() -> None:684f = BytesIO()685f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")686f.seek(0)687df = pl.read_csv(f, has_header=False)688assert df.shape == (2, 6)689690f = BytesIO()691f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")692f.seek(0)693# check if not fails on TryClone and Length impl in file.rs694with pytest.raises(ComputeError):695pl.read_parquet(f)696697698def test_shift() -> None:699df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})700a = df.shift(1)701b = pl.DataFrame(702{"A": [None, "a", "b"], "B": [None, 1, 3]},703)704assert_frame_equal(a, b)705706707def test_multiple_columns_drop() -> None:708df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})709# List input710out = df.drop(["a", "b"])711assert out.columns == ["c"]712# Positional input713out = df.drop("b", "c")714assert out.columns == ["a"]715716717def test_arg_where() -> None:718s = pl.Series([True, False, True, False])719assert_series_equal(720pl.arg_where(s, eager=True).cast(int),721pl.Series([0, 2]),722)723724725def test_to_dummies() -> None:726df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})727dummies = df.to_dummies()728729assert dummies["A_a"].to_list() == [1, 0, 0]730assert dummies["A_b"].to_list() == [0, 1, 0]731assert dummies["A_c"].to_list() == [0, 0, 1]732733df = pl.DataFrame({"a": [1, 2, 3]})734res = df.to_dummies()735736expected = pl.DataFrame(737{"a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]}738).with_columns(pl.all().cast(pl.UInt8))739assert_frame_equal(res, expected)740741df = pl.DataFrame(742{743"i": [1, 2, 3],744"category": ["dog", "cat", "cat"],745},746schema={"i": pl.Int32, "category": pl.Categorical("lexical")},747)748expected = pl.DataFrame(749{750"i": [1, 2, 3],751"category|cat": [0, 1, 1],752"category|dog": [1, 0, 0],753},754schema={"i": pl.Int32, "category|cat": pl.UInt8, "category|dog": pl.UInt8},755)756for _cols in ("category", cs.string()):757result = df.to_dummies(columns=["category"], separator="|")758assert_frame_equal(result, expected)759760# test sorted fast path761result = pl.DataFrame({"x": pl.arange(0, 3, eager=True)}).to_dummies("x")762expected = pl.DataFrame(763{"x_0": [1, 0, 0], "x_1": [0, 1, 0], "x_2": [0, 0, 1]}764).with_columns(pl.all().cast(pl.UInt8))765assert_frame_equal(result, expected)766767768def test_to_dummies_drop_first() -> None:769df = pl.DataFrame(770{771"foo": [0, 1, 2],772"bar": [3, 4, 5],773"baz": ["x", "y", "z"],774}775)776dm = df.to_dummies()777dd = df.to_dummies(drop_first=True)778779assert dd.columns == ["foo_1", "foo_2", "bar_4", "bar_5", "baz_y", "baz_z"]780assert set(dm.columns) - set(dd.columns) == {"foo_0", "bar_3", "baz_x"}781assert_frame_equal(dm.select(dd.columns), dd)782assert dd.rows() == [783(0, 0, 0, 0, 0, 0),784(1, 0, 1, 0, 1, 0),785(0, 1, 0, 1, 0, 1),786]787788789def test_to_dummies_drop_nulls() -> None:790df = pl.DataFrame(791{792"foo": [0, 1, None],793"bar": [3, None, 5],794"baz": [None, "y", "z"],795}796)797798dm = df.to_dummies(drop_nulls=True)799800expected = pl.DataFrame(801{802"foo_0": [1, 0, 0],803"foo_1": [0, 1, 0],804"bar_3": [1, 0, 0],805"bar_5": [0, 0, 1],806"baz_y": [0, 1, 0],807"baz_z": [0, 0, 1],808},809schema={810"foo_0": pl.UInt8,811"foo_1": pl.UInt8,812"bar_3": pl.UInt8,813"bar_5": pl.UInt8,814"baz_y": pl.UInt8,815"baz_z": pl.UInt8,816},817)818assert_frame_equal(dm, expected)819820821def test_to_pandas(df: pl.DataFrame) -> None:822# pyarrow cannot deal with unsigned dictionary integer yet.823# pyarrow cannot convert a time64 w/ non-zero nanoseconds824df = df.drop(["cat", "time", "enum"])825df.to_arrow()826df.to_pandas()827# test shifted df828df.shift(2).to_pandas()829df = pl.DataFrame({"col": pl.Series([True, False, True])})830df.shift(2).to_pandas()831832833def test_from_arrow_table() -> None:834data = {"a": [1, 2], "b": [1, 2]}835tbl = pa.table(data)836837df = cast("pl.DataFrame", pl.from_arrow(tbl))838assert_frame_equal(df, pl.DataFrame(data))839840841def test_df_stats(df: pl.DataFrame) -> None:842df.var()843df.std()844df.min()845df.max()846df.sum()847df.mean()848df.median()849df.quantile(0.4, "nearest")850851852def test_df_fold() -> None:853df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})854855assert_series_equal(856df.fold(lambda s1, s2: s1 + s2), pl.Series("a", [4.0, 5.0, 9.0])857)858assert_series_equal(859df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)),860pl.Series("a", [1.0, 1.0, 3.0]),861)862863df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})864out = df.fold(lambda s1, s2: s1 + s2)865assert_series_equal(out, pl.Series("a", ["foo11.0", "bar22.0", "233.0"]))866867df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})868# just check dispatch. values are tested on rust side.869assert len(df.sum_horizontal()) == 3870assert len(df.mean_horizontal()) == 3871assert len(df.min_horizontal()) == 3872assert len(df.max_horizontal()) == 3873874df_width_one = df[["a"]]875assert_series_equal(df_width_one.fold(lambda s1, s2: s1), df["a"])876877878@pytest.mark.may_fail_cloud # TODO: make pickleable879def test_fold_filter() -> None:880df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})881882out = df.filter(883pl.fold(884acc=pl.lit(True),885function=lambda a, b: a & b,886exprs=[pl.col(c) > 1 for c in df.columns],887)888)889890assert out.shape == (1, 2)891assert out.rows() == [(3, 2)]892893out = df.filter(894pl.fold(895acc=pl.lit(True),896function=lambda a, b: a | b,897exprs=[pl.col(c) > 1 for c in df.columns],898)899)900901assert out.shape == (3, 2)902assert out.rows() == [(1, 0), (2, 1), (3, 2)]903904905def test_column_names() -> None:906tbl = pa.table(907{908"a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)),909"b": pa.array([1, 2, 3, 4, 5], pa.int64()),910}911)912for a in (tbl, tbl[:0]):913df = cast("pl.DataFrame", pl.from_arrow(a))914assert df.columns == ["a", "b"]915916917def test_init_series_edge_cases() -> None:918# confirm that we don't modify the name of the input series in-place919s1 = pl.Series("X", [1, 2, 3])920df1 = pl.DataFrame({"A": s1}, schema_overrides={"A": pl.UInt8})921assert s1.name == "X"922assert df1["A"].name == "A"923924# init same series object under different names925df2 = pl.DataFrame({"A": s1, "B": s1})926assert df2.rows(named=True) == [927{"A": 1, "B": 1},928{"A": 2, "B": 2},929{"A": 3, "B": 3},930]931932# empty series names should not be overwritten933s2 = pl.Series([1, 2, 3])934s3 = pl.Series([2, 3, 4])935df3 = pl.DataFrame([s2, s3])936assert s2.name == s3.name == ""937assert df3.columns == ["column_0", "column_1"]938939940def test_head_group_by() -> None:941commodity_prices = {942"commodity": [943"Wheat",944"Wheat",945"Wheat",946"Wheat",947"Corn",948"Corn",949"Corn",950"Corn",951"Corn",952],953"location": [954"StPaul",955"StPaul",956"StPaul",957"Chicago",958"Chicago",959"Chicago",960"Chicago",961"Chicago",962"Chicago",963],964"seller": [965"Bob",966"Charlie",967"Susan",968"Paul",969"Ed",970"Mary",971"Paul",972"Charlie",973"Norman",974],975"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],976}977df = pl.DataFrame(commodity_prices)978979# this query flexes the wildcard exclusion quite a bit.980keys = ["commodity", "location"]981out = (982df.sort(by="price", descending=True)983.group_by(keys, maintain_order=True)984.agg([pl.col("*").exclude(keys).head(2).name.keep()])985.explode(cs.all().exclude(keys))986)987988assert out.shape == (5, 4)989assert out.rows() == [990("Corn", "Chicago", "Mary", 3.0),991("Corn", "Chicago", "Paul", 2.4),992("Wheat", "StPaul", "Bob", 1.0),993("Wheat", "StPaul", "Susan", 0.8),994("Wheat", "Chicago", "Paul", 0.55),995]996997df = pl.DataFrame(998{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}999)1000out = df.group_by("letters").tail(2).sort("letters")1001assert_frame_equal(1002out,1003pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),1004)1005out = df.group_by("letters").head(2).sort("letters")1006assert_frame_equal(1007out,1008pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),1009)101010111012def test_is_null_is_not_null() -> None:1013df = pl.DataFrame({"nrs": [1, 2, None]})1014assert df.select(pl.col("nrs").is_null())["nrs"].to_list() == [False, False, True]1015assert df.select(pl.col("nrs").is_not_null())["nrs"].to_list() == [1016True,1017True,1018False,1019]102010211022def test_is_nan_is_not_nan() -> None:1023df = pl.DataFrame({"nrs": np.array([1, 2, np.nan])})1024assert df.select(pl.col("nrs").is_nan())["nrs"].to_list() == [False, False, True]1025assert df.select(pl.col("nrs").is_not_nan())["nrs"].to_list() == [True, True, False]102610271028def test_is_finite_is_infinite() -> None:1029df = pl.DataFrame({"nrs": np.array([1, 2, np.inf])})1030assert df.select(pl.col("nrs").is_infinite())["nrs"].to_list() == [1031False,1032False,1033True,1034]1035assert df.select(pl.col("nrs").is_finite())["nrs"].to_list() == [True, True, False]103610371038def test_is_finite_is_infinite_null_series() -> None:1039df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})1040result = df.select(1041pl.col("a").is_finite().alias("finite"),1042pl.col("a").is_infinite().alias("infinite"),1043)1044expected = pl.DataFrame(1045{1046"finite": pl.Series([None, None, None], dtype=pl.Boolean),1047"infinite": pl.Series([None, None, None], dtype=pl.Boolean),1048}1049)1050assert_frame_equal(result, expected)105110521053def test_is_nan_null_series() -> None:1054df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})1055result = df.select(pl.col("a").is_nan())1056expected = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Boolean)})1057assert_frame_equal(result, expected)105810591060def test_len() -> None:1061df = pl.DataFrame({"nrs": [1, 2, 3]})1062assert cast("int", df.select(pl.col("nrs").len()).item()) == 31063assert len(pl.DataFrame()) == 0106410651066def test_multiple_column_sort() -> None:1067df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})1068out = df.sort([pl.col("b"), pl.col("c").reverse()])1069assert list(out["c"]) == [2.0, 1.0, 3.0]1070assert list(out["b"]) == [2, 2, 3]10711072# Explicitly specify numpy dtype because of different defaults on Windows1073df = pl.DataFrame({"a": np.arange(1, 4, dtype=np.int64), "b": ["a", "a", "b"]})10741075assert_frame_equal(1076df.sort("a", descending=True),1077pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]}),1078)1079assert_frame_equal(1080df.sort("b", descending=True, maintain_order=True),1081pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]}),1082)1083assert_frame_equal(1084df.sort(["b", "a"], descending=[False, True]),1085pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]}),1086)108710881089def test_cast_frame() -> None:1090df = pl.DataFrame(1091{1092"a": [1.0, 2.5, 3.0],1093"b": [4, 5, None],1094"c": [True, False, True],1095"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],1096}1097)10981099# cast via col:dtype map1100assert df.cast(1101dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")},1102).schema == {1103"a": pl.Float64,1104"b": pl.Float32,1105"c": pl.String,1106"d": pl.Datetime("ms"),1107}11081109# cast via col:pytype map1110assert df.cast(1111dtypes={"b": float, "c": str, "d": datetime},1112).schema == {1113"a": pl.Float64,1114"b": pl.Float64,1115"c": pl.String,1116"d": pl.Datetime("us"),1117}11181119# cast via selector:dtype map1120assert df.cast(1121{1122cs.numeric(): pl.UInt8,1123cs.temporal(): pl.String,1124}1125).rows() == [1126(1, 4, True, "2020-01-02"),1127(2, 5, False, "2021-03-04"),1128(3, None, True, "2022-05-06"),1129]11301131# cast all fields to a single type1132assert df.cast(pl.String).to_dict(as_series=False) == {1133"a": ["1.0", "2.5", "3.0"],1134"b": ["4", "5", None],1135"c": ["true", "false", "true"],1136"d": ["2020-01-02", "2021-03-04", "2022-05-06"],1137}113811391140def test_duration_arithmetic() -> None:1141df = pl.DataFrame(1142{"a": [datetime(2022, 1, 1, 0, 0, 0), datetime(2022, 1, 2, 0, 0, 0)]}1143)1144d1 = pl.duration(days=3, microseconds=987000)1145d2 = pl.duration(days=6, milliseconds=987)11461147assert_frame_equal(1148df.with_columns(1149b=(df["a"] + d1),1150c=(pl.col("a") + d2),1151),1152pl.DataFrame(1153{1154"a": [1155datetime(2022, 1, 1, 0, 0, 0),1156datetime(2022, 1, 2, 0, 0, 0),1157],1158"b": [1159datetime(2022, 1, 4, 0, 0, 0, 987000),1160datetime(2022, 1, 5, 0, 0, 0, 987000),1161],1162"c": [1163datetime(2022, 1, 7, 0, 0, 0, 987000),1164datetime(2022, 1, 8, 0, 0, 0, 987000),1165],1166}1167),1168)116911701171def test_assign() -> None:1172# check if can assign in case of a single column1173df = pl.DataFrame({"a": [1, 2, 3]})1174# test if we can assign in case of single column1175df = df.with_columns(pl.col("a") * 2)1176assert list(df["a"]) == [2, 4, 6]117711781179def test_arg_sort_by(df: pl.DataFrame) -> None:1180idx_df = df.select(1181pl.arg_sort_by(["int_nulls", "floats"], descending=[False, True]).alias("idx")1182)1183assert (idx_df["idx"] == [1, 0, 2]).all()11841185idx_df = df.select(1186pl.arg_sort_by(["int_nulls", "floats"], descending=False).alias("idx")1187)1188assert (idx_df["idx"] == [1, 0, 2]).all()11891190df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})1191for expr, expected in (1192(pl.arg_sort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),1193(pl.arg_sort_by(["x", "y"], descending=[True, True]), [5, 3, 4, 0, 1, 2]),1194(pl.arg_sort_by(["x", "y"], descending=[True, False]), [5, 4, 3, 2, 0, 1]),1195(pl.arg_sort_by(["x", "y"], descending=[False, True]), [0, 1, 2, 3, 4, 5]),1196):1197assert (df.select(expr.alias("idx"))["idx"] == expected).all()119811991200def test_literal_series() -> None:1201df = pl.DataFrame(1202{1203"a": np.array([21.7, 21.8, 21], dtype=np.float32),1204"b": np.array([1, 3, 2], dtype=np.int8),1205"c": ["reg1", "reg2", "reg3"],1206"d": np.array(1207[datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],1208dtype="<M8[ns]",1209),1210},1211schema_overrides={"a": pl.Float64},1212)1213out = (1214df.lazy()1215.with_columns(pl.Series("e", [2, 1, 3], pl.Int32))1216.with_columns(pl.col("e").cast(pl.Float32))1217.collect()1218)1219expected_schema = {1220"a": pl.Float64,1221"b": pl.Int8,1222"c": pl.String,1223"d": pl.Datetime("ns"),1224"e": pl.Float32,1225}1226assert_frame_equal(1227pl.DataFrame(1228[1229(21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),1230(21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),1231(21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),1232],1233schema=expected_schema, # type: ignore[arg-type]1234orient="row",1235),1236out,1237abs_tol=0.00001,1238)123912401241def test_write_csv() -> None:1242df = pl.DataFrame(1243{1244"foo": [1, 2, 3, 4, 5],1245"bar": [6, 7, 8, 9, 10],1246"ham": ["a", "b", "c", "d", "e"],1247}1248)1249expected = "foo,bar,ham\n1,6,a\n2,7,b\n3,8,c\n4,9,d\n5,10,e\n"12501251# if no file argument is supplied, write_csv() will return the string1252s = df.write_csv()1253assert s == expected12541255# otherwise it will write to the file/iobuffer1256file = BytesIO()1257df.write_csv(file)1258file.seek(0)1259s = file.read().decode("utf8")1260assert s == expected126112621263def test_from_generator_or_iterable() -> None:1264# generator function1265def gen(n: int, *, strkey: bool = True) -> Iterator[Any]:1266for i in range(n):1267yield (str(i) if strkey else i), 1 * i, 2**i, 3**i12681269# iterable object1270class Rows:1271def __init__(self, n: int, *, strkey: bool = True) -> None:1272self._n = n1273self._strkey = strkey12741275def __iter__(self) -> Iterator[Any]:1276yield from gen(self._n, strkey=self._strkey)12771278# check init from column-oriented generator1279assert_frame_equal(1280pl.DataFrame(data=gen(4, strkey=False), orient="col"),1281pl.DataFrame(1282data=[(0, 0, 1, 1), (1, 1, 2, 3), (2, 2, 4, 9), (3, 3, 8, 27)], orient="col"1283),1284)1285# check init from row-oriented generators (more common)1286expected = pl.DataFrame(1287data=list(gen(4)), schema=["a", "b", "c", "d"], orient="row"1288)1289for generated_frame in (1290pl.DataFrame(data=gen(4), schema=["a", "b", "c", "d"]),1291pl.DataFrame(data=Rows(4), schema=["a", "b", "c", "d"]),1292pl.DataFrame(data=(x for x in Rows(4)), schema=["a", "b", "c", "d"]),1293):1294assert_frame_equal(expected, generated_frame)1295assert generated_frame.schema == {1296"a": pl.String,1297"b": pl.Int64,1298"c": pl.Int64,1299"d": pl.Int64,1300}13011302# test 'iterable_to_pydf' directly to validate 'chunk_size' behaviour1303cols = ["a", "b", ("c", pl.Int8), "d"]13041305expected_data = [("0", 0, 1, 1), ("1", 1, 2, 3), ("2", 2, 4, 9), ("3", 3, 8, 27)]1306expected_schema = [1307("a", pl.String),1308("b", pl.Int64),1309("c", pl.Int8),1310("d", pl.Int64),1311]13121313for params in (1314{"data": Rows(4)},1315{"data": gen(4), "chunk_size": 2},1316{"data": Rows(4), "chunk_size": 3},1317{"data": gen(4), "infer_schema_length": None},1318{"data": Rows(4), "infer_schema_length": 1},1319{"data": gen(4), "chunk_size": 2},1320{"data": Rows(4), "infer_schema_length": 5},1321{"data": gen(4), "infer_schema_length": 3, "chunk_size": 2},1322{"data": gen(4), "infer_schema_length": None, "chunk_size": 3},1323):1324d = iterable_to_pydf(schema=cols, **params) # type: ignore[arg-type]1325assert expected_data == d.row_tuples()1326assert expected_schema == list(zip(d.columns(), d.dtypes()))13271328# ref: issue #6489 (initial chunk_size cannot be smaller than 'infer_schema_length')1329df = pl.DataFrame(1330data=iter(([{"col": None}] * 1000) + [{"col": ["a", "b", "c"]}]),1331infer_schema_length=1001,1332)1333assert df.schema == {"col": pl.List(pl.String)}1334assert df[-2:]["col"].to_list() == [None, ["a", "b", "c"]]13351336# empty iterator1337assert_frame_equal(1338pl.DataFrame(data=gen(0), schema=["a", "b", "c", "d"]),1339pl.DataFrame(schema=["a", "b", "c", "d"]),1340)134113421343def test_from_rows() -> None:1344df = pl.from_records([[1, 2, "foo"], [2, 3, "bar"]], orient="row")1345assert_frame_equal(1346df,1347pl.DataFrame(1348{"column_0": [1, 2], "column_1": [2, 3], "column_2": ["foo", "bar"]}1349),1350)1351df = pl.from_records(1352[[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],1353schema_overrides={"column_0": pl.UInt32},1354orient="row",1355)1356assert df.dtypes == [pl.UInt32, pl.Datetime]13571358# auto-inference with same num rows/cols1359data = [(1, 2, "foo"), (2, 3, "bar"), (3, 4, "baz")]1360df = pl.from_records(data, orient="row")1361assert data == df.rows()136213631364@pytest.mark.parametrize(1365"records",1366[1367[1368{"id": 1, "value": 100, "_meta": "a"},1369{"id": 2, "value": 101, "_meta": "b"},1370],1371[1372None,1373{"id": 1, "value": 100, "_meta": "a"},1374{"id": 2, "value": 101, "_meta": "b"},1375],1376[1377{"id": 1, "value": 100, "_meta": "a"},1378{"id": 2, "value": 101, "_meta": "b"},1379None,1380],1381[1382MappingObject(id=1, value=100, _meta="a"),1383MappingObject(id=2, value=101, _meta="b"),1384],1385[1386None,1387MappingObject(id=1, value=100, _meta="a"),1388MappingObject(id=2, value=101, _meta="b"),1389],1390[1391MappingObject(id=1, value=100, _meta="a"),1392MappingObject(id=2, value=101, _meta="b"),1393None,1394],1395],1396)1397def test_from_rows_of_dicts(records: list[dict[str, Any]]) -> None:1398for df_init in (pl.from_dicts, pl.DataFrame):1399df1 = df_init(records).remove(pl.col("id").is_null())1400assert df1.rows() == [(1, 100, "a"), (2, 101, "b")]14011402overrides = {1403"id": pl.Int16,1404"value": pl.Int32,1405}1406df2 = df_init(records, schema_overrides=overrides).remove(1407pl.col("id").is_null()1408)1409assert df2.rows() == [(1, 100, "a"), (2, 101, "b")]1410assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.String}14111412df3 = df_init(records, schema=overrides).remove(pl.col("id").is_null())1413assert df3.rows() == [(1, 100), (2, 101)]1414assert df3.schema == {"id": pl.Int16, "value": pl.Int32}14151416# explicitly check "anyvalue" conversion for dict/mapping dtypes1417py_s = PySeries.new_from_any_values("s", records, True)1418assert py_s.dtype() == pl.Struct(1419{1420"id": pl.Int64,1421"value": pl.Int64,1422"_meta": pl.String,1423}1424)142514261427def test_from_records_with_schema_overrides_12032() -> None:1428# the 'id' fields contains an int value that exceeds Int64 and doesn't have an exact1429# Float64 representation; confirm that the override is applied *during* inference,1430# not as a post-inference cast, so we maintain the accuracy of the original value.1431rec = [1432{"id": 9187643043065364490, "x": 333, "y": None},1433{"id": 9223671840084328467, "x": 666.5, "y": 1698177261953686},1434{"id": 9187643043065364505, "x": 999, "y": 9223372036854775807},1435]1436df = pl.from_records(rec, schema_overrides={"x": pl.Float32, "id": pl.UInt64})1437assert df.schema == OrderedDict(1438[1439("id", pl.UInt64),1440("x", pl.Float32),1441("y", pl.Int64),1442]1443)1444assert rec == df.rows(named=True)144514461447def test_from_large_uint64_misc() -> None:1448uint_data = [[9187643043065364490, 9223671840084328467, 9187643043065364505]]14491450df = pl.DataFrame(uint_data, orient="col", schema_overrides={"column_0": pl.UInt64})1451assert df["column_0"].dtype == pl.UInt641452assert df["column_0"].to_list() == uint_data[0]14531454for overrides in ({}, {"column_1": pl.UInt64}):1455df = pl.DataFrame(1456uint_data,1457orient="row",1458schema_overrides=overrides,1459)1460assert df.schema == OrderedDict(1461[1462("column_0", pl.Int64),1463("column_1", pl.Int128 if overrides == {} else pl.UInt64),1464("column_2", pl.Int64),1465]1466)1467assert df.row(0) == tuple(uint_data[0])146814691470def test_repeat_by_unequal_lengths_panic() -> None:1471df = pl.DataFrame(1472{1473"a": ["x", "y", "z"],1474}1475)1476with pytest.raises(ShapeError):1477df.select(pl.col("a").repeat_by(pl.Series([2, 2])))147814791480@pytest.mark.parametrize(1481("value", "values_expect"),1482[1483(1.2, [[1.2], [1.2, 1.2], [1.2, 1.2, 1.2]]),1484(True, [[True], [True, True], [True, True, True]]),1485("x", [["x"], ["x", "x"], ["x", "x", "x"]]),1486(b"a", [[b"a"], [b"a", b"a"], [b"a", b"a", b"a"]]),1487],1488)1489def test_repeat_by_broadcast_left(1490value: float | bool | str, values_expect: list[list[float | bool | str]]1491) -> None:1492df = pl.DataFrame(1493{1494"n": [1, 2, 3],1495}1496)1497expected = pl.DataFrame({"values": values_expect})1498result = df.select(pl.lit(value).repeat_by(pl.col("n")).alias("values"))1499assert_frame_equal(result, expected)150015011502@pytest.mark.parametrize(1503("a", "a_expected"),1504[1505([1.2, 2.2, 3.3], [[1.2, 1.2, 1.2], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3]]),1506([True, False], [[True, True, True], [False, False, False]]),1507(["x", "y", "z"], [["x", "x", "x"], ["y", "y", "y"], ["z", "z", "z"]]),1508(1509[b"a", b"b", b"c"],1510[[b"a", b"a", b"a"], [b"b", b"b", b"b"], [b"c", b"c", b"c"]],1511),1512],1513)1514def test_repeat_by_broadcast_right(1515a: list[float | bool | str], a_expected: list[list[float | bool | str]]1516) -> None:1517df = pl.DataFrame(1518{1519"a": a,1520}1521)1522expected = pl.DataFrame({"a": a_expected})1523result = df.select(pl.col("a").repeat_by(3))1524assert_frame_equal(result, expected)1525result = df.select(pl.col("a").repeat_by(pl.lit(3)))1526assert_frame_equal(result, expected)152715281529@pytest.mark.parametrize(1530("a", "a_expected"),1531[1532(["foo", "bar"], [["foo", "foo"], ["bar", "bar", "bar"]]),1533([1, 2], [[1, 1], [2, 2, 2]]),1534([True, False], [[True, True], [False, False, False]]),1535(1536[b"a", b"b"],1537[[b"a", b"a"], [b"b", b"b", b"b"]],1538),1539],1540)1541def test_repeat_by(1542a: list[float | bool | str], a_expected: list[list[float | bool | str]]1543) -> None:1544df = pl.DataFrame({"a": a, "n": [2, 3]})1545expected = pl.DataFrame({"a": a_expected})1546result = df.select(pl.col("a").repeat_by("n"))1547assert_frame_equal(result, expected)154815491550def test_join_dates() -> None:1551dts_in = pl.datetime_range(1552datetime(2021, 6, 24),1553datetime(2021, 6, 24, 10, 0, 0),1554interval=timedelta(hours=1),1555closed="left",1556eager=True,1557)1558dts = (1559dts_in.cast(int)1560.map_elements(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))1561.cast(pl.Datetime)1562)15631564# some df with sensor id, (randomish) datetime and some value1565df = pl.DataFrame(1566{1567"sensor": ["a"] * 5 + ["b"] * 5,1568"datetime": dts,1569"value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3],1570}1571)1572out = df.join(df, on="datetime")1573assert out.height == df.height157415751576def test_asof_cross_join() -> None:1577left = pl.DataFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(1578pl.col("a").set_sorted()1579)1580right = pl.DataFrame(1581{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}1582).with_columns(pl.col("a").set_sorted())15831584# only test dispatch of asof join1585out = left.join_asof(right, on="a")1586assert out.shape == (3, 3)15871588left.lazy().join_asof(right.lazy(), on="a").collect()1589assert out.shape == (3, 3)15901591# only test dispatch of cross join1592out = left.join(right, how="cross")1593assert out.shape == (15, 4)15941595left.lazy().join(right.lazy(), how="cross").collect()1596assert out.shape == (15, 4)159715981599def test_join_bad_input_type() -> None:1600left = pl.DataFrame({"a": [1, 2, 3]})1601right = pl.DataFrame({"a": [1, 2, 3]})16021603with pytest.raises(1604TypeError,1605match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",1606):1607left.join(right.lazy(), on="a") # type: ignore[arg-type]16081609with pytest.raises(1610TypeError,1611match="expected `other` .*to be a 'DataFrame'.* not 'Series'",1612):1613left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]16141615class DummyDataFrameSubclass(pl.DataFrame):1616pass16171618right = DummyDataFrameSubclass(right)16191620left.join(right, on="a")162116221623def test_join_where() -> None:1624east = pl.DataFrame(1625{1626"id": [100, 101, 102],1627"dur": [120, 140, 160],1628"rev": [12, 14, 16],1629"cores": [2, 8, 4],1630}1631)1632west = pl.DataFrame(1633{1634"t_id": [404, 498, 676, 742],1635"time": [90, 130, 150, 170],1636"cost": [9, 13, 15, 16],1637"cores": [4, 2, 1, 4],1638}1639)1640out = east.join_where(1641west,1642pl.col("dur") < pl.col("time"),1643pl.col("rev") < pl.col("cost"),1644)16451646expected = pl.DataFrame(1647{1648"id": [100, 100, 100, 101, 101],1649"dur": [120, 120, 120, 140, 140],1650"rev": [12, 12, 12, 14, 14],1651"cores": [2, 2, 2, 8, 8],1652"t_id": [498, 676, 742, 676, 742],1653"time": [130, 150, 170, 150, 170],1654"cost": [13, 15, 16, 15, 16],1655"cores_right": [2, 1, 4, 1, 4],1656}1657)16581659assert_frame_equal(out, expected)166016611662def test_join_where_bad_input_type() -> None:1663east = pl.DataFrame(1664{1665"id": [100, 101, 102],1666"dur": [120, 140, 160],1667"rev": [12, 14, 16],1668"cores": [2, 8, 4],1669}1670)1671west = pl.DataFrame(1672{1673"t_id": [404, 498, 676, 742],1674"time": [90, 130, 150, 170],1675"cost": [9, 13, 15, 16],1676"cores": [4, 2, 1, 4],1677}1678)1679with pytest.raises(1680TypeError,1681match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",1682):1683east.join_where(1684west.lazy(), # type: ignore[arg-type]1685pl.col("dur") < pl.col("time"),1686pl.col("rev") < pl.col("cost"),1687)16881689with pytest.raises(1690TypeError,1691match="expected `other` .*to be a 'DataFrame'.* not 'Series'",1692):1693east.join_where(1694pl.Series(west), # type: ignore[arg-type]1695pl.col("dur") < pl.col("time"),1696pl.col("rev") < pl.col("cost"),1697)16981699class DummyDataFrameSubclass(pl.DataFrame):1700pass17011702west = DummyDataFrameSubclass(west)17031704east.join_where(1705west,1706pl.col("dur") < pl.col("time"),1707pl.col("rev") < pl.col("cost"),1708)170917101711def test_str_concat() -> None:1712df = pl.DataFrame(1713{1714"nrs": [1, 2, 3, 4],1715"name": ["ham", "spam", "foo", None],1716}1717)1718out = df.with_columns((pl.lit("Dr. ") + pl.col("name")).alias("graduated_name"))1719assert out["graduated_name"][0] == "Dr. ham"1720assert out["graduated_name"][1] == "Dr. spam"172117221723def test_dot_product() -> None:1724df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})17251726assert df["a"].dot(df["b"]) == 201727assert typing.cast("int", df.select([pl.col("a").dot("b")])[0, "a"]) == 2017281729result = pl.Series([1, 2, 3]) @ pl.Series([4, 5, 6])1730assert isinstance(result, int)1731assert result == 3217321733result = pl.Series([1, 2, 3]) @ pl.Series([4.0, 5.0, 6.0])1734assert isinstance(result, float)1735assert result == 32.017361737result = pl.Series([1.0, 2.0, 3.0]) @ pl.Series([4.0, 5.0, 6.0])1738assert isinstance(result, float)1739assert result == 32.017401741with pytest.raises(1742InvalidOperationError, match="`dot` operation not supported for dtype `bool`"1743):1744pl.Series([True, False, False, True]) @ pl.Series([4, 5, 6, 7])17451746with pytest.raises(1747InvalidOperationError, match="`dot` operation not supported for dtype `str`"1748):1749pl.Series([1, 2, 3, 4]) @ pl.Series(["True", "False", "False", "True"])175017511752def test_hash_rows() -> None:1753df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})1754assert df.hash_rows().dtype == pl.UInt641755assert df["a"].hash().dtype == pl.UInt641756assert df.select([pl.col("a").hash().alias("foo")])["foo"].dtype == pl.UInt64175717581759def test_reproducible_hash_with_seeds() -> None:1760"""1761Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.17621763cf. issue #3966, hashes must always be reproducible across sessions when using1764the same seeds.1765"""1766df = pl.DataFrame({"s": [1234, None, 5678]})1767seeds = (11, 22, 33, 44)1768expected = pl.Series(1769"s",1770[10832467230526607564, 3044502640115867787, 17228373233104406792],1771dtype=pl.UInt64,1772)1773result = df.hash_rows(*seeds)1774assert_series_equal(expected, result, check_names=False, check_exact=True)1775result = df["s"].hash(*seeds)1776assert_series_equal(expected, result, check_names=False, check_exact=True)1777result = df.select([pl.col("s").hash(*seeds)])["s"]1778assert_series_equal(expected, result, check_names=False, check_exact=True)177917801781@pytest.mark.slow1782@pytest.mark.parametrize(1783"e",1784[1785pl.int_range(1_000_000),1786# Test code path for null_count > 01787pl.when(pl.int_range(1_000_000) != 0).then(pl.int_range(1_000_000)),1788],1789)1790def test_hash_collision_multiple_columns_equal_values_15390(e: pl.Expr) -> None:1791df = pl.select(e.alias("a"))17921793for n_columns in (1, 2, 3, 4):1794s = df.select(pl.col("a").alias(f"x{i}") for i in range(n_columns)).hash_rows()17951796vc = s.sort().value_counts(sort=True)1797max_bucket_size = vc["count"][0]17981799assert max_bucket_size == 1180018011802@pytest.mark.may_fail_auto_streaming # Python objects not yet supported in row encoding1803@pytest.mark.may_fail_cloud1804def test_hashing_on_python_objects() -> None:1805# see if we can do a group_by, drop_duplicates on a DataFrame with objects.1806# this requires that the hashing and aggregations are done on python objects18071808df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]})18091810class Foo:1811def __hash__(self) -> int:1812return 018131814def __eq__(self, other: object) -> bool:1815return True18161817df = df.with_columns(pl.col("a").map_elements(lambda x: Foo()).alias("foo"))1818assert df.group_by(["foo"]).first().shape == (1, 3)1819assert df.unique().shape == (3, 3)182018211822def test_unique_unit_rows() -> None:1823df = pl.DataFrame({"a": [1], "b": [None]}, schema={"a": pl.Int64, "b": pl.Float32})18241825# 'unique' one-row frame should be equal to the original frame1826assert_frame_equal(df, df.unique(subset="a"))1827for col in df.columns:1828assert df.n_unique(subset=[col]) == 1182918301831def test_panic() -> None:1832# may contain some tests that yielded a panic in polars or pl_arrow1833# https://github.com/pola-rs/polars/issues/11101834a = pl.DataFrame(1835{1836"col1": ["a"] * 500 + ["b"] * 500,1837}1838)1839a.filter(pl.col("col1") != "b")184018411842def test_horizontal_agg() -> None:1843df = pl.DataFrame({"a": [1, None, 3], "b": [1, 2, 3]})18441845assert_series_equal(df.sum_horizontal(), pl.Series("sum", [2, 2, 6]))1846assert_series_equal(1847df.sum_horizontal(ignore_nulls=False), pl.Series("sum", [2, None, 6])1848)1849assert_series_equal(1850df.mean_horizontal(ignore_nulls=False), pl.Series("mean", [1.0, None, 3.0])1851)185218531854def test_slicing() -> None:1855# https://github.com/pola-rs/polars/issues/13221856n = 2018571858df = pl.DataFrame(1859{1860"d": ["u", "u", "d", "c", "c", "d", "d"] * n,1861"v1": [None, "help", None, None, None, None, None] * n,1862}1863)18641865assert (df.filter(pl.col("d") != "d").select([pl.col("v1").unique()])).shape == (18662,18671,1868)186918701871def test_group_by_cat_list() -> None:1872grouped = (1873pl.DataFrame(1874[1875pl.Series("str_column", ["a", "b", "b", "a", "b"]),1876pl.Series("int_column", [1, 1, 2, 2, 3]),1877]1878)1879.with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column"))1880.group_by("int_column", maintain_order=True)1881.agg([pl.col("cat_column")])["cat_column"]1882)18831884out = grouped.explode()1885assert out.dtype == pl.Categorical1886assert out[0] == "a"188718881889def test_group_by_agg_n_unique_floats() -> None:1890# tests proper dispatch1891df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})18921893for dtype in [pl.Float32, pl.Float64]:1894out = df.group_by("a", maintain_order=True).agg(1895[pl.col("b").cast(dtype).n_unique()]1896)1897assert out["b"].to_list() == [2, 1]189818991900def test_group_by_agg_n_unique_empty_group_idx_path() -> None:1901df = pl.DataFrame(1902{1903"key": [1, 1, 1, 2, 2, 2],1904"value": [1, 2, 3, 4, 5, 6],1905"filt": [True, True, True, False, False, False],1906}1907)1908out = df.group_by("key", maintain_order=True).agg(1909pl.col("value").filter("filt").n_unique().alias("n_unique")1910)1911expected = pl.DataFrame(1912{1913"key": [1, 2],1914"n_unique": pl.Series([3, 0], dtype=pl.UInt32),1915}1916)1917assert_frame_equal(out, expected)191819191920def test_group_by_agg_n_unique_empty_group_slice_path() -> None:1921df = pl.DataFrame(1922{1923"key": [1, 1, 1, 2, 2, 2],1924"value": [1, 2, 3, 4, 5, 6],1925"filt": [False, False, False, False, False, False],1926}1927)1928out = df.group_by("key", maintain_order=True).agg(1929pl.col("value").filter("filt").n_unique().alias("n_unique")1930)1931expected = pl.DataFrame(1932{1933"key": [1, 2],1934"n_unique": pl.Series([0, 0], dtype=pl.UInt32),1935}1936)1937assert_frame_equal(out, expected)193819391940def test_select_by_dtype(df: pl.DataFrame) -> None:1941out = df.select(pl.col(pl.String))1942assert out.columns == ["strings", "strings_nulls"]1943out = df.select(pl.col([pl.String, pl.Boolean]))1944assert out.columns == ["bools", "bools_nulls", "strings", "strings_nulls"]1945out = df.select(pl.col(INTEGER_DTYPES))1946assert out.columns == ["int", "int_nulls"]19471948out = df.select(ints=pl.struct(pl.col(INTEGER_DTYPES)))1949assert out.schema == {1950"ints": pl.Struct([pl.Field("int", pl.Int64), pl.Field("int_nulls", pl.Int64)])1951}195219531954def test_with_row_index() -> None:1955df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19561957out = df.with_row_index()1958assert out["index"].to_list() == [0, 1, 2]19591960out = df.lazy().with_row_index().collect()1961assert out["index"].to_list() == [0, 1, 2]196219631964def test_with_row_index_bad_offset() -> None:1965df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19661967with pytest.raises(ValueError, match="cannot be negative"):1968df.with_row_index(offset=-1)1969with pytest.raises(1970ValueError, match="cannot be greater than the maximum index value"1971):1972df.with_row_index(offset=2**32)197319741975def test_with_row_index_bad_offset_lazy() -> None:1976lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19771978with pytest.raises(ValueError, match="cannot be negative"):1979lf.with_row_index(offset=-1)1980with pytest.raises(1981ValueError, match="cannot be greater than the maximum index value"1982):1983lf.with_row_index(offset=2**32)198419851986def test_with_row_count_deprecated() -> None:1987df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19881989with pytest.deprecated_call():1990out = df.with_row_count()1991assert out["row_nr"].to_list() == [0, 1, 2]19921993with pytest.deprecated_call():1994out = df.lazy().with_row_count().collect()1995assert out["row_nr"].to_list() == [0, 1, 2]199619971998@pytest.mark.may_fail_cloud1999def test_filter_with_all_expansion() -> None:2000df = pl.DataFrame(2001{2002"b": [1, 2, None],2003"c": [1, 2, None],2004"a": [None, None, None],2005}2006)2007out = df.filter(~pl.fold(True, lambda acc, s: acc & s.is_null(), pl.all()))2008assert out.shape == (2, 3)200920102011# TODO: investigate this discrepancy in auto streaming2012@pytest.mark.may_fail_auto_streaming2013@pytest.mark.may_fail_cloud2014def test_extension() -> None:2015class Foo:2016def __init__(self, value: Any) -> None:2017self.value = value20182019def __repr__(self) -> str:2020return f"foo({self.value})"20212022foos = [Foo(1), Foo(2), Foo(3)]20232024# foos and sys.getrefcount both have a reference.2025base_count = 220262027# We compute the refcount on a separate line otherwise pytest's assert magic2028# might add reference counts.2029rc = sys.getrefcount(foos[0])2030assert rc == base_count20312032df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})2033rc = sys.getrefcount(foos[0])2034assert rc == base_count + 12035del df2036rc = sys.getrefcount(foos[0])2037assert rc == base_count20382039df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})2040rc = sys.getrefcount(foos[0])2041assert rc == base_count + 120422043out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a"))2044rc = sys.getrefcount(foos[0])2045assert rc == base_count + 22046s = out["a"].list.explode()2047rc = sys.getrefcount(foos[0])2048assert rc == base_count + 32049del s2050rc = sys.getrefcount(foos[0])2051assert rc == base_count + 220522053assert out["a"].list.explode().to_list() == foos2054rc = sys.getrefcount(foos[0])2055assert rc == base_count + 22056del out2057rc = sys.getrefcount(foos[0])2058assert rc == base_count + 12059del df2060rc = sys.getrefcount(foos[0])2061assert rc == base_count206220632064@pytest.mark.parametrize("name", [None, "n", ""])2065def test_group_by_order_dispatch(name: str | None) -> None:2066df = pl.DataFrame({"x": list("bab"), "y": range(3)})2067lf = df.lazy()20682069result = df.group_by("x", maintain_order=True).len(name=name)2070lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)20712072name = "len" if name is None else name2073expected = pl.DataFrame(2074data={"x": ["b", "a"], name: [2, 1]},2075schema_overrides={name: pl.UInt32},2076)2077assert_frame_equal(result, expected)2078assert_frame_equal(lazy_result.collect(), expected)20792080result = df.group_by("x", maintain_order=True).all()2081expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})2082assert_frame_equal(result, expected)208320842085def test_partitioned_group_by_order() -> None:2086# check if group ordering is maintained.2087# we only have 30 groups, so this triggers a partitioned group by2088df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})2089out = df.group_by("x", maintain_order=True).agg(pl.all().implode())2090assert_series_equal(out["x"], df["x"])209120922093def test_schema() -> None:2094df = pl.DataFrame(2095{"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}2096)2097expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}2098assert df.schema == expected209921002101def test_schema_equality() -> None:2102lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]})2103lf_rev = lf.select("bar", "foo")21042105assert lf.collect_schema() != lf_rev.collect_schema()2106assert lf.collect().schema != lf_rev.collect().schema210721082109def test_df_schema_unique() -> None:2110df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})2111with pytest.raises(DuplicateError):2112df.columns = ["a", "a"]21132114with pytest.raises(DuplicateError):2115df.rename({"b": "a"})211621172118def test_empty_projection() -> None:2119empty_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).select([])2120assert empty_df.rows() == []2121assert empty_df.schema == {}2122assert empty_df.shape == (0, 0)212321242125def test_fill_null() -> None:2126df = pl.DataFrame({"a": [1, 2], "b": [3, None]})2127assert_frame_equal(df.fill_null(4), pl.DataFrame({"a": [1, 2], "b": [3, 4]}))2128assert_frame_equal(2129df.fill_null(strategy="max"), pl.DataFrame({"a": [1, 2], "b": [3, 3]})2130)21312132# string and list data2133# string goes via binary2134df = pl.DataFrame(2135{2136"c": [2137["Apple", "Orange"],2138["Apple", "Orange"],2139None,2140["Carrot"],2141None,2142None,2143],2144"b": ["Apple", "Orange", None, "Carrot", None, None],2145}2146)21472148assert df.select(2149pl.all().fill_null(strategy="forward").name.suffix("_forward"),2150pl.all().fill_null(strategy="backward").name.suffix("_backward"),2151).to_dict(as_series=False) == {2152"c_forward": [2153["Apple", "Orange"],2154["Apple", "Orange"],2155["Apple", "Orange"],2156["Carrot"],2157["Carrot"],2158["Carrot"],2159],2160"b_forward": ["Apple", "Orange", "Orange", "Carrot", "Carrot", "Carrot"],2161"c_backward": [2162["Apple", "Orange"],2163["Apple", "Orange"],2164["Carrot"],2165["Carrot"],2166None,2167None,2168],2169"b_backward": ["Apple", "Orange", "Carrot", "Carrot", None, None],2170}2171# categoricals2172df = pl.DataFrame(pl.Series("cat", ["a", None], dtype=pl.Categorical))2173s = df.select(pl.col("cat").fill_null(strategy="forward"))["cat"]2174assert s.dtype == pl.Categorical2175assert s.to_list() == ["a", "a"]217621772178def test_fill_nan() -> None:2179df = pl.DataFrame({"a": [1, 2], "b": [3.0, float("nan")]})2180assert_frame_equal(2181df.fill_nan(4),2182pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}),2183)2184assert_frame_equal(2185df.fill_nan(None),2186pl.DataFrame({"a": [1, 2], "b": [3.0, None]}),2187)2188assert df["b"].fill_nan(5.0).to_list() == [3.0, 5.0]2189df = pl.DataFrame(2190{2191"a": [1.0, np.nan, 3.0],2192"b": [datetime(1, 2, 2), datetime(2, 2, 2), datetime(3, 2, 2)],2193}2194)2195assert df.fill_nan(2.0).dtypes == [pl.Float64, pl.Datetime]219621972198#2199def test_forward_fill() -> None:2200df = pl.DataFrame({"a": [1.0, None, 3.0]})2201fill = df.select(pl.col("a").forward_fill())["a"]2202assert_series_equal(fill, pl.Series("a", [1, 1, 3]).cast(pl.Float64))22032204df = pl.DataFrame({"a": [None, 1, None]})2205fill = df.select(pl.col("a").forward_fill())["a"]2206assert_series_equal(fill, pl.Series("a", [None, 1, 1]).cast(pl.Int64))220722082209def test_backward_fill() -> None:2210df = pl.DataFrame({"a": [1.0, None, 3.0]})2211fill = df.select(pl.col("a").backward_fill())["a"]2212assert_series_equal(fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))22132214df = pl.DataFrame({"a": [None, 1, None]})2215fill = df.select(pl.col("a").backward_fill())["a"]2216assert_series_equal(fill, pl.Series("a", [1, 1, None]).cast(pl.Int64))221722182219def test_shrink_to_fit() -> None:2220df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})22212222assert df.shrink_to_fit(in_place=True) is df2223assert df.shrink_to_fit(in_place=False) is not df2224assert_frame_equal(df.shrink_to_fit(in_place=False), df)222522262227def test_add_string() -> None:2228df = pl.DataFrame({"a": ["hi", "there"], "b": ["hello", "world"]})2229expected = pl.DataFrame(2230{"a": ["hi hello", "there hello"], "b": ["hello hello", "world hello"]}2231)2232assert_frame_equal((df + " hello"), expected)22332234expected = pl.DataFrame(2235{"a": ["hello hi", "hello there"], "b": ["hello hello", "hello world"]}2236)2237assert_frame_equal(("hello " + df), expected)223822392240def test_df_broadcast() -> None:2241df = pl.DataFrame({"a": [1, 2, 3]}, schema_overrides={"a": pl.UInt8})2242out = df.with_columns(pl.lit(pl.Series("s", [[1, 2]])).first())2243assert out.shape == (3, 2)2244assert out.schema == {"a": pl.UInt8, "s": pl.List(pl.Int64)}2245assert out.rows() == [(1, [1, 2]), (2, [1, 2]), (3, [1, 2])]224622472248@pytest.mark.may_fail_cloud # not a lazyframe method2249def test_product() -> None:2250df = pl.DataFrame(2251{2252"int": [1, 2, 3],2253"flt": [-1.0, 12.0, 9.0],2254"bool_0": [True, False, True],2255"bool_1": [True, True, True],2256"str": ["a", "b", "c"],2257},2258schema_overrides={2259"int": pl.UInt16,2260"flt": pl.Float32,2261},2262)2263out = df.product()2264expected = pl.DataFrame(2265{"int": [6], "flt": [-108.0], "bool_0": [0], "bool_1": [1], "str": [None]}2266)2267assert_frame_not_equal(out, expected, check_dtypes=True)2268assert_frame_equal(out, expected, check_dtypes=False)226922702271def test_first_last_nth_expressions(fruits_cars: pl.DataFrame) -> None:2272df = fruits_cars2273out = df.select(pl.first())2274assert out.columns == ["A"]22752276out = df.select(pl.last())2277assert out.columns == ["cars"]22782279out = df.select(pl.nth(0))2280assert out.columns == ["A"]22812282out = df.select(pl.nth(1))2283assert out.columns == ["fruits"]22842285out = df.select(pl.nth(-2))2286assert out.columns == ["B"]228722882289def test_is_between(fruits_cars: pl.DataFrame) -> None:2290result = fruits_cars.select(pl.col("A").is_between(2, 4)).to_series()2291assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))22922293result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="none")).to_series()2294assert_series_equal(result, pl.Series("A", [False, False, True, False, False]))22952296result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="both")).to_series()2297assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))22982299result = fruits_cars.select(2300pl.col("A").is_between(2, 4, closed="right")2301).to_series()2302assert_series_equal(result, pl.Series("A", [False, False, True, True, False]))23032304result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="left")).to_series()2305assert_series_equal(result, pl.Series("A", [False, True, True, False, False]))230623072308def test_is_between_data_types() -> None:2309df = pl.DataFrame(2310{2311"flt": [1.4, 1.2, 2.5],2312"int": [2, 3, 4],2313"str": ["xyz", "str", "abc"],2314"date": [date(2020, 1, 1), date(2020, 2, 2), date(2020, 3, 3)],2315"datetime": [2316datetime(2020, 1, 1, 0, 0, 0),2317datetime(2020, 1, 1, 10, 0, 0),2318datetime(2020, 1, 1, 12, 0, 0),2319],2320"tm": [time(10, 30), time(0, 45), time(15, 15)],2321}2322)23232324# on purpose, for float and int, we pass in a mixture of bound data types2325assert_series_equal(2326df.select(pl.col("flt").is_between(1, 2.3))[:, 0],2327pl.Series("flt", [True, True, False]),2328)2329assert_series_equal(2330df.select(pl.col("int").is_between(1.5, 3))[:, 0],2331pl.Series("int", [True, True, False]),2332)2333assert_series_equal(2334df.select(pl.col("date").is_between(date(2019, 1, 1), date(2020, 2, 5)))[:, 0],2335pl.Series("date", [True, True, False]),2336)2337assert_series_equal(2338df.select(2339pl.col("datetime").is_between(2340datetime(2020, 1, 1, 5, 0, 0), datetime(2020, 1, 1, 11, 0, 0)2341)2342)[:, 0],2343pl.Series("datetime", [False, True, False]),2344)2345assert_series_equal(2346df.select(2347pl.col("str").is_between(pl.lit("str"), pl.lit("zzz"), closed="left")2348)[:, 0],2349pl.Series("str", [True, True, False]),2350)2351assert_series_equal(2352df.select(2353pl.col("tm")2354.is_between(time(0, 45), time(10, 30), closed="right")2355.alias("tm_between")2356)[:, 0],2357pl.Series("tm_between", [True, False, False]),2358)235923602361def test_empty_is_in() -> None:2362df_empty_isin = pl.DataFrame({"foo": ["a", "b", "c", "d"]}).filter(2363pl.col("foo").is_in([])2364)2365assert df_empty_isin.shape == (0, 1)2366assert df_empty_isin.rows() == []2367assert df_empty_isin.schema == {"foo": pl.String}236823692370def test_group_by_slice_expression_args() -> None:2371df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)})23722373out = (2374df.group_by("groups", maintain_order=True)2375.agg([pl.col("vals").slice((pl.len() * 0.1).cast(int), (pl.len() // 5))])2376.explode("vals")2377)23782379expected = pl.DataFrame(2380{"groups": ["a", "a", "b", "b", "b", "b"], "vals": [1, 2, 12, 13, 14, 15]}2381)2382assert_frame_equal(out, expected)238323842385def test_join_suffixes() -> None:2386df_a = pl.DataFrame({"A": [1], "B": [1]})2387df_b = pl.DataFrame({"A": [1], "B": [1]})23882389join_strategies: list[JoinStrategy] = ["left", "inner", "full", "cross"]2390for how in join_strategies:2391# no need for an assert, we error if wrong2392df_a.join(df_b, on="A" if how != "cross" else None, suffix="_y", how=how)["B_y"]23932394df_a.join_asof(df_b, on=pl.col("A").set_sorted(), suffix="_y")["B_y"]239523962397def test_explode_empty() -> None:2398df = (2399pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]})2400.group_by("x", maintain_order=True)2401.agg(pl.col("y").gather([]))2402)2403assert df.explode("y").to_dict(as_series=False) == {2404"x": ["a", "b"],2405"y": [None, None],2406}24072408df = pl.DataFrame({"x": ["1", "2", "4"], "y": [["a", "b", "c"], ["d"], []]})2409assert_frame_equal(2410df.explode("y"),2411pl.DataFrame({"x": ["1", "1", "1", "2", "4"], "y": ["a", "b", "c", "d", None]}),2412)24132414df = pl.DataFrame(2415{2416"letters": ["a"],2417"numbers": [[]],2418}2419)2420assert df.explode("numbers").to_dict(as_series=False) == {2421"letters": ["a"],2422"numbers": [None],2423}242424252426def test_asof_by_multiple_keys() -> None:2427lhs = pl.DataFrame(2428{2429"a": [-20, -19, 8, 12, 14],2430"by": [1, 1, 2, 2, 2],2431"by2": [1, 1, 2, 2, 2],2432}2433)24342435rhs = pl.DataFrame(2436{2437"a": [-19, -15, 3, 5, 13],2438"by": [1, 1, 2, 2, 2],2439"by2": [1, 1, 2, 2, 2],2440}2441)24422443result = lhs.join_asof(2444rhs, on=pl.col("a").set_sorted(), by=["by", "by2"], strategy="backward"2445).select(["a", "by"])2446expected = pl.DataFrame({"a": [-20, -19, 8, 12, 14], "by": [1, 1, 2, 2, 2]})2447assert_frame_equal(2448result.group_by("by").agg("a"),2449expected.group_by("by").agg("a"),2450check_row_order=False,2451)245224532454def test_asof_bad_input_type() -> None:2455lhs = pl.DataFrame({"a": [1, 2, 3]})2456rhs = pl.DataFrame({"a": [1, 2, 3]})24572458with pytest.raises(2459TypeError,2460match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",2461):2462lhs.join_asof(rhs.lazy(), on="a") # type: ignore[arg-type]24632464with pytest.raises(2465TypeError,2466match="expected `other` .*to be a 'DataFrame'.* not 'Series'",2467):2468lhs.join_asof(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]24692470class DummyDataFrameSubclass(pl.DataFrame):2471pass24722473rhs = DummyDataFrameSubclass(rhs)24742475lhs.join_asof(rhs, on="a")247624772478def test_list_of_list_of_struct() -> None:2479expected = [{"list_of_list_of_struct": [[{"a": 1}, {"a": 2}]]}]2480pa_df = pa.Table.from_pylist(expected)24812482df = pl.from_arrow(pa_df)2483assert df.rows() == [([[{"a": 1}, {"a": 2}]],)] # type: ignore[union-attr]2484assert df.to_dicts() == expected # type: ignore[union-attr]24852486df = pl.from_arrow(pa_df[:0])2487assert df.to_dicts() == [] # type: ignore[union-attr]248824892490def test_fill_null_limits() -> None:2491assert pl.DataFrame(2492{2493"a": [1, None, None, None, 5, 6, None, None, None, 10],2494"b": ["a", None, None, None, "b", "c", None, None, None, "d"],2495"c": [True, None, None, None, False, True, None, None, None, False],2496}2497).select(2498pl.all().fill_null(strategy="forward", limit=2),2499pl.all().fill_null(strategy="backward", limit=2).name.suffix("_backward"),2500).to_dict(as_series=False) == {2501"a": [1, 1, 1, None, 5, 6, 6, 6, None, 10],2502"b": ["a", "a", "a", None, "b", "c", "c", "c", None, "d"],2503"c": [True, True, True, None, False, True, True, True, None, False],2504"a_backward": [1, None, 5, 5, 5, 6, None, 10, 10, 10],2505"b_backward": ["a", None, "b", "b", "b", "c", None, "d", "d", "d"],2506"c_backward": [2507True,2508None,2509False,2510False,2511False,2512True,2513None,2514False,2515False,2516False,2517],2518}251925202521def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None:2522res_expr = fruits_cars.select(pl.col("A").lower_bound())2523assert res_expr.item() == -922337203685477580825242525res_expr = fruits_cars.select(pl.col("B").upper_bound())2526assert res_expr.item() == 922337203685477580725272528with pytest.raises(ComputeError):2529fruits_cars.select(pl.col("fruits").upper_bound())253025312532def test_selection_misc() -> None:2533df = pl.DataFrame({"x": "abc"}, schema={"x": pl.String})25342535# literal values (as scalar/list)2536for zero in (0, [0]):2537assert df.select(zero)["literal"].to_list() == [0]2538assert df.select(literal=0)["literal"].to_list() == [0]25392540# expect string values to be interpreted as cols2541for x in ("x", ["x"], pl.col("x")):2542assert df.select(x).rows() == [("abc",)]25432544# string col + lit2545assert df.with_columns(["x", 0]).to_dicts() == [{"x": "abc", "literal": 0}]254625472548def test_selection_regex_and_multicol() -> None:2549test_df = pl.DataFrame(2550{2551"a": [1, 2, 3, 4],2552"b": [5, 6, 7, 8],2553"c": [9, 10, 11, 12],2554"foo": [13, 14, 15, 16],2555},2556schema_overrides={"foo": pl.UInt8},2557)25582559# Selection only2560test_df.select(2561pl.col(["a", "b", "c"]).name.suffix("_list"),2562pl.all().exclude("foo").name.suffix("_wild"),2563pl.col("^\\w$").name.suffix("_regex"),2564)25652566# Multi * Single2567assert test_df.select(pl.col(["a", "b", "c"]) * pl.col("foo")).to_dict(2568as_series=False2569) == {2570"a": [13, 28, 45, 64],2571"b": [65, 84, 105, 128],2572"c": [117, 140, 165, 192],2573}2574assert test_df.select(pl.all().exclude("foo") * pl.col("foo")).to_dict(2575as_series=False2576) == {2577"a": [13, 28, 45, 64],2578"b": [65, 84, 105, 128],2579"c": [117, 140, 165, 192],2580}25812582assert test_df.select(pl.col("^\\w$") * pl.col("foo")).to_dict(as_series=False) == {2583"a": [13, 28, 45, 64],2584"b": [65, 84, 105, 128],2585"c": [117, 140, 165, 192],2586}25872588# Multi * Multi2589result = test_df.select(pl.col(["a", "b", "c"]) * pl.col(["a", "b", "c"]))2590expected = {"a": [1, 4, 9, 16], "b": [25, 36, 49, 64], "c": [81, 100, 121, 144]}25912592assert result.to_dict(as_series=False) == expected2593assert test_df.select(pl.exclude("foo") * pl.exclude("foo")).to_dict(2594as_series=False2595) == {2596"a": [1, 4, 9, 16],2597"b": [25, 36, 49, 64],2598"c": [81, 100, 121, 144],2599}2600assert test_df.select(pl.col("^\\w$") * pl.col("^\\w$")).to_dict(2601as_series=False2602) == {2603"a": [1, 4, 9, 16],2604"b": [25, 36, 49, 64],2605"c": [81, 100, 121, 144],2606}26072608df = test_df.select(2609re=pl.struct(pl.col("^\\w$")),2610odd=pl.struct((pl.col(INTEGER_DTYPES) % 2).name.suffix("_is_odd")),2611maxes=pl.struct(pl.all().max().name.suffix("_max")),2612).head(2)2613# ┌───────────┬───────────┬─────────────┐2614# │ re ┆ odd ┆ maxes │2615# │ --- ┆ --- ┆ --- │2616# │ struct[3] ┆ struct[4] ┆ struct[4] │2617# ╞═══════════╪═══════════╪═════════════╡2618# │ {1,5,9} ┆ {1,1,1,1} ┆ {4,8,12,16} │2619# │ {2,6,10} ┆ {0,0,0,0} ┆ {4,8,12,16} │2620# └───────────┴───────────┴─────────────┘2621assert df.rows() == [2622(2623{"a": 1, "b": 5, "c": 9},2624{"a_is_odd": 1, "b_is_odd": 1, "c_is_odd": 1, "foo_is_odd": 1},2625{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},2626),2627(2628{"a": 2, "b": 6, "c": 10},2629{"a_is_odd": 0, "b_is_odd": 0, "c_is_odd": 0, "foo_is_odd": 0},2630{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},2631),2632]263326342635@pytest.mark.parametrize("subset", ["a", cs.starts_with("x", "a")])2636@pytest.mark.may_fail_auto_streaming # Flaky in CI, see https://github.com/pola-rs/polars/issues/209432637@pytest.mark.may_fail_cloud2638def test_unique_on_sorted(subset: Any) -> None:2639df = pl.DataFrame(data={"a": [1, 1, 3], "b": [1, 2, 3]})26402641result = df.with_columns([pl.col("a").set_sorted()]).unique(2642subset=subset,2643keep="last",2644)26452646expected = pl.DataFrame({"a": [1, 3], "b": [2, 3]})2647assert_frame_equal(result, expected)264826492650def test_len_compute(df: pl.DataFrame) -> None:2651df = df.with_columns(pl.struct(["list_bool", "cat"]).alias("struct"))2652filtered = df.filter(pl.col("bools"))2653for col in filtered.columns:2654assert len(filtered[col]) == 126552656taken = df[[1, 2], :]2657for col in taken.columns:2658assert len(taken[col]) == 2265926602661def test_filter_sequence() -> None:2662df = pl.DataFrame({"a": [1, 2, 3]})2663assert df.filter([True, False, True])["a"].to_list() == [1, 3]2664assert df.filter(np.array([True, False, True]))["a"].to_list() == [1, 3]266526662667def test_filter_multiple_predicates() -> None:2668df = pl.DataFrame(2669{2670"a": [1, 1, 1, 2, 2],2671"b": [1, 1, 2, 2, 2],2672"c": [1, 1, 2, 3, 4],2673}2674)26752676# multiple predicates2677expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})2678for out in (2679df.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat2680df.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list2681):2682assert_frame_equal(out, expected)26832684# multiple kwargs2685assert_frame_equal(2686df.filter(a=1, b=2),2687pl.DataFrame({"a": [1], "b": [2], "c": [2]}),2688)26892690# both positional and keyword args2691assert_frame_equal(2692pl.DataFrame({"a": [2], "b": [2], "c": [3]}),2693df.filter(pl.col("c") < 4, a=2, b=2),2694)26952696# boolean mask2697out = df.filter([True, False, False, False, True])2698expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 4]})2699assert_frame_equal(out, expected)27002701# multiple boolean masks2702out = df.filter(2703np.array([True, True, False, True, False]),2704np.array([True, False, True, True, False]),2705)2706expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 3]})2707assert_frame_equal(out, expected)270827092710def test_indexing_set() -> None:2711df = pl.DataFrame({"bool": [True, True], "str": ["N/A", "N/A"], "nr": [1, 2]})27122713df[0, "bool"] = False2714df[0, "nr"] = 1002715df[0, "str"] = "foo"27162717assert df.to_dict(as_series=False) == {2718"bool": [False, True],2719"str": ["foo", "N/A"],2720"nr": [100, 2],2721}272227232724def test_set() -> None:2725# Setting a dataframe using indices is deprecated.2726# We keep these tests because we only generate a warning.2727np.random.seed(1)2728df = pl.DataFrame(2729{"foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10}2730)2731with pytest.raises(2732TypeError,2733match=r"DataFrame object does not support `Series` assignment by index"2734r"\n\nUse `DataFrame.with_columns`.",2735):2736df["new"] = np.random.rand(10)27372738with pytest.raises(2739TypeError,2740match=r"not allowed to set DataFrame by boolean mask in the row position"2741r"\n\nConsider using `DataFrame.with_columns`.",2742):2743df[df["ham"] > 0.5, "ham"] = "a"2744with pytest.raises(2745TypeError,2746match=r"not allowed to set DataFrame by boolean mask in the row position"2747r"\n\nConsider using `DataFrame.with_columns`.",2748):2749df[[True, False], "ham"] = "a"27502751# set 2D2752df = pl.DataFrame({"b": [0, 0]})2753df[["A", "B"]] = [[1, 2], [1, 2]]27542755with pytest.raises(ValueError):2756df[["C", "D"]] = 12757with pytest.raises(ValueError):2758df[["C", "D"]] = [1, 1]2759with pytest.raises(ValueError):2760df[["C", "D"]] = [[1, 2, 3], [1, 2, 3]]27612762# set tuple2763df = pl.DataFrame({"b": [0, 0]})2764df[0, "b"] = 12765assert df[0, "b"] == 127662767df[0, 0] = 22768assert df[0, "b"] == 227692770# row and col selection have to be int or str2771with pytest.raises(TypeError):2772df[:, [1]] = 1 # type: ignore[index]2773with pytest.raises(TypeError):2774df[True, :] = 1 # type: ignore[index]27752776# needs to be a 2 element tuple2777with pytest.raises(ValueError):2778df[1, 2, 3] = 127792780# we cannot index with any type, such as bool2781with pytest.raises(TypeError):2782df[True] = 1 # type: ignore[index]278327842785def test_series_iter_over_frame() -> None:2786df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})27872788expected = {27890: pl.Series("a", [1, 2, 3]),27901: pl.Series("b", [2, 3, 4]),27912: pl.Series("c", [3, 4, 5]),2792}2793for idx, s in enumerate(df):2794assert_series_equal(s, expected[idx])27952796expected = {27970: pl.Series("c", [3, 4, 5]),27981: pl.Series("b", [2, 3, 4]),27992: pl.Series("a", [1, 2, 3]),2800}2801for idx, s in enumerate(reversed(df)):2802assert_series_equal(s, expected[idx])280328042805def test_union_with_aliases_4770() -> None:2806lf = pl.DataFrame(2807{2808"a": [1, None],2809"b": [3, 4],2810}2811).lazy()28122813lf = pl.concat(2814[2815lf.select([pl.col("a").alias("x")]),2816lf.select([pl.col("b").alias("x")]),2817]2818).filter(pl.col("x").is_not_null())28192820assert lf.collect()["x"].to_list() == [1, 3, 4]282128222823def test_init_datetimes_with_timezone() -> None:2824tz_us = "America/New_York"2825tz_europe = "Europe/Amsterdam"28262827dtm = datetime(2022, 10, 12, 12, 30)2828for time_unit in DTYPE_TEMPORAL_UNITS:2829for type_overrides in (2830{2831"schema": [2832("d1", pl.Datetime(time_unit, tz_us)),2833("d2", pl.Datetime(time_unit, tz_europe)),2834]2835},2836{2837"schema_overrides": {2838"d1": pl.Datetime(time_unit, tz_us),2839"d2": pl.Datetime(time_unit, tz_europe),2840}2841},2842):2843result = pl.DataFrame(2844data={2845"d1": [dtm.replace(tzinfo=ZoneInfo(tz_us))],2846"d2": [dtm.replace(tzinfo=ZoneInfo(tz_europe))],2847},2848**type_overrides,2849)2850expected = pl.DataFrame(2851{"d1": ["2022-10-12 12:30"], "d2": ["2022-10-12 12:30"]}2852).with_columns(2853pl.col("d1").str.to_datetime(time_unit=time_unit, time_zone=tz_us),2854pl.col("d2").str.to_datetime(time_unit=time_unit, time_zone=tz_europe),2855)2856assert_frame_equal(result, expected)285728582859@pytest.mark.parametrize(2860(2861"tzinfo",2862"offset",2863"dtype_time_zone",2864"expected_time_zone",2865"expected_item",2866),2867[2868(None, "", None, None, datetime(2020, 1, 1)),2869(2870timezone(timedelta(hours=-8)),2871"-08:00",2872"UTC",2873"UTC",2874datetime(2020, 1, 1, 8, tzinfo=timezone.utc),2875),2876(2877timezone(timedelta(hours=-8)),2878"-08:00",2879None,2880"UTC",2881datetime(2020, 1, 1, 8, tzinfo=timezone.utc),2882),2883],2884)2885@pytest.mark.may_fail_cloud2886def test_init_vs_strptime_consistency(2887tzinfo: timezone | None,2888offset: str,2889dtype_time_zone: str | None,2890expected_time_zone: str,2891expected_item: datetime,2892) -> None:2893result_init = pl.Series(2894[datetime(2020, 1, 1, tzinfo=tzinfo)],2895dtype=pl.Datetime("us", dtype_time_zone),2896)2897result_strptime = pl.Series([f"2020-01-01 00:00{offset}"]).str.strptime(2898pl.Datetime("us", dtype_time_zone)2899)2900assert result_init.dtype == pl.Datetime("us", expected_time_zone)2901assert result_init.item() == expected_item2902assert_series_equal(result_init, result_strptime)290329042905def test_init_vs_strptime_consistency_converts() -> None:2906result = pl.Series(2907[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],2908dtype=pl.Datetime("us", "US/Pacific"),2909).item()2910assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))2911result = (2912pl.Series(["2020-01-01 00:00-08:00"])2913.str.strptime(pl.Datetime("us", "US/Pacific"))2914.item()2915)2916assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))291729182919def test_init_physical_with_timezone() -> None:2920tz_uae = "Asia/Dubai"2921tz_asia = "Asia/Tokyo"29222923dtm_us = 16655778000000002924for time_unit in DTYPE_TEMPORAL_UNITS:2925dtm = {"ms": dtm_us // 1_000, "ns": dtm_us * 1_000}.get(str(time_unit), dtm_us)2926df = pl.DataFrame(2927data={"d1": [dtm], "d2": [dtm]},2928schema=[2929("d1", pl.Datetime(time_unit, tz_uae)),2930("d2", pl.Datetime(time_unit, tz_asia)),2931],2932)2933assert (df["d1"].to_physical() == df["d2"].to_physical()).all()2934assert df.rows() == [2935(2936datetime(2022, 10, 12, 16, 30, tzinfo=ZoneInfo(tz_uae)),2937datetime(2022, 10, 12, 21, 30, tzinfo=ZoneInfo(tz_asia)),2938)2939]294029412942@pytest.mark.parametrize("divop", [floordiv, truediv])2943def test_floordiv_truediv(divop: Callable[..., Any]) -> None:2944# validate truediv/floordiv dataframe ops against python2945df1 = pl.DataFrame(2946data={2947"x": [0, -1, -2, -3],2948"y": [-0.0, -3.0, 5.0, -7.0],2949"z": [10, 3, -5, 7],2950}2951)29522953# scalar2954for df in [df1, df1.slice(0, 0)]:2955for n in (3, 3.0, -3, -3.0):2956py_div = [tuple(divop(elem, n) for elem in row) for row in df.rows()]2957df_div = divop(df, n).rows()2958assert py_div == df_div29592960# series2961xdf, s = df1["x"].to_frame(), pl.Series([2] * 4)2962assert list(divop(xdf, s)["x"]) == [divop(x, 2) for x in list(df1["x"])]29632964# frame2965df2 = pl.DataFrame(2966data={2967"x": [2, -2, 2, 3],2968"y": [4, 4, -4, 8],2969"z": [0.5, 2.0, -2.0, -3],2970}2971)2972df_div = divop(df1, df2).rows()2973for i, (row1, row2) in enumerate(zip(df1.rows(), df2.rows())):2974for j, (elem1, elem2) in enumerate(zip(row1, row2)):2975assert divop(elem1, elem2) == df_div[i][j]297629772978@pytest.mark.parametrize(2979("subset", "keep", "expected_mask"),2980[2981(None, "first", [True, True, True, False]),2982("a", "first", [True, True, False, False]),2983(["a", "b"], "first", [True, True, False, False]),2984(("a", "b"), "last", [True, False, False, True]),2985(("a", "b"), "none", [True, False, False, False]),2986],2987)2988def test_unique(2989subset: str | Sequence[str], keep: UniqueKeepStrategy, expected_mask: list[bool]2990) -> None:2991df = pl.DataFrame({"a": [1, 2, 2, 2], "b": [3, 4, 4, 4], "c": [5, 6, 7, 7]})29922993result = df.unique(maintain_order=True, subset=subset, keep=keep).sort(pl.all())2994expected = df.filter(expected_mask).sort(pl.all())2995assert_frame_equal(result, expected)299629972998def test_iter_slices() -> None:2999df = pl.DataFrame(3000{3001"a": range(95),3002"b": date(2023, 1, 1),3003"c": "klmnopqrstuvwxyz",3004}3005)3006batches = list(df.iter_slices(n_rows=50))30073008assert len(batches[0]) == 503009assert len(batches[1]) == 453010assert batches[1].rows() == df[50:].rows()301130123013def test_format_empty_df() -> None:3014df = pl.DataFrame(3015[3016pl.Series("val1", [], dtype=pl.Categorical),3017pl.Series("val2", [], dtype=pl.Categorical),3018]3019).select(3020pl.format("{}:{}", pl.col("val1"), pl.col("val2")).alias("cat"),3021)3022assert df.shape == (0, 1)3023assert df.dtypes == [pl.String]302430253026def test_deadlocks_3409() -> None:3027assert (3028pl.DataFrame({"col1": [[1, 2, 3]]})3029.with_columns(3030pl.col("col1").list.eval(3031pl.element().map_elements(lambda x: x, return_dtype=pl.Int64)3032)3033)3034.to_dict(as_series=False)3035) == {"col1": [[1, 2, 3]]}30363037assert (3038pl.DataFrame({"col1": [1, 2, 3]})3039.with_columns(3040pl.col("col1").cumulative_eval(3041pl.element().map_batches(lambda x: 0, pl.Int64, returns_scalar=True)3042)3043)3044.to_dict(as_series=False)3045) == {"col1": [0, 0, 0]}304630473048def test_ceil() -> None:3049df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})3050result = df.select(pl.col("a").ceil())3051assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))30523053df = pl.DataFrame({"a": [1, 2, 3]})3054result = df.select(pl.col("a").ceil())3055assert_frame_equal(df, result)305630573058def test_floor() -> None:3059df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})3060result = df.select(pl.col("a").floor())3061assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))30623063df = pl.DataFrame({"a": [1, 2, 3]})3064result = df.select(pl.col("a").floor())3065assert_frame_equal(df, result)306630673068def test_floor_divide() -> None:3069x = 10.43070step = 0.53071df = pl.DataFrame({"x": [x]})3072assert df.with_columns(pl.col("x") // step)[0, 0] == x // step307330743075def test_round() -> None:3076df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})3077col_a_rounded = df.select(pl.col("a").round(decimals=0))["a"]3078assert_series_equal(col_a_rounded, pl.Series("a", [2, 1, 3]).cast(pl.Float64))307930803081def test_dot() -> None:3082df = pl.DataFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]})3083assert df.select(pl.col("a").dot(pl.col("b"))).item() == 12.96308430853086def test_unstack() -> None:3087from string import ascii_uppercase30883089df = pl.DataFrame(3090{3091"col1": list(ascii_uppercase[0:9]),3092"col2": pl.int_range(0, 9, eager=True),3093"col3": pl.int_range(-9, 0, eager=True),3094}3095)3096assert df.unstack(step=3, how="vertical").to_dict(as_series=False) == {3097"col1_0": ["A", "B", "C"],3098"col1_1": ["D", "E", "F"],3099"col1_2": ["G", "H", "I"],3100"col2_0": [0, 1, 2],3101"col2_1": [3, 4, 5],3102"col2_2": [6, 7, 8],3103"col3_0": [-9, -8, -7],3104"col3_1": [-6, -5, -4],3105"col3_2": [-3, -2, -1],3106}31073108assert df.unstack(step=3, how="horizontal").to_dict(as_series=False) == {3109"col1_0": ["A", "D", "G"],3110"col1_1": ["B", "E", "H"],3111"col1_2": ["C", "F", "I"],3112"col2_0": [0, 3, 6],3113"col2_1": [1, 4, 7],3114"col2_2": [2, 5, 8],3115"col3_0": [-9, -6, -3],3116"col3_1": [-8, -5, -2],3117"col3_2": [-7, -4, -1],3118}31193120for column_subset in (("col2", "col3"), cs.integer()):3121assert df.unstack(3122step=3,3123how="horizontal",3124columns=column_subset,3125).to_dict(as_series=False) == {3126"col2_0": [0, 3, 6],3127"col2_1": [1, 4, 7],3128"col2_2": [2, 5, 8],3129"col3_0": [-9, -6, -3],3130"col3_1": [-8, -5, -2],3131"col3_2": [-7, -4, -1],3132}313331343135def test_window_deadlock() -> None:3136np.random.seed(12)31373138df = pl.DataFrame(3139{3140"nrs": [1, 2, 3, None, 5],3141"names": ["foo", "ham", "spam", "egg", None],3142"random": np.random.rand(5),3143"groups": ["A", "A", "B", "C", "B"],3144}3145)31463147_df = df.select(3148pl.col("*"), # select all3149pl.col("random").sum().over("groups").alias("sum[random]/groups"),3150pl.col("random").implode().over("names").alias("random/name"),3151)315231533154def test_sum_empty_column_names() -> None:3155df = pl.DataFrame({"x": [], "y": []}, schema={"x": pl.Boolean, "y": pl.Boolean})3156expected = pl.DataFrame(3157{"x": [0], "y": [0]}, schema={"x": pl.UInt32, "y": pl.UInt32}3158)3159assert_frame_equal(df.sum(), expected)316031613162def test_flags() -> None:3163df = pl.DataFrame({"a": [1, 2, 3], "b": [9, 5, 6]})3164assert df.flags == {3165"a": {"SORTED_ASC": False, "SORTED_DESC": False},3166"b": {"SORTED_ASC": False, "SORTED_DESC": False},3167}3168assert df.set_sorted("a").flags == {3169"a": {"SORTED_ASC": True, "SORTED_DESC": False},3170"b": {"SORTED_ASC": False, "SORTED_DESC": False},3171}317231733174def test_interchange() -> None:3175df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})3176dfi = df.__dataframe__()31773178# Testing some random properties to make sure conversion happened correctly3179assert dfi.num_rows() == 23180assert dfi.get_column(0).dtype[1] == 643181assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6318231833184def test_from_dicts_undeclared_column_dtype() -> None:3185data = [{"a": 1, "b": 2}]3186result = pl.from_dicts(data, schema=["x"])3187assert result.schema == {"x": pl.Null}318831893190def test_from_dicts_with_override() -> None:3191data = [3192{"a": "1", "b": str(2**64 - 1), "c": "1"},3193{"a": "1", "b": "1", "c": "-5.0"},3194]3195override = {"a": pl.Int32, "b": pl.UInt64, "c": pl.Float32}3196result = pl.from_dicts(data, schema_overrides=override)3197assert_frame_equal(3198result,3199pl.DataFrame(3200{3201"a": pl.Series([1, 1], dtype=pl.Int32),3202"b": pl.Series([2**64 - 1, 1], dtype=pl.UInt64),3203"c": pl.Series([1.0, -5.0], dtype=pl.Float32),3204}3205),3206)320732083209def test_from_records_u64_12329() -> None:3210s = pl.from_records([{"a": 9908227375760408577}])3211assert s.dtypes == [pl.Int128]3212assert s["a"][0] == 9908227375760408577321332143215def test_negative_slice_12642() -> None:3216df = pl.DataFrame({"x": range(5)})3217assert_frame_equal(df.slice(-2, 1), df.tail(2).head(1))321832193220def test_iter_columns() -> None:3221df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]})3222iter_columns = df.iter_columns()3223assert_series_equal(next(iter_columns), pl.Series("a", [1, 1, 2]))3224assert_series_equal(next(iter_columns), pl.Series("b", [4, 5, 6]))322532263227def test_get_column_index() -> None:3228df = pl.DataFrame({"actual": [1001], "expected": [1000]})32293230assert df.get_column_index("actual") == 03231assert df.get_column_index("expected") == 132323233with pytest.raises(ColumnNotFoundError, match="missing"):3234df.get_column_index("missing")323532363237def test_dataframe_creation_with_different_series_lengths_19795() -> None:3238with pytest.raises(3239ShapeError,3240match=r"could not create a new DataFrame: height of column 'b' \(1\) does not match height of column 'a' \(2\)",3241):3242pl.DataFrame({"a": [1, 2], "b": [1]})324332443245def test_get_column_after_drop_20119() -> None:3246df = pl.DataFrame({"a": ["A"], "b": ["B"], "c": ["C"]})3247df.drop_in_place("a")3248c = df.get_column("c")3249assert_series_equal(c, pl.Series("c", ["C"]))325032513252def test_select_oob_row_20775() -> None:3253df = pl.DataFrame({"a": [1, 2, 3]})3254with pytest.raises(3255IndexError,3256match="index 99 is out of bounds for DataFrame of height 3",3257):3258df[99]325932603261@pytest.mark.parametrize("idx", [3, 99, -4, -99])3262def test_select_oob_element_20775_too_large(idx: int) -> None:3263df = pl.DataFrame({"a": [1, 2, 3]})3264with pytest.raises(3265IndexError,3266match=f"index {idx} is out of bounds for sequence of length 3",3267):3268df[idx, "a"]326932703271def test_nan_to_null() -> None:3272a = np.array([np.nan, 1])32733274df1 = pl.DataFrame(a, nan_to_null=True)3275df2 = pl.DataFrame(3276(a,),3277nan_to_null=True,3278)32793280assert_frame_equal(df1, df2)328132823283# Below 3 tests for https://github.com/pola-rs/polars/issues/17879328432853286def test_with_columns_dict_direct_typeerror() -> None:3287data = {"a": pl.col("a") * 2}3288df = pl.select(a=1)3289with pytest.raises(3290TypeError, match="Cannot pass a dictionary as a single positional argument"3291):3292df.with_columns(data)329332943295def test_with_columns_dict_unpacking() -> None:3296data = {"a": pl.col("a") * 2}3297df = pl.select(a=1).with_columns(**data)3298expected = pl.DataFrame({"a": [2]})3299assert df.equals(expected)330033013302def test_with_columns_generator_alias() -> None:3303data = {"a": pl.col("a") * 2}3304df = pl.select(a=1).with_columns(expr.alias(name) for name, expr in data.items())3305expected = pl.DataFrame({"a": [2]})3306assert df.equals(expected)330733083309