Path: blob/main/py-polars/tests/unit/dataframe/test_df.py
8406 views
from __future__ import annotations12import sys3import typing4from collections import OrderedDict5from collections.abc import Iterator, Mapping6from datetime import date, datetime, time, timedelta, timezone7from decimal import Decimal8from io import BytesIO9from itertools import chain, repeat10from operator import floordiv, truediv11from typing import TYPE_CHECKING, Any, cast12from zoneinfo import ZoneInfo1314import numpy as np15import pyarrow as pa16import pytest1718import polars as pl19import polars.selectors as cs20from polars._plr import PySeries21from polars._utils.construction import iterable_to_pydf22from polars.datatypes import DTYPE_TEMPORAL_UNITS23from polars.exceptions import (24ColumnNotFoundError,25ComputeError,26DuplicateError,27InvalidOperationError,28OutOfBoundsError,29ShapeError,30)31from polars.testing import (32assert_frame_equal,33assert_frame_not_equal,34assert_series_equal,35)36from tests.unit.conftest import FLOAT_DTYPES, INTEGER_DTYPES3738if TYPE_CHECKING:39from collections.abc import Callable, Iterator, Sequence4041from polars import Expr42from polars._typing import JoinStrategy, UniqueKeepStrategy43from tests.conftest import PlMonkeyPatch444546class MappingObject(Mapping[str, Any]): # noqa: D10147def __init__(self, **values: Any) -> None:48self._data = {**values}4950def __getitem__(self, key: str) -> Any:51return self._data[key]5253def __iter__(self) -> Iterator[str]:54yield from self._data5556def __len__(self) -> int:57return len(self._data)585960def test_version() -> None:61isinstance(pl.__version__, str)626364def test_null_count() -> None:65df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", None]})66assert df.null_count().shape == (1, 2)67assert df.null_count().row(0) == (0, 1)68assert df.null_count().row(np.int64(0)) == (0, 1) # type: ignore[call-overload]697071@pytest.mark.parametrize("input", [None, (), [], {}, pa.Table.from_arrays([])])72def test_init_empty(input: Any) -> None:73# test various flavours of empty init74df = pl.DataFrame(input)75assert df.shape == (0, 0)76assert df.is_empty()777879def test_df_bool_ambiguous() -> None:80empty_df = pl.DataFrame()81with pytest.raises(TypeError, match="ambiguous"):82not empty_df838485def test_special_char_colname_init() -> None:86from string import punctuation8788cols = [(c, pl.Int8) for c in punctuation]89df = pl.DataFrame(schema=cols)9091assert len(cols) == df.width92assert len(df.rows()) == 093assert df.is_empty()949596def test_comparisons() -> None:97df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})9899# Constants100assert_frame_equal(df == 2, pl.DataFrame({"a": [False, True], "b": [False, False]}))101assert_frame_equal(df != 2, pl.DataFrame({"a": [True, False], "b": [True, True]}))102assert_frame_equal(df < 3.0, pl.DataFrame({"a": [True, True], "b": [False, False]}))103assert_frame_equal(df >= 2, pl.DataFrame({"a": [False, True], "b": [True, True]}))104assert_frame_equal(df <= 2, pl.DataFrame({"a": [True, True], "b": [False, False]}))105106with pytest.raises(ComputeError):107df > "2" # noqa: B015108109# Series110s = pl.Series([3, 1])111assert_frame_equal(df >= s, pl.DataFrame({"a": [False, True], "b": [True, True]}))112113# DataFrame114other = pl.DataFrame({"a": [1, 2], "b": [2, 3]})115assert_frame_equal(116df == other, pl.DataFrame({"a": [True, True], "b": [False, False]})117)118assert_frame_equal(119df != other, pl.DataFrame({"a": [False, False], "b": [True, True]})120)121assert_frame_equal(122df > other, pl.DataFrame({"a": [False, False], "b": [True, True]})123)124assert_frame_equal(125df < other, pl.DataFrame({"a": [False, False], "b": [False, False]})126)127assert_frame_equal(128df >= other, pl.DataFrame({"a": [True, True], "b": [True, True]})129)130assert_frame_equal(131df <= other, pl.DataFrame({"a": [True, True], "b": [False, False]})132)133134# DataFrame columns mismatch135with pytest.raises(ValueError):136df == pl.DataFrame({"a": [1, 2], "c": [3, 4]}) # noqa: B015137with pytest.raises(ValueError):138df == pl.DataFrame({"b": [3, 4], "a": [1, 2]}) # noqa: B015139140# DataFrame shape mismatch141with pytest.raises(ValueError):142df == pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # noqa: B015143144# Type mismatch145with pytest.raises(ComputeError):146df == pl.DataFrame({"a": [1, 2], "b": ["x", "y"]}) # noqa: B015147148149def test_column_selection() -> None:150df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})151152# get column by name153b = pl.Series("b", [1.0, 2.0, 3.0])154assert_series_equal(df["b"], b)155assert_series_equal(df.get_column("b"), b)156157with pytest.raises(ColumnNotFoundError, match="x"):158df.get_column("x")159160default_series = pl.Series("x", ["?", "?", "?"])161assert_series_equal(df.get_column("x", default=default_series), default_series)162163assert df.get_column("x", default=None) is None164165# get column by index166assert_series_equal(df.to_series(1), pl.Series("b", [1.0, 2.0, 3.0]))167assert_series_equal(df.to_series(-1), pl.Series("c", ["a", "b", "c"]))168169170def test_mixed_sequence_selection() -> None:171df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})172result = df.select(["a", pl.col("b"), pl.lit("c")])173expected = pl.DataFrame({"a": [1, 2], "b": [3, 4], "literal": ["c", "c"]})174assert_frame_equal(result, expected)175176177def test_from_arrow(plmonkeypatch: PlMonkeyPatch) -> None:178tbl = pa.table(179{180"a": pa.array([1, 2], pa.timestamp("s")),181"b": pa.array([1, 2], pa.timestamp("ms")),182"c": pa.array([1, 2], pa.timestamp("us")),183"d": pa.array([1, 2], pa.timestamp("ns")),184"e": pa.array([1, 2], pa.int32()),185"decimal1": pa.array([1, 2], pa.decimal128(2, 1)),186"struct": pa.array(187[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])188),189}190)191record_batches = tbl.to_batches(max_chunksize=1)192expected_schema = {193"a": pl.Datetime("ms"),194"b": pl.Datetime("ms"),195"c": pl.Datetime("us"),196"d": pl.Datetime("ns"),197"e": pl.Int32,198"decimal1": pl.Decimal(2, 1),199"struct": pl.Struct({"a": pl.Int32()}),200}201expected_data = [202(203datetime(1970, 1, 1, 0, 0, 1),204datetime(1970, 1, 1, 0, 0, 0, 1000),205datetime(1970, 1, 1, 0, 0, 0, 1),206datetime(1970, 1, 1, 0, 0),2071,208Decimal("1.0"),209{"a": 1},210),211(212datetime(1970, 1, 1, 0, 0, 2),213datetime(1970, 1, 1, 0, 0, 0, 2000),214datetime(1970, 1, 1, 0, 0, 0, 2),215datetime(1970, 1, 1, 0, 0),2162,217Decimal("2.0"),218{"a": 2},219),220]221for arrow_data in (tbl, record_batches, (rb for rb in record_batches)):222df = cast("pl.DataFrame", pl.from_arrow(arrow_data))223assert df.schema == expected_schema224assert df.rows() == expected_data225226# record batches (inc. empty)227for b, n_expected in (228(record_batches[0], 1),229(record_batches[0][:0], 0),230):231df = cast("pl.DataFrame", pl.from_arrow(b))232assert df.schema == expected_schema233assert df.rows() == expected_data[:n_expected]234235empty_tbl = tbl[:0] # no rows236df = cast("pl.DataFrame", pl.from_arrow(empty_tbl))237assert df.schema == expected_schema238assert df.rows() == []239240# try a single column dtype override241for t in (tbl, empty_tbl):242df = pl.DataFrame(t, schema_overrides={"e": pl.Int8})243override_schema = expected_schema.copy()244override_schema["e"] = pl.Int8245assert df.schema == override_schema246assert df.rows() == expected_data[: (df.height)]247248# init from record batches with overrides249df = pl.DataFrame(250{251"id": ["a123", "b345", "c567", "d789", "e101"],252"points": [99, 45, 50, 85, 35],253}254)255tbl = df.to_arrow()256batches = tbl.to_batches(max_chunksize=3)257258df0: pl.DataFrame = pl.from_arrow(batches) # type: ignore[assignment]259df1: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]260data=batches,261schema=["x", "y"],262schema_overrides={"y": pl.Int32},263)264df2: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]265data=batches[0],266schema=["x", "y"],267schema_overrides={"y": pl.Int32},268)269270assert df0.rows() == df.rows()271assert df1.rows() == df.rows()272assert df2.rows() == df.rows()[:3]273274assert df0.schema == {"id": pl.String, "points": pl.Int64}275print(df1.schema)276assert df1.schema == {"x": pl.String, "y": pl.Int32}277assert df2.schema == {"x": pl.String, "y": pl.Int32}278279with pytest.raises(TypeError, match="Cannot convert str"):280pl.from_arrow(data="xyz")281282with pytest.raises(TypeError, match="Cannot convert int"):283pl.from_arrow(data=(x for x in (1, 2, 3)))284285286@pytest.mark.parametrize(287"data",288[289pa.Table.from_pydict(290{291"struct": pa.array(292[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])293),294}295),296pa.Table.from_pydict(297{298"struct": pa.chunked_array(299[[{"a": 1}], [{"a": 2}]], pa.struct([pa.field("a", pa.int32())])300),301}302),303],304)305def test_from_arrow_struct_column(data: pa.Table) -> None:306df = cast("pl.DataFrame", pl.from_arrow(data=data))307expected_schema = pl.Schema({"struct": pl.Struct({"a": pl.Int32()})})308expected_data = [({"a": 1},), ({"a": 2},)]309assert df.schema == expected_schema310assert df.rows() == expected_data311312313def test_dataframe_membership_operator() -> None:314# cf. issue #4032315df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})316assert "name" in df317assert "phone" not in df318assert df._ipython_key_completions_() == ["name", "age"]319320321def test_sort() -> None:322df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})323expected = pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]})324assert_frame_equal(df.sort("a"), expected)325assert_frame_equal(df.sort(["a", "b"]), expected)326327328def test_sort_multi_output_exprs_01() -> None:329df = pl.DataFrame(330{331"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],332"strs": ["abc", "def", "ghi"],333"vals": [10.5, 20.3, 15.7],334}335)336337expected = pl.DataFrame(338{339"dts": [date(2077, 10, 2), date(2077, 10, 2), date(2077, 10, 3)],340"strs": ["ghi", "def", "abc"],341"vals": [15.7, 20.3, 10.5],342}343)344assert_frame_equal(expected, df.sort(pl.col("^(d|v).*$")))345assert_frame_equal(expected, df.sort(cs.temporal() | cs.numeric()))346assert_frame_equal(expected, df.sort(cs.temporal(), cs.numeric(), cs.binary()))347348expected = pl.DataFrame(349{350"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],351"strs": ["abc", "def", "ghi"],352"vals": [10.5, 20.3, 15.7],353}354)355assert_frame_equal(356expected,357df.sort(pl.col("^(d|v).*$"), descending=[True]),358)359assert_frame_equal(360expected,361df.sort(cs.temporal() | cs.numeric(), descending=[True]),362)363assert_frame_equal(364expected,365df.sort(cs.temporal(), cs.numeric(), descending=[True, True]),366)367368with pytest.raises(369ValueError,370match=r"the length of `descending` \(2\) does not match the length of `by` \(1\)",371):372df.sort(by=[cs.temporal()], descending=[True, False])373374with pytest.raises(375ValueError,376match=r"the length of `nulls_last` \(3\) does not match the length of `by` \(2\)",377):378df.sort("dts", "strs", nulls_last=[True, False, True])379380# No columns selected - return original input.381assert_frame_equal(df, df.sort(pl.col("^xxx$")))382383384@pytest.mark.parametrize(385("by_explicit", "desc_explicit", "by_multi", "desc_multi"),386[387(388["w", "x", "y", "z"],389[False, False, True, True],390[cs.integer(), cs.string()],391[False, True],392),393(394["w", "y", "z"],395[True, True, False],396[pl.col("^(w|y)$"), pl.col("^z.*$")],397[True, False],398),399(400["z", "w", "x"],401[True, False, False],402[pl.col("z"), cs.numeric()],403[True, False],404),405],406)407def test_sort_multi_output_exprs_02(408by_explicit: list[str],409desc_explicit: list[bool],410by_multi: list[Expr],411desc_multi: list[bool],412) -> None:413df = pl.DataFrame(414{415"w": [100, 100, 100, 100, 200, 200, 200, 200],416"x": [888, 888, 444, 444, 888, 888, 444, 888],417"y": ["b", "b", "a", "a", "b", "b", "a", "a"],418"z": ["x", "y", "x", "y", "x", "y", "x", "y"],419}420)421res1 = df.sort(*by_explicit, descending=desc_explicit)422res2 = df.sort(*by_multi, descending=desc_multi)423assert_frame_equal(res1, res2)424425426def test_sort_maintain_order() -> None:427l1 = (428pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})429.sort("A", maintain_order=True)430.slice(0, 3)431.collect()["B"]432.to_list()433)434l2 = (435pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})436.sort("A")437.collect()438.slice(0, 3)["B"]439.to_list()440)441assert l1 == l2 == ["A", "B", "C"]442443444@pytest.mark.parametrize("nulls_last", [False, True], ids=["nulls_first", "nulls_last"])445def test_sort_maintain_order_descending_repeated_nulls(nulls_last: bool) -> None:446got = (447pl.LazyFrame({"A": [None, -1, 1, 1, None], "B": [1, 2, 3, 4, 5]})448.sort("A", descending=True, maintain_order=True, nulls_last=nulls_last)449.collect()450)451if nulls_last:452expect = pl.DataFrame({"A": [1, 1, -1, None, None], "B": [3, 4, 2, 1, 5]})453else:454expect = pl.DataFrame({"A": [None, None, 1, 1, -1], "B": [1, 5, 3, 4, 2]})455assert_frame_equal(got, expect)456457458def test_replace() -> None:459df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})460s = pl.Series("c", [True, False, True])461df._replace("a", s)462assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}))463464465def test_assignment() -> None:466df = pl.DataFrame({"foo": [1, 2, 3], "bar": [2, 3, 4]})467df = df.with_columns(pl.col("foo").alias("foo"))468# make sure that assignment does not change column order469assert df.columns == ["foo", "bar"]470df = df.with_columns(471pl.when(pl.col("foo") > 1).then(9).otherwise(pl.col("foo")).alias("foo")472)473assert df["foo"].to_list() == [1, 9, 9]474475476def test_insert_column() -> None:477# insert series478df = (479pl.DataFrame({"z": [3, 4, 5]})480.insert_column(0, pl.Series("x", [1, 2, 3]))481.insert_column(-1, pl.Series("y", [2, 3, 4]))482)483expected_df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})484assert_frame_equal(expected_df, df)485486# insert expressions487df = pl.DataFrame(488{489"id": ["xx", "yy", "zz"],490"v1": [5, 4, 6],491"v2": [7, 3, 3],492}493)494df.insert_column(3, (pl.col("v1") * pl.col("v2")).alias("v3"))495df.insert_column(1, (pl.col("v2") - pl.col("v1")).alias("v0"))496497expected = pl.DataFrame(498{499"id": ["xx", "yy", "zz"],500"v0": [2, -1, -3],501"v1": [5, 4, 6],502"v2": [7, 3, 3],503"v3": [35, 12, 18],504}505)506assert_frame_equal(df, expected)507508# check that we raise suitable index errors509for idx, column in (510(10, pl.col("v1").sqrt().alias("v1_sqrt")),511(-10, pl.Series("foo", [1, 2, 3])),512):513with pytest.raises(514IndexError,515match=rf"column index {idx} is out of range \(frame has 5 columns\)",516):517df.insert_column(idx, column)518519520def test_replace_column() -> None:521df = (522pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})523.replace_column(0, pl.Series("a", [4, 5, 6]))524.replace_column(-2, pl.Series("b", [5, 6, 7]))525.replace_column(-1, pl.Series("c", [6, 7, 8]))526)527expected_df = pl.DataFrame({"a": [4, 5, 6], "b": [5, 6, 7], "c": [6, 7, 8]})528assert_frame_equal(expected_df, df)529530531def test_to_series() -> None:532df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})533534assert_series_equal(df.to_series(), df["x"])535assert_series_equal(df.to_series(0), df["x"])536assert_series_equal(df.to_series(-3), df["x"])537538assert_series_equal(df.to_series(1), df["y"])539assert_series_equal(df.to_series(-2), df["y"])540541assert_series_equal(df.to_series(2), df["z"])542assert_series_equal(df.to_series(-1), df["z"])543544545def test_to_series_bad_inputs() -> None:546df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})547548with pytest.raises(IndexError, match="index 5 is out of bounds"):549df.to_series(5)550551with pytest.raises(IndexError, match="index -100 is out of bounds"):552df.to_series(-100)553554with pytest.raises(555TypeError, match="'str' object cannot be interpreted as an integer"556):557df.to_series("x") # type: ignore[arg-type]558559560def test_gather_every() -> None:561df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})562expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})563assert_frame_equal(expected_df, df.gather_every(2))564565expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})566assert_frame_equal(expected_df, df.gather_every(2, offset=1))567568569def test_gather_every_agg() -> None:570df = pl.DataFrame(571{572"g": [1, 1, 1, 2, 2, 2],573"a": ["a", "b", "c", "d", "e", "f"],574}575)576out = df.group_by(pl.col("g")).agg(pl.col("a").gather_every(2)).sort("g")577expected = pl.DataFrame(578{579"g": [1, 2],580"a": [["a", "c"], ["d", "f"]],581}582)583assert_frame_equal(out, expected)584585586def test_take_misc(fruits_cars: pl.DataFrame) -> None:587df = fruits_cars588589# Out of bounds error.590with pytest.raises(OutOfBoundsError):591df.sort("fruits").select(592pl.col("B").reverse().gather([1, 2]).implode().over("fruits"),593"fruits",594)595596# Null indices.597assert_frame_equal(598df.select(pl.col("fruits").gather(pl.Series([0, None]))),599pl.DataFrame({"fruits": ["banana", None]}),600)601602for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]:603out = df.sort("fruits").select(604pl.col("B")605.reverse()606.gather(index) # type: ignore[arg-type]607.over("fruits", mapping_strategy="join"),608"fruits",609)610611assert out[0, "B"].to_list() == [2, 3]612assert out[4, "B"].to_list() == [1, 4]613614out = df.sort("fruits").select(615pl.col("B").reverse().get(pl.lit(1)).over("fruits"),616"fruits",617)618assert out[0, "B"] == 3619assert out[4, "B"] == 4620621622def test_pipe() -> None:623df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8]})624625def _multiply(data: pl.DataFrame, mul: int) -> pl.DataFrame:626return data * mul627628result = df.pipe(_multiply, mul=3)629630assert_frame_equal(result, df * 3)631632633def test_explode() -> None:634df = pl.DataFrame({"letters": ["c", "a"], "nrs": [[1, 2], [1, 3]]})635out = df.explode("nrs")636assert out["letters"].to_list() == ["c", "c", "a", "a"]637assert out["nrs"].to_list() == [1, 2, 1, 3]638639640@pytest.mark.parametrize(641("stack", "exp_shape", "exp_columns"),642[643([pl.Series("stacked", [-1, -1, -1])], (3, 3), ["a", "b", "stacked"]),644(645[pl.Series("stacked2", [-1, -1, -1]), pl.Series("stacked3", [-1, -1, -1])],646(3, 4),647["a", "b", "stacked2", "stacked3"],648),649],650)651@pytest.mark.parametrize("in_place", [True, False])652def test_hstack_list_of_series(653stack: list[pl.Series],654exp_shape: tuple[int, int],655exp_columns: list[str],656in_place: bool,657) -> None:658df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})659if in_place:660df.hstack(stack, in_place=True)661assert df.shape == exp_shape662assert df.columns == exp_columns663else:664df_out = df.hstack(stack, in_place=False)665assert df_out.shape == exp_shape666assert df_out.columns == exp_columns667668669@pytest.mark.parametrize("in_place", [True, False])670def test_hstack_dataframe(in_place: bool) -> None:671df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})672df2 = pl.DataFrame({"c": [2, 1, 3], "d": ["a", "b", "c"]})673expected = pl.DataFrame(674{"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [2, 1, 3], "d": ["a", "b", "c"]}675)676if in_place:677df.hstack(df2, in_place=True)678assert_frame_equal(df, expected)679else:680df_out = df.hstack(df2, in_place=False)681assert_frame_equal(df_out, expected)682683684@pytest.mark.may_fail_cloud685def test_file_buffer() -> None:686f = BytesIO()687f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")688f.seek(0)689df = pl.read_csv(f, has_header=False)690assert df.shape == (2, 6)691692f = BytesIO()693f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")694f.seek(0)695# check if not fails on TryClone and Length impl in file.rs696with pytest.raises(ComputeError):697pl.read_parquet(f)698699700def test_shift() -> None:701df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})702a = df.shift(1)703b = pl.DataFrame(704{"A": [None, "a", "b"], "B": [None, 1, 3]},705)706assert_frame_equal(a, b)707708709def test_multiple_columns_drop() -> None:710df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})711# List input712out = df.drop(["a", "b"])713assert out.columns == ["c"]714# Positional input715out = df.drop("b", "c")716assert out.columns == ["a"]717718719def test_arg_where() -> None:720s = pl.Series([True, False, True, False])721assert_series_equal(722pl.arg_where(s, eager=True).cast(int),723pl.Series([0, 2]),724)725726727def test_to_dummies() -> None:728df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})729dummies = df.to_dummies()730731assert dummies["A_a"].to_list() == [1, 0, 0]732assert dummies["A_b"].to_list() == [0, 1, 0]733assert dummies["A_c"].to_list() == [0, 0, 1]734735df = pl.DataFrame({"a": [1, 2, 3]})736res = df.to_dummies()737738expected = pl.DataFrame(739{"a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]}740).with_columns(pl.all().cast(pl.UInt8))741assert_frame_equal(res, expected)742743df = pl.DataFrame(744{745"i": [1, 2, 3],746"category": ["dog", "cat", "cat"],747},748schema={"i": pl.Int32, "category": pl.Categorical()},749)750expected = pl.DataFrame(751{752"i": [1, 2, 3],753"category|cat": [0, 1, 1],754"category|dog": [1, 0, 0],755},756schema={"i": pl.Int32, "category|cat": pl.UInt8, "category|dog": pl.UInt8},757)758for _cols in ("category", cs.string()):759result = df.to_dummies(columns=["category"], separator="|")760assert_frame_equal(result, expected)761762# test sorted fast path763result = pl.DataFrame({"x": pl.arange(0, 3, eager=True)}).to_dummies("x")764expected = pl.DataFrame(765{"x_0": [1, 0, 0], "x_1": [0, 1, 0], "x_2": [0, 0, 1]}766).with_columns(pl.all().cast(pl.UInt8))767assert_frame_equal(result, expected)768769770def test_to_dummies_drop_first() -> None:771df = pl.DataFrame(772{773"foo": [0, 1, 2],774"bar": [3, 4, 5],775"baz": ["x", "y", "z"],776}777)778dm = df.to_dummies()779dd = df.to_dummies(drop_first=True)780781assert dd.columns == ["foo_1", "foo_2", "bar_4", "bar_5", "baz_y", "baz_z"]782assert set(dm.columns) - set(dd.columns) == {"foo_0", "bar_3", "baz_x"}783assert_frame_equal(dm.select(dd.columns), dd)784assert dd.rows() == [785(0, 0, 0, 0, 0, 0),786(1, 0, 1, 0, 1, 0),787(0, 1, 0, 1, 0, 1),788]789790791def test_to_dummies_drop_nulls() -> None:792df = pl.DataFrame(793{794"foo": [0, 1, None],795"bar": [3, None, 5],796"baz": [None, "y", "z"],797}798)799800dm = df.to_dummies(drop_nulls=True)801802expected = pl.DataFrame(803{804"foo_0": [1, 0, 0],805"foo_1": [0, 1, 0],806"bar_3": [1, 0, 0],807"bar_5": [0, 0, 1],808"baz_y": [0, 1, 0],809"baz_z": [0, 0, 1],810},811schema={812"foo_0": pl.UInt8,813"foo_1": pl.UInt8,814"bar_3": pl.UInt8,815"bar_5": pl.UInt8,816"baz_y": pl.UInt8,817"baz_z": pl.UInt8,818},819)820assert_frame_equal(dm, expected)821822823def test_to_pandas(df: pl.DataFrame) -> None:824# pyarrow cannot deal with unsigned dictionary integer yet.825# pyarrow cannot convert a time64 w/ non-zero nanoseconds826df = df.drop(["cat", "time", "enum"])827df.to_arrow()828df.to_pandas()829# test shifted df830df.shift(2).to_pandas()831df = pl.DataFrame({"col": pl.Series([True, False, True])})832df.shift(2).to_pandas()833834835def test_from_arrow_table() -> None:836data = {"a": [1, 2], "b": [1, 2]}837tbl = pa.table(data)838839df = cast("pl.DataFrame", pl.from_arrow(tbl))840assert_frame_equal(df, pl.DataFrame(data))841842843def test_df_stats(df: pl.DataFrame) -> None:844df.var()845df.std()846df.min()847df.max()848df.sum()849df.mean()850df.median()851df.quantile(0.4, "nearest")852853854def test_df_fold() -> None:855df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})856857assert_series_equal(858df.fold(lambda s1, s2: s1 + s2), pl.Series("a", [4.0, 5.0, 9.0])859)860assert_series_equal(861df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)),862pl.Series("a", [1.0, 1.0, 3.0]),863)864865df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})866out = df.fold(lambda s1, s2: s1 + s2)867assert_series_equal(out, pl.Series("a", ["foo11.0", "bar22.0", "233.0"]))868869df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})870# just check dispatch. values are tested on rust side.871assert len(df.sum_horizontal()) == 3872assert len(df.mean_horizontal()) == 3873assert len(df.min_horizontal()) == 3874assert len(df.max_horizontal()) == 3875876df_width_one = df[["a"]]877assert_series_equal(df_width_one.fold(lambda s1, s2: s1), df["a"])878879880@pytest.mark.may_fail_cloud # TODO: make pickleable881def test_fold_filter() -> None:882df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})883884out = df.filter(885pl.fold(886acc=pl.lit(True),887function=lambda a, b: a & b,888exprs=[pl.col(c) > 1 for c in df.columns],889)890)891892assert out.shape == (1, 2)893assert out.rows() == [(3, 2)]894895out = df.filter(896pl.fold(897acc=pl.lit(True),898function=lambda a, b: a | b,899exprs=[pl.col(c) > 1 for c in df.columns],900)901)902903assert out.shape == (3, 2)904assert out.rows() == [(1, 0), (2, 1), (3, 2)]905906907def test_column_names() -> None:908tbl = pa.table(909{910"a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)),911"b": pa.array([1, 2, 3, 4, 5], pa.int64()),912}913)914for a in (tbl, tbl[:0]):915df = cast("pl.DataFrame", pl.from_arrow(a))916assert df.columns == ["a", "b"]917918919def test_init_series_edge_cases() -> None:920# confirm that we don't modify the name of the input series in-place921s1 = pl.Series("X", [1, 2, 3])922df1 = pl.DataFrame({"A": s1}, schema_overrides={"A": pl.UInt8})923assert s1.name == "X"924assert df1["A"].name == "A"925926# init same series object under different names927df2 = pl.DataFrame({"A": s1, "B": s1})928assert df2.rows(named=True) == [929{"A": 1, "B": 1},930{"A": 2, "B": 2},931{"A": 3, "B": 3},932]933934# empty series names should not be overwritten935s2 = pl.Series([1, 2, 3])936s3 = pl.Series([2, 3, 4])937df3 = pl.DataFrame([s2, s3])938assert s2.name == s3.name == ""939assert df3.columns == ["column_0", "column_1"]940941942def test_head_group_by() -> None:943commodity_prices = {944"commodity": [945"Wheat",946"Wheat",947"Wheat",948"Wheat",949"Corn",950"Corn",951"Corn",952"Corn",953"Corn",954],955"location": [956"StPaul",957"StPaul",958"StPaul",959"Chicago",960"Chicago",961"Chicago",962"Chicago",963"Chicago",964"Chicago",965],966"seller": [967"Bob",968"Charlie",969"Susan",970"Paul",971"Ed",972"Mary",973"Paul",974"Charlie",975"Norman",976],977"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],978}979df = pl.DataFrame(commodity_prices)980981# this query flexes the wildcard exclusion quite a bit.982keys = ["commodity", "location"]983out = (984df.sort(by="price", descending=True)985.group_by(keys, maintain_order=True)986.agg([pl.col("*").exclude(keys).head(2).name.keep()])987.explode(cs.all().exclude(keys))988)989990assert out.shape == (5, 4)991assert out.rows() == [992("Corn", "Chicago", "Mary", 3.0),993("Corn", "Chicago", "Paul", 2.4),994("Wheat", "StPaul", "Bob", 1.0),995("Wheat", "StPaul", "Susan", 0.8),996("Wheat", "Chicago", "Paul", 0.55),997]998999df = pl.DataFrame(1000{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}1001)1002out = df.group_by("letters").tail(2).sort("letters")1003assert_frame_equal(1004out,1005pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),1006)1007out = df.group_by("letters").head(2).sort("letters")1008assert_frame_equal(1009out,1010pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),1011)101210131014def test_is_null_is_not_null() -> None:1015df = pl.DataFrame({"nrs": [1, 2, None]})1016assert df.select(pl.col("nrs").is_null())["nrs"].to_list() == [False, False, True]1017assert df.select(pl.col("nrs").is_not_null())["nrs"].to_list() == [1018True,1019True,1020False,1021]102210231024def test_is_nan_is_not_nan() -> None:1025df = pl.DataFrame({"nrs": np.array([1, 2, np.nan])})1026assert df.select(pl.col("nrs").is_nan())["nrs"].to_list() == [False, False, True]1027assert df.select(pl.col("nrs").is_not_nan())["nrs"].to_list() == [True, True, False]102810291030def test_is_finite_is_infinite() -> None:1031df = pl.DataFrame({"nrs": np.array([1, 2, np.inf])})1032assert df.select(pl.col("nrs").is_infinite())["nrs"].to_list() == [1033False,1034False,1035True,1036]1037assert df.select(pl.col("nrs").is_finite())["nrs"].to_list() == [True, True, False]103810391040def test_is_finite_is_infinite_null_series() -> None:1041df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})1042result = df.select(1043pl.col("a").is_finite().alias("finite"),1044pl.col("a").is_infinite().alias("infinite"),1045)1046expected = pl.DataFrame(1047{1048"finite": pl.Series([None, None, None], dtype=pl.Boolean),1049"infinite": pl.Series([None, None, None], dtype=pl.Boolean),1050}1051)1052assert_frame_equal(result, expected)105310541055def test_is_nan_null_series() -> None:1056df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})1057result = df.select(pl.col("a").is_nan())1058expected = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Boolean)})1059assert_frame_equal(result, expected)106010611062def test_len() -> None:1063df = pl.DataFrame({"nrs": [1, 2, 3]})1064assert cast("int", df.select(pl.col("nrs").len()).item()) == 31065assert len(pl.DataFrame()) == 0106610671068def test_multiple_column_sort() -> None:1069df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})1070out = df.sort([pl.col("b"), pl.col("c").reverse()])1071assert list(out["c"]) == [2.0, 1.0, 3.0]1072assert list(out["b"]) == [2, 2, 3]10731074# Explicitly specify numpy dtype because of different defaults on Windows1075df = pl.DataFrame({"a": np.arange(1, 4, dtype=np.int64), "b": ["a", "a", "b"]})10761077assert_frame_equal(1078df.sort("a", descending=True),1079pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]}),1080)1081assert_frame_equal(1082df.sort("b", descending=True, maintain_order=True),1083pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]}),1084)1085assert_frame_equal(1086df.sort(["b", "a"], descending=[False, True]),1087pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]}),1088)108910901091def test_cast_frame() -> None:1092df = pl.DataFrame(1093{1094"a": [1.0, 2.5, 3.0],1095"b": [4, 5, None],1096"c": [True, False, True],1097"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],1098}1099)11001101# cast via col:dtype map1102assert df.cast(1103dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")},1104).schema == {1105"a": pl.Float64,1106"b": pl.Float32,1107"c": pl.String,1108"d": pl.Datetime("ms"),1109}11101111# cast via col:pytype map1112assert df.cast(1113dtypes={"b": float, "c": str, "d": datetime},1114).schema == {1115"a": pl.Float64,1116"b": pl.Float64,1117"c": pl.String,1118"d": pl.Datetime("us"),1119}11201121# cast via selector:dtype map1122assert df.cast(1123{1124cs.numeric(): pl.UInt8,1125cs.temporal(): pl.String,1126}1127).rows() == [1128(1, 4, True, "2020-01-02"),1129(2, 5, False, "2021-03-04"),1130(3, None, True, "2022-05-06"),1131]11321133# cast all fields to a single type1134assert df.cast(pl.String).to_dict(as_series=False) == {1135"a": ["1.0", "2.5", "3.0"],1136"b": ["4", "5", None],1137"c": ["true", "false", "true"],1138"d": ["2020-01-02", "2021-03-04", "2022-05-06"],1139}114011411142def test_duration_arithmetic() -> None:1143df = pl.DataFrame(1144{"a": [datetime(2022, 1, 1, 0, 0, 0), datetime(2022, 1, 2, 0, 0, 0)]}1145)1146d1 = pl.duration(days=3, microseconds=987000)1147d2 = pl.duration(days=6, milliseconds=987)11481149assert_frame_equal(1150df.with_columns(1151b=(df["a"] + d1),1152c=(pl.col("a") + d2),1153),1154pl.DataFrame(1155{1156"a": [1157datetime(2022, 1, 1, 0, 0, 0),1158datetime(2022, 1, 2, 0, 0, 0),1159],1160"b": [1161datetime(2022, 1, 4, 0, 0, 0, 987000),1162datetime(2022, 1, 5, 0, 0, 0, 987000),1163],1164"c": [1165datetime(2022, 1, 7, 0, 0, 0, 987000),1166datetime(2022, 1, 8, 0, 0, 0, 987000),1167],1168}1169),1170)117111721173def test_assign() -> None:1174# check if can assign in case of a single column1175df = pl.DataFrame({"a": [1, 2, 3]})1176# test if we can assign in case of single column1177df = df.with_columns(pl.col("a") * 2)1178assert list(df["a"]) == [2, 4, 6]117911801181def test_arg_sort_by(df: pl.DataFrame) -> None:1182idx_df = df.select(1183pl.arg_sort_by(["int_nulls", "floats"], descending=[False, True]).alias("idx")1184)1185assert (idx_df["idx"] == [1, 0, 2]).all()11861187idx_df = df.select(1188pl.arg_sort_by(["int_nulls", "floats"], descending=False).alias("idx")1189)1190assert (idx_df["idx"] == [1, 0, 2]).all()11911192df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})1193for expr, expected in (1194(pl.arg_sort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),1195(pl.arg_sort_by(["x", "y"], descending=[True, True]), [5, 3, 4, 0, 1, 2]),1196(pl.arg_sort_by(["x", "y"], descending=[True, False]), [5, 4, 3, 2, 0, 1]),1197(pl.arg_sort_by(["x", "y"], descending=[False, True]), [0, 1, 2, 3, 4, 5]),1198):1199assert (df.select(expr.alias("idx"))["idx"] == expected).all()120012011202def test_literal_series() -> None:1203df = pl.DataFrame(1204{1205"a": np.array([21.7, 21.8, 21], dtype=np.float32),1206"b": np.array([1, 3, 2], dtype=np.int8),1207"c": ["reg1", "reg2", "reg3"],1208"d": np.array(1209[datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],1210dtype="<M8[ns]",1211),1212},1213schema_overrides={"a": pl.Float64},1214)1215out = (1216df.lazy()1217.with_columns(pl.Series("e", [2, 1, 3], pl.Int32))1218.with_columns(pl.col("e").cast(pl.Float32))1219.collect()1220)1221expected_schema = {1222"a": pl.Float64,1223"b": pl.Int8,1224"c": pl.String,1225"d": pl.Datetime("ns"),1226"e": pl.Float32,1227}1228assert_frame_equal(1229pl.DataFrame(1230[1231(21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),1232(21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),1233(21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),1234],1235schema=expected_schema, # type: ignore[arg-type]1236orient="row",1237),1238out,1239abs_tol=0.00001,1240)124112421243def test_write_csv() -> None:1244df = pl.DataFrame(1245{1246"foo": [1, 2, 3, 4, 5],1247"bar": [6, 7, 8, 9, 10],1248"ham": ["a", "b", "c", "d", "e"],1249}1250)1251expected = "foo,bar,ham\n1,6,a\n2,7,b\n3,8,c\n4,9,d\n5,10,e\n"12521253# if no file argument is supplied, write_csv() will return the string1254s = df.write_csv()1255assert s == expected12561257# otherwise it will write to the file/iobuffer1258file = BytesIO()1259df.write_csv(file)1260file.seek(0)1261s = file.read().decode("utf8")1262assert s == expected126312641265def test_from_generator_or_iterable() -> None:1266# generator function1267def gen(n: int, *, strkey: bool = True) -> Iterator[Any]:1268for i in range(n):1269yield (str(i) if strkey else i), 1 * i, 2**i, 3**i12701271def gen_named(n: int, *, strkey: bool = True) -> Iterator[Any]:1272for i in range(n):1273yield {"a": (str(i) if strkey else i), "b": 1 * i, "c": 2**i, "d": 3**i}12741275# iterable object1276class Rows:1277def __init__(self, n: int, *, strkey: bool = True) -> None:1278self._n = n1279self._strkey = strkey12801281def __iter__(self) -> Iterator[Any]:1282yield from gen(self._n, strkey=self._strkey)12831284# check init from column-oriented generator1285assert_frame_equal(1286pl.DataFrame(data=gen(4, strkey=False), orient="col"),1287pl.DataFrame(1288data=[(0, 0, 1, 1), (1, 1, 2, 3), (2, 2, 4, 9), (3, 3, 8, 27)], orient="col"1289),1290)1291# check init from row-oriented generators (more common)1292expected = pl.DataFrame(1293data=list(gen(4)), schema=["a", "b", "c", "d"], orient="row"1294)1295for generated_frame in (1296pl.DataFrame(data=gen(4), schema=["a", "b", "c", "d"]),1297pl.DataFrame(data=Rows(4), schema=["a", "b", "c", "d"]),1298pl.DataFrame(data=(x for x in Rows(4)), schema=["a", "b", "c", "d"]),1299):1300assert_frame_equal(expected, generated_frame)1301assert generated_frame.schema == {1302"a": pl.String,1303"b": pl.Int64,1304"c": pl.Int64,1305"d": pl.Int64,1306}13071308# test 'iterable_to_pydf' directly to validate 'chunk_size' behaviour1309cols = ["a", "b", ("c", pl.Int8), "d"]13101311expected_data = [("0", 0, 1, 1), ("1", 1, 2, 3), ("2", 2, 4, 9), ("3", 3, 8, 27)]1312expected_schema = [1313("a", pl.String),1314("b", pl.Int64),1315("c", pl.Int8),1316("d", pl.Int64),1317]13181319for params in (1320{"data": Rows(4)},1321{"data": gen(4), "chunk_size": 2},1322{"data": Rows(4), "chunk_size": 3},1323{"data": gen(4), "infer_schema_length": None},1324{"data": Rows(4), "infer_schema_length": 1},1325{"data": gen(4), "chunk_size": 2},1326{"data": Rows(4), "infer_schema_length": 5},1327{"data": gen(4), "infer_schema_length": 3, "chunk_size": 2},1328{"data": gen(4), "infer_schema_length": None, "chunk_size": 3},1329):1330d = iterable_to_pydf(schema=cols, **params) # type: ignore[arg-type]1331assert expected_data == d.row_tuples()1332assert expected_schema == list(zip(d.columns(), d.dtypes(), strict=True))13331334# ref: issue #6489 (initial chunk_size cannot be smaller than 'infer_schema_length')1335df = pl.DataFrame(1336data=iter(([{"col": None}] * 1000) + [{"col": ["a", "b", "c"]}]),1337infer_schema_length=1001,1338)1339assert df.schema == {"col": pl.List(pl.String)}1340assert df[-2:]["col"].to_list() == [None, ["a", "b", "c"]]13411342# ref: issue #23404 (infer_schema_length=None should always scan all data)1343d = iterable_to_pydf(1344data=chain(repeat({"col": 1}, length_minus_1 := 100), repeat({"col": 1.1}, 1)),1345infer_schema_length=None,1346chunk_size=length_minus_1,1347)1348assert d.dtypes() == [pl.Float64()]13491350# empty iterator1351assert_frame_equal(1352pl.DataFrame(data=gen(0), schema=["a", "b", "c", "d"]),1353pl.DataFrame(schema=["a", "b", "c", "d"]),1354)13551356# schema overrides1357assert_frame_equal(1358pl.DataFrame(1359data=gen_named(1),1360schema_overrides={"a": pl.Float64(), "c": pl.Float64()},1361),1362pl.DataFrame([{"a": 0.0, "b": 0, "c": 1.0, "d": 1}]),1363)136413651366def test_from_rows() -> None:1367df = pl.from_records([[1, 2, "foo"], [2, 3, "bar"]], orient="row")1368assert_frame_equal(1369df,1370pl.DataFrame(1371{"column_0": [1, 2], "column_1": [2, 3], "column_2": ["foo", "bar"]}1372),1373)1374df = pl.from_records(1375[[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],1376schema_overrides={"column_0": pl.UInt32},1377orient="row",1378)1379assert df.dtypes == [pl.UInt32, pl.Datetime]13801381# auto-inference with same num rows/cols1382data = [(1, 2, "foo"), (2, 3, "bar"), (3, 4, "baz")]1383df = pl.from_records(data, orient="row")1384assert data == df.rows()138513861387@pytest.mark.parametrize(1388"records",1389[1390[1391{"id": 1, "value": 100, "_meta": "a"},1392{"id": 2, "value": 101, "_meta": "b"},1393],1394[1395None,1396{"id": 1, "value": 100, "_meta": "a"},1397{"id": 2, "value": 101, "_meta": "b"},1398],1399[1400{"id": 1, "value": 100, "_meta": "a"},1401{"id": 2, "value": 101, "_meta": "b"},1402None,1403],1404[1405MappingObject(id=1, value=100, _meta="a"),1406MappingObject(id=2, value=101, _meta="b"),1407],1408[1409None,1410MappingObject(id=1, value=100, _meta="a"),1411MappingObject(id=2, value=101, _meta="b"),1412],1413[1414MappingObject(id=1, value=100, _meta="a"),1415MappingObject(id=2, value=101, _meta="b"),1416None,1417],1418],1419)1420def test_from_rows_of_dicts(records: Sequence[Mapping[str, Any]]) -> None:1421for df_init in (pl.from_dicts, pl.DataFrame):1422df1 = df_init(records).remove(pl.col("id").is_null())1423assert df1.rows() == [(1, 100, "a"), (2, 101, "b")]14241425overrides = {1426"id": pl.Int16,1427"value": pl.Int32,1428}1429df2 = df_init(records, schema_overrides=overrides).remove(1430pl.col("id").is_null()1431)1432assert df2.rows() == [(1, 100, "a"), (2, 101, "b")]1433assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.String}14341435df3 = df_init(records, schema=overrides).remove(pl.col("id").is_null())1436assert df3.rows() == [(1, 100), (2, 101)]1437assert df3.schema == {"id": pl.Int16, "value": pl.Int32}14381439# explicitly check "anyvalue" conversion for dict/mapping dtypes1440py_s = PySeries.new_from_any_values("s", records, True)1441assert py_s.dtype() == pl.Struct(1442{1443"id": pl.Int64,1444"value": pl.Int64,1445"_meta": pl.String,1446}1447)144814491450def test_from_records_with_schema_overrides_12032() -> None:1451# the 'id' fields contains an int value that exceeds Int64 and doesn't have an exact1452# Float64 representation; confirm that the override is applied *during* inference,1453# not as a post-inference cast, so we maintain the accuracy of the original value.1454rec = [1455{"id": 9187643043065364490, "x": 333, "y": None},1456{"id": 9223671840084328467, "x": 666.5, "y": 1698177261953686},1457{"id": 9187643043065364505, "x": 999, "y": 9223372036854775807},1458]1459df = pl.from_records(rec, schema_overrides={"x": pl.Float32, "id": pl.UInt64})1460assert df.schema == OrderedDict(1461[1462("id", pl.UInt64),1463("x", pl.Float32),1464("y", pl.Int64),1465]1466)1467assert rec == df.rows(named=True)146814691470def test_from_large_uint64_misc() -> None:1471uint_data = [[9187643043065364490, 9223671840084328467, 9187643043065364505]]14721473df = pl.DataFrame(uint_data, orient="col", schema_overrides={"column_0": pl.UInt64})1474assert df["column_0"].dtype == pl.UInt641475assert df["column_0"].to_list() == uint_data[0]14761477for overrides in ({}, {"column_1": pl.UInt64}):1478df = pl.DataFrame(1479uint_data,1480orient="row",1481schema_overrides=overrides,1482)1483assert df.schema == OrderedDict(1484[1485("column_0", pl.Int64),1486("column_1", pl.Int128 if overrides == {} else pl.UInt64),1487("column_2", pl.Int64),1488]1489)1490assert df.row(0) == tuple(uint_data[0])149114921493def test_repeat_by_unequal_lengths_panic() -> None:1494df = pl.DataFrame(1495{1496"a": ["x", "y", "z"],1497}1498)1499with pytest.raises(ShapeError):1500df.select(pl.col("a").repeat_by(pl.Series([2, 2])))150115021503@pytest.mark.parametrize(1504("value", "values_expect"),1505[1506(1.2, [[1.2], [1.2, 1.2], [1.2, 1.2, 1.2]]),1507(True, [[True], [True, True], [True, True, True]]),1508("x", [["x"], ["x", "x"], ["x", "x", "x"]]),1509(b"a", [[b"a"], [b"a", b"a"], [b"a", b"a", b"a"]]),1510],1511)1512def test_repeat_by_broadcast_left(1513value: float | bool | str, values_expect: list[list[float | bool | str]]1514) -> None:1515df = pl.DataFrame(1516{1517"n": [1, 2, 3],1518}1519)1520expected = pl.DataFrame({"values": values_expect})1521result = df.select(pl.lit(value).repeat_by(pl.col("n")).alias("values"))1522assert_frame_equal(result, expected)152315241525@pytest.mark.parametrize(1526("a", "a_expected"),1527[1528([1.2, 2.2, 3.3], [[1.2, 1.2, 1.2], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3]]),1529([True, False], [[True, True, True], [False, False, False]]),1530(["x", "y", "z"], [["x", "x", "x"], ["y", "y", "y"], ["z", "z", "z"]]),1531(1532[b"a", b"b", b"c"],1533[[b"a", b"a", b"a"], [b"b", b"b", b"b"], [b"c", b"c", b"c"]],1534),1535],1536)1537def test_repeat_by_broadcast_right(1538a: list[float | bool | str], a_expected: list[list[float | bool | str]]1539) -> None:1540df = pl.DataFrame(1541{1542"a": a,1543}1544)1545expected = pl.DataFrame({"a": a_expected})1546result = df.select(pl.col("a").repeat_by(3))1547assert_frame_equal(result, expected)1548result = df.select(pl.col("a").repeat_by(pl.lit(3)))1549assert_frame_equal(result, expected)155015511552@pytest.mark.parametrize(1553("a", "a_expected"),1554[1555(["foo", "bar"], [["foo", "foo"], ["bar", "bar", "bar"]]),1556([1, 2], [[1, 1], [2, 2, 2]]),1557([True, False], [[True, True], [False, False, False]]),1558(1559[b"a", b"b"],1560[[b"a", b"a"], [b"b", b"b", b"b"]],1561),1562],1563)1564def test_repeat_by(1565a: list[float | bool | str], a_expected: list[list[float | bool | str]]1566) -> None:1567df = pl.DataFrame({"a": a, "n": [2, 3]})1568expected = pl.DataFrame({"a": a_expected})1569result = df.select(pl.col("a").repeat_by("n"))1570assert_frame_equal(result, expected)157115721573def test_join_dates() -> None:1574dts_in = pl.datetime_range(1575datetime(2021, 6, 24),1576datetime(2021, 6, 24, 10, 0, 0),1577interval=timedelta(hours=1),1578closed="left",1579eager=True,1580)1581dts = (1582dts_in.cast(int)1583.map_elements(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))1584.cast(pl.Datetime)1585)15861587# some df with sensor id, (randomish) datetime and some value1588df = pl.DataFrame(1589{1590"sensor": ["a"] * 5 + ["b"] * 5,1591"datetime": dts,1592"value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3],1593}1594)1595out = df.join(df, on="datetime")1596assert out.height == df.height159715981599def test_asof_cross_join() -> None:1600left = pl.DataFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(1601pl.col("a").set_sorted()1602)1603right = pl.DataFrame(1604{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}1605).with_columns(pl.col("a").set_sorted())16061607# only test dispatch of asof join1608out = left.join_asof(right, on="a")1609assert out.shape == (3, 3)16101611left.lazy().join_asof(right.lazy(), on="a").collect()1612assert out.shape == (3, 3)16131614# only test dispatch of cross join1615out = left.join(right, how="cross")1616assert out.shape == (15, 4)16171618left.lazy().join(right.lazy(), how="cross").collect()1619assert out.shape == (15, 4)162016211622def test_join_bad_input_type() -> None:1623left = pl.DataFrame({"a": [1, 2, 3]})1624right = pl.DataFrame({"a": [1, 2, 3]})16251626with pytest.raises(1627TypeError,1628match=r"expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",1629):1630left.join(right.lazy(), on="a") # type: ignore[arg-type]16311632with pytest.raises(1633TypeError,1634match=r"expected `other` .*to be a 'DataFrame'.* not 'Series'",1635):1636left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]16371638class DummyDataFrameSubclass(pl.DataFrame):1639pass16401641right = DummyDataFrameSubclass(right)16421643left.join(right, on="a")164416451646def test_join_where() -> None:1647east = pl.DataFrame(1648{1649"id": [100, 101, 102],1650"dur": [120, 140, 160],1651"rev": [12, 14, 16],1652"cores": [2, 8, 4],1653}1654)1655west = pl.DataFrame(1656{1657"t_id": [404, 498, 676, 742],1658"time": [90, 130, 150, 170],1659"cost": [9, 13, 15, 16],1660"cores": [4, 2, 1, 4],1661}1662)1663out = east.join_where(1664west,1665pl.col("dur") < pl.col("time"),1666pl.col("rev") < pl.col("cost"),1667)16681669expected = pl.DataFrame(1670{1671"id": [100, 100, 100, 101, 101],1672"dur": [120, 120, 120, 140, 140],1673"rev": [12, 12, 12, 14, 14],1674"cores": [2, 2, 2, 8, 8],1675"t_id": [498, 676, 742, 676, 742],1676"time": [130, 150, 170, 150, 170],1677"cost": [13, 15, 16, 15, 16],1678"cores_right": [2, 1, 4, 1, 4],1679}1680)16811682assert_frame_equal(out, expected)168316841685def test_join_where_bad_input_type() -> None:1686east = pl.DataFrame(1687{1688"id": [100, 101, 102],1689"dur": [120, 140, 160],1690"rev": [12, 14, 16],1691"cores": [2, 8, 4],1692}1693)1694west = pl.DataFrame(1695{1696"t_id": [404, 498, 676, 742],1697"time": [90, 130, 150, 170],1698"cost": [9, 13, 15, 16],1699"cores": [4, 2, 1, 4],1700}1701)1702with pytest.raises(1703TypeError,1704match=r"expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",1705):1706east.join_where(1707west.lazy(), # type: ignore[arg-type]1708pl.col("dur") < pl.col("time"),1709pl.col("rev") < pl.col("cost"),1710)17111712with pytest.raises(1713TypeError,1714match=r"expected `other` .*to be a 'DataFrame'.* not 'Series'",1715):1716east.join_where(1717pl.Series(west), # type: ignore[arg-type]1718pl.col("dur") < pl.col("time"),1719pl.col("rev") < pl.col("cost"),1720)17211722class DummyDataFrameSubclass(pl.DataFrame):1723pass17241725west = DummyDataFrameSubclass(west)17261727east.join_where(1728west,1729pl.col("dur") < pl.col("time"),1730pl.col("rev") < pl.col("cost"),1731)173217331734def test_str_concat() -> None:1735df = pl.DataFrame(1736{1737"nrs": [1, 2, 3, 4],1738"name": ["ham", "spam", "foo", None],1739}1740)1741out = df.with_columns((pl.lit("Dr. ") + pl.col("name")).alias("graduated_name"))1742assert out["graduated_name"][0] == "Dr. ham"1743assert out["graduated_name"][1] == "Dr. spam"174417451746def test_dot_product() -> None:1747df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})17481749assert df["a"].dot(df["b"]) == 201750assert typing.cast("int", df.select([pl.col("a").dot("b")])[0, "a"]) == 2017511752result = pl.Series([1, 2, 3]) @ pl.Series([4, 5, 6])1753assert isinstance(result, int)1754assert result == 3217551756result = pl.Series([1, 2, 3]) @ pl.Series([4.0, 5.0, 6.0])1757assert isinstance(result, float)1758assert result == 32.017591760result = pl.Series([1.0, 2.0, 3.0]) @ pl.Series([4.0, 5.0, 6.0])1761assert isinstance(result, float)1762assert result == 32.017631764with pytest.raises(1765InvalidOperationError, match="`dot` operation not supported for dtype `bool`"1766):1767pl.Series([True, False, False, True]) @ pl.Series([4, 5, 6, 7])17681769with pytest.raises(1770InvalidOperationError, match="`dot` operation not supported for dtype `str`"1771):1772pl.Series([1, 2, 3, 4]) @ pl.Series(["True", "False", "False", "True"])177317741775def test_hash_rows() -> None:1776df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})1777assert df.hash_rows().dtype == pl.UInt641778assert df["a"].hash().dtype == pl.UInt641779assert df.select([pl.col("a").hash().alias("foo")])["foo"].dtype == pl.UInt64178017811782def test_reproducible_hash_with_seeds() -> None:1783"""1784Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.17851786cf. issue #3966, hashes must always be reproducible across sessions when using1787the same seeds.1788"""1789df = pl.DataFrame({"s": [1234, None, 5678]})1790seeds = (11, 22, 33, 44)1791expected = pl.Series(1792"s",1793[7829205897147972687, 10151361788274345728, 17508017346787321581],1794dtype=pl.UInt64,1795)1796result = df.hash_rows(*seeds)1797assert_series_equal(expected, result, check_names=False, check_exact=True)1798result = df["s"].hash(*seeds)1799assert_series_equal(expected, result, check_names=False, check_exact=True)1800result = df.select([pl.col("s").hash(*seeds)])["s"]1801assert_series_equal(expected, result, check_names=False, check_exact=True)180218031804@pytest.mark.slow1805@pytest.mark.parametrize(1806"e",1807[1808pl.int_range(1_000_000),1809# Test code path for null_count > 01810pl.when(pl.int_range(1_000_000) != 0).then(pl.int_range(1_000_000)),1811],1812)1813def test_hash_collision_multiple_columns_equal_values_15390(e: pl.Expr) -> None:1814df = pl.select(e.alias("a"))18151816for n_columns in (1, 2, 3, 4):1817s = df.select(pl.col("a").alias(f"x{i}") for i in range(n_columns)).hash_rows()18181819vc = s.sort().value_counts(sort=True)1820max_bucket_size = vc["count"][0]18211822assert max_bucket_size == 1182318241825@pytest.mark.may_fail_auto_streaming # Python objects not yet supported in row encoding1826@pytest.mark.may_fail_cloud1827def test_hashing_on_python_objects() -> None:1828# see if we can do a group_by, drop_duplicates on a DataFrame with objects.1829# this requires that the hashing and aggregations are done on python objects18301831df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]})18321833class Foo:1834def __hash__(self) -> int:1835return 018361837def __eq__(self, other: object) -> bool:1838return True18391840df = df.with_columns(pl.col("a").map_elements(lambda x: Foo()).alias("foo"))1841assert df.group_by(["foo"]).first().shape == (1, 3)1842assert df.unique().shape == (3, 3)184318441845def test_unique_unit_rows() -> None:1846df = pl.DataFrame({"a": [1], "b": [None]}, schema={"a": pl.Int64, "b": pl.Float32})18471848# 'unique' one-row frame should be equal to the original frame1849assert_frame_equal(df, df.unique(subset="a"))1850for col in df.columns:1851assert df.n_unique(subset=[col]) == 1185218531854def test_panic() -> None:1855# may contain some tests that yielded a panic in polars or pl_arrow1856# https://github.com/pola-rs/polars/issues/11101857a = pl.DataFrame(1858{1859"col1": ["a"] * 500 + ["b"] * 500,1860}1861)1862a.filter(pl.col("col1") != "b")186318641865def test_horizontal_agg() -> None:1866df = pl.DataFrame({"a": [1, None, 3], "b": [1, 2, 3]})18671868assert_series_equal(df.sum_horizontal(), pl.Series("sum", [2, 2, 6]))1869assert_series_equal(1870df.sum_horizontal(ignore_nulls=False), pl.Series("sum", [2, None, 6])1871)1872assert_series_equal(1873df.mean_horizontal(ignore_nulls=False), pl.Series("mean", [1.0, None, 3.0])1874)187518761877def test_slicing() -> None:1878# https://github.com/pola-rs/polars/issues/13221879n = 2018801881df = pl.DataFrame(1882{1883"d": ["u", "u", "d", "c", "c", "d", "d"] * n,1884"v1": [None, "help", None, None, None, None, None] * n,1885}1886)18871888assert (df.filter(pl.col("d") != "d").select([pl.col("v1").unique()])).shape == (18892,18901,1891)189218931894def test_group_by_cat_list() -> None:1895grouped = (1896pl.DataFrame(1897[1898pl.Series("str_column", ["a", "b", "b", "a", "b"]),1899pl.Series("int_column", [1, 1, 2, 2, 3]),1900]1901)1902.with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column"))1903.group_by("int_column", maintain_order=True)1904.agg([pl.col("cat_column")])["cat_column"]1905)19061907out = grouped.explode()1908assert out.dtype == pl.Categorical1909assert out[0] == "a"191019111912@pytest.mark.parametrize("dtype", FLOAT_DTYPES)1913def test_group_by_agg_n_unique_floats(dtype: pl.DataType) -> None:1914# tests proper dispatch1915df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19161917out = df.group_by("a", maintain_order=True).agg(1918[pl.col("b").cast(dtype).n_unique()]1919)1920assert out["b"].to_list() == [2, 1]192119221923def test_group_by_agg_n_unique_empty_group_idx_path() -> None:1924df = pl.DataFrame(1925{1926"key": [1, 1, 1, 2, 2, 2],1927"value": [1, 2, 3, 4, 5, 6],1928"filt": [True, True, True, False, False, False],1929}1930)1931out = df.group_by("key", maintain_order=True).agg(1932pl.col("value").filter("filt").n_unique().alias("n_unique")1933)1934expected = pl.DataFrame(1935{1936"key": [1, 2],1937"n_unique": pl.Series([3, 0], dtype=pl.get_index_type()),1938}1939)1940assert_frame_equal(out, expected)194119421943def test_group_by_agg_n_unique_empty_group_slice_path() -> None:1944df = pl.DataFrame(1945{1946"key": [1, 1, 1, 2, 2, 2],1947"value": [1, 2, 3, 4, 5, 6],1948"filt": [False, False, False, False, False, False],1949}1950)1951out = df.group_by("key", maintain_order=True).agg(1952pl.col("value").filter("filt").n_unique().alias("n_unique")1953)1954expected = pl.DataFrame(1955{1956"key": [1, 2],1957"n_unique": pl.Series([0, 0], dtype=pl.get_index_type()),1958}1959)1960assert_frame_equal(out, expected)196119621963def test_select_by_dtype(df: pl.DataFrame) -> None:1964out = df.select(pl.col(pl.String))1965assert out.columns == ["strings", "strings_nulls"]1966out = df.select(pl.col([pl.String, pl.Boolean]))1967assert out.columns == ["bools", "bools_nulls", "strings", "strings_nulls"]1968out = df.select(pl.col(INTEGER_DTYPES))1969assert out.columns == ["int", "int_nulls"]19701971out = df.select(ints=pl.struct(pl.col(INTEGER_DTYPES)))1972assert out.schema == {1973"ints": pl.Struct([pl.Field("int", pl.Int64), pl.Field("int_nulls", pl.Int64)])1974}197519761977def test_with_row_index() -> None:1978df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19791980out = df.with_row_index()1981assert out["index"].to_list() == [0, 1, 2]19821983out = df.lazy().with_row_index().collect()1984assert out["index"].to_list() == [0, 1, 2]198519861987def test_with_row_index_bad_offset() -> None:1988df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})19891990with pytest.raises(ValueError, match="cannot be negative"):1991df.with_row_index(offset=-1)1992with pytest.raises(1993ValueError, match="cannot be greater than the maximum index value"1994):1995df.with_row_index(offset=2**64)199619971998def test_with_row_index_bad_offset_lazy() -> None:1999lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})20002001with pytest.raises(ValueError, match="cannot be negative"):2002lf.with_row_index(offset=-1)2003with pytest.raises(2004ValueError, match="cannot be greater than the maximum index value"2005):2006lf.with_row_index(offset=2**64)200720082009def test_with_row_count_deprecated() -> None:2010df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})20112012with pytest.deprecated_call():2013out = df.with_row_count()2014assert out["row_nr"].to_list() == [0, 1, 2]20152016with pytest.deprecated_call():2017out = df.lazy().with_row_count().collect()2018assert out["row_nr"].to_list() == [0, 1, 2]201920202021@pytest.mark.may_fail_cloud2022def test_filter_with_all_expansion() -> None:2023df = pl.DataFrame(2024{2025"b": [1, 2, None],2026"c": [1, 2, None],2027"a": [None, None, None],2028}2029)2030out = df.filter(~pl.fold(True, lambda acc, s: acc & s.is_null(), pl.all()))2031assert out.shape == (2, 3)203220332034# TODO: investigate this discrepancy in auto streaming2035@pytest.mark.may_fail_auto_streaming2036@pytest.mark.may_fail_cloud2037def test_extension() -> None:2038class Foo:2039def __init__(self, value: Any) -> None:2040self.value = value20412042def __repr__(self) -> str:2043return f"foo({self.value})"20442045foos = [Foo(1), Foo(2), Foo(3)]20462047# foos and sys.getrefcount both have a reference.2048base_count = 220492050# We compute the refcount on a separate line otherwise pytest's assert magic2051# might add reference counts.2052rc = sys.getrefcount(foos[0])2053assert rc == base_count20542055df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})2056rc = sys.getrefcount(foos[0])2057assert rc == base_count + 12058del df2059rc = sys.getrefcount(foos[0])2060assert rc == base_count20612062df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})2063rc = sys.getrefcount(foos[0])2064assert rc == base_count + 120652066out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a"))2067rc = sys.getrefcount(foos[0])2068assert rc == base_count + 22069s = out["a"].list.explode()2070rc = sys.getrefcount(foos[0])2071assert rc == base_count + 32072del s2073rc = sys.getrefcount(foos[0])2074assert rc == base_count + 220752076assert out["a"].list.explode().to_list() == foos2077rc = sys.getrefcount(foos[0])2078assert rc == base_count + 22079del out2080rc = sys.getrefcount(foos[0])2081assert rc == base_count + 12082del df2083rc = sys.getrefcount(foos[0])2084assert rc == base_count208520862087@pytest.mark.parametrize("name", [None, "n", ""])2088def test_group_by_order_dispatch(name: str | None) -> None:2089df = pl.DataFrame({"x": list("bab"), "y": range(3)})2090lf = df.lazy()20912092result = df.group_by("x", maintain_order=True).len(name=name)2093lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)20942095name = "len" if name is None else name2096expected = pl.DataFrame(2097data={"x": ["b", "a"], name: [2, 1]},2098schema_overrides={name: pl.get_index_type()},2099)2100assert_frame_equal(result, expected)2101assert_frame_equal(lazy_result.collect(), expected)21022103result = df.group_by("x", maintain_order=True).all()2104expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})2105assert_frame_equal(result, expected)210621072108def test_partitioned_group_by_order() -> None:2109# check if group ordering is maintained.2110# we only have 30 groups, so this triggers a partitioned group by2111df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})2112out = df.group_by("x", maintain_order=True).agg(pl.all().implode())2113assert_series_equal(out["x"], df["x"])211421152116def test_schema() -> None:2117df = pl.DataFrame(2118{"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}2119)2120expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}2121assert df.schema == expected212221232124def test_schema_equality() -> None:2125lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]})2126lf_rev = lf.select("bar", "foo")21272128assert lf.collect_schema() != lf_rev.collect_schema()2129assert lf.collect().schema != lf_rev.collect().schema213021312132def test_df_schema_unique() -> None:2133df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})2134with pytest.raises(DuplicateError):2135df.columns = ["a", "a"]21362137with pytest.raises(DuplicateError):2138df.rename({"b": "a"})213921402141def test_empty_projection() -> None:2142empty_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).select([])2143assert empty_df.rows() == []2144assert empty_df.schema == {}2145assert empty_df.shape == (0, 0)214621472148def test_fill_null() -> None:2149df = pl.DataFrame({"a": [1, 2], "b": [3, None]})2150assert_frame_equal(df.fill_null(4), pl.DataFrame({"a": [1, 2], "b": [3, 4]}))2151assert_frame_equal(2152df.fill_null(strategy="max"), pl.DataFrame({"a": [1, 2], "b": [3, 3]})2153)21542155# string and list data2156# string goes via binary2157df = pl.DataFrame(2158{2159"c": [2160["Apple", "Orange"],2161["Apple", "Orange"],2162None,2163["Carrot"],2164None,2165None,2166],2167"b": ["Apple", "Orange", None, "Carrot", None, None],2168}2169)21702171assert df.select(2172pl.all().fill_null(strategy="forward").name.suffix("_forward"),2173pl.all().fill_null(strategy="backward").name.suffix("_backward"),2174).to_dict(as_series=False) == {2175"c_forward": [2176["Apple", "Orange"],2177["Apple", "Orange"],2178["Apple", "Orange"],2179["Carrot"],2180["Carrot"],2181["Carrot"],2182],2183"b_forward": ["Apple", "Orange", "Orange", "Carrot", "Carrot", "Carrot"],2184"c_backward": [2185["Apple", "Orange"],2186["Apple", "Orange"],2187["Carrot"],2188["Carrot"],2189None,2190None,2191],2192"b_backward": ["Apple", "Orange", "Carrot", "Carrot", None, None],2193}2194# categoricals2195df = pl.DataFrame(pl.Series("cat", ["a", None], dtype=pl.Categorical))2196s = df.select(pl.col("cat").fill_null(strategy="forward"))["cat"]2197assert s.dtype == pl.Categorical2198assert s.to_list() == ["a", "a"]219922002201def test_fill_nan() -> None:2202df = pl.DataFrame({"a": [1, 2], "b": [3.0, float("nan")]})2203assert_frame_equal(2204df.fill_nan(4),2205pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}),2206)2207assert_frame_equal(2208df.fill_nan(None),2209pl.DataFrame({"a": [1, 2], "b": [3.0, None]}),2210)2211assert df["b"].fill_nan(5.0).to_list() == [3.0, 5.0]2212df = pl.DataFrame(2213{2214"a": [1.0, np.nan, 3.0],2215"b": [datetime(1, 2, 2), datetime(2, 2, 2), datetime(3, 2, 2)],2216}2217)2218assert df.fill_nan(2.0).dtypes == [pl.Float64, pl.Datetime]221922202221#2222def test_forward_fill() -> None:2223df = pl.DataFrame({"a": [1.0, None, 3.0]})2224fill = df.select(pl.col("a").forward_fill())["a"]2225assert_series_equal(fill, pl.Series("a", [1, 1, 3]).cast(pl.Float64))22262227df = pl.DataFrame({"a": [None, 1, None]})2228fill = df.select(pl.col("a").forward_fill())["a"]2229assert_series_equal(fill, pl.Series("a", [None, 1, 1]).cast(pl.Int64))223022312232def test_backward_fill() -> None:2233df = pl.DataFrame({"a": [1.0, None, 3.0]})2234fill = df.select(pl.col("a").backward_fill())["a"]2235assert_series_equal(fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))22362237df = pl.DataFrame({"a": [None, 1, None]})2238fill = df.select(pl.col("a").backward_fill())["a"]2239assert_series_equal(fill, pl.Series("a", [1, 1, None]).cast(pl.Int64))224022412242def test_shrink_to_fit() -> None:2243df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})22442245assert df.shrink_to_fit(in_place=True) is df2246assert df.shrink_to_fit(in_place=False) is not df2247assert_frame_equal(df.shrink_to_fit(in_place=False), df)224822492250def test_add_string() -> None:2251df = pl.DataFrame({"a": ["hi", "there"], "b": ["hello", "world"]})2252expected = pl.DataFrame(2253{"a": ["hi hello", "there hello"], "b": ["hello hello", "world hello"]}2254)2255assert_frame_equal((df + " hello"), expected)22562257expected = pl.DataFrame(2258{"a": ["hello hi", "hello there"], "b": ["hello hello", "hello world"]}2259)2260assert_frame_equal(("hello " + df), expected)226122622263def test_df_broadcast() -> None:2264df = pl.DataFrame({"a": [1, 2, 3]}, schema_overrides={"a": pl.UInt8})2265out = df.with_columns(pl.lit(pl.Series("s", [[1, 2]])).first())2266assert out.shape == (3, 2)2267assert out.schema == {"a": pl.UInt8, "s": pl.List(pl.Int64)}2268assert out.rows() == [(1, [1, 2]), (2, [1, 2]), (3, [1, 2])]226922702271@pytest.mark.may_fail_cloud # not a lazyframe method2272def test_product() -> None:2273df = pl.DataFrame(2274{2275"int": [1, 2, 3],2276"flt": [-1.0, 12.0, 9.0],2277"bool_0": [True, False, True],2278"bool_1": [True, True, True],2279"str": ["a", "b", "c"],2280},2281schema_overrides={2282"int": pl.UInt16,2283"flt": pl.Float32,2284},2285)2286out = df.product()2287expected = pl.DataFrame(2288{"int": [6], "flt": [-108.0], "bool_0": [0], "bool_1": [1], "str": [None]}2289)2290assert_frame_not_equal(out, expected, check_dtypes=True)2291assert_frame_equal(out, expected, check_dtypes=False)229222932294def test_first_last_nth_expressions(fruits_cars: pl.DataFrame) -> None:2295df = fruits_cars2296out = df.select(pl.first())2297assert out.columns == ["A"]22982299out = df.select(pl.last())2300assert out.columns == ["cars"]23012302out = df.select(pl.nth(0))2303assert out.columns == ["A"]23042305out = df.select(pl.nth(1))2306assert out.columns == ["fruits"]23072308out = df.select(pl.nth(-2))2309assert out.columns == ["B"]231023112312def test_is_between(fruits_cars: pl.DataFrame) -> None:2313result = fruits_cars.select(pl.col("A").is_between(2, 4)).to_series()2314assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))23152316result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="none")).to_series()2317assert_series_equal(result, pl.Series("A", [False, False, True, False, False]))23182319result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="both")).to_series()2320assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))23212322result = fruits_cars.select(2323pl.col("A").is_between(2, 4, closed="right")2324).to_series()2325assert_series_equal(result, pl.Series("A", [False, False, True, True, False]))23262327result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="left")).to_series()2328assert_series_equal(result, pl.Series("A", [False, True, True, False, False]))232923302331def test_is_between_data_types() -> None:2332df = pl.DataFrame(2333{2334"flt": [1.4, 1.2, 2.5],2335"int": [2, 3, 4],2336"str": ["xyz", "str", "abc"],2337"date": [date(2020, 1, 1), date(2020, 2, 2), date(2020, 3, 3)],2338"datetime": [2339datetime(2020, 1, 1, 0, 0, 0),2340datetime(2020, 1, 1, 10, 0, 0),2341datetime(2020, 1, 1, 12, 0, 0),2342],2343"tm": [time(10, 30), time(0, 45), time(15, 15)],2344}2345)23462347# on purpose, for float and int, we pass in a mixture of bound data types2348assert_series_equal(2349df.select(pl.col("flt").is_between(1, 2.3))[:, 0],2350pl.Series("flt", [True, True, False]),2351)2352assert_series_equal(2353df.select(pl.col("int").is_between(1.5, 3))[:, 0],2354pl.Series("int", [True, True, False]),2355)2356assert_series_equal(2357df.select(pl.col("date").is_between(date(2019, 1, 1), date(2020, 2, 5)))[:, 0],2358pl.Series("date", [True, True, False]),2359)2360assert_series_equal(2361df.select(2362pl.col("datetime").is_between(2363datetime(2020, 1, 1, 5, 0, 0), datetime(2020, 1, 1, 11, 0, 0)2364)2365)[:, 0],2366pl.Series("datetime", [False, True, False]),2367)2368assert_series_equal(2369df.select(2370pl.col("str").is_between(pl.lit("str"), pl.lit("zzz"), closed="left")2371)[:, 0],2372pl.Series("str", [True, True, False]),2373)2374assert_series_equal(2375df.select(2376pl.col("tm")2377.is_between(time(0, 45), time(10, 30), closed="right")2378.alias("tm_between")2379)[:, 0],2380pl.Series("tm_between", [True, False, False]),2381)238223832384def test_empty_is_in() -> None:2385df_empty_isin = pl.DataFrame({"foo": ["a", "b", "c", "d"]}).filter(2386pl.col("foo").is_in([])2387)2388assert df_empty_isin.shape == (0, 1)2389assert df_empty_isin.rows() == []2390assert df_empty_isin.schema == {"foo": pl.String}239123922393def test_group_by_slice_expression_args() -> None:2394df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)})23952396out = (2397df.group_by("groups", maintain_order=True)2398.agg([pl.col("vals").slice((pl.len() * 0.1).cast(int), (pl.len() // 5))])2399.explode("vals")2400)24012402expected = pl.DataFrame(2403{"groups": ["a", "a", "b", "b", "b", "b"], "vals": [1, 2, 12, 13, 14, 15]}2404)2405assert_frame_equal(out, expected)240624072408def test_join_suffixes() -> None:2409df_a = pl.DataFrame({"A": [1], "B": [1]})2410df_b = pl.DataFrame({"A": [1], "B": [1]})24112412join_strategies: list[JoinStrategy] = ["left", "inner", "full", "cross"]2413for how in join_strategies:2414# no need for an assert, we error if wrong2415df_a.join(df_b, on="A" if how != "cross" else None, suffix="_y", how=how)["B_y"]24162417df_a.join_asof(df_b, on=pl.col("A").set_sorted(), suffix="_y")["B_y"]241824192420def test_explode_empty() -> None:2421df = (2422pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]})2423.group_by("x", maintain_order=True)2424.agg(pl.col("y").gather([]))2425)2426assert df.explode("y").to_dict(as_series=False) == {2427"x": ["a", "b"],2428"y": [None, None],2429}24302431df = pl.DataFrame({"x": ["1", "2", "4"], "y": [["a", "b", "c"], ["d"], []]})2432assert_frame_equal(2433df.explode("y"),2434pl.DataFrame({"x": ["1", "1", "1", "2", "4"], "y": ["a", "b", "c", "d", None]}),2435)24362437df = pl.DataFrame(2438{2439"letters": ["a"],2440"numbers": [[]],2441}2442)2443assert df.explode("numbers").to_dict(as_series=False) == {2444"letters": ["a"],2445"numbers": [None],2446}244724482449def test_asof_by_multiple_keys() -> None:2450lhs = pl.DataFrame(2451{2452"a": [-20, -19, 8, 12, 14],2453"by": [1, 1, 2, 2, 2],2454"by2": [1, 1, 2, 2, 2],2455}2456)24572458rhs = pl.DataFrame(2459{2460"a": [-19, -15, 3, 5, 13],2461"by": [1, 1, 2, 2, 2],2462"by2": [1, 1, 2, 2, 2],2463}2464)24652466result = lhs.join_asof(2467rhs, on=pl.col("a").set_sorted(), by=["by", "by2"], strategy="backward"2468).select(["a", "by"])2469expected = pl.DataFrame({"a": [-20, -19, 8, 12, 14], "by": [1, 1, 2, 2, 2]})2470assert_frame_equal(2471result.group_by("by").agg("a"),2472expected.group_by("by").agg("a"),2473check_row_order=False,2474)247524762477def test_asof_bad_input_type() -> None:2478lhs = pl.DataFrame({"a": [1, 2, 3]})2479rhs = pl.DataFrame({"a": [1, 2, 3]})24802481with pytest.raises(2482TypeError,2483match=r"expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",2484):2485lhs.join_asof(rhs.lazy(), on="a") # type: ignore[arg-type]24862487with pytest.raises(2488TypeError,2489match=r"expected `other` .*to be a 'DataFrame'.* not 'Series'",2490):2491lhs.join_asof(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]24922493class DummyDataFrameSubclass(pl.DataFrame):2494pass24952496rhs = DummyDataFrameSubclass(rhs)24972498lhs.join_asof(rhs, on="a")249925002501def test_list_of_list_of_struct() -> None:2502expected = [{"list_of_list_of_struct": [[{"a": 1}, {"a": 2}]]}]2503pa_df = pa.Table.from_pylist(expected)25042505df = pl.from_arrow(pa_df)2506assert df.rows() == [([[{"a": 1}, {"a": 2}]],)] # type: ignore[union-attr]2507assert df.to_dicts() == expected # type: ignore[union-attr]25082509df = pl.from_arrow(pa_df[:0])2510assert df.to_dicts() == [] # type: ignore[union-attr]251125122513def test_fill_null_limits() -> None:2514assert pl.DataFrame(2515{2516"a": [1, None, None, None, 5, 6, None, None, None, 10],2517"b": ["a", None, None, None, "b", "c", None, None, None, "d"],2518"c": [True, None, None, None, False, True, None, None, None, False],2519}2520).select(2521pl.all().fill_null(strategy="forward", limit=2),2522pl.all().fill_null(strategy="backward", limit=2).name.suffix("_backward"),2523).to_dict(as_series=False) == {2524"a": [1, 1, 1, None, 5, 6, 6, 6, None, 10],2525"b": ["a", "a", "a", None, "b", "c", "c", "c", None, "d"],2526"c": [True, True, True, None, False, True, True, True, None, False],2527"a_backward": [1, None, 5, 5, 5, 6, None, 10, 10, 10],2528"b_backward": ["a", None, "b", "b", "b", "c", None, "d", "d", "d"],2529"c_backward": [2530True,2531None,2532False,2533False,2534False,2535True,2536None,2537False,2538False,2539False,2540],2541}254225432544def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None:2545res_expr = fruits_cars.select(pl.col("A").lower_bound())2546assert res_expr.item() == -922337203685477580825472548res_expr = fruits_cars.select(pl.col("B").upper_bound())2549assert res_expr.item() == 922337203685477580725502551with pytest.raises(ComputeError):2552fruits_cars.select(pl.col("fruits").upper_bound())255325542555def test_selection_misc() -> None:2556df = pl.DataFrame({"x": "abc"}, schema={"x": pl.String})25572558# literal values (as scalar/list)2559for zero in (0, [0]):2560assert df.select(zero)["literal"].to_list() == [0]2561assert df.select(literal=0)["literal"].to_list() == [0]25622563# expect string values to be interpreted as cols2564for x in ("x", ["x"], pl.col("x")):2565assert df.select(x).rows() == [("abc",)]25662567# string col + lit2568assert df.with_columns(["x", 0]).to_dicts() == [{"x": "abc", "literal": 0}]256925702571def test_selection_regex_and_multicol() -> None:2572test_df = pl.DataFrame(2573{2574"a": [1, 2, 3, 4],2575"b": [5, 6, 7, 8],2576"c": [9, 10, 11, 12],2577"foo": [13, 14, 15, 16],2578},2579schema_overrides={"foo": pl.UInt8},2580)25812582# Selection only2583test_df.select(2584pl.col(["a", "b", "c"]).name.suffix("_list"),2585pl.all().exclude("foo").name.suffix("_wild"),2586pl.col("^\\w$").name.suffix("_regex"),2587)25882589# Multi * Single2590assert test_df.select(pl.col(["a", "b", "c"]) * pl.col("foo")).to_dict(2591as_series=False2592) == {2593"a": [13, 28, 45, 64],2594"b": [65, 84, 105, 128],2595"c": [117, 140, 165, 192],2596}2597assert test_df.select(pl.all().exclude("foo") * pl.col("foo")).to_dict(2598as_series=False2599) == {2600"a": [13, 28, 45, 64],2601"b": [65, 84, 105, 128],2602"c": [117, 140, 165, 192],2603}26042605assert test_df.select(pl.col("^\\w$") * pl.col("foo")).to_dict(as_series=False) == {2606"a": [13, 28, 45, 64],2607"b": [65, 84, 105, 128],2608"c": [117, 140, 165, 192],2609}26102611# Multi * Multi2612result = test_df.select(pl.col(["a", "b", "c"]) * pl.col(["a", "b", "c"]))2613expected = {"a": [1, 4, 9, 16], "b": [25, 36, 49, 64], "c": [81, 100, 121, 144]}26142615assert result.to_dict(as_series=False) == expected2616assert test_df.select(pl.exclude("foo") * pl.exclude("foo")).to_dict(2617as_series=False2618) == {2619"a": [1, 4, 9, 16],2620"b": [25, 36, 49, 64],2621"c": [81, 100, 121, 144],2622}2623assert test_df.select(pl.col("^\\w$") * pl.col("^\\w$")).to_dict(2624as_series=False2625) == {2626"a": [1, 4, 9, 16],2627"b": [25, 36, 49, 64],2628"c": [81, 100, 121, 144],2629}26302631df = test_df.select(2632re=pl.struct(pl.col("^\\w$")),2633odd=pl.struct((pl.col(INTEGER_DTYPES) % 2).name.suffix("_is_odd")),2634maxes=pl.struct(pl.all().max().name.suffix("_max")),2635).head(2)2636# ┌───────────┬───────────┬─────────────┐2637# │ re ┆ odd ┆ maxes │2638# │ --- ┆ --- ┆ --- │2639# │ struct[3] ┆ struct[4] ┆ struct[4] │2640# ╞═══════════╪═══════════╪═════════════╡2641# │ {1,5,9} ┆ {1,1,1,1} ┆ {4,8,12,16} │2642# │ {2,6,10} ┆ {0,0,0,0} ┆ {4,8,12,16} │2643# └───────────┴───────────┴─────────────┘2644assert df.rows() == [2645(2646{"a": 1, "b": 5, "c": 9},2647{"a_is_odd": 1, "b_is_odd": 1, "c_is_odd": 1, "foo_is_odd": 1},2648{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},2649),2650(2651{"a": 2, "b": 6, "c": 10},2652{"a_is_odd": 0, "b_is_odd": 0, "c_is_odd": 0, "foo_is_odd": 0},2653{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},2654),2655]265626572658@pytest.mark.parametrize("subset", ["a", cs.starts_with("x", "a")])2659@pytest.mark.may_fail_auto_streaming # Flaky in CI, see https://github.com/pola-rs/polars/issues/209432660@pytest.mark.may_fail_cloud2661def test_unique_on_sorted(subset: Any) -> None:2662df = pl.DataFrame(data={"a": [1, 1, 3], "b": [1, 2, 3]})26632664result = df.with_columns([pl.col("a").set_sorted()]).unique(2665subset=subset,2666keep="last",2667)26682669expected = pl.DataFrame({"a": [1, 3], "b": [2, 3]})2670assert_frame_equal(result, expected)267126722673def test_len_compute(df: pl.DataFrame) -> None:2674df = df.with_columns(pl.struct(["list_bool", "cat"]).alias("struct"))2675filtered = df.filter(pl.col("bools"))2676for col in filtered.columns:2677assert len(filtered[col]) == 126782679taken = df[[1, 2], :]2680for col in taken.columns:2681assert len(taken[col]) == 2268226832684def test_filter_sequence() -> None:2685df = pl.DataFrame({"a": [1, 2, 3]})2686assert df.filter([True, False, True])["a"].to_list() == [1, 3]2687assert df.filter(np.array([True, False, True]))["a"].to_list() == [1, 3]268826892690def test_filter_multiple_predicates() -> None:2691df = pl.DataFrame(2692{2693"a": [1, 1, 1, 2, 2],2694"b": [1, 1, 2, 2, 2],2695"c": [1, 1, 2, 3, 4],2696}2697)26982699# multiple predicates2700expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})2701for out in (2702df.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat2703df.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list2704):2705assert_frame_equal(out, expected)27062707# multiple kwargs2708assert_frame_equal(2709df.filter(a=1, b=2),2710pl.DataFrame({"a": [1], "b": [2], "c": [2]}),2711)27122713# both positional and keyword args2714assert_frame_equal(2715pl.DataFrame({"a": [2], "b": [2], "c": [3]}),2716df.filter(pl.col("c") < 4, a=2, b=2),2717)27182719# boolean mask2720out = df.filter([True, False, False, False, True])2721expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 4]})2722assert_frame_equal(out, expected)27232724# multiple boolean masks2725out = df.filter(2726np.array([True, True, False, True, False]),2727np.array([True, False, True, True, False]),2728)2729expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 3]})2730assert_frame_equal(out, expected)273127322733def test_indexing_set() -> None:2734df = pl.DataFrame({"bool": [True, True], "str": ["N/A", "N/A"], "nr": [1, 2]})27352736df[0, "bool"] = False2737df[0, "nr"] = 1002738df[0, "str"] = "foo"27392740assert df.to_dict(as_series=False) == {2741"bool": [False, True],2742"str": ["foo", "N/A"],2743"nr": [100, 2],2744}274527462747def test_set() -> None:2748# Setting a dataframe using indices is deprecated.2749# We keep these tests because we only generate a warning.2750np.random.seed(1)2751df = pl.DataFrame(2752{"foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10}2753)2754with pytest.raises(2755TypeError,2756match=r"DataFrame object does not support `Series` assignment by index"2757r"\n\nUse `DataFrame.with_columns`.",2758):2759df["new"] = np.random.rand(10)27602761with pytest.raises(2762TypeError,2763match=r"not allowed to set DataFrame by boolean mask in the row position"2764r"\n\nConsider using `DataFrame.with_columns`.",2765):2766df[df["ham"] > 0.5, "ham"] = "a"2767with pytest.raises(2768TypeError,2769match=r"not allowed to set DataFrame by boolean mask in the row position"2770r"\n\nConsider using `DataFrame.with_columns`.",2771):2772df[[True, False], "ham"] = "a"27732774# set 2D2775df = pl.DataFrame({"b": [0, 0]})2776df[["A", "B"]] = [[1, 2], [1, 2]]27772778with pytest.raises(ValueError):2779df[["C", "D"]] = 12780with pytest.raises(ValueError):2781df[["C", "D"]] = [1, 1]2782with pytest.raises(ValueError):2783df[["C", "D"]] = [[1, 2, 3], [1, 2, 3]]27842785# set tuple2786df = pl.DataFrame({"b": [0, 0]})2787df[0, "b"] = 12788assert df[0, "b"] == 127892790df[0, 0] = 22791assert df[0, "b"] == 227922793# row and col selection have to be int or str2794with pytest.raises(TypeError):2795df[:, [1]] = 1 # type: ignore[index]2796with pytest.raises(TypeError):2797df[True, :] = 1 # type: ignore[index]27982799# needs to be a 2 element tuple2800with pytest.raises(ValueError):2801df[1, 2, 3] = 128022803# we cannot index with any type, such as bool2804with pytest.raises(TypeError):2805df[True] = 1 # type: ignore[index]280628072808def test_series_iter_over_frame() -> None:2809df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})28102811expected = {28120: pl.Series("a", [1, 2, 3]),28131: pl.Series("b", [2, 3, 4]),28142: pl.Series("c", [3, 4, 5]),2815}2816for idx, s in enumerate(df):2817assert_series_equal(s, expected[idx])28182819expected = {28200: pl.Series("c", [3, 4, 5]),28211: pl.Series("b", [2, 3, 4]),28222: pl.Series("a", [1, 2, 3]),2823}2824for idx, s in enumerate(reversed(df)):2825assert_series_equal(s, expected[idx])282628272828def test_union_with_aliases_4770() -> None:2829lf = pl.DataFrame(2830{2831"a": [1, None],2832"b": [3, 4],2833}2834).lazy()28352836lf = pl.concat(2837[2838lf.select([pl.col("a").alias("x")]),2839lf.select([pl.col("b").alias("x")]),2840]2841).filter(pl.col("x").is_not_null())28422843assert lf.collect()["x"].to_list() == [1, 3, 4]284428452846def test_init_datetimes_with_timezone() -> None:2847tz_us = "America/New_York"2848tz_europe = "Europe/Amsterdam"28492850dtm = datetime(2022, 10, 12, 12, 30)2851for time_unit in DTYPE_TEMPORAL_UNITS:2852for type_overrides in (2853{2854"schema": [2855("d1", pl.Datetime(time_unit, tz_us)),2856("d2", pl.Datetime(time_unit, tz_europe)),2857]2858},2859{2860"schema_overrides": {2861"d1": pl.Datetime(time_unit, tz_us),2862"d2": pl.Datetime(time_unit, tz_europe),2863}2864},2865):2866result = pl.DataFrame(2867data={2868"d1": [dtm.replace(tzinfo=ZoneInfo(tz_us))],2869"d2": [dtm.replace(tzinfo=ZoneInfo(tz_europe))],2870},2871**type_overrides,2872)2873expected = pl.DataFrame(2874{"d1": ["2022-10-12 12:30"], "d2": ["2022-10-12 12:30"]}2875).with_columns(2876pl.col("d1").str.to_datetime(time_unit=time_unit, time_zone=tz_us),2877pl.col("d2").str.to_datetime(time_unit=time_unit, time_zone=tz_europe),2878)2879assert_frame_equal(result, expected)288028812882@pytest.mark.parametrize(2883(2884"tzinfo",2885"offset",2886"dtype_time_zone",2887"expected_time_zone",2888"expected_item",2889),2890[2891(None, "", None, None, datetime(2020, 1, 1)),2892(2893timezone(timedelta(hours=-8)),2894"-08:00",2895"UTC",2896"UTC",2897datetime(2020, 1, 1, 8, tzinfo=timezone.utc),2898),2899(2900timezone(timedelta(hours=-8)),2901"-08:00",2902None,2903"UTC",2904datetime(2020, 1, 1, 8, tzinfo=timezone.utc),2905),2906],2907)2908@pytest.mark.may_fail_cloud2909def test_init_vs_strptime_consistency(2910tzinfo: timezone | None,2911offset: str,2912dtype_time_zone: str | None,2913expected_time_zone: str,2914expected_item: datetime,2915) -> None:2916result_init = pl.Series(2917[datetime(2020, 1, 1, tzinfo=tzinfo)],2918dtype=pl.Datetime("us", dtype_time_zone),2919)2920result_strptime = pl.Series([f"2020-01-01 00:00{offset}"]).str.strptime(2921pl.Datetime("us", dtype_time_zone)2922)2923assert result_init.dtype == pl.Datetime("us", expected_time_zone)2924assert result_init.item() == expected_item2925assert_series_equal(result_init, result_strptime)292629272928def test_init_vs_strptime_consistency_converts() -> None:2929result = pl.Series(2930[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],2931dtype=pl.Datetime("us", "America/Los_Angeles"),2932).item()2933assert result == datetime(29342020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="America/Los_Angeles")2935)2936result = (2937pl.Series(["2020-01-01 00:00-08:00"])2938.str.strptime(pl.Datetime("us", "America/Los_Angeles"))2939.item()2940)2941assert result == datetime(29422020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="America/Los_Angeles")2943)294429452946def test_init_physical_with_timezone() -> None:2947tz_uae = "Asia/Dubai"2948tz_asia = "Asia/Tokyo"29492950dtm_us = 16655778000000002951for time_unit in DTYPE_TEMPORAL_UNITS:2952dtm = {"ms": dtm_us // 1_000, "ns": dtm_us * 1_000}.get(str(time_unit), dtm_us)2953df = pl.DataFrame(2954data={"d1": [dtm], "d2": [dtm]},2955schema=[2956("d1", pl.Datetime(time_unit, tz_uae)),2957("d2", pl.Datetime(time_unit, tz_asia)),2958],2959)2960assert (df["d1"].to_physical() == df["d2"].to_physical()).all()2961assert df.rows() == [2962(2963datetime(2022, 10, 12, 16, 30, tzinfo=ZoneInfo(tz_uae)),2964datetime(2022, 10, 12, 21, 30, tzinfo=ZoneInfo(tz_asia)),2965)2966]296729682969@pytest.mark.parametrize("divop", [floordiv, truediv])2970def test_floordiv_truediv(divop: Callable[..., Any]) -> None:2971# validate truediv/floordiv dataframe ops against python2972df1 = pl.DataFrame(2973data={2974"x": [0, -1, -2, -3],2975"y": [-0.0, -3.0, 5.0, -7.0],2976"z": [10, 3, -5, 7],2977}2978)29792980# scalar2981for df in [df1, df1.slice(0, 0)]:2982for n in (3, 3.0, -3, -3.0):2983py_div = [tuple(divop(elem, n) for elem in row) for row in df.rows()]2984df_div = divop(df, n).rows()2985assert py_div == df_div29862987# series2988xdf, s = df1["x"].to_frame(), pl.Series([2] * 4)2989assert list(divop(xdf, s)["x"]) == [divop(x, 2) for x in list(df1["x"])]29902991# frame2992df2 = pl.DataFrame(2993data={2994"x": [2, -2, 2, 3],2995"y": [4, 4, -4, 8],2996"z": [0.5, 2.0, -2.0, -3],2997}2998)2999df_div = divop(df1, df2).rows()3000for i, (row1, row2) in enumerate(zip(df1.rows(), df2.rows(), strict=True)):3001for j, (elem1, elem2) in enumerate(zip(row1, row2, strict=True)):3002assert divop(elem1, elem2) == df_div[i][j]300330043005@pytest.mark.parametrize(3006("subset", "keep", "expected_mask"),3007[3008(None, "first", [True, True, True, False]),3009("a", "first", [True, True, False, False]),3010(["a", "b"], "first", [True, True, False, False]),3011(("a", "b"), "last", [True, False, False, True]),3012(("a", "b"), "none", [True, False, False, False]),3013],3014)3015def test_unique(3016subset: str | Sequence[str], keep: UniqueKeepStrategy, expected_mask: list[bool]3017) -> None:3018df = pl.DataFrame({"a": [1, 2, 2, 2], "b": [3, 4, 4, 4], "c": [5, 6, 7, 7]})30193020result = df.unique(maintain_order=True, subset=subset, keep=keep).sort(pl.all())3021expected = df.filter(expected_mask).sort(pl.all())3022assert_frame_equal(result, expected)302330243025def test_iter_slices() -> None:3026df = pl.DataFrame(3027{3028"a": range(95),3029"b": date(2023, 1, 1),3030"c": "klmnopqrstuvwxyz",3031}3032)3033batches = list(df.iter_slices(n_rows=50))30343035assert len(batches[0]) == 503036assert len(batches[1]) == 453037assert batches[1].rows() == df[50:].rows()303830393040def test_format_empty_df() -> None:3041df = pl.DataFrame(3042[3043pl.Series("val1", [], dtype=pl.Categorical),3044pl.Series("val2", [], dtype=pl.Categorical),3045]3046).select(3047pl.format("{}:{}", pl.col("val1"), pl.col("val2")).alias("cat"),3048)3049assert df.shape == (0, 1)3050assert df.dtypes == [pl.String]305130523053def test_deadlocks_3409() -> None:3054assert (3055pl.DataFrame({"col1": [[1, 2, 3]]})3056.with_columns(3057pl.col("col1").list.eval(3058pl.element().map_elements(lambda x: x, return_dtype=pl.Int64)3059)3060)3061.to_dict(as_series=False)3062) == {"col1": [[1, 2, 3]]}30633064assert (3065pl.DataFrame({"col1": [1, 2, 3]})3066.with_columns(3067pl.col("col1").cumulative_eval(3068pl.element().map_batches(lambda x: 0, pl.Int64, returns_scalar=True)3069)3070)3071.to_dict(as_series=False)3072) == {"col1": [0, 0, 0]}307330743075def test_ceil() -> None:3076df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})3077result = df.select(pl.col("a").ceil())3078assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))30793080df = pl.DataFrame({"a": [1, 2, 3]})3081result = df.select(pl.col("a").ceil())3082assert_frame_equal(df, result)308330843085def test_floor() -> None:3086df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})3087result = df.select(pl.col("a").floor())3088assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))30893090df = pl.DataFrame({"a": [1, 2, 3]})3091result = df.select(pl.col("a").floor())3092assert_frame_equal(df, result)309330943095def test_floor_divide() -> None:3096x = 10.43097step = 0.53098df = pl.DataFrame({"x": [x]})3099assert df.with_columns(pl.col("x") // step)[0, 0] == x // step310031013102def test_round() -> None:3103df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})3104col_a_rounded = df.select(pl.col("a").round(decimals=0))["a"]3105assert_series_equal(col_a_rounded, pl.Series("a", [2, 1, 3]).cast(pl.Float64))310631073108def test_dot() -> None:3109df = pl.DataFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]})3110assert df.select(pl.col("a").dot(pl.col("b"))).item() == 12.96311131123113def test_unstack() -> None:3114from string import ascii_uppercase31153116df = pl.DataFrame(3117{3118"col1": list(ascii_uppercase[0:9]),3119"col2": pl.int_range(0, 9, eager=True),3120"col3": pl.int_range(-9, 0, eager=True),3121}3122)3123assert df.unstack(step=3, how="vertical").to_dict(as_series=False) == {3124"col1_0": ["A", "B", "C"],3125"col1_1": ["D", "E", "F"],3126"col1_2": ["G", "H", "I"],3127"col2_0": [0, 1, 2],3128"col2_1": [3, 4, 5],3129"col2_2": [6, 7, 8],3130"col3_0": [-9, -8, -7],3131"col3_1": [-6, -5, -4],3132"col3_2": [-3, -2, -1],3133}31343135assert df.unstack(step=3, how="horizontal").to_dict(as_series=False) == {3136"col1_0": ["A", "D", "G"],3137"col1_1": ["B", "E", "H"],3138"col1_2": ["C", "F", "I"],3139"col2_0": [0, 3, 6],3140"col2_1": [1, 4, 7],3141"col2_2": [2, 5, 8],3142"col3_0": [-9, -6, -3],3143"col3_1": [-8, -5, -2],3144"col3_2": [-7, -4, -1],3145}31463147for column_subset in (("col2", "col3"), cs.integer()):3148assert df.unstack(3149step=3,3150how="horizontal",3151columns=column_subset,3152).to_dict(as_series=False) == {3153"col2_0": [0, 3, 6],3154"col2_1": [1, 4, 7],3155"col2_2": [2, 5, 8],3156"col3_0": [-9, -6, -3],3157"col3_1": [-8, -5, -2],3158"col3_2": [-7, -4, -1],3159}316031613162def test_window_deadlock() -> None:3163np.random.seed(12)31643165df = pl.DataFrame(3166{3167"nrs": [1, 2, 3, None, 5],3168"names": ["foo", "ham", "spam", "egg", None],3169"random": np.random.rand(5),3170"groups": ["A", "A", "B", "C", "B"],3171}3172)31733174_df = df.select(3175pl.col("*"), # select all3176pl.col("random").sum().over("groups").alias("sum[random]/groups"),3177pl.col("random").implode().over("names").alias("random/name"),3178)317931803181def test_sum_empty_column_names() -> None:3182df = pl.DataFrame({"x": [], "y": []}, schema={"x": pl.Boolean, "y": pl.Boolean})3183expected = pl.DataFrame(3184{"x": [0], "y": [0]},3185schema={"x": pl.get_index_type(), "y": pl.get_index_type()},3186)3187assert_frame_equal(df.sum(), expected)318831893190def test_flags() -> None:3191df = pl.DataFrame({"a": [1, 2, 3], "b": [9, 5, 6]})3192assert df.flags == {3193"a": {"SORTED_ASC": False, "SORTED_DESC": False},3194"b": {"SORTED_ASC": False, "SORTED_DESC": False},3195}3196assert df.set_sorted("a").flags == {3197"a": {"SORTED_ASC": True, "SORTED_DESC": False},3198"b": {"SORTED_ASC": False, "SORTED_DESC": False},3199}320032013202def test_interchange() -> None:3203df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})3204dfi = df.__dataframe__()32053206# Testing some random properties to make sure conversion happened correctly3207assert dfi.num_rows() == 23208assert dfi.get_column(0).dtype[1] == 643209assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6321032113212def test_from_dicts_undeclared_column_dtype() -> None:3213data = [{"a": 1, "b": 2}]3214result = pl.from_dicts(data, schema=["x"])3215assert result.schema == {"x": pl.Null}321632173218def test_from_dicts_with_override() -> None:3219data = [3220{"a": "1", "b": str(2**64 - 1), "c": "1"},3221{"a": "1", "b": "1", "c": "-5.0"},3222]3223override = {"a": pl.Int32, "b": pl.UInt64, "c": pl.Float32}3224result = pl.from_dicts(data, schema_overrides=override)3225assert_frame_equal(3226result,3227pl.DataFrame(3228{3229"a": pl.Series([1, 1], dtype=pl.Int32),3230"b": pl.Series([2**64 - 1, 1], dtype=pl.UInt64),3231"c": pl.Series([1.0, -5.0], dtype=pl.Float32),3232}3233),3234)323532363237def test_from_records_u64_12329() -> None:3238s = pl.from_records([{"a": 9908227375760408577}])3239assert s.dtypes == [pl.Int128]3240assert s["a"][0] == 9908227375760408577324132423243def test_negative_slice_12642() -> None:3244df = pl.DataFrame({"x": range(5)})3245assert_frame_equal(df.slice(-2, 1), df.tail(2).head(1))324632473248def test_iter_columns() -> None:3249df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]})3250iter_columns = df.iter_columns()3251assert_series_equal(next(iter_columns), pl.Series("a", [1, 1, 2]))3252assert_series_equal(next(iter_columns), pl.Series("b", [4, 5, 6]))325332543255def test_get_column_index() -> None:3256df = pl.DataFrame({"actual": [1001], "expected": [1000]})32573258assert df.get_column_index("actual") == 03259assert df.get_column_index("expected") == 132603261with pytest.raises(ColumnNotFoundError, match="missing"):3262df.get_column_index("missing")326332643265def test_dataframe_creation_with_different_series_lengths_19795() -> None:3266with pytest.raises(3267ShapeError,3268match=r"could not create a new DataFrame: height of column 'b' \(1\) does not match height of column 'a' \(2\)",3269):3270pl.DataFrame({"a": [1, 2], "b": [1]})327132723273def test_get_column_after_drop_20119() -> None:3274df = pl.DataFrame({"a": ["A"], "b": ["B"], "c": ["C"]})3275df.drop_in_place("a")3276c = df.get_column("c")3277assert_series_equal(c, pl.Series("c", ["C"]))327832793280def test_select_oob_row_20775() -> None:3281df = pl.DataFrame({"a": [1, 2, 3]})3282with pytest.raises(3283IndexError,3284match="index 99 is out of bounds for DataFrame of height 3",3285):3286df[99]328732883289@pytest.mark.parametrize("idx", [3, 99, -4, -99])3290def test_select_oob_element_20775_too_large(idx: int) -> None:3291df = pl.DataFrame({"a": [1, 2, 3]})3292with pytest.raises(3293IndexError,3294match=f"index {idx} is out of bounds for sequence of length 3",3295):3296df[idx, "a"]329732983299def test_nan_to_null() -> None:3300a = np.array([np.nan, 1])33013302df1 = pl.DataFrame(a, nan_to_null=True)3303df2 = pl.DataFrame(3304(a,),3305nan_to_null=True,3306)33073308assert_frame_equal(df1, df2)330933103311# Below 3 tests for https://github.com/pola-rs/polars/issues/17879331233133314def test_with_columns_dict_direct_typeerror() -> None:3315data = {"a": pl.col("a") * 2}3316df = pl.select(a=1)3317with pytest.raises(3318TypeError, match="Cannot pass a dictionary as a single positional argument"3319):3320df.with_columns(data)332133223323def test_with_columns_dict_unpacking() -> None:3324data = {"a": pl.col("a") * 2}3325df = pl.select(a=1).with_columns(**data)3326expected = pl.DataFrame({"a": [2]})3327assert df.equals(expected)332833293330def test_with_columns_generator_alias() -> None:3331data = {"a": pl.col("a") * 2}3332df = pl.select(a=1).with_columns(expr.alias(name) for name, expr in data.items())3333expected = pl.DataFrame({"a": [2]})3334assert df.equals(expected)333533363337