Path: blob/main/py-polars/tests/unit/lazyframe/test_lazyframe.py
8446 views
from __future__ import annotations12import re3from datetime import date, datetime4from functools import reduce5from inspect import signature6from operator import add7from string import ascii_letters8from typing import TYPE_CHECKING, Any, NoReturn, cast910import numpy as np11import pytest1213import polars as pl14import polars.selectors as cs15from polars import lit, when16from polars.exceptions import (17InvalidOperationError,18PerformanceWarning,19PolarsInefficientMapWarning,20)21from polars.testing import assert_frame_equal, assert_series_equal22from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES2324if TYPE_CHECKING:25from collections.abc import Callable2627from _pytest.capture import CaptureFixture2829from polars._typing import MapElementsStrategy, PolarsDataType30from tests.conftest import PlMonkeyPatch313233def test_init_signature_match() -> None:34# eager/lazy init signatures are expected to match; if this test fails, it35# means a parameter was added to one but not the other, and that should be36# fixed (or an explicit exemption should be made here, with an explanation)37assert signature(pl.DataFrame.__init__) == signature(pl.LazyFrame.__init__)383940def test_lazy_misc() -> None:41ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})42_ = ldf.with_columns(pl.lit(1).alias("foo")).select([pl.col("a"), pl.col("foo")])4344# test if it executes45_ = ldf.with_columns(46when(pl.col("a") > pl.lit(2)).then(pl.lit(10)).otherwise(pl.lit(1)).alias("new")47).collect()484950def test_implode() -> None:51ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})52eager = (53ldf.group_by(pl.col("a").alias("grp"), maintain_order=True)54.agg(pl.implode("a", "b").name.suffix("_imp"))55.collect()56)57assert_frame_equal(58eager,59pl.DataFrame(60{61"grp": [1, 2, 3],62"a_imp": [[1], [2], [3]],63"b_imp": [[1.0], [2.0], [3.0]],64}65),66)676869def test_lazyframe_membership_operator() -> None:70ldf = pl.LazyFrame({"name": ["Jane", "John"], "age": [20, 30]})71assert "name" in ldf72assert "phone" not in ldf7374# note: cannot use lazyframe in boolean context75with pytest.raises(TypeError, match="ambiguous"):76not ldf777879def test_apply() -> None:80ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})81new = ldf.with_columns_seq(82pl.col("a").map_batches(lambda s: s * 2, return_dtype=pl.Int64).alias("foo")83)84expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo"))85assert_frame_equal(new, expected)86assert_frame_equal(new.collect(), expected.collect())8788ldf = pl.LazyFrame({"a": [1, 2, 3] * 20, "b": [1.0, 2.0, 3.0] * 20})89strategy: MapElementsStrategy90for strategy in ("thread_local", "threading"):91with pytest.warns(92PolarsInefficientMapWarning,93match="with this one instead",94):95df_new = ldf.with_columns(96pl.col("a")97.map_elements(lambda s: s * 2, strategy=strategy, return_dtype=pl.Int64)98.alias("foo")99).collect()100101df_expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo")).collect()102assert_frame_equal(df_new, df_expected)103104105def test_add_eager_column() -> None:106lf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})107assert lf.collect_schema().len() == 2108109out = lf.with_columns(pl.lit(pl.Series("c", [1, 2, 3]))).collect()110assert out["c"].sum() == 6111assert out.collect_schema().len() == 3112113114def test_set_null() -> None:115ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})116out = ldf.with_columns(117when(pl.col("a") > 1).then(lit(None)).otherwise(100).alias("foo")118).collect()119s = out["foo"]120assert s[0] == 100121assert s[1] is None122assert s[2] is None123124125def test_gather_every() -> None:126ldf = pl.LazyFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})127expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})128assert_frame_equal(expected_df, ldf.gather_every(2).collect())129expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})130assert_frame_equal(expected_df, ldf.gather_every(2, offset=1).collect())131132133def test_agg() -> None:134df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})135ldf = df.lazy().min()136res = ldf.collect()137assert res.shape == (1, 2)138assert res.row(0) == (1, 1.0)139140141def test_count_suffix_10783() -> None:142df = pl.DataFrame(143{144"a": [["a", "c", "b"], ["a", "b", "c"], ["a", "d", "c"], ["c", "a", "b"]],145"b": [["a", "c", "b"], ["a", "b", "c"], ["a", "d", "c"], ["c", "a", "b"]],146}147)148df_with_cnt = df.with_columns(149pl.len()150.over(pl.col("a").list.sort().list.join("").hash())151.name.suffix("_suffix")152)153df_expect = df.with_columns(pl.Series("len_suffix", [3, 3, 1, 3]))154assert_frame_equal(df_with_cnt, df_expect, check_dtypes=False)155156157def test_or() -> None:158ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})159out = ldf.filter((pl.col("a") == 1) | (pl.col("b") > 2)).collect()160assert out.rows() == [(1, 1.0), (3, 3.0)]161162163def test_filter_str() -> None:164# use a str instead of a column expr165ldf = pl.LazyFrame(166{167"time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"],168"bools": [True, False, True, False],169}170)171172# last row based on a filter173result = ldf.filter(pl.col("bools")).select_seq(pl.last("*")).collect()174expected = pl.DataFrame({"time": ["11:13:00"], "bools": [True]})175assert_frame_equal(result, expected)176177# last row based on a filter178result = ldf.filter("bools").select(pl.last("*")).collect()179assert_frame_equal(result, expected)180181182def test_filter_multiple_predicates() -> None:183ldf = pl.LazyFrame(184{185"a": [1, 1, 1, 2, 2],186"b": [1, 1, 2, 2, 2],187"c": [1, 1, 2, 3, 4],188}189)190191# multiple predicates192expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})193for out in (194ldf.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat195ldf.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list196):197assert_frame_equal(out.collect(), expected)198199# multiple kwargs200assert_frame_equal(201ldf.filter(a=1, b=2).collect(),202pl.DataFrame({"a": [1], "b": [2], "c": [2]}),203)204205# both positional and keyword args206assert_frame_equal(207ldf.filter(pl.col("c") < 4, a=2, b=2).collect(),208pl.DataFrame({"a": [2], "b": [2], "c": [3]}),209)210211ldf = pl.LazyFrame(212{213"description": ["eq", "gt", "ge"],214"predicate": ["==", ">", ">="],215},216)217assert ldf.filter(predicate="==").select("description").collect().item() == "eq"218219220@pytest.mark.parametrize(221"predicate",222[223[pl.lit(True)],224iter([pl.lit(True)]),225[True, True, True],226iter([True, True, True]),227(p for p in (pl.col("c") < 9,)),228(p for p in (pl.col("a") > 0, pl.col("b") > 0)),229],230)231def test_filter_seq_iterable_all_true(predicate: Any) -> None:232ldf = pl.LazyFrame(233{234"a": [1, 1, 1],235"b": [1, 1, 2],236"c": [3, 1, 2],237}238)239assert_frame_equal(ldf, ldf.filter(predicate))240241242def test_apply_custom_function() -> None:243ldf = pl.LazyFrame(244{245"A": [1, 2, 3, 4, 5],246"fruits": ["banana", "banana", "apple", "apple", "banana"],247"B": [5, 4, 3, 2, 1],248"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],249}250)251252# two ways to determine the length groups.253df = (254ldf.group_by("fruits")255.agg(256[257pl.col("cars")258.implode()259.map_elements(lambda groups: groups.len(), return_dtype=pl.Int64)260.alias("custom_1"),261pl.col("cars")262.implode()263.map_elements(lambda groups: groups.len(), return_dtype=pl.Int64)264.alias("custom_2"),265pl.count("cars").alias("cars_count"),266]267)268.sort("custom_1", descending=True)269).collect()270271expected = pl.DataFrame(272{273"fruits": ["banana", "apple"],274"custom_1": [3, 2],275"custom_2": [3, 2],276"cars_count": [3, 2],277}278)279expected = expected.with_columns(pl.col("cars_count").cast(pl.get_index_type()))280assert_frame_equal(df, expected)281282283def test_group_by() -> None:284ldf = pl.LazyFrame(285{286"a": [1.0, None, 3.0, 4.0],287"b": [5.0, 2.5, -3.0, 2.0],288"grp": ["a", "a", "b", "b"],289}290)291expected_a = pl.DataFrame({"grp": ["a", "b"], "a": [1.0, 3.5]})292expected_a_b = pl.DataFrame({"grp": ["a", "b"], "a": [1.0, 3.5], "b": [3.75, -0.5]})293294for out in (295ldf.group_by("grp").agg(pl.mean("a")).collect(),296ldf.group_by(pl.col("grp")).agg(pl.mean("a")).collect(),297):298assert_frame_equal(out.sort(by="grp"), expected_a)299300out = ldf.group_by("grp").agg(pl.mean("a", "b")).collect()301assert_frame_equal(out.sort(by="grp"), expected_a_b)302303304def test_arg_unique() -> None:305ldf = pl.LazyFrame({"a": [4, 1, 4]})306col_a_unique = ldf.select(pl.col("a").arg_unique()).collect()["a"]307assert_series_equal(col_a_unique, pl.Series("a", [0, 1]).cast(pl.get_index_type()))308309310def test_arg_sort() -> None:311ldf = pl.LazyFrame({"a": [4, 1, 3]}).select(pl.col("a").arg_sort())312assert ldf.collect()["a"].to_list() == [1, 2, 0]313314315def test_window_function() -> None:316lf = pl.LazyFrame(317{318"A": [1, 2, 3, 4, 5],319"fruits": ["banana", "banana", "apple", "apple", "banana"],320"B": [5, 4, 3, 2, 1],321"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],322}323)324assert lf.collect_schema().len() == 4325326q = lf.with_columns(327pl.sum("A").over("fruits").alias("fruit_sum_A"),328pl.first("B").over("fruits").alias("fruit_first_B"),329pl.max("B").over("cars").alias("cars_max_B"),330)331assert q.collect_schema().len() == 7332333assert q.collect()["cars_max_B"].to_list() == [5, 4, 5, 5, 5]334335out = lf.select([pl.first("B").over(["fruits", "cars"]).alias("B_first")])336assert out.collect()["B_first"].to_list() == [5, 4, 3, 3, 5]337338339def test_when_then_flatten() -> None:340ldf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [3, 4, 5]})341342assert ldf.select(343when(pl.col("foo") > 1)344.then(pl.col("bar"))345.when(pl.col("bar") < 3)346.then(10)347.otherwise(30)348).collect()["bar"].to_list() == [30, 4, 5]349350351def test_describe_plan() -> None:352assert isinstance(pl.LazyFrame({"a": [1]}).explain(optimized=True), str)353assert isinstance(pl.LazyFrame({"a": [1]}).explain(optimized=False), str)354355356@pytest.mark.may_fail_cloud # reason: inspects logs357def test_inspect(capsys: CaptureFixture[str]) -> None:358ldf = pl.LazyFrame({"a": [1]})359ldf.inspect().collect()360captured = capsys.readouterr()361assert len(captured.out) > 0362363ldf.select(pl.col("a").cum_sum().inspect().alias("bar")).collect()364res = capsys.readouterr()365assert len(res.out) > 0366367368@pytest.mark.may_fail_auto_streaming369def test_fetch(fruits_cars: pl.DataFrame) -> None:370with pytest.warns(371DeprecationWarning,372match=r"use `LazyFrame\.collect` instead",373):374res = fruits_cars.lazy().select("*").fetch(2)375assert_frame_equal(res, res[:2])376377378def test_fold_filter() -> None:379lf = pl.LazyFrame({"a": [1, 2, 3], "b": [0, 1, 2]})380381out = lf.filter(382pl.fold(383acc=pl.lit(True),384function=lambda a, b: a & b,385exprs=[pl.col(c) > 1 for c in lf.collect_schema()],386)387).collect()388389assert out.shape == (1, 2)390assert out.rows() == [(3, 2)]391392out = lf.filter(393pl.fold(394acc=pl.lit(True),395function=lambda a, b: a | b,396exprs=[pl.col(c) > 1 for c in lf.collect_schema()],397)398).collect()399400assert out.rows() == [(1, 0), (2, 1), (3, 2)]401402403def test_head_group_by() -> None:404commodity_prices = {405"commodity": [406"Wheat",407"Wheat",408"Wheat",409"Wheat",410"Corn",411"Corn",412"Corn",413"Corn",414"Corn",415],416"location": [417"StPaul",418"StPaul",419"StPaul",420"Chicago",421"Chicago",422"Chicago",423"Chicago",424"Chicago",425"Chicago",426],427"seller": [428"Bob",429"Charlie",430"Susan",431"Paul",432"Ed",433"Mary",434"Paul",435"Charlie",436"Norman",437],438"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],439}440ldf = pl.LazyFrame(commodity_prices)441442# this query flexes the wildcard exclusion quite a bit.443keys = ["commodity", "location"]444out = (445ldf.sort(by="price", descending=True)446.group_by(keys, maintain_order=True)447.agg([pl.col("*").exclude(keys).head(2).name.keep()])448.explode(cs.all().exclude(keys))449)450451assert out.collect().rows() == [452("Corn", "Chicago", "Mary", 3.0),453("Corn", "Chicago", "Paul", 2.4),454("Wheat", "StPaul", "Bob", 1.0),455("Wheat", "StPaul", "Susan", 0.8),456("Wheat", "Chicago", "Paul", 0.55),457]458459ldf = pl.LazyFrame(460{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}461)462out = ldf.group_by("letters").tail(2).sort("letters")463assert_frame_equal(464out.collect(),465pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),466)467out = ldf.group_by("letters").head(2).sort("letters")468assert_frame_equal(469out.collect(),470pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),471)472473474def test_is_null_is_not_null() -> None:475ldf = pl.LazyFrame({"nrs": [1, 2, None]}).select(476pl.col("nrs").is_null().alias("is_null"),477pl.col("nrs").is_not_null().alias("not_null"),478)479assert ldf.collect()["is_null"].to_list() == [False, False, True]480assert ldf.collect()["not_null"].to_list() == [True, True, False]481482483def test_is_nan_is_not_nan() -> None:484ldf = pl.LazyFrame({"nrs": np.array([1, 2, np.nan])}).select(485pl.col("nrs").is_nan().alias("is_nan"),486pl.col("nrs").is_not_nan().alias("not_nan"),487)488assert ldf.collect()["is_nan"].to_list() == [False, False, True]489assert ldf.collect()["not_nan"].to_list() == [True, True, False]490491492def test_is_finite_is_infinite() -> None:493ldf = pl.LazyFrame({"nrs": np.array([1, 2, np.inf])}).select(494pl.col("nrs").is_infinite().alias("is_inf"),495pl.col("nrs").is_finite().alias("not_inf"),496)497assert ldf.collect()["is_inf"].to_list() == [False, False, True]498assert ldf.collect()["not_inf"].to_list() == [True, True, False]499500501def test_len() -> None:502ldf = pl.LazyFrame({"nrs": [1, 2, 3]})503assert cast("int", ldf.select(pl.col("nrs").len()).collect().item()) == 3504505506@pytest.mark.parametrize("dtype", NUMERIC_DTYPES)507def test_cum_agg(dtype: PolarsDataType) -> None:508ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}, schema={"a": dtype})509assert_series_equal(510ldf.select(pl.col("a").cum_min()).collect()["a"],511pl.Series("a", [1, 1, 1, 1], dtype=dtype),512)513assert_series_equal(514ldf.select(pl.col("a").cum_max()).collect()["a"],515pl.Series("a", [1, 2, 3, 3], dtype=dtype),516)517518expected_dtype = (519pl.Int64 if dtype in [pl.Int8, pl.Int16, pl.UInt8, pl.UInt16] else dtype520)521assert_series_equal(522ldf.select(pl.col("a").cum_sum()).collect()["a"],523pl.Series("a", [1, 3, 6, 8], dtype=expected_dtype),524)525526expected_dtype = (527pl.Int64528if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.UInt8, pl.UInt16, pl.UInt32]529else dtype530)531assert_series_equal(532ldf.select(pl.col("a").cum_prod()).collect()["a"],533pl.Series("a", [1, 2, 6, 12], dtype=expected_dtype),534)535536537def test_ceil() -> None:538ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0]})539result = ldf.select(pl.col("a").ceil()).collect()540assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))541542ldf = pl.LazyFrame({"a": [1, 2, 3]})543result = ldf.select(pl.col("a").ceil()).collect()544assert_frame_equal(ldf.collect(), result)545546547def test_floor() -> None:548ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0]})549result = ldf.select(pl.col("a").floor()).collect()550assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))551552ldf = pl.LazyFrame({"a": [1, 2, 3]})553result = ldf.select(pl.col("a").floor()).collect()554assert_frame_equal(ldf.collect(), result)555556557@pytest.mark.parametrize(558("n", "ndigits", "expected"),559[560(1.005, 2, 1.0),561(1234.00000254495, 10, 1234.000002545),562(1835.665, 2, 1835.67),563(-1835.665, 2, -1835.67),564(2.49, 0, 2.0),565(123.45678, 2, 123.46),566(1254, 2, 1254.0),567(1254, 0, 1254.0),568(123.55, 0, 124.0),569(123.55, 1, 123.6),570(-1.23456789, 6, -1.234568),571(1.0e-5, 5, 0.00001),572(1.0e-20, 20, 1e-20),573(1.0e20, 2, 100000000000000000000.0),574],575)576@pytest.mark.parametrize("dtype", FLOAT_DTYPES)577def test_round(n: float, ndigits: int, expected: float, dtype: pl.DataType) -> None:578ldf = pl.LazyFrame({"value": [n]}, schema_overrides={"value": dtype})579assert_series_equal(580ldf.select(pl.col("value").round(decimals=ndigits)).collect().to_series(),581pl.Series("value", [expected], dtype=dtype),582)583584585@pytest.mark.parametrize(586("n", "ndigits", "expected1", "expected2"),587[588(0.5, 0, 0.0, 1.0),589(1.5, 0, 2.0, 2.0),590(2.5, 0, 2.0, 3.0),591(-0.5, 0, -0.0, -1.0),592(-1.5, 0, -2.0, -2.0),593(2.25, 1, 2.2, 2.3),594(2.75, 1, 2.8, 2.8),595(-2.25, 1, -2.2, -2.3),596],597)598@pytest.mark.parametrize("dtype", FLOAT_DTYPES)599def test_round_mode(600n: float, ndigits: int, expected1: float, expected2: float, dtype: pl.DataType601) -> None:602ldf = pl.LazyFrame({"value": [n]}, schema_overrides={"value": dtype})603assert_series_equal(604ldf.select(pl.col("value").round(ndigits, mode="half_to_even"))605.collect()606.to_series(),607pl.Series("value", [expected1], dtype=dtype),608)609assert_series_equal(610ldf.select(pl.col("value").round(ndigits, mode="half_away_from_zero"))611.collect()612.to_series(),613pl.Series("value", [expected2], dtype=dtype),614)615616617def test_dot() -> None:618ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]}).select(619pl.col("a").dot(pl.col("b"))620)621assert cast("float", ldf.collect().item()) == 12.96622623624def test_sort() -> None:625ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}).select(pl.col("a").sort())626assert_series_equal(ldf.collect()["a"], pl.Series("a", [1, 2, 2, 3]))627628629def test_custom_group_by() -> None:630ldf = pl.LazyFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})631out = (632ldf.group_by("b", maintain_order=True)633.agg(634[635pl.col("a")636.implode()637.map_elements(lambda x: x.sum(), return_dtype=pl.Int64)638]639)640.collect()641)642assert out.rows() == [("a", 1), ("b", 2), ("c", 2)]643644645def test_lazy_columns() -> None:646lf = pl.LazyFrame(647{648"a": [1],649"b": [1],650"c": [1],651}652)653assert lf.select("a", "c").collect_schema().names() == ["a", "c"]654655656def test_cast_frame() -> None:657lf = pl.LazyFrame(658{659"a": [1.0, 2.5, 3.0],660"b": [4, 5, None],661"c": [True, False, True],662"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],663}664)665666# cast via col:dtype map667assert lf.cast(668dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")}669).collect_schema() == {670"a": pl.Float64,671"b": pl.Float32,672"c": pl.String,673"d": pl.Datetime("ms"),674}675676# cast via selector:dtype map677lfc = lf.cast(678{679cs.float(): pl.UInt8,680cs.integer(): pl.Int32,681cs.temporal(): pl.String,682}683)684assert lfc.collect_schema() == {685"a": pl.UInt8,686"b": pl.Int32,687"c": pl.Boolean,688"d": pl.String,689}690assert lfc.collect().rows() == [691(1, 4, True, "2020-01-02"),692(2, 5, False, "2021-03-04"),693(3, None, True, "2022-05-06"),694]695696# cast all fields to a single type697result = lf.cast(pl.String)698expected = pl.LazyFrame(699{700"a": ["1.0", "2.5", "3.0"],701"b": ["4", "5", None],702"c": ["true", "false", "true"],703"d": ["2020-01-02", "2021-03-04", "2022-05-06"],704}705)706assert_frame_equal(result, expected)707708# test 'strict' mode709lf = pl.LazyFrame({"a": [1000, 2000, 3000]})710711with pytest.raises(InvalidOperationError, match=r"conversion .* failed"):712lf.cast(pl.UInt8).collect()713714assert lf.cast(pl.UInt8, strict=False).collect().rows() == [715(None,),716(None,),717(None,),718]719720721def test_interpolate() -> None:722df = pl.DataFrame({"a": [1, None, 3]})723assert df.select(pl.col("a").interpolate())["a"].to_list() == [1, 2, 3]724assert df["a"].interpolate().to_list() == [1, 2, 3]725assert df.interpolate()["a"].to_list() == [1, 2, 3]726assert df.lazy().interpolate().collect()["a"].to_list() == [1, 2, 3]727728729def test_fill_nan() -> None:730df = pl.DataFrame({"a": [1.0, np.nan, 3.0]})731assert_series_equal(df.fill_nan(2.0)["a"], pl.Series("a", [1.0, 2.0, 3.0]))732assert_series_equal(733df.lazy().fill_nan(2.0).collect()["a"], pl.Series("a", [1.0, 2.0, 3.0])734)735assert_series_equal(736df.lazy().fill_nan(None).collect()["a"], pl.Series("a", [1.0, None, 3.0])737)738assert_series_equal(739df.select(pl.col("a").fill_nan(2))["a"], pl.Series("a", [1.0, 2.0, 3.0])740)741# nearest742assert pl.Series([None, 1, None, None, None, -8, None, None, 10]).interpolate(743method="nearest"744).to_list() == [None, 1, 1, -8, -8, -8, -8, 10, 10]745746747def test_fill_null() -> None:748df = pl.DataFrame({"a": [1.0, None, 3.0]})749750assert df.select([pl.col("a").fill_null(strategy="min")])["a"][1] == 1.0751assert df.lazy().fill_null(2).collect()["a"].to_list() == [1.0, 2.0, 3.0]752753with pytest.raises(ValueError, match="must specify either"):754df.fill_null()755with pytest.raises(ValueError, match="cannot specify both"):756df.fill_null(value=3.0, strategy="max")757with pytest.raises(ValueError, match="can only specify `limit`"):758df.fill_null(strategy="max", limit=2)759760761def test_backward_fill() -> None:762ldf = pl.LazyFrame({"a": [1.0, None, 3.0]})763col_a_backward_fill = ldf.select(764[pl.col("a").fill_null(strategy="backward")]765).collect()["a"]766assert_series_equal(col_a_backward_fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))767768769def test_rolling(fruits_cars: pl.DataFrame) -> None:770ldf = fruits_cars.lazy()771out = ldf.select(772pl.col("A").rolling_min(3, min_samples=1).alias("1"),773pl.col("A").rolling_min(3).alias("1b"),774pl.col("A").rolling_mean(3, min_samples=1).alias("2"),775pl.col("A").rolling_mean(3).alias("2b"),776pl.col("A").rolling_max(3, min_samples=1).alias("3"),777pl.col("A").rolling_max(3).alias("3b"),778pl.col("A").rolling_sum(3, min_samples=1).alias("4"),779pl.col("A").rolling_sum(3).alias("4b"),780# below we use .round purely for the ability to do assert frame equality781pl.col("A").rolling_std(3).round(1).alias("std"),782pl.col("A").rolling_var(3).round(1).alias("var"),783)784785assert_frame_equal(786out.collect(),787pl.DataFrame(788{789"1": [1, 1, 1, 2, 3],790"1b": [None, None, 1, 2, 3],791"2": [1.0, 1.5, 2.0, 3.0, 4.0],792"2b": [None, None, 2.0, 3.0, 4.0],793"3": [1, 2, 3, 4, 5],794"3b": [None, None, 3, 4, 5],795"4": [1, 3, 6, 9, 12],796"4b": [None, None, 6, 9, 12],797"std": [None, None, 1.0, 1.0, 1.0],798"var": [None, None, 1.0, 1.0, 1.0],799}800),801)802803out_single_val_variance = ldf.select(804pl.col("A").rolling_std(3, min_samples=1).round(decimals=4).alias("std"),805pl.col("A").rolling_var(3, min_samples=1).round(decimals=1).alias("var"),806).collect()807808assert cast("float", out_single_val_variance[0, "std"]) is None809assert cast("float", out_single_val_variance[0, "var"]) is None810811812def test_arr_namespace(fruits_cars: pl.DataFrame) -> None:813ldf = fruits_cars.lazy()814out = ldf.select(815"fruits",816pl.col("B")817.over("fruits", mapping_strategy="join")818.list.min()819.alias("B_by_fruits_min1"),820pl.col("B")821.min()822.over("fruits", mapping_strategy="join")823.alias("B_by_fruits_min2"),824pl.col("B")825.over("fruits", mapping_strategy="join")826.list.max()827.alias("B_by_fruits_max1"),828pl.col("B")829.max()830.over("fruits", mapping_strategy="join")831.alias("B_by_fruits_max2"),832pl.col("B")833.over("fruits", mapping_strategy="join")834.list.sum()835.alias("B_by_fruits_sum1"),836pl.col("B")837.sum()838.over("fruits", mapping_strategy="join")839.alias("B_by_fruits_sum2"),840pl.col("B")841.over("fruits", mapping_strategy="join")842.list.mean()843.alias("B_by_fruits_mean1"),844pl.col("B")845.mean()846.over("fruits", mapping_strategy="join")847.alias("B_by_fruits_mean2"),848)849expected = pl.DataFrame(850{851"fruits": ["banana", "banana", "apple", "apple", "banana"],852"B_by_fruits_min1": [1, 1, 2, 2, 1],853"B_by_fruits_min2": [1, 1, 2, 2, 1],854"B_by_fruits_max1": [5, 5, 3, 3, 5],855"B_by_fruits_max2": [5, 5, 3, 3, 5],856"B_by_fruits_sum1": [10, 10, 5, 5, 10],857"B_by_fruits_sum2": [10, 10, 5, 5, 10],858"B_by_fruits_mean1": [8593.3333333333333335,8603.3333333333333335,8612.5,8622.5,8633.3333333333333335,864],865"B_by_fruits_mean2": [8663.3333333333333335,8673.3333333333333335,8682.5,8692.5,8703.3333333333333335,871],872}873)874assert_frame_equal(out.collect(), expected)875876877def test_arithmetic() -> None:878ldf = pl.LazyFrame({"a": [1, 2, 3]})879880out = ldf.select(881(pl.col("a") % 2).alias("1"),882(2 % pl.col("a")).alias("2"),883(1 // pl.col("a")).alias("3"),884(1 * pl.col("a")).alias("4"),885(1 + pl.col("a")).alias("5"),886(1 - pl.col("a")).alias("6"),887(pl.col("a") // 2).alias("7"),888(pl.col("a") * 2).alias("8"),889(pl.col("a") + 2).alias("9"),890(pl.col("a") - 2).alias("10"),891(-pl.col("a")).alias("11"),892)893expected = pl.DataFrame(894{895"1": [1, 0, 1],896"2": [0, 0, 2],897"3": [1, 0, 0],898"4": [1, 2, 3],899"5": [2, 3, 4],900"6": [0, -1, -2],901"7": [0, 1, 1],902"8": [2, 4, 6],903"9": [3, 4, 5],904"10": [-1, 0, 1],905"11": [-1, -2, -3],906}907)908assert_frame_equal(out.collect(), expected)909910911def test_float_floor_divide() -> None:912x = 10.4913step = 0.5914ldf = pl.LazyFrame({"x": [x]})915ldf_res = ldf.with_columns(pl.col("x") // step).collect().item()916assert ldf_res == x // step917918919def test_argminmax() -> None:920ldf = pl.LazyFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 2, 2]})921out = ldf.select(922pl.col("a").arg_min().alias("min"),923pl.col("a").arg_max().alias("max"),924).collect()925assert out["max"][0] == 4926assert out["min"][0] == 0927928out = (929ldf.group_by("b", maintain_order=True)930.agg([pl.col("a").arg_min().alias("min"), pl.col("a").arg_max().alias("max")])931.collect()932)933assert out["max"][0] == 1934assert out["min"][0] == 0935936937def test_limit(fruits_cars: pl.DataFrame) -> None:938assert_frame_equal(fruits_cars.lazy().limit(1).collect(), fruits_cars[0, :])939940941def test_head(fruits_cars: pl.DataFrame) -> None:942assert_frame_equal(fruits_cars.lazy().head(2).collect(), fruits_cars[:2, :])943944945def test_tail(fruits_cars: pl.DataFrame) -> None:946assert_frame_equal(fruits_cars.lazy().tail(2).collect(), fruits_cars[3:, :])947948949def test_last(fruits_cars: pl.DataFrame) -> None:950result = fruits_cars.lazy().last().collect()951expected = fruits_cars[(len(fruits_cars) - 1) :, :]952assert_frame_equal(result, expected)953954955def test_first(fruits_cars: pl.DataFrame) -> None:956assert_frame_equal(fruits_cars.lazy().first().collect(), fruits_cars[0, :])957958959def test_join_suffix() -> None:960df_left = pl.DataFrame(961{962"a": ["a", "b", "a", "z"],963"b": [1, 2, 3, 4],964"c": [6, 5, 4, 3],965}966)967df_right = pl.DataFrame(968{969"a": ["b", "c", "b", "a"],970"b": [0, 3, 9, 6],971"c": [1, 0, 2, 1],972}973)974out = df_left.join(df_right, on="a", suffix="_bar")975assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]976out = df_left.lazy().join(df_right.lazy(), on="a", suffix="_bar").collect()977assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]978979980@pytest.mark.may_fail_cloud # reason: no981def test_collect_unexpected_kwargs(df: pl.DataFrame) -> None:982with pytest.raises(TypeError, match="unexpected keyword argument"):983df.lazy().collect(common_subexpr_elim=False) # type: ignore[call-overload]984985986def test_spearman_corr() -> None:987ldf = pl.LazyFrame(988{989"era": [1, 1, 1, 2, 2, 2],990"prediction": [2, 4, 5, 190, 1, 4],991"target": [1, 3, 2, 1, 43, 3],992}993)994995out = (996ldf.group_by("era", maintain_order=True).agg(997pl.corr(pl.col("prediction"), pl.col("target"), method="spearman").alias(998"c"999),1000)1001).collect()["c"]1002assert np.isclose(out[0], 0.5)1003assert np.isclose(out[1], -1.0)10041005# we can also pass in column names directly1006out = (1007ldf.group_by("era", maintain_order=True).agg(1008pl.corr("prediction", "target", method="spearman").alias("c"),1009)1010).collect()["c"]1011assert np.isclose(out[0], 0.5)1012assert np.isclose(out[1], -1.0)101310141015def test_spearman_corr_ties() -> None:1016"""In Spearman correlation, ranks are computed using the average method ."""1017df = pl.DataFrame({"a": [1, 1, 1, 2, 3, 7, 4], "b": [4, 3, 2, 2, 4, 3, 1]})10181019result = df.select(1020pl.corr("a", "b", method="spearman").alias("a1"),1021pl.corr(pl.col("a").rank("min"), pl.col("b").rank("min")).alias("a2"),1022pl.corr(pl.col("a").rank(), pl.col("b").rank()).alias("a3"),1023)1024expected = pl.DataFrame(1025[1026pl.Series("a1", [-0.19048482943986483], dtype=pl.Float64),1027pl.Series("a2", [-0.17223653586587362], dtype=pl.Float64),1028pl.Series("a3", [-0.19048482943986483], dtype=pl.Float64),1029]1030)1031assert_frame_equal(result, expected)103210331034def test_pearson_corr() -> None:1035ldf = pl.LazyFrame(1036{1037"era": [1, 1, 1, 2, 2, 2],1038"prediction": [2, 4, 5, 190, 1, 4],1039"target": [1, 3, 2, 1, 43, 3],1040}1041)10421043out = (1044ldf.group_by("era", maintain_order=True).agg(1045pl.corr(1046pl.col("prediction"),1047pl.col("target"),1048method="pearson",1049).alias("c"),1050)1051).collect()["c"]1052assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])10531054# we can also pass in column names directly1055out = (1056ldf.group_by("era", maintain_order=True).agg(1057pl.corr("prediction", "target", method="pearson").alias("c"),1058)1059).collect()["c"]1060assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])106110621063def test_null_count() -> None:1064lf = pl.LazyFrame({"a": [1, 2, None, 2], "b": [None, 3, None, 3]})1065assert lf.null_count().collect().rows() == [(1, 2)]106610671068def test_lazy_concat(df: pl.DataFrame) -> None:1069shape = df.shape1070shape = (shape[0] * 2, shape[1])10711072out = pl.concat([df.lazy(), df.lazy()]).collect()1073assert out.shape == shape1074assert_frame_equal(out, df.vstack(df))107510761077def test_self_join() -> None:1078# 27201079ldf = pl.from_dict(1080data={1081"employee_id": [100, 101, 102],1082"employee_name": ["James", "Alice", "Bob"],1083"manager_id": [None, 100, 101],1084}1085).lazy()10861087out = (1088ldf.join(other=ldf, left_on="manager_id", right_on="employee_id", how="left")1089.select(1090pl.col("employee_id"),1091pl.col("employee_name"),1092pl.col("employee_name_right").alias("manager_name"),1093)1094.collect()1095)1096assert set(out.rows()) == {1097(100, "James", None),1098(101, "Alice", "James"),1099(102, "Bob", "Alice"),1100}110111021103def test_group_lengths() -> None:1104ldf = pl.LazyFrame(1105{1106"group": ["A", "A", "A", "B", "B", "B", "B"],1107"id": ["1", "1", "2", "3", "4", "3", "5"],1108}1109)11101111result = ldf.group_by(["group"], maintain_order=True).agg(1112[1113(pl.col("id").unique_counts() / pl.col("id").len())1114.sum()1115.alias("unique_counts_sum"),1116pl.col("id").unique().len().alias("unique_len"),1117]1118)1119expected = pl.DataFrame(1120{1121"group": ["A", "B"],1122"unique_counts_sum": [1.0, 1.0],1123"unique_len": [2, 3],1124},1125schema_overrides={"unique_len": pl.get_index_type()},1126)1127assert_frame_equal(result.collect(), expected)112811291130def test_quantile_filtered_agg() -> None:1131assert (1132pl.LazyFrame(1133{1134"group": [0, 0, 0, 0, 1, 1, 1, 1],1135"value": [1, 2, 3, 4, 1, 2, 3, 4],1136}1137)1138.group_by("group")1139.agg(pl.col("value").filter(pl.col("value") < 2).quantile(0.5))1140.collect()["value"]1141.to_list()1142) == [1.0, 1.0]114311441145def test_predicate_count_vstack() -> None:1146l1 = pl.LazyFrame(1147{1148"k": ["x", "y"],1149"v": [3, 2],1150}1151)1152l2 = pl.LazyFrame(1153{1154"k": ["x", "y"],1155"v": [5, 7],1156}1157)1158assert pl.concat([l1, l2]).filter(pl.len().over("k") == 2).collect()[1159"v"1160].to_list() == [3, 2, 5, 7]116111621163def test_lazy_method() -> None:1164# We want to support `.lazy()` on a Lazy DataFrame to allow more generic user code.1165df = pl.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})1166assert_frame_equal(df.lazy(), df.lazy().lazy())116711681169def test_update_schema_after_projection_pd_t4157() -> None:1170ldf = pl.LazyFrame({"c0": [], "c1": [], "c2": []}).rename({"c2": "c2_"})1171assert ldf.drop("c2_").select(pl.col("c0")).collect().columns == ["c0"]117211731174def test_type_coercion_unknown_4190() -> None:1175df = (1176pl.LazyFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).with_columns(1177pl.col("a") & pl.col("a").fill_null(True)1178)1179).collect()1180assert df.shape == (3, 2)1181assert df.rows() == [(1, 1), (2, 2), (3, 3)]118211831184def test_lazy_cache_same_key() -> None:1185ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]})11861187# these have the same schema, but should not be used by cache as they are different1188add_node = ldf.select([(pl.col("a") + pl.col("b")).alias("a"), pl.col("c")]).cache()1189mult_node = ldf.select((pl.col("a") * pl.col("b")).alias("a"), pl.col("c")).cache()11901191result = mult_node.join(add_node, on="c", suffix="_mult").select(1192(pl.col("a") - pl.col("a_mult")).alias("a"), pl.col("c")1193)1194expected = pl.LazyFrame({"a": [-1, 2, 7], "c": ["x", "y", "z"]})1195assert_frame_equal(result, expected, check_row_order=False)119611971198@pytest.mark.may_fail_cloud # reason: inspects logs1199@pytest.mark.may_fail_auto_streaming1200def test_lazy_cache_hit(plmonkeypatch: PlMonkeyPatch, capfd: Any) -> None:1201plmonkeypatch.setenv("POLARS_VERBOSE", "1")12021203ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]})1204add_node = ldf.select([(pl.col("a") + pl.col("b")).alias("a"), pl.col("c")]).cache()12051206result = add_node.join(add_node, on="c", suffix="_mult").select(1207(pl.col("a") - pl.col("a_mult")).alias("a"), pl.col("c")1208)1209expected = pl.LazyFrame({"a": [0, 0, 0], "c": ["x", "y", "z"]})1210assert_frame_equal(result, expected, check_row_order=False)12111212(_, err) = capfd.readouterr()1213assert "CACHE HIT" in err121412151216@pytest.mark.may_fail_cloud # reason: impure udf1217def test_lazy_cache_parallel() -> None:1218df_evaluated = 012191220def map_df(df: pl.DataFrame) -> pl.DataFrame:1221nonlocal df_evaluated1222df_evaluated += 11223return df12241225df = pl.LazyFrame({"a": [1]}).map_batches(map_df).cache()12261227df = pl.concat(1228[1229df.select(pl.col("a") + 1),1230df.select(pl.col("a") + 2),1231df.select(pl.col("a") + 3),1232],1233parallel=True,1234)12351236assert df_evaluated == 012371238df.collect()1239assert df_evaluated == 1124012411242@pytest.mark.may_fail_cloud # reason: impure udf1243def test_lazy_cache_nested_parallel() -> None:1244df_inner_evaluated = 01245df_outer_evaluated = 012461247def map_df_inner(df: pl.DataFrame) -> pl.DataFrame:1248nonlocal df_inner_evaluated1249df_inner_evaluated += 11250return df12511252def map_df_outer(df: pl.DataFrame) -> pl.DataFrame:1253nonlocal df_outer_evaluated1254df_outer_evaluated += 11255return df12561257df_inner = pl.LazyFrame({"a": [1]}).map_batches(map_df_inner).cache()1258df_outer = df_inner.select(pl.col("a") + 1).map_batches(map_df_outer).cache()12591260df = pl.concat(1261[1262df_outer.select(pl.col("a") + 2),1263df_outer.select(pl.col("a") + 3),1264],1265parallel=True,1266)12671268assert df_inner_evaluated == 01269assert df_outer_evaluated == 012701271df.collect()1272assert df_inner_evaluated == 11273assert df_outer_evaluated == 1127412751276def test_quadratic_behavior_4736() -> None:1277# no assert; if this function does not stall our tests it has passed!1278lf = pl.LazyFrame(schema=list(ascii_letters))1279lf.select(reduce(add, (pl.col(c) for c in lf.collect_schema())))128012811282@pytest.mark.parametrize("input_dtype", [pl.Int64, pl.Float64])1283def test_from_epoch(input_dtype: PolarsDataType) -> None:1284ldf = pl.LazyFrame(1285[1286pl.Series("timestamp_d", [13285]).cast(input_dtype),1287pl.Series("timestamp_s", [1147880044]).cast(input_dtype),1288pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(input_dtype),1289pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(input_dtype),1290pl.Series("timestamp_ns", [1147880044 * 1_000_000_000]).cast(input_dtype),1291]1292)12931294exp_dt = datetime(2006, 5, 17, 15, 34, 4)1295expected = pl.DataFrame(1296[1297# 'd' → Date, 'ns' → Datetime('ns'), otherwise → Datetime('us')1298pl.Series("timestamp_d", [date(2006, 5, 17)]),1299pl.Series("timestamp_s", [exp_dt]),1300pl.Series("timestamp_ms", [exp_dt]),1301pl.Series("timestamp_us", [exp_dt]),1302pl.Series("timestamp_ns", [exp_dt]).cast(pl.Datetime("ns")),1303]1304)13051306ldf_result = ldf.select(1307pl.from_epoch(pl.col("timestamp_d"), time_unit="d"),1308pl.from_epoch(pl.col("timestamp_s"), time_unit="s"),1309pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"),1310pl.from_epoch(pl.col("timestamp_us"), time_unit="us"),1311pl.from_epoch(pl.col("timestamp_ns"), time_unit="ns"),1312).collect()13131314assert_frame_equal(ldf_result, expected)13151316ts_col = pl.col("timestamp_s")1317with pytest.raises(ValueError):1318_ = ldf.select(pl.from_epoch(ts_col, time_unit="s2")) # type: ignore[call-overload]131913201321def test_from_epoch_str() -> None:1322ldf = pl.LazyFrame(1323[1324pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(pl.String),1325pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(pl.String),1326]1327)13281329with pytest.raises(InvalidOperationError):1330ldf.select(1331pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"),1332pl.from_epoch(pl.col("timestamp_us"), time_unit="us"),1333).collect()133413351336def test_cum_agg_types() -> None:1337ldf = pl.LazyFrame({"a": [1, 2], "b": [True, False], "c": [1.3, 2.4]})1338cum_sum_lf = ldf.select(1339pl.col("a").cum_sum(),1340pl.col("b").cum_sum(),1341pl.col("c").cum_sum(),1342)1343assert cum_sum_lf.collect_schema()["a"] == pl.Int641344assert cum_sum_lf.collect_schema()["b"] == pl.UInt321345assert cum_sum_lf.collect_schema()["c"] == pl.Float641346collected_cumsum_lf = cum_sum_lf.collect()1347assert collected_cumsum_lf.schema == cum_sum_lf.collect_schema()13481349cum_prod_lf = ldf.select(1350pl.col("a").cast(pl.UInt64).cum_prod(),1351pl.col("b").cum_prod(),1352pl.col("c").cum_prod(),1353)1354assert cum_prod_lf.collect_schema()["a"] == pl.UInt641355assert cum_prod_lf.collect_schema()["b"] == pl.Int641356assert cum_prod_lf.collect_schema()["c"] == pl.Float641357collected_cum_prod_lf = cum_prod_lf.collect()1358assert collected_cum_prod_lf.schema == cum_prod_lf.collect_schema()135913601361def test_compare_schema_between_lazy_and_eager_6904() -> None:1362float32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Float32)})1363eager_result = float32_df.select(pl.col("x").sqrt()).select(pl.col(pl.Float32))1364lazy_result = (1365float32_df.lazy()1366.select(pl.col("x").sqrt())1367.select(pl.col(pl.Float32))1368.collect()1369)1370assert eager_result.shape == lazy_result.shape13711372eager_result = float32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float32))1373lazy_result = (1374float32_df.lazy()1375.select(pl.col("x").pow(2))1376.select(pl.col(pl.Float32))1377.collect()1378)1379assert eager_result.shape == lazy_result.shape13801381int32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int32)})1382eager_result = int32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float64))1383lazy_result = (1384int32_df.lazy().select(pl.col("x").pow(2)).select(pl.col(pl.Float64)).collect()1385)1386assert eager_result.shape == lazy_result.shape13871388int8_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int8)})1389eager_result = int8_df.select(pl.col("x").diff()).select(pl.col(pl.Int16))1390lazy_result = (1391int8_df.lazy().select(pl.col("x").diff()).select(pl.col(pl.Int16)).collect()1392)1393assert eager_result.shape == lazy_result.shape139413951396@pytest.mark.slow1397@pytest.mark.parametrize(1398"dtype",1399[1400pl.UInt8,1401pl.UInt16,1402pl.UInt32,1403pl.UInt64,1404pl.Int8,1405pl.Int16,1406pl.Int32,1407pl.Int64,1408pl.Float32,1409pl.Float64,1410],1411)1412@pytest.mark.parametrize(1413"func",1414[1415pl.col("x").arg_max(),1416pl.col("x").arg_min(),1417pl.col("x").max(),1418pl.col("x").mean(),1419pl.col("x").median(),1420pl.col("x").min(),1421pl.col("x").nan_max(),1422pl.col("x").nan_min(),1423pl.col("x").product(),1424pl.col("x").quantile(0.5),1425pl.col("x").std(),1426pl.col("x").sum(),1427pl.col("x").var(),1428],1429)1430def test_compare_aggregation_between_lazy_and_eager_6904(1431dtype: PolarsDataType, func: pl.Expr1432) -> None:1433df = pl.DataFrame(1434{1435"x": pl.Series(values=[1, 2, 3] * 2, dtype=dtype),1436"y": pl.Series(values=["a"] * 3 + ["b"] * 3),1437}1438)1439result_eager = df.select(func.over("y")).select("x")1440dtype_eager = result_eager["x"].dtype1441result_lazy = df.lazy().select(func.over("y")).select(pl.col(dtype_eager)).collect()1442assert_frame_equal(result_eager, result_lazy)144314441445@pytest.mark.parametrize(1446"comparators",1447[1448("==", pl.LazyFrame.__eq__),1449("!=", pl.LazyFrame.__ne__),1450(">", pl.LazyFrame.__gt__),1451("<", pl.LazyFrame.__lt__),1452(">=", pl.LazyFrame.__ge__),1453("<=", pl.LazyFrame.__le__),1454],1455)1456def test_lazy_comparison_operators(1457comparators: tuple[str, Callable[[pl.LazyFrame, Any], NoReturn]],1458) -> None:1459# we cannot compare lazy frames, so all should raise a TypeError1460with pytest.raises(1461TypeError,1462match=f'"{comparators[0]!r}" comparison not supported for LazyFrame objects',1463):1464comparators[1](pl.LazyFrame(), pl.LazyFrame())146514661467def test_lf_properties() -> None:1468lf = pl.LazyFrame(1469{1470"foo": [1, 2, 3],1471"bar": [6.0, 7.0, 8.0],1472"ham": ["a", "b", "c"],1473}1474)1475with pytest.warns(PerformanceWarning):1476assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}1477with pytest.warns(PerformanceWarning):1478assert lf.columns == ["foo", "bar", "ham"]1479with pytest.warns(PerformanceWarning):1480assert lf.dtypes == [pl.Int64, pl.Float64, pl.String]1481with pytest.warns(PerformanceWarning):1482assert lf.width == 3148314841485def test_lf_unnest() -> None:1486lf = pl.DataFrame(1487[1488pl.Series(1489"a",1490[{"ab": [1, 2, 3], "ac": [3, 4, 5]}],1491dtype=pl.Struct({"ab": pl.List(pl.Int64), "ac": pl.List(pl.Int64)}),1492),1493pl.Series(1494"b",1495[{"ba": [5, 6, 7], "bb": [7, 8, 9]}],1496dtype=pl.Struct({"ba": pl.List(pl.Int64), "bb": pl.List(pl.Int64)}),1497),1498]1499).lazy()15001501expected = pl.DataFrame(1502[1503pl.Series("ab", [[1, 2, 3]], dtype=pl.List(pl.Int64)),1504pl.Series("ac", [[3, 4, 5]], dtype=pl.List(pl.Int64)),1505pl.Series("ba", [[5, 6, 7]], dtype=pl.List(pl.Int64)),1506pl.Series("bb", [[7, 8, 9]], dtype=pl.List(pl.Int64)),1507]1508)1509assert_frame_equal(lf.unnest("a", "b").collect(), expected)151015111512def test_type_coercion_cast_boolean_after_comparison() -> None:1513import operator15141515lf = pl.LazyFrame({"a": 1, "b": 2})15161517for op in [1518operator.eq,1519operator.ne,1520operator.lt,1521operator.le,1522operator.gt,1523operator.ge,1524pl.Expr.eq_missing,1525pl.Expr.ne_missing,1526]:1527e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean).alias("o")1528assert "cast" not in lf.with_columns(e).explain()15291530e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean).cast(pl.Boolean).alias("o")1531assert "cast" not in lf.with_columns(e).explain()15321533for op in [operator.and_, operator.or_, operator.xor]:1534e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean)1535assert "cast" in lf.with_columns(e).explain()153615371538def test_unique_length_multiple_columns() -> None:1539lf = pl.LazyFrame(1540{1541"a": [1, 1, 1, 2, 3],1542"b": [100, 100, 200, 100, 300],1543}1544)1545assert lf.unique().select(pl.len()).collect().item() == 4154615471548def test_asof_cross_join() -> None:1549left = pl.LazyFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(1550pl.col("a").set_sorted()1551)1552right = pl.LazyFrame(1553{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}1554).with_columns(pl.col("a").set_sorted())15551556out = left.join_asof(right, on="a").collect()1557assert out.shape == (3, 3)155815591560def test_join_bad_input_type() -> None:1561left = pl.LazyFrame({"a": [1, 2, 3]})1562right = pl.LazyFrame({"a": [1, 2, 3]})15631564with pytest.raises(1565TypeError,1566match=r"expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",1567):1568left.join(right.collect(), on="a") # type: ignore[arg-type]15691570with pytest.raises(1571TypeError,1572match=r"expected `other` .*to be a 'LazyFrame'.* not 'Series'",1573):1574left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]15751576class DummyLazyFrameSubclass(pl.LazyFrame):1577pass15781579a = DummyLazyFrameSubclass(left.collect())1580b = DummyLazyFrameSubclass(right.collect())15811582a.join(b, on="a").collect()158315841585def test_join_where() -> None:1586east = pl.LazyFrame(1587{1588"id": [100, 101, 102],1589"dur": [120, 140, 160],1590"rev": [12, 14, 16],1591"cores": [2, 8, 4],1592}1593)1594west = pl.LazyFrame(1595{1596"t_id": [404, 498, 676, 742],1597"time": [90, 130, 150, 170],1598"cost": [9, 13, 15, 16],1599"cores": [4, 2, 1, 4],1600}1601)1602out = east.join_where(1603west,1604pl.col("dur") < pl.col("time"),1605pl.col("rev") < pl.col("cost"),1606).collect()16071608expected = pl.DataFrame(1609{1610"id": [100, 100, 100, 101, 101],1611"dur": [120, 120, 120, 140, 140],1612"rev": [12, 12, 12, 14, 14],1613"cores": [2, 2, 2, 8, 8],1614"t_id": [498, 676, 742, 676, 742],1615"time": [130, 150, 170, 150, 170],1616"cost": [13, 15, 16, 15, 16],1617"cores_right": [2, 1, 4, 1, 4],1618}1619)16201621assert_frame_equal(out, expected)162216231624def test_join_where_bad_input_type() -> None:1625east = pl.LazyFrame(1626{1627"id": [100, 101, 102],1628"dur": [120, 140, 160],1629"rev": [12, 14, 16],1630"cores": [2, 8, 4],1631}1632)1633west = pl.LazyFrame(1634{1635"t_id": [404, 498, 676, 742],1636"time": [90, 130, 150, 170],1637"cost": [9, 13, 15, 16],1638"cores": [4, 2, 1, 4],1639}1640)1641with pytest.raises(1642TypeError,1643match=r"expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",1644):1645east.join_where(1646west.collect(), # type: ignore[arg-type]1647pl.col("dur") < pl.col("time"),1648pl.col("rev") < pl.col("cost"),1649)16501651with pytest.raises(1652TypeError,1653match=r"expected `other` .*to be a 'LazyFrame'.* not 'Series'",1654):1655east.join_where(1656pl.Series(west.collect()), # type: ignore[arg-type]1657pl.col("dur") < pl.col("time"),1658pl.col("rev") < pl.col("cost"),1659)16601661class DummyLazyFrameSubclass(pl.LazyFrame):1662pass16631664a = DummyLazyFrameSubclass(east.collect())1665b = DummyLazyFrameSubclass(west.collect())16661667a.join_where(1668b,1669pl.col("dur") < pl.col("time"),1670pl.col("rev") < pl.col("cost"),1671).collect()167216731674def test_cache_hit_with_proj_and_pred_pushdown() -> None:1675rgx = re.compile(r"CACHE\[id: (.*)\]")16761677lf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]}).cache()16781679q = pl.concat([lf, lf]).select("a", "b")1680assert_frame_equal(1681q.collect(), pl.DataFrame({"a": [1, 2, 3] * 2, "b": [3, 4, 5] * 2})1682)1683e = rgx.findall(q.explain())16841685assert len(e) == 2 # there are only 2 caches1686assert e[0] == e[1] # all caches are the same16871688q = pl.concat([lf, lf]).filter(pl.col.a != 0)1689assert_frame_equal(1690q.collect(),1691pl.DataFrame(1692{"a": [1, 2, 3] * 2, "b": [3, 4, 5] * 2, "c": ["x", "y", "z"] * 2}1693),1694)1695e = rgx.findall(q.explain())16961697assert len(e) == 2 # there are only 2 caches1698assert e[0] == e[1] # all caches are the same169917001701def test_cache_hit_child_removal() -> None:1702df = pl.DataFrame(1703{1704"a": [1, 2, 3],1705}1706)17071708q = df.lazy().sort("a").cache()17091710q1 = pl.concat([q.unique(), q.unique()])1711q2 = pl.concat([q.unique(), q.unique(keep="none")])17121713e1 = q1.explain()1714e2 = q2.explain()17151716assert "SORT" not in e11717assert "SORT" not in e217181719rgx = re.compile(r"CACHE\[id: (.*)\]")17201721e1m = rgx.findall(e1)1722e2m = rgx.findall(e2)17231724assert len(e1m) == 2 # there are only 2 caches1725assert len(e2m) == 2 # there are only 2 caches1726assert e1m[0] == e1m[1] # all caches are the same1727assert e2m[0] == e2m[1] # all caches are the same17281729df1 = q1.collect()1730df2 = q2.collect()17311732assert_frame_equal(df1.head(3), df, check_row_order=False)1733assert_frame_equal(df1.tail(3), df, check_row_order=False)1734assert_frame_equal(df2.head(3), df, check_row_order=False)1735assert_frame_equal(df2.tail(3), df, check_row_order=False)173617371738