Path: blob/main/py-polars/tests/unit/lazyframe/test_lazyframe.py
6939 views
from __future__ import annotations12import re3from datetime import date, datetime4from functools import reduce5from inspect import signature6from operator import add7from string import ascii_letters8from typing import TYPE_CHECKING, Any, Callable, NoReturn, cast910import numpy as np11import pytest1213import polars as pl14import polars.selectors as cs15from polars import lit, when16from polars.exceptions import (17InvalidOperationError,18PerformanceWarning,19PolarsInefficientMapWarning,20)21from polars.testing import assert_frame_equal, assert_series_equal22from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES2324if TYPE_CHECKING:25from _pytest.capture import CaptureFixture2627from polars._typing import PolarsDataType282930def test_init_signature_match() -> None:31# eager/lazy init signatures are expected to match; if this test fails, it32# means a parameter was added to one but not the other, and that should be33# fixed (or an explicit exemption should be made here, with an explanation)34assert signature(pl.DataFrame.__init__) == signature(pl.LazyFrame.__init__)353637def test_lazy_misc() -> None:38ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})39_ = ldf.with_columns(pl.lit(1).alias("foo")).select([pl.col("a"), pl.col("foo")])4041# test if it executes42_ = ldf.with_columns(43when(pl.col("a") > pl.lit(2)).then(pl.lit(10)).otherwise(pl.lit(1)).alias("new")44).collect()454647def test_implode() -> None:48ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})49eager = (50ldf.group_by(pl.col("a").alias("grp"), maintain_order=True)51.agg(pl.implode("a", "b").name.suffix("_imp"))52.collect()53)54assert_frame_equal(55eager,56pl.DataFrame(57{58"grp": [1, 2, 3],59"a_imp": [[1], [2], [3]],60"b_imp": [[1.0], [2.0], [3.0]],61}62),63)646566def test_lazyframe_membership_operator() -> None:67ldf = pl.LazyFrame({"name": ["Jane", "John"], "age": [20, 30]})68assert "name" in ldf69assert "phone" not in ldf7071# note: cannot use lazyframe in boolean context72with pytest.raises(TypeError, match="ambiguous"):73not ldf747576def test_apply() -> None:77ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})78new = ldf.with_columns_seq(79pl.col("a").map_batches(lambda s: s * 2, return_dtype=pl.Int64).alias("foo")80)81expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo"))82assert_frame_equal(new, expected)83assert_frame_equal(new.collect(), expected.collect())8485with pytest.warns(PolarsInefficientMapWarning, match="with this one instead"):86for strategy in ["thread_local", "threading"]:87ldf = pl.LazyFrame({"a": [1, 2, 3] * 20, "b": [1.0, 2.0, 3.0] * 20})88new = ldf.with_columns(89pl.col("a")90.map_elements(lambda s: s * 2, strategy=strategy, return_dtype=pl.Int64) # type: ignore[arg-type]91.alias("foo")92)93expected = ldf.clone().with_columns((pl.col("a") * 2).alias("foo"))94assert_frame_equal(new.collect(), expected.collect())959697def test_add_eager_column() -> None:98lf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})99assert lf.collect_schema().len() == 2100101out = lf.with_columns(pl.lit(pl.Series("c", [1, 2, 3]))).collect()102assert out["c"].sum() == 6103assert out.collect_schema().len() == 3104105106def test_set_null() -> None:107ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})108out = ldf.with_columns(109when(pl.col("a") > 1).then(lit(None)).otherwise(100).alias("foo")110).collect()111s = out["foo"]112assert s[0] == 100113assert s[1] is None114assert s[2] is None115116117def test_gather_every() -> None:118ldf = pl.LazyFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})119expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})120assert_frame_equal(expected_df, ldf.gather_every(2).collect())121expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})122assert_frame_equal(expected_df, ldf.gather_every(2, offset=1).collect())123124125def test_agg() -> None:126df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})127ldf = df.lazy().min()128res = ldf.collect()129assert res.shape == (1, 2)130assert res.row(0) == (1, 1.0)131132133def test_count_suffix_10783() -> None:134df = pl.DataFrame(135{136"a": [["a", "c", "b"], ["a", "b", "c"], ["a", "d", "c"], ["c", "a", "b"]],137"b": [["a", "c", "b"], ["a", "b", "c"], ["a", "d", "c"], ["c", "a", "b"]],138}139)140df_with_cnt = df.with_columns(141pl.len()142.over(pl.col("a").list.sort().list.join("").hash())143.name.suffix("_suffix")144)145df_expect = df.with_columns(pl.Series("len_suffix", [3, 3, 1, 3]))146assert_frame_equal(df_with_cnt, df_expect, check_dtypes=False)147148149def test_or() -> None:150ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})151out = ldf.filter((pl.col("a") == 1) | (pl.col("b") > 2)).collect()152assert out.rows() == [(1, 1.0), (3, 3.0)]153154155def test_filter_str() -> None:156# use a str instead of a column expr157ldf = pl.LazyFrame(158{159"time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"],160"bools": [True, False, True, False],161}162)163164# last row based on a filter165result = ldf.filter(pl.col("bools")).select_seq(pl.last("*")).collect()166expected = pl.DataFrame({"time": ["11:13:00"], "bools": [True]})167assert_frame_equal(result, expected)168169# last row based on a filter170result = ldf.filter("bools").select(pl.last("*")).collect()171assert_frame_equal(result, expected)172173174def test_filter_multiple_predicates() -> None:175ldf = pl.LazyFrame(176{177"a": [1, 1, 1, 2, 2],178"b": [1, 1, 2, 2, 2],179"c": [1, 1, 2, 3, 4],180}181)182183# multiple predicates184expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})185for out in (186ldf.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat187ldf.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list188):189assert_frame_equal(out.collect(), expected)190191# multiple kwargs192assert_frame_equal(193ldf.filter(a=1, b=2).collect(),194pl.DataFrame({"a": [1], "b": [2], "c": [2]}),195)196197# both positional and keyword args198assert_frame_equal(199ldf.filter(pl.col("c") < 4, a=2, b=2).collect(),200pl.DataFrame({"a": [2], "b": [2], "c": [3]}),201)202203ldf = pl.LazyFrame(204{205"description": ["eq", "gt", "ge"],206"predicate": ["==", ">", ">="],207},208)209assert ldf.filter(predicate="==").select("description").collect().item() == "eq"210211212@pytest.mark.parametrize(213"predicate",214[215[pl.lit(True)],216iter([pl.lit(True)]),217[True, True, True],218iter([True, True, True]),219(p for p in (pl.col("c") < 9,)),220(p for p in (pl.col("a") > 0, pl.col("b") > 0)),221],222)223def test_filter_seq_iterable_all_true(predicate: Any) -> None:224ldf = pl.LazyFrame(225{226"a": [1, 1, 1],227"b": [1, 1, 2],228"c": [3, 1, 2],229}230)231assert_frame_equal(ldf, ldf.filter(predicate))232233234def test_apply_custom_function() -> None:235ldf = pl.LazyFrame(236{237"A": [1, 2, 3, 4, 5],238"fruits": ["banana", "banana", "apple", "apple", "banana"],239"B": [5, 4, 3, 2, 1],240"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],241}242)243244# two ways to determine the length groups.245df = (246ldf.group_by("fruits")247.agg(248[249pl.col("cars")250.implode()251.map_elements(lambda groups: groups.len(), return_dtype=pl.Int64)252.alias("custom_1"),253pl.col("cars")254.implode()255.map_elements(lambda groups: groups.len(), return_dtype=pl.Int64)256.alias("custom_2"),257pl.count("cars").alias("cars_count"),258]259)260.sort("custom_1", descending=True)261).collect()262263expected = pl.DataFrame(264{265"fruits": ["banana", "apple"],266"custom_1": [3, 2],267"custom_2": [3, 2],268"cars_count": [3, 2],269}270)271expected = expected.with_columns(pl.col("cars_count").cast(pl.UInt32))272assert_frame_equal(df, expected)273274275def test_group_by() -> None:276ldf = pl.LazyFrame(277{278"a": [1.0, None, 3.0, 4.0],279"b": [5.0, 2.5, -3.0, 2.0],280"grp": ["a", "a", "b", "b"],281}282)283expected_a = pl.DataFrame({"grp": ["a", "b"], "a": [1.0, 3.5]})284expected_a_b = pl.DataFrame({"grp": ["a", "b"], "a": [1.0, 3.5], "b": [3.75, -0.5]})285286for out in (287ldf.group_by("grp").agg(pl.mean("a")).collect(),288ldf.group_by(pl.col("grp")).agg(pl.mean("a")).collect(),289):290assert_frame_equal(out.sort(by="grp"), expected_a)291292out = ldf.group_by("grp").agg(pl.mean("a", "b")).collect()293assert_frame_equal(out.sort(by="grp"), expected_a_b)294295296def test_arg_unique() -> None:297ldf = pl.LazyFrame({"a": [4, 1, 4]})298col_a_unique = ldf.select(pl.col("a").arg_unique()).collect()["a"]299assert_series_equal(col_a_unique, pl.Series("a", [0, 1]).cast(pl.UInt32))300301302def test_arg_sort() -> None:303ldf = pl.LazyFrame({"a": [4, 1, 3]}).select(pl.col("a").arg_sort())304assert ldf.collect()["a"].to_list() == [1, 2, 0]305306307def test_window_function() -> None:308lf = pl.LazyFrame(309{310"A": [1, 2, 3, 4, 5],311"fruits": ["banana", "banana", "apple", "apple", "banana"],312"B": [5, 4, 3, 2, 1],313"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],314}315)316assert lf.collect_schema().len() == 4317318q = lf.with_columns(319pl.sum("A").over("fruits").alias("fruit_sum_A"),320pl.first("B").over("fruits").alias("fruit_first_B"),321pl.max("B").over("cars").alias("cars_max_B"),322)323assert q.collect_schema().len() == 7324325assert q.collect()["cars_max_B"].to_list() == [5, 4, 5, 5, 5]326327out = lf.select([pl.first("B").over(["fruits", "cars"]).alias("B_first")])328assert out.collect()["B_first"].to_list() == [5, 4, 3, 3, 5]329330331def test_when_then_flatten() -> None:332ldf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [3, 4, 5]})333334assert ldf.select(335when(pl.col("foo") > 1)336.then(pl.col("bar"))337.when(pl.col("bar") < 3)338.then(10)339.otherwise(30)340).collect()["bar"].to_list() == [30, 4, 5]341342343def test_describe_plan() -> None:344assert isinstance(pl.LazyFrame({"a": [1]}).explain(optimized=True), str)345assert isinstance(pl.LazyFrame({"a": [1]}).explain(optimized=False), str)346347348@pytest.mark.may_fail_cloud # reason: inspects logs349def test_inspect(capsys: CaptureFixture[str]) -> None:350ldf = pl.LazyFrame({"a": [1]})351ldf.inspect().collect()352captured = capsys.readouterr()353assert len(captured.out) > 0354355ldf.select(pl.col("a").cum_sum().inspect().alias("bar")).collect()356res = capsys.readouterr()357assert len(res.out) > 0358359360@pytest.mark.may_fail_auto_streaming361def test_fetch(fruits_cars: pl.DataFrame) -> None:362with pytest.warns(DeprecationWarning):363res = fruits_cars.lazy().select("*").fetch(2)364assert_frame_equal(res, res[:2])365366367def test_fold_filter() -> None:368lf = pl.LazyFrame({"a": [1, 2, 3], "b": [0, 1, 2]})369370out = lf.filter(371pl.fold(372acc=pl.lit(True),373function=lambda a, b: a & b,374exprs=[pl.col(c) > 1 for c in lf.collect_schema()],375)376).collect()377378assert out.shape == (1, 2)379assert out.rows() == [(3, 2)]380381out = lf.filter(382pl.fold(383acc=pl.lit(True),384function=lambda a, b: a | b,385exprs=[pl.col(c) > 1 for c in lf.collect_schema()],386)387).collect()388389assert out.rows() == [(1, 0), (2, 1), (3, 2)]390391392def test_head_group_by() -> None:393commodity_prices = {394"commodity": [395"Wheat",396"Wheat",397"Wheat",398"Wheat",399"Corn",400"Corn",401"Corn",402"Corn",403"Corn",404],405"location": [406"StPaul",407"StPaul",408"StPaul",409"Chicago",410"Chicago",411"Chicago",412"Chicago",413"Chicago",414"Chicago",415],416"seller": [417"Bob",418"Charlie",419"Susan",420"Paul",421"Ed",422"Mary",423"Paul",424"Charlie",425"Norman",426],427"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],428}429ldf = pl.LazyFrame(commodity_prices)430431# this query flexes the wildcard exclusion quite a bit.432keys = ["commodity", "location"]433out = (434ldf.sort(by="price", descending=True)435.group_by(keys, maintain_order=True)436.agg([pl.col("*").exclude(keys).head(2).name.keep()])437.explode(cs.all().exclude(keys))438)439440assert out.collect().rows() == [441("Corn", "Chicago", "Mary", 3.0),442("Corn", "Chicago", "Paul", 2.4),443("Wheat", "StPaul", "Bob", 1.0),444("Wheat", "StPaul", "Susan", 0.8),445("Wheat", "Chicago", "Paul", 0.55),446]447448ldf = pl.LazyFrame(449{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}450)451out = ldf.group_by("letters").tail(2).sort("letters")452assert_frame_equal(453out.collect(),454pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),455)456out = ldf.group_by("letters").head(2).sort("letters")457assert_frame_equal(458out.collect(),459pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),460)461462463def test_is_null_is_not_null() -> None:464ldf = pl.LazyFrame({"nrs": [1, 2, None]}).select(465pl.col("nrs").is_null().alias("is_null"),466pl.col("nrs").is_not_null().alias("not_null"),467)468assert ldf.collect()["is_null"].to_list() == [False, False, True]469assert ldf.collect()["not_null"].to_list() == [True, True, False]470471472def test_is_nan_is_not_nan() -> None:473ldf = pl.LazyFrame({"nrs": np.array([1, 2, np.nan])}).select(474pl.col("nrs").is_nan().alias("is_nan"),475pl.col("nrs").is_not_nan().alias("not_nan"),476)477assert ldf.collect()["is_nan"].to_list() == [False, False, True]478assert ldf.collect()["not_nan"].to_list() == [True, True, False]479480481def test_is_finite_is_infinite() -> None:482ldf = pl.LazyFrame({"nrs": np.array([1, 2, np.inf])}).select(483pl.col("nrs").is_infinite().alias("is_inf"),484pl.col("nrs").is_finite().alias("not_inf"),485)486assert ldf.collect()["is_inf"].to_list() == [False, False, True]487assert ldf.collect()["not_inf"].to_list() == [True, True, False]488489490def test_len() -> None:491ldf = pl.LazyFrame({"nrs": [1, 2, 3]})492assert cast(int, ldf.select(pl.col("nrs").len()).collect().item()) == 3493494495@pytest.mark.parametrize("dtype", NUMERIC_DTYPES)496def test_cum_agg(dtype: PolarsDataType) -> None:497ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}, schema={"a": dtype})498assert_series_equal(499ldf.select(pl.col("a").cum_min()).collect()["a"],500pl.Series("a", [1, 1, 1, 1], dtype=dtype),501)502assert_series_equal(503ldf.select(pl.col("a").cum_max()).collect()["a"],504pl.Series("a", [1, 2, 3, 3], dtype=dtype),505)506507expected_dtype = (508pl.Int64 if dtype in [pl.Int8, pl.Int16, pl.UInt8, pl.UInt16] else dtype509)510assert_series_equal(511ldf.select(pl.col("a").cum_sum()).collect()["a"],512pl.Series("a", [1, 3, 6, 8], dtype=expected_dtype),513)514515expected_dtype = (516pl.Int64517if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.UInt8, pl.UInt16, pl.UInt32]518else dtype519)520assert_series_equal(521ldf.select(pl.col("a").cum_prod()).collect()["a"],522pl.Series("a", [1, 2, 6, 12], dtype=expected_dtype),523)524525526def test_ceil() -> None:527ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0]})528result = ldf.select(pl.col("a").ceil()).collect()529assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))530531ldf = pl.LazyFrame({"a": [1, 2, 3]})532result = ldf.select(pl.col("a").ceil()).collect()533assert_frame_equal(ldf.collect(), result)534535536def test_floor() -> None:537ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0]})538result = ldf.select(pl.col("a").floor()).collect()539assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))540541ldf = pl.LazyFrame({"a": [1, 2, 3]})542result = ldf.select(pl.col("a").floor()).collect()543assert_frame_equal(ldf.collect(), result)544545546@pytest.mark.parametrize(547("n", "ndigits", "expected"),548[549(1.005, 2, 1.0),550(1234.00000254495, 10, 1234.000002545),551(1835.665, 2, 1835.67),552(-1835.665, 2, -1835.67),553(1.27499, 2, 1.27),554(123.45678, 2, 123.46),555(1254, 2, 1254.0),556(1254, 0, 1254.0),557(123.55, 0, 124.0),558(123.55, 1, 123.6),559(-1.23456789, 6, -1.234568),560(1.0e-5, 5, 0.00001),561(1.0e-20, 20, 1e-20),562(1.0e20, 2, 100000000000000000000.0),563],564)565@pytest.mark.parametrize("dtype", FLOAT_DTYPES)566def test_round(n: float, ndigits: int, expected: float, dtype: pl.DataType) -> None:567ldf = pl.LazyFrame({"value": [n]}, schema_overrides={"value": dtype})568assert_series_equal(569ldf.select(pl.col("value").round(decimals=ndigits)).collect().to_series(),570pl.Series("value", [expected], dtype=dtype),571)572573574def test_dot() -> None:575ldf = pl.LazyFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]}).select(576pl.col("a").dot(pl.col("b"))577)578assert cast(float, ldf.collect().item()) == 12.96579580581def test_sort() -> None:582ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}).select(pl.col("a").sort())583assert_series_equal(ldf.collect()["a"], pl.Series("a", [1, 2, 2, 3]))584585586def test_custom_group_by() -> None:587ldf = pl.LazyFrame({"a": [1, 2, 1, 1], "b": ["a", "b", "c", "c"]})588out = (589ldf.group_by("b", maintain_order=True)590.agg(591[592pl.col("a")593.implode()594.map_elements(lambda x: x.sum(), return_dtype=pl.Int64)595]596)597.collect()598)599assert out.rows() == [("a", 1), ("b", 2), ("c", 2)]600601602def test_lazy_columns() -> None:603lf = pl.LazyFrame(604{605"a": [1],606"b": [1],607"c": [1],608}609)610assert lf.select("a", "c").collect_schema().names() == ["a", "c"]611612613def test_cast_frame() -> None:614lf = pl.LazyFrame(615{616"a": [1.0, 2.5, 3.0],617"b": [4, 5, None],618"c": [True, False, True],619"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],620}621)622623# cast via col:dtype map624assert lf.cast(625dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")}626).collect_schema() == {627"a": pl.Float64,628"b": pl.Float32,629"c": pl.String,630"d": pl.Datetime("ms"),631}632633# cast via selector:dtype map634lfc = lf.cast(635{636cs.float(): pl.UInt8,637cs.integer(): pl.Int32,638cs.temporal(): pl.String,639}640)641assert lfc.collect_schema() == {642"a": pl.UInt8,643"b": pl.Int32,644"c": pl.Boolean,645"d": pl.String,646}647assert lfc.collect().rows() == [648(1, 4, True, "2020-01-02"),649(2, 5, False, "2021-03-04"),650(3, None, True, "2022-05-06"),651]652653# cast all fields to a single type654result = lf.cast(pl.String)655expected = pl.LazyFrame(656{657"a": ["1.0", "2.5", "3.0"],658"b": ["4", "5", None],659"c": ["true", "false", "true"],660"d": ["2020-01-02", "2021-03-04", "2022-05-06"],661}662)663assert_frame_equal(result, expected)664665# test 'strict' mode666lf = pl.LazyFrame({"a": [1000, 2000, 3000]})667668with pytest.raises(InvalidOperationError, match="conversion .* failed"):669lf.cast(pl.UInt8).collect()670671assert lf.cast(pl.UInt8, strict=False).collect().rows() == [672(None,),673(None,),674(None,),675]676677678def test_interpolate() -> None:679df = pl.DataFrame({"a": [1, None, 3]})680assert df.select(pl.col("a").interpolate())["a"].to_list() == [1, 2, 3]681assert df["a"].interpolate().to_list() == [1, 2, 3]682assert df.interpolate()["a"].to_list() == [1, 2, 3]683assert df.lazy().interpolate().collect()["a"].to_list() == [1, 2, 3]684685686def test_fill_nan() -> None:687df = pl.DataFrame({"a": [1.0, np.nan, 3.0]})688assert_series_equal(df.fill_nan(2.0)["a"], pl.Series("a", [1.0, 2.0, 3.0]))689assert_series_equal(690df.lazy().fill_nan(2.0).collect()["a"], pl.Series("a", [1.0, 2.0, 3.0])691)692assert_series_equal(693df.lazy().fill_nan(None).collect()["a"], pl.Series("a", [1.0, None, 3.0])694)695assert_series_equal(696df.select(pl.col("a").fill_nan(2))["a"], pl.Series("a", [1.0, 2.0, 3.0])697)698# nearest699assert pl.Series([None, 1, None, None, None, -8, None, None, 10]).interpolate(700method="nearest"701).to_list() == [None, 1, 1, -8, -8, -8, -8, 10, 10]702703704def test_fill_null() -> None:705df = pl.DataFrame({"a": [1.0, None, 3.0]})706707assert df.select([pl.col("a").fill_null(strategy="min")])["a"][1] == 1.0708assert df.lazy().fill_null(2).collect()["a"].to_list() == [1.0, 2.0, 3.0]709710with pytest.raises(ValueError, match="must specify either"):711df.fill_null()712with pytest.raises(ValueError, match="cannot specify both"):713df.fill_null(value=3.0, strategy="max")714with pytest.raises(ValueError, match="can only specify `limit`"):715df.fill_null(strategy="max", limit=2)716717718def test_backward_fill() -> None:719ldf = pl.LazyFrame({"a": [1.0, None, 3.0]})720col_a_backward_fill = ldf.select(721[pl.col("a").fill_null(strategy="backward")]722).collect()["a"]723assert_series_equal(col_a_backward_fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))724725726def test_rolling(fruits_cars: pl.DataFrame) -> None:727ldf = fruits_cars.lazy()728out = ldf.select(729pl.col("A").rolling_min(3, min_samples=1).alias("1"),730pl.col("A").rolling_min(3).alias("1b"),731pl.col("A").rolling_mean(3, min_samples=1).alias("2"),732pl.col("A").rolling_mean(3).alias("2b"),733pl.col("A").rolling_max(3, min_samples=1).alias("3"),734pl.col("A").rolling_max(3).alias("3b"),735pl.col("A").rolling_sum(3, min_samples=1).alias("4"),736pl.col("A").rolling_sum(3).alias("4b"),737# below we use .round purely for the ability to do assert frame equality738pl.col("A").rolling_std(3).round(1).alias("std"),739pl.col("A").rolling_var(3).round(1).alias("var"),740)741742assert_frame_equal(743out.collect(),744pl.DataFrame(745{746"1": [1, 1, 1, 2, 3],747"1b": [None, None, 1, 2, 3],748"2": [1.0, 1.5, 2.0, 3.0, 4.0],749"2b": [None, None, 2.0, 3.0, 4.0],750"3": [1, 2, 3, 4, 5],751"3b": [None, None, 3, 4, 5],752"4": [1, 3, 6, 9, 12],753"4b": [None, None, 6, 9, 12],754"std": [None, None, 1.0, 1.0, 1.0],755"var": [None, None, 1.0, 1.0, 1.0],756}757),758)759760out_single_val_variance = ldf.select(761pl.col("A").rolling_std(3, min_samples=1).round(decimals=4).alias("std"),762pl.col("A").rolling_var(3, min_samples=1).round(decimals=1).alias("var"),763).collect()764765assert cast(float, out_single_val_variance[0, "std"]) is None766assert cast(float, out_single_val_variance[0, "var"]) is None767768769def test_arr_namespace(fruits_cars: pl.DataFrame) -> None:770ldf = fruits_cars.lazy()771out = ldf.select(772"fruits",773pl.col("B")774.over("fruits", mapping_strategy="join")775.list.min()776.alias("B_by_fruits_min1"),777pl.col("B")778.min()779.over("fruits", mapping_strategy="join")780.alias("B_by_fruits_min2"),781pl.col("B")782.over("fruits", mapping_strategy="join")783.list.max()784.alias("B_by_fruits_max1"),785pl.col("B")786.max()787.over("fruits", mapping_strategy="join")788.alias("B_by_fruits_max2"),789pl.col("B")790.over("fruits", mapping_strategy="join")791.list.sum()792.alias("B_by_fruits_sum1"),793pl.col("B")794.sum()795.over("fruits", mapping_strategy="join")796.alias("B_by_fruits_sum2"),797pl.col("B")798.over("fruits", mapping_strategy="join")799.list.mean()800.alias("B_by_fruits_mean1"),801pl.col("B")802.mean()803.over("fruits", mapping_strategy="join")804.alias("B_by_fruits_mean2"),805)806expected = pl.DataFrame(807{808"fruits": ["banana", "banana", "apple", "apple", "banana"],809"B_by_fruits_min1": [1, 1, 2, 2, 1],810"B_by_fruits_min2": [1, 1, 2, 2, 1],811"B_by_fruits_max1": [5, 5, 3, 3, 5],812"B_by_fruits_max2": [5, 5, 3, 3, 5],813"B_by_fruits_sum1": [10, 10, 5, 5, 10],814"B_by_fruits_sum2": [10, 10, 5, 5, 10],815"B_by_fruits_mean1": [8163.3333333333333335,8173.3333333333333335,8182.5,8192.5,8203.3333333333333335,821],822"B_by_fruits_mean2": [8233.3333333333333335,8243.3333333333333335,8252.5,8262.5,8273.3333333333333335,828],829}830)831assert_frame_equal(out.collect(), expected)832833834def test_arithmetic() -> None:835ldf = pl.LazyFrame({"a": [1, 2, 3]})836837out = ldf.select(838(pl.col("a") % 2).alias("1"),839(2 % pl.col("a")).alias("2"),840(1 // pl.col("a")).alias("3"),841(1 * pl.col("a")).alias("4"),842(1 + pl.col("a")).alias("5"),843(1 - pl.col("a")).alias("6"),844(pl.col("a") // 2).alias("7"),845(pl.col("a") * 2).alias("8"),846(pl.col("a") + 2).alias("9"),847(pl.col("a") - 2).alias("10"),848(-pl.col("a")).alias("11"),849)850expected = pl.DataFrame(851{852"1": [1, 0, 1],853"2": [0, 0, 2],854"3": [1, 0, 0],855"4": [1, 2, 3],856"5": [2, 3, 4],857"6": [0, -1, -2],858"7": [0, 1, 1],859"8": [2, 4, 6],860"9": [3, 4, 5],861"10": [-1, 0, 1],862"11": [-1, -2, -3],863}864)865assert_frame_equal(out.collect(), expected)866867868def test_float_floor_divide() -> None:869x = 10.4870step = 0.5871ldf = pl.LazyFrame({"x": [x]})872ldf_res = ldf.with_columns(pl.col("x") // step).collect().item()873assert ldf_res == x // step874875876def test_argminmax() -> None:877ldf = pl.LazyFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 2, 2]})878out = ldf.select(879pl.col("a").arg_min().alias("min"),880pl.col("a").arg_max().alias("max"),881).collect()882assert out["max"][0] == 4883assert out["min"][0] == 0884885out = (886ldf.group_by("b", maintain_order=True)887.agg([pl.col("a").arg_min().alias("min"), pl.col("a").arg_max().alias("max")])888.collect()889)890assert out["max"][0] == 1891assert out["min"][0] == 0892893894def test_limit(fruits_cars: pl.DataFrame) -> None:895assert_frame_equal(fruits_cars.lazy().limit(1).collect(), fruits_cars[0, :])896897898def test_head(fruits_cars: pl.DataFrame) -> None:899assert_frame_equal(fruits_cars.lazy().head(2).collect(), fruits_cars[:2, :])900901902def test_tail(fruits_cars: pl.DataFrame) -> None:903assert_frame_equal(fruits_cars.lazy().tail(2).collect(), fruits_cars[3:, :])904905906def test_last(fruits_cars: pl.DataFrame) -> None:907result = fruits_cars.lazy().last().collect()908expected = fruits_cars[(len(fruits_cars) - 1) :, :]909assert_frame_equal(result, expected)910911912def test_first(fruits_cars: pl.DataFrame) -> None:913assert_frame_equal(fruits_cars.lazy().first().collect(), fruits_cars[0, :])914915916def test_join_suffix() -> None:917df_left = pl.DataFrame(918{919"a": ["a", "b", "a", "z"],920"b": [1, 2, 3, 4],921"c": [6, 5, 4, 3],922}923)924df_right = pl.DataFrame(925{926"a": ["b", "c", "b", "a"],927"b": [0, 3, 9, 6],928"c": [1, 0, 2, 1],929}930)931out = df_left.join(df_right, on="a", suffix="_bar")932assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]933out = df_left.lazy().join(df_right.lazy(), on="a", suffix="_bar").collect()934assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]935936937@pytest.mark.may_fail_cloud # reason: no938def test_collect_unexpected_kwargs(df: pl.DataFrame) -> None:939with pytest.raises(TypeError, match="unexpected keyword argument"):940df.lazy().collect(common_subexpr_elim=False) # type: ignore[call-overload]941942943def test_spearman_corr() -> None:944ldf = pl.LazyFrame(945{946"era": [1, 1, 1, 2, 2, 2],947"prediction": [2, 4, 5, 190, 1, 4],948"target": [1, 3, 2, 1, 43, 3],949}950)951952out = (953ldf.group_by("era", maintain_order=True).agg(954pl.corr(pl.col("prediction"), pl.col("target"), method="spearman").alias(955"c"956),957)958).collect()["c"]959assert np.isclose(out[0], 0.5)960assert np.isclose(out[1], -1.0)961962# we can also pass in column names directly963out = (964ldf.group_by("era", maintain_order=True).agg(965pl.corr("prediction", "target", method="spearman").alias("c"),966)967).collect()["c"]968assert np.isclose(out[0], 0.5)969assert np.isclose(out[1], -1.0)970971972def test_spearman_corr_ties() -> None:973"""In Spearman correlation, ranks are computed using the average method ."""974df = pl.DataFrame({"a": [1, 1, 1, 2, 3, 7, 4], "b": [4, 3, 2, 2, 4, 3, 1]})975976result = df.select(977pl.corr("a", "b", method="spearman").alias("a1"),978pl.corr(pl.col("a").rank("min"), pl.col("b").rank("min")).alias("a2"),979pl.corr(pl.col("a").rank(), pl.col("b").rank()).alias("a3"),980)981expected = pl.DataFrame(982[983pl.Series("a1", [-0.19048482943986483], dtype=pl.Float64),984pl.Series("a2", [-0.17223653586587362], dtype=pl.Float64),985pl.Series("a3", [-0.19048482943986483], dtype=pl.Float64),986]987)988assert_frame_equal(result, expected)989990991def test_pearson_corr() -> None:992ldf = pl.LazyFrame(993{994"era": [1, 1, 1, 2, 2, 2],995"prediction": [2, 4, 5, 190, 1, 4],996"target": [1, 3, 2, 1, 43, 3],997}998)9991000out = (1001ldf.group_by("era", maintain_order=True).agg(1002pl.corr(1003pl.col("prediction"),1004pl.col("target"),1005method="pearson",1006).alias("c"),1007)1008).collect()["c"]1009assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])10101011# we can also pass in column names directly1012out = (1013ldf.group_by("era", maintain_order=True).agg(1014pl.corr("prediction", "target", method="pearson").alias("c"),1015)1016).collect()["c"]1017assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])101810191020def test_null_count() -> None:1021lf = pl.LazyFrame({"a": [1, 2, None, 2], "b": [None, 3, None, 3]})1022assert lf.null_count().collect().rows() == [(1, 2)]102310241025def test_lazy_concat(df: pl.DataFrame) -> None:1026shape = df.shape1027shape = (shape[0] * 2, shape[1])10281029out = pl.concat([df.lazy(), df.lazy()]).collect()1030assert out.shape == shape1031assert_frame_equal(out, df.vstack(df))103210331034def test_self_join() -> None:1035# 27201036ldf = pl.from_dict(1037data={1038"employee_id": [100, 101, 102],1039"employee_name": ["James", "Alice", "Bob"],1040"manager_id": [None, 100, 101],1041}1042).lazy()10431044out = (1045ldf.join(other=ldf, left_on="manager_id", right_on="employee_id", how="left")1046.select(1047pl.col("employee_id"),1048pl.col("employee_name"),1049pl.col("employee_name_right").alias("manager_name"),1050)1051.collect()1052)1053assert set(out.rows()) == {1054(100, "James", None),1055(101, "Alice", "James"),1056(102, "Bob", "Alice"),1057}105810591060def test_group_lengths() -> None:1061ldf = pl.LazyFrame(1062{1063"group": ["A", "A", "A", "B", "B", "B", "B"],1064"id": ["1", "1", "2", "3", "4", "3", "5"],1065}1066)10671068result = ldf.group_by(["group"], maintain_order=True).agg(1069[1070(pl.col("id").unique_counts() / pl.col("id").len())1071.sum()1072.alias("unique_counts_sum"),1073pl.col("id").unique().len().alias("unique_len"),1074]1075)1076expected = pl.DataFrame(1077{1078"group": ["A", "B"],1079"unique_counts_sum": [1.0, 1.0],1080"unique_len": [2, 3],1081},1082schema_overrides={"unique_len": pl.UInt32},1083)1084assert_frame_equal(result.collect(), expected)108510861087def test_quantile_filtered_agg() -> None:1088assert (1089pl.LazyFrame(1090{1091"group": [0, 0, 0, 0, 1, 1, 1, 1],1092"value": [1, 2, 3, 4, 1, 2, 3, 4],1093}1094)1095.group_by("group")1096.agg(pl.col("value").filter(pl.col("value") < 2).quantile(0.5))1097.collect()["value"]1098.to_list()1099) == [1.0, 1.0]110011011102def test_predicate_count_vstack() -> None:1103l1 = pl.LazyFrame(1104{1105"k": ["x", "y"],1106"v": [3, 2],1107}1108)1109l2 = pl.LazyFrame(1110{1111"k": ["x", "y"],1112"v": [5, 7],1113}1114)1115assert pl.concat([l1, l2]).filter(pl.len().over("k") == 2).collect()[1116"v"1117].to_list() == [3, 2, 5, 7]111811191120def test_lazy_method() -> None:1121# We want to support `.lazy()` on a Lazy DataFrame to allow more generic user code.1122df = pl.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})1123assert_frame_equal(df.lazy(), df.lazy().lazy())112411251126def test_update_schema_after_projection_pd_t4157() -> None:1127ldf = pl.LazyFrame({"c0": [], "c1": [], "c2": []}).rename({"c2": "c2_"})1128assert ldf.drop("c2_").select(pl.col("c0")).collect().columns == ["c0"]112911301131def test_type_coercion_unknown_4190() -> None:1132df = (1133pl.LazyFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).with_columns(1134pl.col("a") & pl.col("a").fill_null(True)1135)1136).collect()1137assert df.shape == (3, 2)1138assert df.rows() == [(1, 1), (2, 2), (3, 3)]113911401141def test_lazy_cache_same_key() -> None:1142ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]})11431144# these have the same schema, but should not be used by cache as they are different1145add_node = ldf.select([(pl.col("a") + pl.col("b")).alias("a"), pl.col("c")]).cache()1146mult_node = ldf.select((pl.col("a") * pl.col("b")).alias("a"), pl.col("c")).cache()11471148result = mult_node.join(add_node, on="c", suffix="_mult").select(1149(pl.col("a") - pl.col("a_mult")).alias("a"), pl.col("c")1150)1151expected = pl.LazyFrame({"a": [-1, 2, 7], "c": ["x", "y", "z"]})1152assert_frame_equal(result, expected, check_row_order=False)115311541155@pytest.mark.may_fail_cloud # reason: inspects logs1156@pytest.mark.may_fail_auto_streaming1157def test_lazy_cache_hit(monkeypatch: Any, capfd: Any) -> None:1158monkeypatch.setenv("POLARS_VERBOSE", "1")11591160ldf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]})1161add_node = ldf.select([(pl.col("a") + pl.col("b")).alias("a"), pl.col("c")]).cache()11621163result = add_node.join(add_node, on="c", suffix="_mult").select(1164(pl.col("a") - pl.col("a_mult")).alias("a"), pl.col("c")1165)1166expected = pl.LazyFrame({"a": [0, 0, 0], "c": ["x", "y", "z"]})1167assert_frame_equal(result, expected, check_row_order=False)11681169(_, err) = capfd.readouterr()1170assert "CACHE HIT" in err117111721173@pytest.mark.may_fail_cloud # reason: impure udf1174def test_lazy_cache_parallel() -> None:1175df_evaluated = 011761177def map_df(df: pl.DataFrame) -> pl.DataFrame:1178nonlocal df_evaluated1179df_evaluated += 11180return df11811182df = pl.LazyFrame({"a": [1]}).map_batches(map_df).cache()11831184df = pl.concat(1185[1186df.select(pl.col("a") + 1),1187df.select(pl.col("a") + 2),1188df.select(pl.col("a") + 3),1189],1190parallel=True,1191)11921193assert df_evaluated == 011941195df.collect()1196assert df_evaluated == 1119711981199@pytest.mark.may_fail_cloud # reason: impure udf1200def test_lazy_cache_nested_parallel() -> None:1201df_inner_evaluated = 01202df_outer_evaluated = 012031204def map_df_inner(df: pl.DataFrame) -> pl.DataFrame:1205nonlocal df_inner_evaluated1206df_inner_evaluated += 11207return df12081209def map_df_outer(df: pl.DataFrame) -> pl.DataFrame:1210nonlocal df_outer_evaluated1211df_outer_evaluated += 11212return df12131214df_inner = pl.LazyFrame({"a": [1]}).map_batches(map_df_inner).cache()1215df_outer = df_inner.select(pl.col("a") + 1).map_batches(map_df_outer).cache()12161217df = pl.concat(1218[1219df_outer.select(pl.col("a") + 2),1220df_outer.select(pl.col("a") + 3),1221],1222parallel=True,1223)12241225assert df_inner_evaluated == 01226assert df_outer_evaluated == 012271228df.collect()1229assert df_inner_evaluated == 11230assert df_outer_evaluated == 1123112321233def test_quadratic_behavior_4736() -> None:1234# no assert; if this function does not stall our tests it has passed!1235lf = pl.LazyFrame(schema=list(ascii_letters))1236lf.select(reduce(add, (pl.col(c) for c in lf.collect_schema())))123712381239@pytest.mark.parametrize("input_dtype", [pl.Int64, pl.Float64])1240def test_from_epoch(input_dtype: PolarsDataType) -> None:1241ldf = pl.LazyFrame(1242[1243pl.Series("timestamp_d", [13285]).cast(input_dtype),1244pl.Series("timestamp_s", [1147880044]).cast(input_dtype),1245pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(input_dtype),1246pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(input_dtype),1247pl.Series("timestamp_ns", [1147880044 * 1_000_000_000]).cast(input_dtype),1248]1249)12501251exp_dt = datetime(2006, 5, 17, 15, 34, 4)1252expected = pl.DataFrame(1253[1254pl.Series("timestamp_d", [date(2006, 5, 17)]),1255pl.Series("timestamp_s", [exp_dt]), # s is no Polars dtype, defaults to us1256pl.Series("timestamp_ms", [exp_dt]).cast(pl.Datetime("ms")),1257pl.Series("timestamp_us", [exp_dt]), # us is Polars Datetime default1258pl.Series("timestamp_ns", [exp_dt]).cast(pl.Datetime("ns")),1259]1260)12611262ldf_result = ldf.select(1263pl.from_epoch(pl.col("timestamp_d"), time_unit="d"),1264pl.from_epoch(pl.col("timestamp_s"), time_unit="s"),1265pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"),1266pl.from_epoch(pl.col("timestamp_us"), time_unit="us"),1267pl.from_epoch(pl.col("timestamp_ns"), time_unit="ns"),1268).collect()12691270assert_frame_equal(ldf_result, expected)12711272ts_col = pl.col("timestamp_s")1273with pytest.raises(ValueError):1274_ = ldf.select(pl.from_epoch(ts_col, time_unit="s2")) # type: ignore[call-overload]127512761277def test_from_epoch_str() -> None:1278ldf = pl.LazyFrame(1279[1280pl.Series("timestamp_ms", [1147880044 * 1_000]).cast(pl.String),1281pl.Series("timestamp_us", [1147880044 * 1_000_000]).cast(pl.String),1282]1283)12841285with pytest.raises(InvalidOperationError):1286ldf.select(1287pl.from_epoch(pl.col("timestamp_ms"), time_unit="ms"),1288pl.from_epoch(pl.col("timestamp_us"), time_unit="us"),1289).collect()129012911292def test_cum_agg_types() -> None:1293ldf = pl.LazyFrame({"a": [1, 2], "b": [True, False], "c": [1.3, 2.4]})1294cum_sum_lf = ldf.select(1295pl.col("a").cum_sum(),1296pl.col("b").cum_sum(),1297pl.col("c").cum_sum(),1298)1299assert cum_sum_lf.collect_schema()["a"] == pl.Int641300assert cum_sum_lf.collect_schema()["b"] == pl.UInt321301assert cum_sum_lf.collect_schema()["c"] == pl.Float641302collected_cumsum_lf = cum_sum_lf.collect()1303assert collected_cumsum_lf.schema == cum_sum_lf.collect_schema()13041305cum_prod_lf = ldf.select(1306pl.col("a").cast(pl.UInt64).cum_prod(),1307pl.col("b").cum_prod(),1308pl.col("c").cum_prod(),1309)1310assert cum_prod_lf.collect_schema()["a"] == pl.UInt641311assert cum_prod_lf.collect_schema()["b"] == pl.Int641312assert cum_prod_lf.collect_schema()["c"] == pl.Float641313collected_cum_prod_lf = cum_prod_lf.collect()1314assert collected_cum_prod_lf.schema == cum_prod_lf.collect_schema()131513161317def test_compare_schema_between_lazy_and_eager_6904() -> None:1318float32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Float32)})1319eager_result = float32_df.select(pl.col("x").sqrt()).select(pl.col(pl.Float32))1320lazy_result = (1321float32_df.lazy()1322.select(pl.col("x").sqrt())1323.select(pl.col(pl.Float32))1324.collect()1325)1326assert eager_result.shape == lazy_result.shape13271328eager_result = float32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float32))1329lazy_result = (1330float32_df.lazy()1331.select(pl.col("x").pow(2))1332.select(pl.col(pl.Float32))1333.collect()1334)1335assert eager_result.shape == lazy_result.shape13361337int32_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int32)})1338eager_result = int32_df.select(pl.col("x").pow(2)).select(pl.col(pl.Float64))1339lazy_result = (1340int32_df.lazy().select(pl.col("x").pow(2)).select(pl.col(pl.Float64)).collect()1341)1342assert eager_result.shape == lazy_result.shape13431344int8_df = pl.DataFrame({"x": pl.Series(values=[], dtype=pl.Int8)})1345eager_result = int8_df.select(pl.col("x").diff()).select(pl.col(pl.Int16))1346lazy_result = (1347int8_df.lazy().select(pl.col("x").diff()).select(pl.col(pl.Int16)).collect()1348)1349assert eager_result.shape == lazy_result.shape135013511352@pytest.mark.slow1353@pytest.mark.parametrize(1354"dtype",1355[1356pl.UInt8,1357pl.UInt16,1358pl.UInt32,1359pl.UInt64,1360pl.Int8,1361pl.Int16,1362pl.Int32,1363pl.Int64,1364pl.Float32,1365pl.Float64,1366],1367)1368@pytest.mark.parametrize(1369"func",1370[1371pl.col("x").arg_max(),1372pl.col("x").arg_min(),1373pl.col("x").max(),1374pl.col("x").mean(),1375pl.col("x").median(),1376pl.col("x").min(),1377pl.col("x").nan_max(),1378pl.col("x").nan_min(),1379pl.col("x").product(),1380pl.col("x").quantile(0.5),1381pl.col("x").std(),1382pl.col("x").sum(),1383pl.col("x").var(),1384],1385)1386def test_compare_aggregation_between_lazy_and_eager_6904(1387dtype: PolarsDataType, func: pl.Expr1388) -> None:1389df = pl.DataFrame(1390{1391"x": pl.Series(values=[1, 2, 3] * 2, dtype=dtype),1392"y": pl.Series(values=["a"] * 3 + ["b"] * 3),1393}1394)1395result_eager = df.select(func.over("y")).select("x")1396dtype_eager = result_eager["x"].dtype1397result_lazy = df.lazy().select(func.over("y")).select(pl.col(dtype_eager)).collect()1398assert_frame_equal(result_eager, result_lazy)139914001401@pytest.mark.parametrize(1402"comparators",1403[1404("==", pl.LazyFrame.__eq__),1405("!=", pl.LazyFrame.__ne__),1406(">", pl.LazyFrame.__gt__),1407("<", pl.LazyFrame.__lt__),1408(">=", pl.LazyFrame.__ge__),1409("<=", pl.LazyFrame.__le__),1410],1411)1412def test_lazy_comparison_operators(1413comparators: tuple[str, Callable[[pl.LazyFrame, Any], NoReturn]],1414) -> None:1415# we cannot compare lazy frames, so all should raise a TypeError1416with pytest.raises(1417TypeError,1418match=f'"{comparators[0]!r}" comparison not supported for LazyFrame objects',1419):1420comparators[1](pl.LazyFrame(), pl.LazyFrame())142114221423def test_lf_properties() -> None:1424lf = pl.LazyFrame(1425{1426"foo": [1, 2, 3],1427"bar": [6.0, 7.0, 8.0],1428"ham": ["a", "b", "c"],1429}1430)1431with pytest.warns(PerformanceWarning):1432assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}1433with pytest.warns(PerformanceWarning):1434assert lf.columns == ["foo", "bar", "ham"]1435with pytest.warns(PerformanceWarning):1436assert lf.dtypes == [pl.Int64, pl.Float64, pl.String]1437with pytest.warns(PerformanceWarning):1438assert lf.width == 3143914401441def test_lf_unnest() -> None:1442lf = pl.DataFrame(1443[1444pl.Series(1445"a",1446[{"ab": [1, 2, 3], "ac": [3, 4, 5]}],1447dtype=pl.Struct({"ab": pl.List(pl.Int64), "ac": pl.List(pl.Int64)}),1448),1449pl.Series(1450"b",1451[{"ba": [5, 6, 7], "bb": [7, 8, 9]}],1452dtype=pl.Struct({"ba": pl.List(pl.Int64), "bb": pl.List(pl.Int64)}),1453),1454]1455).lazy()14561457expected = pl.DataFrame(1458[1459pl.Series("ab", [[1, 2, 3]], dtype=pl.List(pl.Int64)),1460pl.Series("ac", [[3, 4, 5]], dtype=pl.List(pl.Int64)),1461pl.Series("ba", [[5, 6, 7]], dtype=pl.List(pl.Int64)),1462pl.Series("bb", [[7, 8, 9]], dtype=pl.List(pl.Int64)),1463]1464)1465assert_frame_equal(lf.unnest("a", "b").collect(), expected)146614671468def test_type_coercion_cast_boolean_after_comparison() -> None:1469import operator14701471lf = pl.LazyFrame({"a": 1, "b": 2})14721473for op in [1474operator.eq,1475operator.ne,1476operator.lt,1477operator.le,1478operator.gt,1479operator.ge,1480pl.Expr.eq_missing,1481pl.Expr.ne_missing,1482]:1483e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean).alias("o")1484assert "cast" not in lf.with_columns(e).explain()14851486e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean).cast(pl.Boolean).alias("o")1487assert "cast" not in lf.with_columns(e).explain()14881489for op in [operator.and_, operator.or_, operator.xor]:1490e = op(pl.col("a"), pl.col("b")).cast(pl.Boolean)1491assert "cast" in lf.with_columns(e).explain()149214931494def test_unique_length_multiple_columns() -> None:1495lf = pl.LazyFrame(1496{1497"a": [1, 1, 1, 2, 3],1498"b": [100, 100, 200, 100, 300],1499}1500)1501assert lf.unique().select(pl.len()).collect().item() == 4150215031504def test_asof_cross_join() -> None:1505left = pl.LazyFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(1506pl.col("a").set_sorted()1507)1508right = pl.LazyFrame(1509{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}1510).with_columns(pl.col("a").set_sorted())15111512out = left.join_asof(right, on="a").collect()1513assert out.shape == (3, 3)151415151516def test_join_bad_input_type() -> None:1517left = pl.LazyFrame({"a": [1, 2, 3]})1518right = pl.LazyFrame({"a": [1, 2, 3]})15191520with pytest.raises(1521TypeError,1522match="expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",1523):1524left.join(right.collect(), on="a") # type: ignore[arg-type]15251526with pytest.raises(1527TypeError,1528match="expected `other` .*to be a 'LazyFrame'.* not 'Series'",1529):1530left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]15311532class DummyLazyFrameSubclass(pl.LazyFrame):1533pass15341535a = DummyLazyFrameSubclass(left.collect())1536b = DummyLazyFrameSubclass(right.collect())15371538a.join(b, on="a").collect()153915401541def test_join_where() -> None:1542east = pl.LazyFrame(1543{1544"id": [100, 101, 102],1545"dur": [120, 140, 160],1546"rev": [12, 14, 16],1547"cores": [2, 8, 4],1548}1549)1550west = pl.LazyFrame(1551{1552"t_id": [404, 498, 676, 742],1553"time": [90, 130, 150, 170],1554"cost": [9, 13, 15, 16],1555"cores": [4, 2, 1, 4],1556}1557)1558out = east.join_where(1559west,1560pl.col("dur") < pl.col("time"),1561pl.col("rev") < pl.col("cost"),1562).collect()15631564expected = pl.DataFrame(1565{1566"id": [100, 100, 100, 101, 101],1567"dur": [120, 120, 120, 140, 140],1568"rev": [12, 12, 12, 14, 14],1569"cores": [2, 2, 2, 8, 8],1570"t_id": [498, 676, 742, 676, 742],1571"time": [130, 150, 170, 150, 170],1572"cost": [13, 15, 16, 15, 16],1573"cores_right": [2, 1, 4, 1, 4],1574}1575)15761577assert_frame_equal(out, expected)157815791580def test_join_where_bad_input_type() -> None:1581east = pl.LazyFrame(1582{1583"id": [100, 101, 102],1584"dur": [120, 140, 160],1585"rev": [12, 14, 16],1586"cores": [2, 8, 4],1587}1588)1589west = pl.LazyFrame(1590{1591"t_id": [404, 498, 676, 742],1592"time": [90, 130, 150, 170],1593"cost": [9, 13, 15, 16],1594"cores": [4, 2, 1, 4],1595}1596)1597with pytest.raises(1598TypeError,1599match="expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",1600):1601east.join_where(1602west.collect(), # type: ignore[arg-type]1603pl.col("dur") < pl.col("time"),1604pl.col("rev") < pl.col("cost"),1605)16061607with pytest.raises(1608TypeError,1609match="expected `other` .*to be a 'LazyFrame'.* not 'Series'",1610):1611east.join_where(1612pl.Series(west.collect()), # type: ignore[arg-type]1613pl.col("dur") < pl.col("time"),1614pl.col("rev") < pl.col("cost"),1615)16161617class DummyLazyFrameSubclass(pl.LazyFrame):1618pass16191620a = DummyLazyFrameSubclass(east.collect())1621b = DummyLazyFrameSubclass(west.collect())16221623a.join_where(1624b,1625pl.col("dur") < pl.col("time"),1626pl.col("rev") < pl.col("cost"),1627).collect()162816291630def test_cache_hit_with_proj_and_pred_pushdown() -> None:1631rgx = re.compile(r"CACHE\[id: (.*)\]")16321633lf = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": ["x", "y", "z"]}).cache()16341635q = pl.concat([lf, lf]).select("a", "b")1636assert_frame_equal(1637q.collect(), pl.DataFrame({"a": [1, 2, 3] * 2, "b": [3, 4, 5] * 2})1638)1639e = rgx.findall(q.explain())16401641assert len(e) == 2 # there are only 2 caches1642assert e[0] == e[1] # all caches are the same16431644q = pl.concat([lf, lf]).filter(pl.col.a != 0)1645assert_frame_equal(1646q.collect(),1647pl.DataFrame(1648{"a": [1, 2, 3] * 2, "b": [3, 4, 5] * 2, "c": ["x", "y", "z"] * 2}1649),1650)1651e = rgx.findall(q.explain())16521653assert len(e) == 2 # there are only 2 caches1654assert e[0] == e[1] # all caches are the same165516561657def test_cache_hit_child_removal() -> None:1658df = pl.DataFrame(1659{1660"a": [1, 2, 3],1661}1662)16631664q = df.lazy().sort("a").cache()16651666q1 = pl.concat([q.unique(), q.unique()])1667q2 = pl.concat([q.unique(), q.unique(keep="none")])16681669e1 = q1.explain()1670e2 = q2.explain()16711672assert "SORT" not in e11673assert "SORT" not in e216741675rgx = re.compile(r"CACHE\[id: (.*)\]")16761677e1m = rgx.findall(e1)1678e2m = rgx.findall(e2)16791680assert len(e1m) == 2 # there are only 2 caches1681assert len(e2m) == 2 # there are only 2 caches1682assert e1m[0] == e1m[1] # all caches are the same1683assert e2m[0] == e2m[1] # all caches are the same16841685df1 = q1.collect()1686df2 = q2.collect()16871688assert_frame_equal(df1.head(3), df, check_row_order=False)1689assert_frame_equal(df1.tail(3), df, check_row_order=False)1690assert_frame_equal(df2.head(3), df, check_row_order=False)1691assert_frame_equal(df2.tail(3), df, check_row_order=False)169216931694