Path: blob/main/py-polars/tests/unit/functions/test_functions.py
8408 views
from __future__ import annotations12from typing import TYPE_CHECKING, Any34import numpy as np5import pytest67import polars as pl8from polars.exceptions import DuplicateError, InvalidOperationError9from polars.testing import assert_frame_equal, assert_series_equal10from tests.unit.conftest import NUMERIC_DTYPES, TEMPORAL_DTYPES1112if TYPE_CHECKING:13from polars._typing import ConcatMethod, CorrelationMethod, PolarsDataType141516def test_concat_align() -> None:17a = pl.DataFrame({"a": ["a", "b", "d", "e", "e"], "b": [1, 2, 4, 5, 6]})18b = pl.DataFrame({"a": ["a", "b", "c"], "c": [5.5, 6.0, 7.5]})19c = pl.DataFrame({"a": ["a", "b", "c", "d", "e"], "d": ["w", "x", "y", "z", None]})2021for align_full in ("align", "align_full"):22result = pl.concat([a, b, c], how=align_full)23expected = pl.DataFrame(24{25"a": ["a", "b", "c", "d", "e", "e"],26"b": [1, 2, None, 4, 5, 6],27"c": [5.5, 6.0, 7.5, None, None, None],28"d": ["w", "x", "y", "z", None, None],29}30)31assert_frame_equal(result, expected)3233result = pl.concat([a, b, c], how="align_left")34expected = pl.DataFrame(35{36"a": ["a", "b", "d", "e", "e"],37"b": [1, 2, 4, 5, 6],38"c": [5.5, 6.0, None, None, None],39"d": ["w", "x", "z", None, None],40}41)42assert_frame_equal(result, expected)4344result = pl.concat([a, b, c], how="align_right")45expected = pl.DataFrame(46{47"a": ["a", "b", "c", "d", "e"],48"b": [1, 2, None, None, None],49"c": [5.5, 6.0, 7.5, None, None],50"d": ["w", "x", "y", "z", None],51}52)53assert_frame_equal(result, expected)5455result = pl.concat([a, b, c], how="align_inner")56expected = pl.DataFrame(57{58"a": ["a", "b"],59"b": [1, 2],60"c": [5.5, 6.0],61"d": ["w", "x"],62}63)64assert_frame_equal(result, expected)656667@pytest.mark.parametrize(68"strategy", ["align", "align_full", "align_left", "align_right"]69)70def test_concat_align_no_common_cols(strategy: ConcatMethod) -> None:71df1 = pl.DataFrame({"a": [1, 2], "b": [1, 2]})72df2 = pl.DataFrame({"c": [3, 4], "d": [3, 4]})7374with pytest.raises(75InvalidOperationError,76match=f"{strategy!r} strategy requires at least one common column",77):78pl.concat((df1, df2), how=strategy)798081@pytest.mark.parametrize(82("a", "b", "c", "strategy"),83[84(85pl.DataFrame({"a": [1, 2]}),86pl.DataFrame({"b": ["a", "b"], "c": [3, 4]}),87pl.DataFrame({"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]}),88"diagonal",89),90(91pl.DataFrame(92{"a": [1, 2]},93schema_overrides={"a": pl.Int32},94),95pl.DataFrame(96{"b": ["a", "b"], "c": [3, 4]},97schema_overrides={"c": pl.UInt8},98),99pl.DataFrame(100{"a": [5, 6], "c": [5, 6], "d": [5, 6], "b": ["x", "y"]},101schema_overrides={"b": pl.Categorical},102),103"diagonal_relaxed",104),105],106)107def test_concat_diagonal(108a: pl.DataFrame, b: pl.DataFrame, c: pl.DataFrame, strategy: ConcatMethod109) -> None:110for out in [111pl.concat([a, b, c], how=strategy),112pl.concat([a.lazy(), b.lazy(), c.lazy()], how=strategy).collect(),113]:114expected = pl.DataFrame(115{116"a": [1, 2, None, None, 5, 6],117"b": [None, None, "a", "b", "x", "y"],118"c": [None, None, 3, 4, 5, 6],119"d": [None, None, None, None, 5, 6],120}121)122assert_frame_equal(out, expected)123124125def test_concat_diagonal_relaxed_with_empty_frame() -> None:126df1 = pl.DataFrame()127df2 = pl.DataFrame(128{129"a": ["a", "b"],130"b": [1, 2],131}132)133out = pl.concat((df1, df2), how="diagonal_relaxed")134expected = df2135assert_frame_equal(out, expected)136137138@pytest.mark.parametrize("lazy", [False, True])139def test_concat_horizontal(lazy: bool) -> None:140a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})141b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]})142143if lazy:144out = pl.concat([a.lazy(), b.lazy()], how="horizontal").collect()145else:146out = pl.concat([a, b], how="horizontal")147148expected = pl.DataFrame(149{150"a": ["a", "b", None, None],151"b": [1, 2, None, None],152"c": [5, 7, 8, 9],153"d": [1, 2, 1, 2],154"e": [1, 2, 1, 2],155}156)157assert_frame_equal(out, expected)158159160@pytest.mark.parametrize("lazy", [False, True])161def test_concat_horizontal_three_dfs(lazy: bool) -> None:162a = pl.DataFrame({"a1": [1, 2, 3], "a2": ["a", "b", "c"]})163b = pl.DataFrame({"b1": [0.25, 0.5]})164c = pl.DataFrame({"c1": [1, 2, 3, 4], "c2": [5, 6, 7, 8], "c3": [9, 10, 11, 12]})165166if lazy:167out = pl.concat([a.lazy(), b.lazy(), c.lazy()], how="horizontal").collect()168else:169out = pl.concat([a, b, c], how="horizontal")170171expected = pl.DataFrame(172{173"a1": [1, 2, 3, None],174"a2": ["a", "b", "c", None],175"b1": [0.25, 0.5, None, None],176"c1": [1, 2, 3, 4],177"c2": [5, 6, 7, 8],178"c3": [9, 10, 11, 12],179}180)181assert_frame_equal(out, expected)182183184@pytest.mark.parametrize("lazy", [False, True])185def test_concat_horizontal_single_df(lazy: bool) -> None:186a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})187188if lazy:189out = pl.concat([a.lazy()], how="horizontal").collect()190else:191out = pl.concat([a], how="horizontal")192193expected = a194assert_frame_equal(out, expected)195196197def test_concat_horizontal_duplicate_col() -> None:198a = pl.LazyFrame({"a": ["a", "b"], "b": [1, 2]})199b = pl.LazyFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "a": [1, 2, 1, 2]})200201with pytest.raises(DuplicateError):202pl.concat([a, b], how="horizontal").collect()203204205def test_concat_vertical() -> None:206a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})207b = pl.DataFrame({"a": ["c", "d", "e"], "b": [3, 4, 5]})208209result = pl.concat([a, b], how="vertical")210expected = pl.DataFrame(211{212"a": ["a", "b", "c", "d", "e"],213"b": [1, 2, 3, 4, 5],214}215)216assert_frame_equal(result, expected)217218219def test_cov() -> None:220s1 = pl.Series("a", [10, 37, -40])221s2 = pl.Series("b", [70, -10, 35])222223# lazy/expression224lf = pl.LazyFrame([s1, s2])225res1 = lf.select(226x=pl.cov("a", "b"),227y=pl.cov("a", "b", ddof=2),228).collect()229230# eager/series231res2 = (232pl.cov(s1, s2, eager=True).alias("x"),233pl.cov(s1, s2, eager=True, ddof=2).alias("y"),234)235236# expect same result from both approaches237for idx, (r1, r2) in enumerate(zip(res1, res2, strict=True)):238expected_value = -645.8333333333 if idx == 0 else -1291.6666666666239assert pytest.approx(expected_value) == r1.item()240assert_series_equal(r1, r2)241242243def test_corr() -> None:244s1 = pl.Series("a", [10, 37, -40])245s2 = pl.Series("b", [70, -10, 35])246247# lazy/expression248lf = pl.LazyFrame([s1, s2])249res1 = lf.select(250x=pl.corr("a", "b"),251y=pl.corr("a", "b", method="spearman"),252).collect()253254# eager/series255res2 = (256pl.corr(s1, s2, eager=True).alias("x"),257pl.corr(s1, s2, method="spearman", eager=True).alias("y"),258)259260# expect same result from both approaches261for idx, (r1, r2) in enumerate(zip(res1, res2, strict=True)):262assert pytest.approx(-0.412199756 if idx == 0 else -0.5) == r1.item()263assert_series_equal(r1, r2)264265266def test_extend_ints() -> None:267a = pl.DataFrame({"a": [1 for _ in range(1)]}, schema={"a": pl.Int64})268with pytest.raises(pl.exceptions.SchemaError):269a.extend(a.select(pl.lit(0, dtype=pl.Int32).alias("a")))270271272def test_null_handling_correlation() -> None:273df = pl.DataFrame({"a": [1, 2, 3, None, 4], "b": [1, 2, 3, 10, 4]})274275out = df.select(276pl.corr("a", "b").alias("pearson"),277pl.corr("a", "b", method="spearman").alias("spearman"),278)279assert out["pearson"][0] == pytest.approx(1.0)280assert out["spearman"][0] == pytest.approx(1.0)281282# see #4930283df1 = pl.DataFrame({"a": [None, 1, 2], "b": [None, 2, 1]})284df2 = pl.DataFrame({"a": [np.nan, 1, 2], "b": [np.nan, 2, 1]})285286assert np.isclose(df1.select(pl.corr("a", "b", method="spearman")).item(), -1.0)287assert (288str(289df2.select(pl.corr("a", "b", method="spearman", propagate_nans=True)).item()290)291== "nan"292)293294295# see #25407296def test_spearman_propagate_nans_with_all_nulls_does_not_panic() -> None:297df = pl.select(x=None, y=None).cast(pl.Float64)298299out = df.select(pl.corr("x", "y", method="spearman", propagate_nans=True))300301assert str(out.item()) == "nan"302303304def test_align_frames() -> None:305import numpy as np306import pandas as pd307308# setup some test frames309pdf1 = pd.DataFrame(310{311"date": pd.date_range(start="2019-01-02", periods=9),312"a": np.array([0, 1, 2, np.nan, 4, 5, 6, 7, 8], dtype=np.float64),313"b": np.arange(9, 18, dtype=np.float64),314}315).set_index("date")316317pdf2 = pd.DataFrame(318{319"date": pd.date_range(start="2019-01-04", periods=7),320"a": np.arange(9, 16, dtype=np.float64),321"b": np.arange(10, 17, dtype=np.float64),322}323).set_index("date")324325# calculate dot-product in pandas326pd_dot = (pdf1 * pdf2).sum(axis="columns").to_frame("dot").reset_index()327328# use "align_frames" to calculate dot-product from disjoint rows. pandas uses an329# index to automatically infer the correct frame-alignment for the calculation;330# we need to do it explicitly (which also makes it clearer what is happening)331pf1, pf2 = pl.align_frames(332pl.from_pandas(pdf1.reset_index()),333pl.from_pandas(pdf2.reset_index()),334on="date",335)336pl_dot = (337(pf1[["a", "b"]] * pf2[["a", "b"]])338.fill_null(0)339.select(pl.sum_horizontal("*").alias("dot"))340.insert_column(0, pf1["date"])341)342# confirm we match the same operation in pandas343assert_frame_equal(pl_dot, pl.from_pandas(pd_dot))344pd.testing.assert_frame_equal(pd_dot, pl_dot.to_pandas())345346# confirm alignment function works with lazy frames347lf1, lf2 = pl.align_frames(348pl.from_pandas(pdf1.reset_index()).lazy(),349pl.from_pandas(pdf2.reset_index()).lazy(),350on="date",351)352assert isinstance(lf1, pl.LazyFrame)353assert_frame_equal(lf1.collect(), pf1)354assert_frame_equal(lf2.collect(), pf2)355356# misc: no frames results in an empty list357assert pl.align_frames(on="date") == []358359# expected error condition360with pytest.raises(TypeError):361pl.align_frames( # type: ignore[type-var]362pl.from_pandas(pdf1.reset_index()).lazy(),363pl.from_pandas(pdf2.reset_index()),364on="date",365)366367368def test_align_frames_misc() -> None:369df1 = pl.DataFrame([[3, 5, 6], [5, 8, 9]], orient="row")370df2 = pl.DataFrame([[2, 5, 6], [3, 8, 9], [4, 2, 0]], orient="row")371372# descending result373pf1, pf2 = pl.align_frames(374[df1, df2], # list input375on="column_0",376descending=True,377)378assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]379assert pf2.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]380381# handle identical frames382pf1, pf2, pf3 = pl.align_frames(383(df for df in (df1, df2, df2)), # generator input384on="column_0",385descending=True,386)387assert pf1.rows() == [(5, 8, 9), (4, None, None), (3, 5, 6), (2, None, None)]388for pf in (pf2, pf3):389assert pf.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]390391392def test_align_frames_with_nulls() -> None:393df1 = pl.DataFrame({"key": ["x", "y", None], "value": [1, 2, 0]})394df2 = pl.DataFrame({"key": ["x", None, "z", "y"], "value": [4, 3, 6, 5]})395396a1, a2 = pl.align_frames(df1, df2, on="key")397398aligned_frame_data = a1.to_dict(as_series=False), a2.to_dict(as_series=False)399assert aligned_frame_data == (400{"key": [None, "x", "y", "z"], "value": [0, 1, 2, None]},401{"key": [None, "x", "y", "z"], "value": [3, 4, 5, 6]},402)403404405def test_align_frames_duplicate_key() -> None:406# setup some test frames with duplicate key/alignment values407df1 = pl.DataFrame({"x": ["a", "a", "a", "e"], "y": [1, 2, 4, 5]})408df2 = pl.DataFrame({"y": [0, 0, -1], "z": [5.5, 6.0, 7.5], "x": ["a", "b", "b"]})409410# align rows, confirming correctness and original column order411af1, af2 = pl.align_frames(df1, df2, on="x")412413# shape: (6, 2) shape: (6, 3)414# ┌─────┬──────┐ ┌──────┬──────┬─────┐415# │ x ┆ y │ │ y ┆ z ┆ x │416# │ --- ┆ --- │ │ --- ┆ --- ┆ --- │417# │ str ┆ i64 │ │ i64 ┆ f64 ┆ str │418# ╞═════╪══════╡ ╞══════╪══════╪═════╡419# │ a ┆ 1 │ │ 0 ┆ 5.5 ┆ a │420# │ a ┆ 2 │ │ 0 ┆ 5.5 ┆ a │421# │ a ┆ 4 │ │ 0 ┆ 5.5 ┆ a │422# │ b ┆ null │ │ 0 ┆ 6.0 ┆ b │423# │ b ┆ null │ │ -1 ┆ 7.5 ┆ b │424# │ e ┆ 5 │ │ null ┆ null ┆ e │425# └─────┴──────┘ └──────┴──────┴─────┘426assert af1.rows() == [427("a", 1),428("a", 2),429("a", 4),430("b", None),431("b", None),432("e", 5),433]434assert af2.rows() == [435(0, 5.5, "a"),436(0, 5.5, "a"),437(0, 5.5, "a"),438(0, 6.0, "b"),439(-1, 7.5, "b"),440(None, None, "e"),441]442443# align frames the other way round, using "left" alignment strategy444af1, af2 = pl.align_frames(df2, df1, on="x", how="left")445446# shape: (5, 3) shape: (5, 2)447# ┌─────┬─────┬─────┐ ┌─────┬──────┐448# │ y ┆ z ┆ x │ │ x ┆ y │449# │ --- ┆ --- ┆ --- │ │ --- ┆ --- │450# │ i64 ┆ f64 ┆ str │ │ str ┆ i64 │451# ╞═════╪═════╪═════╡ ╞═════╪══════╡452# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 1 │453# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 2 │454# │ 0 ┆ 5.5 ┆ a │ │ a ┆ 4 │455# │ 0 ┆ 6.0 ┆ b │ │ b ┆ null │456# │ -1 ┆ 7.5 ┆ b │ │ b ┆ null │457# └─────┴─────┴─────┘ └─────┴──────┘458assert af1.rows() == [459(0, 5.5, "a"),460(0, 5.5, "a"),461(0, 5.5, "a"),462(0, 6.0, "b"),463(-1, 7.5, "b"),464]465assert af2.rows() == [466("a", 1),467("a", 2),468("a", 4),469("b", None),470("b", None),471]472473474def test_align_frames_single_row_20445() -> None:475left = pl.DataFrame({"a": [1], "b": [2]})476right = pl.DataFrame({"a": [1], "c": [3]})477result = pl.align_frames(left, right, how="left", on="a")478assert_frame_equal(result[0], left)479assert_frame_equal(result[1], right)480481482def test_coalesce() -> None:483df = pl.DataFrame(484{485"a": [1, None, None, None],486"b": [1, 2, None, None],487"c": [5, None, 3, None],488}489)490# list inputs491expected = pl.Series("d", [1, 2, 3, 10]).to_frame()492result = df.select(pl.coalesce(["a", "b", "c", 10]).alias("d"))493assert_frame_equal(expected, result)494495# positional inputs496expected = pl.Series("d", [1.0, 2.0, 3.0, 10.0]).to_frame()497result = df.select(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))498assert_frame_equal(result, expected)499500501def test_coalesce_eager() -> None:502# eager/series inputs503s1 = pl.Series("colx", [None, 2, None])504s2 = pl.Series("coly", [1, None, None])505s3 = pl.Series("colz", [None, None, 3])506507res = pl.coalesce(s1, s2, s3, eager=True)508expected = pl.Series("colx", [1, 2, 3])509assert_series_equal(expected, res)510511for zero in (0, pl.lit(0)):512res = pl.coalesce(s1, zero, eager=True)513expected = pl.Series("colx", [0, 2, 0])514assert_series_equal(expected, res)515516res = pl.coalesce(zero, s1, eager=True)517expected = pl.Series("literal", [0, 0, 0])518assert_series_equal(expected, res)519520with pytest.raises(521ValueError,522match="expected at least one Series in 'coalesce' if 'eager=True'",523):524pl.coalesce("x", "y", eager=True)525526527def test_overflow_diff() -> None:528df = pl.DataFrame({"a": [20, 10, 30]})529assert df.select(pl.col("a").cast(pl.UInt64).diff()).to_dict(as_series=False) == {530"a": [None, -10, 20]531}532533534@pytest.mark.may_fail_cloud # reason: unknown type535def test_fill_null_unknown_output_type() -> None:536df = pl.DataFrame({"a": [None, 2, 3, 4, 5]})537assert df.with_columns(538np.exp(pl.col("a")).fill_null(pl.lit(1, pl.Float64))539).to_dict(as_series=False) == {540"a": [5411.0,5427.38905609893065,54320.085536923187668,54454.598150033144236,545148.4131591025766,546]547}548549550@pytest.mark.parametrize(("dtype"), [*NUMERIC_DTYPES, *TEMPORAL_DTYPES])551def test_approx_n_unique(dtype: pl.DataType) -> None:552df = pl.DataFrame({"a": pl.arange(100, eager=True).cast(dtype)})553cardinality = df.select(pl.col("a").approx_n_unique()).to_series()[0]554assert 92 <= cardinality <= 108555556557def test_approx_n_unique_null() -> None:558df = pl.DataFrame({"a": 100 * [None]})559cardinality = df.select(pl.col("a").approx_n_unique()).to_series()[0]560assert cardinality == 1561562563def test_lazy_functions() -> None:564df = pl.DataFrame(565{566"a": ["foo", "bar", "foo"],567"b": [1, 2, 3],568"c": [-1.0, 2.0, 4.0],569}570)571572# test function expressions against frame573out = df.select(574pl.var("b").name.suffix("_var"),575pl.std("b").name.suffix("_std"),576pl.max("a", "b").name.suffix("_max"),577pl.min("a", "b").name.suffix("_min"),578pl.sum("b", "c").name.suffix("_sum"),579pl.mean("b", "c").name.suffix("_mean"),580pl.median("c", "b").name.suffix("_median"),581pl.n_unique("b", "a").name.suffix("_n_unique"),582pl.first("a").name.suffix("_first"),583pl.first("b", "c").name.suffix("_first"),584pl.last("c", "b", "a").name.suffix("_last"),585)586expected: dict[str, list[Any]] = {587"b_var": [1.0],588"b_std": [1.0],589"a_max": ["foo"],590"b_max": [3],591"a_min": ["bar"],592"b_min": [1],593"b_sum": [6],594"c_sum": [5.0],595"b_mean": [2.0],596"c_mean": [5 / 3],597"c_median": [2.0],598"b_median": [2.0],599"b_n_unique": [3],600"a_n_unique": [2],601"a_first": ["foo"],602"b_first": [1],603"c_first": [-1.0],604"c_last": [4.0],605"b_last": [3],606"a_last": ["foo"],607}608assert_frame_equal(609out,610pl.DataFrame(611data=expected,612schema_overrides={613"a_n_unique": pl.get_index_type(),614"b_n_unique": pl.get_index_type(),615},616),617)618619# test function expressions against series620for name, value in expected.items():621col, fn = name.split("_", 1)622if series_fn := getattr(df[col], fn, None):623assert series_fn() == value[0]624625# regex selection626out = df.select(627pl.struct(pl.max("^a|b$")).alias("x"),628pl.struct(pl.min("^.*[bc]$")).alias("y"),629pl.struct(pl.sum("^[^a]$")).alias("z"),630)631assert out.rows() == [632({"a": "foo", "b": 3}, {"b": 1, "c": -1.0}, {"b": 6, "c": 5.0})633]634635636def test_count() -> None:637df = pl.DataFrame({"a": [1, 1, 1], "b": [None, "xx", "yy"]})638out = df.select(pl.count("a"))639assert list(out["a"]) == [3]640641for count_expr in (642pl.count("b", "a"),643[pl.count("b"), pl.count("a")],644):645out = df.select(count_expr)646assert out.rows() == [(2, 3)]647648649def test_head_tail(fruits_cars: pl.DataFrame) -> None:650res_expr = fruits_cars.select(pl.head("A", 2))651expected = pl.Series("A", [1, 2])652assert_series_equal(res_expr.to_series(), expected)653654res_expr = fruits_cars.select(pl.tail("A", 2))655expected = pl.Series("A", [4, 5])656assert_series_equal(res_expr.to_series(), expected)657658659@pytest.mark.parametrize(660"dtype", [pl.Int32, pl.Boolean, pl.String, pl.Categorical, pl.List]661)662def test_first_last(dtype: PolarsDataType) -> None:663# Ensure multiple chunks.664s1 = pl.Series("a", [None, None], dtype=pl.Int32)665s2 = pl.Series("a", [None, 3, 4, None], dtype=pl.Int32)666s3 = pl.Series("a", [None, None], dtype=pl.Int32)667s = s1.append(s2).append(s3)668if dtype == pl.Categorical:669# For categorical, we must go through String670s = s.cast(pl.String)671s = s.cast(dtype)672lf = s.to_frame().lazy()673674result = lf.select(pl.col("a").first()).collect()675expected_value = pl.Series("a", [None])676if dtype == pl.Categorical:677# For categorical, we must go through String678expected_value = expected_value.cast(pl.String)679expected = expected_value.cast(dtype).to_frame()680assert_frame_equal(result, expected)681682result = lf.select(pl.col("a").first(ignore_nulls=True)).collect()683expected_value = pl.Series("a", [3])684if dtype == pl.Categorical:685# For categorical, we must go through String686expected_value = expected_value.cast(pl.String)687688expected = expected_value.cast(dtype).to_frame()689assert_frame_equal(result, expected)690691result = lf.select(pl.col("a").last()).collect()692expected_value = pl.Series("a", [None])693if dtype == pl.Categorical:694# For categorical, we must go through String695expected_value = expected_value.cast(pl.String)696expected = expected_value.cast(dtype).to_frame()697assert_frame_equal(result, expected)698699result = lf.select(pl.col("a").last(ignore_nulls=True)).collect()700expected_value = pl.Series("a", [4])701if dtype == pl.Categorical:702# For categorical, we must go through String703expected_value = expected_value.cast(pl.String)704expected = expected_value.cast(dtype).to_frame()705assert_frame_equal(result, expected)706707# Test with empty708lf = pl.Series("a", [], dtype=dtype).to_frame().lazy()709expected = pl.Series("a", [None], dtype=dtype).to_frame()710711result = lf.select(pl.col("a").first()).collect()712assert_frame_equal(result, expected)713714result = lf.select(pl.col("a").first(ignore_nulls=True)).collect()715assert_frame_equal(result, expected)716717result = lf.select(pl.col("a").last()).collect()718assert_frame_equal(result, expected)719720result = lf.select(pl.col("a").last(ignore_nulls=True)).collect()721assert_frame_equal(result, expected)722723# Test with no nulls724lf = pl.Series("a", [1, 2, 3, 4, 5], dtype=pl.Int32).to_frame().lazy()725expected_value = pl.Series("a", [1])726if dtype == pl.Categorical:727# For categorical, we must go through String728expected_value = expected_value.cast(pl.String)729lf = lf.with_columns(pl.col("a").cast(pl.String))730731lf = lf.with_columns(pl.col("a").cast(dtype))732expected = expected_value.cast(dtype).to_frame()733734result = lf.select(pl.col("a").first()).collect()735assert_frame_equal(result, expected)736737result = lf.select(pl.col("a").first(ignore_nulls=True)).collect()738assert_frame_equal(result, expected)739740expected_value = pl.Series("a", [5])741if dtype == pl.Categorical:742# For categorical, we must go through String743expected_value = expected_value.cast(pl.String)744expected = expected_value.cast(dtype).to_frame()745746result = lf.select(pl.col("a").last()).collect()747assert_frame_equal(result, expected)748749result = lf.select(pl.col("a").last(ignore_nulls=True)).collect()750assert_frame_equal(result, expected)751752753def test_escape_regex() -> None:754result = pl.escape_regex("abc(\\w+)")755expected = "abc\\(\\\\w\\+\\)"756assert result == expected757758df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})759with pytest.raises(760TypeError,761match=r"escape_regex function is unsupported for `Expr`, you may want use `Expr\.str\.escape_regex` instead",762):763df.with_columns(escaped=pl.escape_regex(pl.col("text"))) # type: ignore[arg-type]764765with pytest.raises(766TypeError,767match="escape_regex function supports only `str` type, got `int`",768):769pl.escape_regex(3) # type: ignore[arg-type]770771772@pytest.mark.parametrize("func", ["var", "std"])773def test_var_std_lit_23156(func: str) -> None:774for n in range(100):775input = pl.DataFrame({"x": list(range(n))}).select(pl.col("x"), pl.lit(0))776out = getattr(input, func)()777if n <= 1:778assert_series_equal(779out["literal"], pl.Series("literal", [None], dtype=pl.Float64)780)781else:782assert_series_equal(783out["literal"], pl.Series("literal", [0.0], dtype=pl.Float64)784)785786787def test_row_index_expr() -> None:788lf = pl.LazyFrame({"x": ["A", "A", "B", "B", "B"]})789790assert_frame_equal(791lf.with_columns(pl.row_index(), pl.row_index("another_index")).collect(),792pl.DataFrame(793{794"x": ["A", "A", "B", "B", "B"],795"index": [0, 1, 2, 3, 4],796"another_index": [0, 1, 2, 3, 4],797},798schema={799"x": pl.String,800"index": pl.get_index_type(),801"another_index": pl.get_index_type(),802},803),804)805806assert_frame_equal(807(808lf.group_by("x")809.agg(pl.row_index(), pl.row_index("another_index"))810.sort("x")811.collect()812),813pl.DataFrame(814{815"x": ["A", "B"],816"index": [[0, 1], [0, 1, 2]],817"another_index": [[0, 1], [0, 1, 2]],818},819schema={820"x": pl.String,821"index": pl.List(pl.get_index_type()),822"another_index": pl.List(pl.get_index_type()),823},824),825)826827assert_frame_equal(828lf.select(pl.row_index()).collect(),829pl.DataFrame(830{"index": [0, 1, 2, 3, 4]},831schema={"index": pl.get_index_type()},832),833)834835836@pytest.mark.parametrize("dt", [pl.Float16, pl.Float32, pl.Float64])837@pytest.mark.parametrize("method", ["pearson", "spearman"])838def test_corr_spearman_float_dtype_26335(839dt: pl.DataType, method: CorrelationMethod840) -> None:841df = pl.DataFrame(842{843"a": [1, 8, 3],844"b": [4, 5, 2],845"c": ["foo", "foo", "foo"],846},847schema_overrides={"a": dt, "b": dt},848)849850q = df.lazy().select(pl.corr("a", "b", method=method))851out = q.collect()852assert out.schema["a"] == dt853854q = df.lazy().group_by("c").agg(pl.corr("a", "b", method=method))855out = q.collect()856assert out.schema["a"] == dt857858859