Path: blob/main/py-polars/tests/unit/meta/test_errors.py
8410 views
from __future__ import annotations12import io3from datetime import date, datetime, time, tzinfo4from decimal import Decimal5from typing import TYPE_CHECKING, Any67import numpy as np8import pandas as pd9import pytest1011import polars as pl12from polars.datatypes.convert import dtype_to_py_type13from polars.exceptions import (14ColumnNotFoundError,15ComputeError,16InvalidOperationError,17OutOfBoundsError,18SchemaError,19SchemaFieldNotFoundError,20ShapeError,21StructFieldNotFoundError,22)23from polars.testing import assert_frame_equal24from tests.unit.conftest import TEMPORAL_DTYPES2526if TYPE_CHECKING:27from polars._typing import ConcatMethod282930def test_error_on_empty_group_by() -> None:31with pytest.raises(32ComputeError, match="at least one key is required in a group_by operation"33):34pl.DataFrame({"x": [0, 0, 1, 1]}).group_by([]).agg(pl.len())353637def test_error_on_reducing_map() -> None:38df = pl.DataFrame(39{"id": [0, 0, 0, 1, 1, 1], "t": [2, 4, 5, 10, 11, 14], "y": [0, 1, 1, 2, 3, 4]}40)41with pytest.raises(42TypeError,43match=r"`map` with `returns_scalar=False`",44):45df.group_by("id").agg(46pl.map_batches(["t", "y"], np.mean, return_dtype=pl.Float64)47)4849df = pl.DataFrame({"x": [1, 2, 3, 4], "group": [1, 2, 1, 2]})50with pytest.raises(TypeError, match=r"`map` with `returns_scalar=False`"):51df.select(52pl.col("x")53.map_batches(54lambda x: x.cut(breaks=[1, 2, 3], include_breaks=True).struct.unnest(),55is_elementwise=True,56return_dtype=pl.Struct(57{"breakpoint": pl.Int64, "cat": pl.Categorical()}58),59)60.over("group")61)6263assert_frame_equal(64df.select(65pl.col("x")66.map_batches(67lambda x: x.cut(breaks=[1, 2, 3], include_breaks=True),68is_elementwise=True,69)70.struct.unnest()71.over("group")72),73pl.DataFrame(74{75"breakpoint": [1.0, 2.0, 3.0, float("inf")],76"category": ["(-inf, 1]", "(1, 2]", "(2, 3]", "(3, inf]"],77},78schema_overrides={"category": pl.Categorical()},79),80)818283def test_error_on_invalid_by_in_asof_join() -> None:84df1 = pl.DataFrame(85{86"a": ["a", "b", "a"],87"b": [1, 2, 3],88"c": ["a", "b", "a"],89}90).set_sorted("b")9192df2 = df1.with_columns(pl.col("a").cast(pl.Categorical))93with pytest.raises(ComputeError):94df1.join_asof(df2, on="b", by=["a", "c"])959697@pytest.mark.parametrize("dtype", TEMPORAL_DTYPES)98def test_error_on_invalid_series_init(dtype: pl.DataType) -> None:99py_type = dtype_to_py_type(dtype)100with pytest.raises(101TypeError,102match=f"'float' object cannot be interpreted as a {py_type.__name__!r}",103):104pl.Series([1.5, 2.0, 3.75], dtype=dtype)105106107def test_error_on_invalid_series_init2() -> None:108with pytest.raises(TypeError, match="unexpected value"):109pl.Series([1.5, 2.0, 3.75], dtype=pl.Int32)110111112def test_error_on_invalid_struct_field() -> None:113with pytest.raises(StructFieldNotFoundError):114pl.struct(115[pl.Series("a", [1, 2]), pl.Series("b", ["a", "b"])], eager=True116).struct.field("z")117118119def test_not_found_error() -> None:120csv = "a,b,c\n2,1,1"121df = pl.read_csv(io.StringIO(csv))122with pytest.raises(ColumnNotFoundError):123df.select("d")124125126def test_string_numeric_comp_err() -> None:127with pytest.raises(ComputeError, match="cannot compare string with numeric type"):128pl.DataFrame({"a": [1.1, 21, 31, 21, 51, 61, 71, 81]}).select(pl.col("a") < "9")129130131def test_panic_error() -> None:132with pytest.raises(133InvalidOperationError,134match="unit: 'k' not supported",135):136pl.datetime_range(137start=datetime(2021, 12, 16),138end=datetime(2021, 12, 16, 3),139interval="99k",140eager=True,141)142143144def test_join_lazy_on_df() -> None:145df_left = pl.DataFrame(146{147"Id": [1, 2, 3, 4],148"Names": ["A", "B", "C", "D"],149}150)151df_right = pl.DataFrame({"Id": [1, 3], "Tags": ["xxx", "yyy"]})152153with pytest.raises(154TypeError,155match=r"expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",156):157df_left.lazy().join(df_right, on="Id") # type: ignore[arg-type]158159with pytest.raises(160TypeError,161match=r"expected `other` .*to be a 'LazyFrame'.* not 'DataFrame'",162):163df_left.lazy().join_asof(df_right, on="Id") # type: ignore[arg-type]164165with pytest.raises(166TypeError,167match=r"expected `other` .*to be a 'LazyFrame'.* not 'pandas.*DataFrame'",168):169df_left.lazy().join_asof(df_right.to_pandas(), on="Id") # type: ignore[arg-type]170171172def test_projection_update_schema_missing_column() -> None:173with pytest.raises(174ColumnNotFoundError,175match='unable to find column "colC"',176):177(178pl.DataFrame({"colA": ["a", "b", "c"], "colB": [1, 2, 3]})179.lazy()180.filter(~pl.col("colC").is_null())181.group_by(["colA"])182.agg([pl.col("colB").sum().alias("result")])183.collect()184)185186187def test_not_found_on_rename() -> None:188df = pl.DataFrame({"exists": [1, 2, 3]})189190err_type = (SchemaFieldNotFoundError, ColumnNotFoundError)191with pytest.raises(err_type):192df.rename({"does_not_exist": "exists"})193194with pytest.raises(err_type):195df.select(pl.col("does_not_exist").alias("new_name"))196197198def test_getitem_errs() -> None:199df = pl.DataFrame({"a": [1, 2, 3]})200201with pytest.raises(202TypeError,203match=r"cannot select columns using key of type 'set': {'some'}",204):205df[{"some"}] # type: ignore[call-overload]206207with pytest.raises(208TypeError,209match=r"cannot select elements using key of type 'set': {'strange'}",210):211df["a"][{"strange"}] # type: ignore[call-overload]212213with pytest.raises(214TypeError,215match=r"cannot use `__setitem__` on DataFrame with key {'some'} of type 'set' and value 'foo' of type 'str'",216):217df[{"some"}] = "foo" # type: ignore[index]218219220def test_err_bubbling_up_to_lit() -> None:221df = pl.DataFrame({"date": [date(2020, 1, 1)], "value": [42]})222223with pytest.raises(TypeError):224df.filter(pl.col("date") == pl.Date("2020-01-01")) # type: ignore[call-arg,operator]225226227def test_filter_not_of_type_bool() -> None:228df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})229with pytest.raises(230InvalidOperationError, match="filter predicate must be of type `Boolean`, got"231):232df.filter(pl.col("json_val").str.json_path_match("$.a"))233234235def test_is_nan_on_non_boolean() -> None:236with pytest.raises(InvalidOperationError):237pl.Series(["1", "2", "3"]).fill_nan("2") # type: ignore[arg-type]238239240@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set241def test_window_expression_different_group_length() -> None:242try:243pl.DataFrame({"groups": ["a", "a", "b", "a", "b"]}).select(244pl.col("groups").map_elements(lambda _: pl.Series([1, 2])).over("groups")245)246except ShapeError as exc:247msg = str(exc)248assert (249"the length of the window expression did not match that of the group" in msg250)251assert "group:" in msg252assert "group length:" in msg253assert "output: 'shape:" in msg254255256def test_invalid_concat_type_err() -> None:257df = pl.DataFrame(258{259"foo": [1, 2],260"bar": [6, 7],261"ham": ["a", "b"],262}263)264with pytest.raises(265ValueError,266match=r"DataFrame `how` must be one of {'vertical', '.+', 'align_right'}, got 'sausage'",267):268pl.concat([df, df], how="sausage") # type: ignore[arg-type]269270271@pytest.mark.parametrize("how", ["horizontal", "diagonal"])272def test_series_concat_err(how: ConcatMethod) -> None:273s = pl.Series([1, 2, 3])274with pytest.raises(275ValueError,276match="Series only supports 'vertical' concat strategy",277):278pl.concat([s, s], how=how)279280281def test_invalid_sort_by() -> None:282df = pl.DataFrame(283{284"a": ["bill", "bob", "jen", "allie", "george"],285"b": ["M", "M", "F", "F", "M"],286"c": [32, 40, 20, 19, 39],287}288)289290# `select a where b order by c desc`291with pytest.raises(ShapeError):292df.select(pl.col("a").filter(pl.col("b") == "M").sort_by("c", descending=True))293294295def test_epoch_time_type() -> None:296with pytest.raises(297InvalidOperationError,298match="`timestamp` operation not supported for dtype `time`",299):300pl.Series([time(0, 0, 1)]).dt.epoch("s")301302303def test_duplicate_columns_arg_csv() -> None:304f = io.BytesIO()305pl.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}).write_csv(f)306f.seek(0)307with pytest.raises(308ValueError, match=r"`columns` arg should only have unique values"309):310pl.read_csv(f, columns=["x", "x", "y"])311312313def test_datetime_time_add_err() -> None:314with pytest.raises(SchemaError, match="failed to determine supertype"):315pl.Series([datetime(1970, 1, 1, 0, 0, 1)]) + pl.Series([time(0, 0, 2)])316317318def test_invalid_dtype() -> None:319with pytest.raises(320TypeError,321match=r"cannot parse input of type 'str' into Polars data type \(given: 'mayonnaise'\)",322):323pl.Series([1, 2], dtype="mayonnaise") # type: ignore[arg-type]324325with pytest.raises(326TypeError,327match=r"cannot parse input <class 'datetime\.tzinfo'> into Polars data type",328):329pl.Series([None], dtype=tzinfo) # type: ignore[arg-type]330331332def test_arr_eval_named_cols() -> None:333df = pl.DataFrame({"A": ["a", "b"], "B": [["a", "b"], ["c", "d"]]})334335with pytest.raises(ComputeError):336df.select(pl.col("B").list.eval(pl.element().append(pl.col("A"))))337338339def test_alias_in_join_keys() -> None:340df = pl.DataFrame({"A": ["a", "b"], "B": [["a", "b"], ["c", "d"]]})341with pytest.raises(342InvalidOperationError,343match=r"'alias' is not allowed in a join key, use 'with_columns' first",344):345df.join(df, on=pl.col("A").alias("foo"))346347348def test_sort_by_different_lengths() -> None:349df = pl.DataFrame(350{351"group": ["a"] * 3 + ["b"] * 3,352"col1": [1, 2, 3, 300, 200, 100],353"col2": [1, 2, 3, 300, 1, 1],354}355)356with pytest.raises(357ShapeError,358match=r"expressions in 'sort_by' must have matching group lengths",359):360df.group_by("group").agg(361[362pl.col("col1").sort_by(pl.col("col2").unique()),363]364)365366with pytest.raises(367ShapeError,368match=r"expressions in 'sort_by' must have matching group lengths",369):370df.group_by("group").agg(371[372pl.col("col1").sort_by(pl.col("col2").arg_unique()),373]374)375376with pytest.raises(377ShapeError,378match=r"expressions in 'sort_by' must have matching group lengths",379):380df.group_by("group").agg(381[382pl.col("col1").sort_by(pl.col("col2").first()),383]384)385386387def test_err_filter_no_expansion() -> None:388# df contains floats389df = pl.DataFrame(390{391"a": [0.1, 0.2],392}393)394395with pytest.raises(396ComputeError, match=r"The predicate expanded to zero expressions"397):398# we filter by ints399df.filter(pl.col(pl.Int16).min() < 0.1)400401402@pytest.mark.parametrize(403("e"),404[405pl.col("date") > "2021-11-10",406pl.col("date") < "2021-11-10",407],408)409def test_date_string_comparison(e: pl.Expr) -> None:410df = pl.DataFrame(411{412"date": [413"2022-11-01",414"2022-11-02",415"2022-11-05",416],417}418).with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"))419420with pytest.raises(421InvalidOperationError,422match=r"cannot compare 'date/datetime/time' to a string value",423):424df.select(e)425426427def test_compare_different_len() -> None:428df = pl.DataFrame(429{430"idx": list(range(5)),431}432)433434s = pl.Series([2, 5, 8])435with pytest.raises(ShapeError):436df.filter(pl.col("idx") == s)437438439def test_take_negative_index_is_oob() -> None:440df = pl.DataFrame({"value": [1, 2, 3]})441with pytest.raises(OutOfBoundsError):442df["value"].gather(-4)443444445def test_string_numeric_arithmetic_err() -> None:446df = pl.DataFrame({"s": ["x"]})447with pytest.raises(448InvalidOperationError, match=r"arithmetic on string and numeric not allowed"449):450df.select(pl.col("s") + 1)451452453def test_ambiguous_filter_err() -> None:454df = pl.DataFrame({"a": [None, "2", "3"], "b": [None, None, "z"]})455with pytest.raises(456ComputeError,457match=r"The predicate passed to 'LazyFrame.filter' expanded to multiple expressions",458):459df.filter(pl.col(["a", "b"]).is_null())460461462def test_with_column_duplicates() -> None:463df = pl.DataFrame({"a": [0, None, 2, 3, None], "b": [None, 1, 2, 3, None]})464with pytest.raises(465ComputeError,466match=r"the name 'same' passed to `LazyFrame.with_columns` is duplicate.*",467):468assert df.with_columns([pl.all().alias("same")]).columns == ["a", "b", "same"]469470471@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set472def test_skip_nulls_err() -> None:473df = pl.DataFrame({"foo": [None, None]})474with pytest.raises(475pl.exceptions.InvalidOperationError,476match=r"UDF called without return type, but was not able to infer the output type",477):478df.with_columns(pl.col("foo").map_elements(lambda x: x, skip_nulls=True))479480481@pytest.mark.parametrize(482("test_df", "type", "expected_message"),483[484pytest.param(485pl.DataFrame({"A": [1, 2, 3], "B": ["1", "2", "help"]}),486pl.UInt32,487"conversion .* failed",488id="Unsigned integer",489)490],491)492def test_cast_err_column_value_highlighting(493test_df: pl.DataFrame, type: pl.DataType, expected_message: str494) -> None:495with pytest.raises(InvalidOperationError, match=expected_message):496test_df.with_columns(pl.all().cast(type))497498499def test_invalid_group_by_arg() -> None:500df = pl.DataFrame({"a": [1]})501with pytest.raises(502TypeError, match="specifying aggregations as a dictionary is not supported"503):504df.group_by(1).agg({"a": "sum"})505506507def test_overflow_msg() -> None:508with pytest.raises(509ComputeError,510match=r"could not append value: 2147483648 of type: i64 to the builder",511):512pl.DataFrame([[2**31]], [("a", pl.Int32)], orient="row")513514515def test_sort_by_err_9259() -> None:516df = pl.DataFrame(517{"a": [1, 1, 1], "b": [3, 2, 1], "c": [1, 1, 2]},518schema={"a": pl.Float32, "b": pl.Float32, "c": pl.Float32},519)520with pytest.raises(ShapeError):521df.lazy().group_by("c").agg(522[pl.col("a").sort_by(pl.col("b").filter(pl.col("b") > 100)).sum()]523).collect()524525526def test_empty_inputs_error() -> None:527df = pl.DataFrame({"col1": [1]})528with pytest.raises(529pl.exceptions.InvalidOperationError, match="expected at least 1 input"530):531df.select(pl.sum_horizontal(pl.exclude("col1")))532533534@pytest.mark.parametrize(535("colname", "values", "expected"),536[537("a", [2], [False, True, False]),538("a", [True, False], None),539("a", ["2", "3", "4"], None),540("b", [Decimal("3.14")], None),541("c", [-2, -1, 0, 1, 2], None),542(543"d",544pl.datetime_range(545datetime.now(),546datetime.now(),547interval="2345ns",548time_unit="ns",549eager=True,550),551None,552),553("d", [time(10, 30)], None),554("e", [datetime(1999, 12, 31, 10, 30)], None),555("f", ["xx", "zz"], None),556],557)558def test_invalid_is_in_dtypes(559colname: str, values: list[Any], expected: list[Any] | None560) -> None:561df = pl.DataFrame(562{563"a": [1, 2, 3],564"b": [-2.5, 0.0, 2.5],565"c": [True, None, False],566"d": [datetime(2001, 10, 30), None, datetime(2009, 7, 5)],567"e": [date(2029, 12, 31), date(1999, 12, 31), None],568"f": [b"xx", b"yy", b"zz"],569}570)571if expected is None:572with pytest.raises(573InvalidOperationError,574match=r"'is_in' cannot check for .*? values in .*? data",575):576df.select(pl.col(colname).is_in(values))577else:578assert df.select(pl.col(colname).is_in(values))[colname].to_list() == expected579580581def test_sort_by_error() -> None:582df = pl.DataFrame(583{584"id": [1, 1, 1, 2, 2, 3, 3, 3],585"number": [1, 3, 2, 1, 2, 2, 1, 3],586"type": ["A", "B", "A", "B", "B", "A", "B", "C"],587"cost": [10, 25, 20, 25, 30, 30, 50, 100],588}589)590591with pytest.raises(592ShapeError,593match="expressions in 'sort_by' must have matching group lengths",594):595df.group_by("id", maintain_order=True).agg(596pl.col("cost").filter(pl.col("type") == "A").sort_by("number")597)598599600def test_non_existent_expr_inputs_in_lazy() -> None:601with pytest.raises(ColumnNotFoundError):602pl.LazyFrame().filter(pl.col("x") == 1).explain() # tests: 12074603604lf = pl.LazyFrame({"foo": [1, 1, -2, 3]})605606with pytest.raises(ColumnNotFoundError):607(608lf.select(pl.col("foo").cum_sum().alias("bar"))609.filter(pl.col("bar") == pl.col("foo"))610.explain()611)612613614def test_error_list_to_array() -> None:615with pytest.raises(ComputeError, match="not all elements have the specified width"):616pl.DataFrame(617data={"a": [[1, 2], [3, 4, 5]]}, schema={"a": pl.List(pl.Int8)}618).with_columns(array=pl.col("a").list.to_array(2))619620621def test_raise_not_found_in_simplify_14974() -> None:622df = pl.DataFrame()623with pytest.raises(ColumnNotFoundError):624df.select(1 / (1 + pl.col("a")))625626627def test_invalid_product_type() -> None:628with pytest.raises(629InvalidOperationError,630match="`product` operation not supported for dtype",631):632pl.Series([[1, 2, 3]]).product()633634635def test_fill_null_invalid_supertype() -> None:636df = pl.DataFrame({"date": [date(2022, 1, 1), None]})637with pytest.raises(InvalidOperationError, match="got invalid or ambiguous"):638df.select(pl.col("date").fill_null(1.0))639640641@pytest.mark.may_fail_cloud # reason: Object type not supported642def test_raise_invalid_arithmetic() -> None:643df = pl.Series("a", [object()]).to_frame()644645with pytest.raises(InvalidOperationError):646df.select(pl.col("a") - pl.col("a"))647648649def test_err_invalid_comparison() -> None:650with pytest.raises(651SchemaError,652match="could not evaluate comparison between series 'a' of dtype: Date and series 'b' of dtype: Boolean",653):654_ = pl.Series("a", [date(2020, 1, 1)]) == pl.Series("b", [True])655656with pytest.raises(657InvalidOperationError,658match="could not apply comparison on series of dtype 'object; operand names: 'a', 'b'",659):660_ = pl.Series("a", [object()]) == pl.Series("b", [object])661662663def test_no_panic_pandas_nat() -> None:664# we don't want to support pd.nat, but don't want to panic.665with pytest.raises(Exception): # noqa: B017666pl.DataFrame({"x": [pd.NaT]})667668669def test_list_to_struct_invalid_type() -> None:670with pytest.raises(pl.exceptions.InvalidOperationError):671pl.DataFrame({"a": 1}).to_series().list.to_struct(fields=["a", "b"])672673674def test_raise_invalid_agg() -> None:675with pytest.raises(pl.exceptions.ColumnNotFoundError):676(677pl.LazyFrame({"foo": [1]})678.with_row_index()679.group_by("index")680.agg(pl.col("foo").filter(pl.col("i_do_not_exist")))681).collect()682683684def test_err_mean_horizontal_lists() -> None:685df = pl.DataFrame(686{687"experiment_id": [1, 2],688"sensor1": [[1, 2, 3], [7, 8, 9]],689"sensor2": [[4, 5, 6], [10, 11, 12]],690}691)692with pytest.raises(pl.exceptions.InvalidOperationError):693df.with_columns(pl.mean_horizontal("sensor1", "sensor2").alias("avg_sensor"))694695696def test_raise_column_not_found_in_join_arg() -> None:697a = pl.DataFrame({"x": [1, 2, 3]})698b = pl.DataFrame({"y": [1, 2, 3]})699with pytest.raises(pl.exceptions.ColumnNotFoundError):700a.join(b, on="y")701702703def test_raise_on_different_results_20104() -> None:704df = pl.DataFrame({"x": [1, 2]})705706with pytest.raises(TypeError):707df.rolling("x", period="3i").agg(708result=pl.col("x")709.gather_every(2, offset=1)710.map_batches(pl.Series.min, return_dtype=pl.Float64)711)712713714@pytest.mark.parametrize("fill_value", [None, -1])715def test_shift_with_null_deprecated_24105(fill_value: Any) -> None:716df = pl.DataFrame({"x": [1, 2, 3]})717df_shift = None718with pytest.deprecated_call( # @2.0719match=r"shift value 'n' is null, which currently returns a column of null values. This will become an error in the future.",720):721df_shift = df.select(722pl.col.x.shift(pl.col.x.filter(pl.col.x > 3).first(), fill_value=fill_value)723)724# Check that the result is a column of nulls, even if the fill_value is different725assert_frame_equal(726df_shift,727pl.DataFrame({"x": [None, None, None]}),728check_dtypes=False,729)730731732def test_raies_on_mismatch_column_length_24500() -> None:733df = pl.DataFrame(734{735"a": [10, 10, 10, 20, 20, 20],736"b": [2, 2, 99, 3, 3, 3],737"c": [3, 3, 3, 2, 2, 99],738}739)740with pytest.raises(741ShapeError,742match="expressions must have matching group lengths",743):744df.group_by("a").agg(745pl.struct(746pl.col("b").head(pl.col("b").first()),747pl.col("c").head(pl.col("c").first()),748)749)750751752def test_raies_on_mismatch_column_length_binary_expr() -> None:753df = pl.DataFrame(754{755"a": [10, 10, 10, 20, 20, 20],756"b": [2, 0, 99, 0, 0, 0],757"c": [3, 0, 0, 2, 0, 99],758}759)760761with pytest.raises(762ShapeError,763match="expressions must have matching group lengths",764):765df.group_by("a").agg(766pl.Expr.add(767pl.col("b").head(pl.col("b").first()),768pl.col("c").head(pl.col("c").first()),769)770)771772773