Path: blob/main/py-polars/tests/unit/test_match_to_schema.py
6939 views
import pytest12import polars as pl3from polars.testing import assert_frame_equal456def test_match_to_schema() -> None:7df = pl.DataFrame(8[9pl.Series("a", [1, 2, 3], pl.Int64),10pl.Series("b", ["A", "B", "C"], pl.String),11]12)1314result = df.lazy().match_to_schema(df.schema).collect()15assert_frame_equal(df, result)1617result = df.lazy().match_to_schema({"a": pl.Int64(), "b": pl.String()}).collect()18assert_frame_equal(df, result)1920with pytest.raises(pl.exceptions.SchemaError):21df.lazy().match_to_schema({"a": pl.String(), "b": pl.Int64()}).collect()2223with pytest.raises(pl.exceptions.SchemaError):24df.lazy().match_to_schema({"x": pl.Int64(), "y": pl.String()}).collect()252627def test_match_to_schema_missing_columns() -> None:28df = pl.DataFrame(29[30pl.Series("a", [1, 2, 3], pl.Int64),31pl.Series("b", ["A", "B", "C"], pl.String),32]33)3435expected = df.with_columns(c=pl.lit(None, dtype=pl.Datetime()))3637result = (38df.lazy()39.match_to_schema(40expected.schema,41missing_columns="insert",42)43.collect()44)45assert_frame_equal(expected, result)4647result = (48df.lazy()49.match_to_schema(50expected.schema,51missing_columns={"c": "insert"},52)53.collect()54)5556result = (57df.lazy()58.match_to_schema(59{"a": pl.Int64(), "b": pl.String(), "c": pl.Datetime()},60missing_columns="insert",61)62.collect()63)64assert_frame_equal(df.with_columns(c=pl.lit(None, dtype=pl.Datetime())), result)6566df = pl.DataFrame(67[68pl.Series("b", ["A", "B", "C"], pl.String),69]70)7172result = (73df.lazy()74.match_to_schema(75{"a": pl.Int64(), "b": pl.String(), "c": pl.Datetime()},76missing_columns="insert",77)78.collect()79)80assert_frame_equal(81df.select(82a=pl.lit(None, dtype=pl.Int64()),83b=pl.col.b,84c=pl.lit(None, dtype=pl.Datetime()),85),86result,87)888990def test_match_to_schema_extra_columns() -> None:91df = pl.DataFrame(92[93pl.Series("a", [1, 2, 3], pl.Int64),94pl.Series("b", ["A", "B", "C"], pl.String),95pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),96]97)9899expected = df.select(["a"])100101with pytest.raises(pl.exceptions.SchemaError, match=r'`match_to_schema`: "b", "c"'):102df.lazy().match_to_schema(expected.schema).collect()103104result = (105df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()106)107assert_frame_equal(expected, result)108109expected = df.select(["a", "b", "c"])110result = (111df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()112)113assert_frame_equal(expected, result)114115expected = df.select(["a", "c"])116result = (117df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()118)119assert_frame_equal(expected, result)120121122def test_match_to_schema_missing_struct_fields() -> None:123df = pl.DataFrame(124[125pl.Series("a", [1, 2, 3], pl.Int64),126pl.Series(127"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})128),129pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),130]131)132133expected = df.with_columns(134pl.col.b.struct.with_fields(135y=pl.repeat(pl.lit(None, dtype=pl.Datetime()), pl.len())136)137)138139with pytest.raises(pl.exceptions.SchemaError):140df.lazy().match_to_schema(expected.schema).collect()141142with pytest.raises(pl.exceptions.SchemaError):143df.lazy().match_to_schema(expected.schema, missing_columns="insert").collect()144145result = (146df.lazy()147.match_to_schema(expected.schema, missing_struct_fields="insert")148.collect()149)150assert_frame_equal(expected, result)151152153def test_match_to_schema_extra_struct_fields() -> None:154expected = pl.DataFrame(155[156pl.Series("a", [1, 2, 3], pl.Int64),157pl.Series(158"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})159),160pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),161]162)163164df = expected.with_columns(165pl.col.b.struct.with_fields(166y=pl.repeat(pl.lit(None, dtype=pl.Datetime()), pl.len())167)168)169170with pytest.raises(pl.exceptions.SchemaError):171df.lazy().match_to_schema(expected.schema).collect()172173with pytest.raises(pl.exceptions.SchemaError):174df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()175176result = (177df.lazy()178.match_to_schema(expected.schema, extra_struct_fields="ignore")179.collect()180)181assert_frame_equal(expected, result)182183184def test_match_to_schema_int_upcast() -> None:185df = pl.DataFrame(186[187pl.Series("a", [1, 2, 3], pl.Int32),188pl.Series(189"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})190),191pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),192]193)194195expected = df.with_columns(pl.col.a.cast(pl.Int64))196197with pytest.raises(pl.exceptions.SchemaError):198df.lazy().match_to_schema(expected.schema).collect()199200with pytest.raises(pl.exceptions.SchemaError):201df.lazy().match_to_schema(expected.schema, float_cast="upcast").collect()202203result = df.lazy().match_to_schema(expected.schema, integer_cast="upcast").collect()204assert_frame_equal(expected, result)205206207def test_match_to_schema_float_upcast() -> None:208df = pl.DataFrame(209[210pl.Series("a", [1.0, 2.0, 3.0], pl.Float32()),211pl.Series(212"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})213),214pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),215]216)217218expected = df.with_columns(pl.col.a.cast(pl.Float64()))219220with pytest.raises(pl.exceptions.SchemaError):221df.lazy().match_to_schema(expected.schema).collect()222223with pytest.raises(pl.exceptions.SchemaError):224df.lazy().match_to_schema(expected.schema, integer_cast="upcast").collect()225226result = df.lazy().match_to_schema(expected.schema, float_cast="upcast").collect()227assert_frame_equal(expected, result)228229230