Path: blob/main/py-polars/tests/unit/lazyframe/test_serde.py
6939 views
from __future__ import annotations12import io3from typing import TYPE_CHECKING45import pytest6from hypothesis import example, given78import polars as pl9from polars.exceptions import ComputeError10from polars.testing import assert_frame_equal11from polars.testing.parametric import dataframes1213if TYPE_CHECKING:14from pathlib import Path1516from polars._typing import SerializationFormat171819@given(20lf=dataframes(21lazy=True,22excluded_dtypes=[pl.Struct],23)24)25@example(lf=pl.LazyFrame({"foo": ["a", "b", "a"]}, schema={"foo": pl.Enum(["b", "a"])}))26def test_lf_serde_roundtrip_binary(lf: pl.LazyFrame) -> None:27serialized = lf.serialize(format="binary")28result = pl.LazyFrame.deserialize(io.BytesIO(serialized), format="binary")29assert_frame_equal(result, lf, categorical_as_str=True)303132@given(33lf=dataframes(34lazy=True,35excluded_dtypes=[36pl.Float32, # Bug, see: https://github.com/pola-rs/polars/issues/1721137pl.Float64, # Bug, see: https://github.com/pola-rs/polars/issues/1721138pl.Struct, # Outer nullability not supported39],40)41)42@pytest.mark.filterwarnings("ignore")43def test_lf_serde_roundtrip_json(lf: pl.LazyFrame) -> None:44serialized = lf.serialize(format="json")45result = pl.LazyFrame.deserialize(io.StringIO(serialized), format="json")46assert_frame_equal(result, lf, categorical_as_str=True)474849@pytest.fixture50def lf() -> pl.LazyFrame:51"""Sample LazyFrame for testing serialization/deserialization."""52return pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}).select("a").sum()535455@pytest.mark.filterwarnings("ignore")56def test_lf_serde_json_stringio(lf: pl.LazyFrame) -> None:57serialized = lf.serialize(format="json")58assert isinstance(serialized, str)59result = pl.LazyFrame.deserialize(io.StringIO(serialized), format="json")60assert_frame_equal(result, lf)616263def test_lf_serde(lf: pl.LazyFrame) -> None:64serialized = lf.serialize()65assert isinstance(serialized, bytes)66result = pl.LazyFrame.deserialize(io.BytesIO(serialized))67assert_frame_equal(result, lf)686970@pytest.mark.parametrize(71("format", "buf"),72[73("binary", io.BytesIO()),74("json", io.StringIO()),75("json", io.BytesIO()),76],77)78@pytest.mark.filterwarnings("ignore")79def test_lf_serde_to_from_buffer(80lf: pl.LazyFrame, format: SerializationFormat, buf: io.IOBase81) -> None:82lf.serialize(buf, format=format)83buf.seek(0)84result = pl.LazyFrame.deserialize(buf, format=format)85assert_frame_equal(lf, result)868788@pytest.mark.write_disk89def test_lf_serde_to_from_file(lf: pl.LazyFrame, tmp_path: Path) -> None:90tmp_path.mkdir(exist_ok=True)9192file_path = tmp_path / "small.bin"93lf.serialize(file_path)94result = pl.LazyFrame.deserialize(file_path)9596assert_frame_equal(lf, result)979899def test_lf_deserialize_validation() -> None:100f = io.BytesIO(b"hello world!")101with pytest.raises(ComputeError, match="expected value at line 1 column 1"):102pl.LazyFrame.deserialize(f, format="json")103104105@pytest.mark.write_disk106def test_lf_serde_scan(tmp_path: Path) -> None:107tmp_path.mkdir(exist_ok=True)108path = tmp_path / "dataset.parquet"109110df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})111df.write_parquet(path)112lf = pl.scan_parquet(path)113114ser = lf.serialize()115result = pl.LazyFrame.deserialize(io.BytesIO(ser))116assert_frame_equal(result, lf)117assert_frame_equal(result.collect(), df)118119120@pytest.mark.filterwarnings("ignore::polars.exceptions.PolarsInefficientMapWarning")121def test_lf_serde_version_specific_lambda() -> None:122lf = pl.LazyFrame({"a": [1, 2, 3]}).select(123pl.col("a").map_elements(lambda x: x + 1, return_dtype=pl.Int64)124)125ser = lf.serialize()126127result = pl.LazyFrame.deserialize(io.BytesIO(ser))128expected = pl.LazyFrame({"a": [2, 3, 4]})129assert_frame_equal(result, expected)130131132def custom_function(x: pl.Series) -> pl.Series:133return x + 1134135136@pytest.mark.may_fail_cloud # reason: cloud does not have access to this scope137@pytest.mark.filterwarnings("ignore::polars.exceptions.PolarsInefficientMapWarning")138def test_lf_serde_version_specific_named_function() -> None:139lf = pl.LazyFrame({"a": [1, 2, 3]}).select(140pl.col("a").map_batches(custom_function, return_dtype=pl.Int64)141)142ser = lf.serialize()143144result = pl.LazyFrame.deserialize(io.BytesIO(ser))145expected = pl.LazyFrame({"a": [2, 3, 4]})146assert_frame_equal(result, expected)147148149@pytest.mark.filterwarnings("ignore::polars.exceptions.PolarsInefficientMapWarning")150def test_lf_serde_map_batches_on_lazyframe() -> None:151lf = pl.LazyFrame({"a": [1, 2, 3]}).map_batches(lambda x: x + 1)152ser = lf.serialize()153154result = pl.LazyFrame.deserialize(io.BytesIO(ser))155expected = pl.LazyFrame({"a": [2, 3, 4]})156assert_frame_equal(result, expected)157158159