Path: blob/main/py-polars/tests/unit/interchange/test_roundtrip.py
8424 views
from __future__ import annotations12from datetime import datetime3from typing import TYPE_CHECKING45import pandas as pd6import pyarrow as pa7import pyarrow.interchange8import pytest9from hypothesis import given1011import polars as pl12from polars._utils.various import parse_version13from polars.interchange.from_dataframe import (14from_dataframe as from_dataframe_interchange_protocol,15)16from polars.testing import assert_frame_equal, assert_series_equal17from polars.testing.parametric import dataframes1819skip_if_broken_pandas_version = pytest.mark.skipif(20pd.__version__.startswith("2"), reason="bug. see #20316"21)2223if TYPE_CHECKING:24from polars._typing import PolarsDataType2526protocol_dtypes: list[PolarsDataType] = [27pl.Int8,28pl.Int16,29pl.Int32,30pl.Int64,31pl.UInt8,32pl.UInt16,33pl.UInt32,34pl.UInt64,35pl.Float16,36pl.Float32,37pl.Float64,38pl.Boolean,39pl.String,40pl.Datetime,41# This is broken for empty dataframes42# TODO: Enable lexically ordered categoricals43# pl.Categorical(),44# TODO: Add Enum45# pl.Enum,46]474849@given(50dataframes(51allowed_dtypes=protocol_dtypes,52allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/1619053)54)55def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:56dfi = df.__dataframe__()57df_pa = pa.interchange.from_dataframe(dfi)5859result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]60assert_frame_equal(result, df, categorical_as_str=True)616263@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related64@given(65dataframes(66allowed_dtypes=protocol_dtypes,67excluded_dtypes=[68pl.String, # Polars String type does not match protocol spec69pl.Categorical,70],71allow_chunks=False,72)73)74def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:75dfi = df.__dataframe__(allow_copy=False)76df_pa = pa.interchange.from_dataframe(dfi, allow_copy=False)7778result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]79assert_frame_equal(result, df, categorical_as_str=True)808182@pytest.mark.filterwarnings("ignore:.*copy keyword is deprecated:Warning")83@pytest.mark.filterwarnings(84"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"85)86@given(87dataframes(88allowed_dtypes=protocol_dtypes,89excluded_dtypes=[pl.Float16], # Not yet supported by pandas90allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/1619091)92)93def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:94dfi = df.__dataframe__()95df_pd = pd.api.interchange.from_dataframe(dfi)96result = pl.from_pandas(df_pd, nan_to_null=False)97assert_frame_equal(result, df, categorical_as_str=True)9899100@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related101@pytest.mark.filterwarnings(102"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"103)104@given(105dataframes(106allowed_dtypes=protocol_dtypes,107excluded_dtypes=[108pl.String, # Polars String type does not match protocol spec109pl.Categorical,110pl.Float16, # Not yet supported by pandas111],112allow_chunks=False,113allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190114)115)116def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:117dfi = df.__dataframe__(allow_copy=False)118df_pd = pd.api.interchange.from_dataframe(dfi, allow_copy=False)119result = pl.from_pandas(df_pd, nan_to_null=False)120assert_frame_equal(result, df, categorical_as_str=True)121122123@given(124dataframes(125allowed_dtypes=protocol_dtypes,126excluded_dtypes=[127pl.Categorical, # Categoricals read back as Enum types128],129)130)131def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:132df_pa = df.to_arrow()133result = from_dataframe_interchange_protocol(df_pa)134assert_frame_equal(result, df, categorical_as_str=True)135136137@given(138dataframes(139allowed_dtypes=protocol_dtypes,140excluded_dtypes=[141pl.String, # Polars String type does not match protocol spec142pl.Categorical, # Polars copies the categories to construct a mapping143pl.Boolean, # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991144],145allow_chunks=False,146)147)148def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:149df_pa = df.to_arrow()150result = from_dataframe_interchange_protocol(df_pa, allow_copy=False)151assert_frame_equal(result, df)152153154@skip_if_broken_pandas_version155@given(156dataframes(157allowed_dtypes=protocol_dtypes,158excluded_dtypes=[159pl.Categorical, # Categoricals come back as Enums160],161allow_nan=False, # NaN values come back as nulls162)163)164def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:165df_pd = df.to_pandas(use_pyarrow_extension_array=True)166result = from_dataframe_interchange_protocol(df_pd) # type: ignore[arg-type,unused-ignore]167assert_frame_equal(result, df, categorical_as_str=True)168169170@skip_if_broken_pandas_version171@given(172dataframes(173allowed_dtypes=protocol_dtypes,174excluded_dtypes=[175pl.String, # Polars String type does not match protocol spec176pl.Categorical, # Categoricals come back as Enums177pl.Boolean, # pandas exports boolean buffers as byte-packed178],179# Empty dataframes cause an error due to a bug in pandas.180# https://github.com/pandas-dev/pandas/issues/56700181min_size=1,182allow_chunks=False,183allow_nan=False, # NaN values come back as nulls184)185)186def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:187df_pd = df.to_pandas(use_pyarrow_extension_array=True)188result = from_dataframe_interchange_protocol(df_pd, allow_copy=False) # type: ignore[arg-type,unused-ignore]189assert_frame_equal(result, df)190191192@given(193dataframes(194allowed_dtypes=protocol_dtypes,195excluded_dtypes=[196pl.Categorical, # Categoricals come back as Enums197],198# Empty string columns cause an error due to a bug in pandas.199# https://github.com/pandas-dev/pandas/issues/56703200min_size=1,201allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190202allow_nan=False, # NaN values come back as nulls203)204)205def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:206df_pd = df.to_pandas()207result = from_dataframe_interchange_protocol(df_pd) # type: ignore[arg-type,unused-ignore]208assert_frame_equal(result, df, categorical_as_str=True)209210211@given(212dataframes(213allowed_dtypes=protocol_dtypes,214excluded_dtypes=[215pl.String, # Polars String type does not match protocol spec216pl.Categorical, # Categoricals come back as Enums217pl.Boolean, # pandas exports boolean buffers as byte-packed218],219# Empty dataframes cause an error due to a bug in pandas.220# https://github.com/pandas-dev/pandas/issues/56700221min_size=1,222allow_chunks=False,223allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190224allow_nan=False, # NaN values come back as nulls225)226)227def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:228df_pd = df.to_pandas()229result = from_dataframe_interchange_protocol(df_pd, allow_copy=False) # type: ignore[arg-type,unused-ignore]230assert_frame_equal(result, df)231232233@pytest.mark.filterwarnings("ignore:.*copy keyword is deprecated:Warning")234def test_to_dataframe_pandas_boolean_subchunks() -> None:235df = pl.Series("a", [False, False]).to_frame()236df_chunked = pl.concat([df[0, :], df[1, :]], rechunk=False)237dfi = df_chunked.__dataframe__()238239df_pd = pd.api.interchange.from_dataframe(dfi)240result = pl.from_pandas(df_pd, nan_to_null=False)241242assert_frame_equal(result, df)243244245def test_to_dataframe_pyarrow_boolean() -> None:246df = pl.Series("a", [True, False], dtype=pl.Boolean).to_frame()247dfi = df.__dataframe__()248249df_pa = pa.interchange.from_dataframe(dfi)250result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]251252assert_frame_equal(result, df)253254255def test_to_dataframe_pyarrow_boolean_midbyte_slice() -> None:256s = pl.Series("a", [False] * 9)[3:]257df = s.to_frame()258dfi = df.__dataframe__()259260df_pa = pa.interchange.from_dataframe(dfi)261result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]262263assert_frame_equal(result, df)264265266@pytest.mark.skipif(267parse_version(pd.__version__) < (2, 2),268reason="Pandas versions < 2.2 do not implement the required conversions",269)270def test_from_dataframe_pandas_timestamp_ns() -> None:271df = pl.Series("a", [datetime(2000, 1, 1)], dtype=pl.Datetime("ns")).to_frame()272df_pd = df.to_pandas(use_pyarrow_extension_array=True)273result = pl.from_dataframe(df_pd)274assert_frame_equal(result, df)275276277def test_from_pyarrow_str_dict_with_null_values_20270() -> None:278tb = pa.table(279{280"col1": pa.DictionaryArray.from_arrays(281[0, 0, None, 1, 2], ["A", None, "B"]282),283},284schema=pa.schema({"col1": pa.dictionary(pa.uint32(), pa.string())}),285)286df = pl.from_arrow(tb)287assert isinstance(df, pl.DataFrame)288289assert_series_equal(290df.to_series(), pl.Series("col1", ["A", "A", None, None, "B"], pl.Categorical)291)292293294