Path: blob/main/py-polars/tests/unit/interchange/test_roundtrip.py
6939 views
from __future__ import annotations12from datetime import datetime3from typing import TYPE_CHECKING45import pandas as pd6import pyarrow as pa7import pyarrow.interchange8import pytest9from hypothesis import given1011import polars as pl12from polars._utils.various import parse_version13from polars.interchange.from_dataframe import (14from_dataframe as from_dataframe_interchange_protocol,15)16from polars.testing import assert_frame_equal, assert_series_equal17from polars.testing.parametric import dataframes1819skip_if_broken_pandas_version = pytest.mark.skipif(20pd.__version__.startswith("2"), reason="bug. see #20316"21)2223if TYPE_CHECKING:24from polars._typing import PolarsDataType2526protocol_dtypes: list[PolarsDataType] = [27pl.Int8,28pl.Int16,29pl.Int32,30pl.Int64,31pl.UInt8,32pl.UInt16,33pl.UInt32,34pl.UInt64,35pl.Float32,36pl.Float64,37pl.Boolean,38pl.String,39pl.Datetime,40# This is broken for empty dataframes41# TODO: Enable lexically ordered categoricals42# pl.Categorical("lexical"),43# TODO: Add Enum44# pl.Enum,45]464748@given(49dataframes(50allowed_dtypes=protocol_dtypes,51allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/1619052)53)54def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:55dfi = df.__dataframe__()56df_pa = pa.interchange.from_dataframe(dfi)5758result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]59assert_frame_equal(result, df, categorical_as_str=True)606162@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related63@given(64dataframes(65allowed_dtypes=protocol_dtypes,66excluded_dtypes=[67pl.String, # Polars String type does not match protocol spec68pl.Categorical,69],70allow_chunks=False,71)72)73def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:74dfi = df.__dataframe__(allow_copy=False)75df_pa = pa.interchange.from_dataframe(dfi, allow_copy=False)7677result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]78assert_frame_equal(result, df, categorical_as_str=True)798081@pytest.mark.filterwarnings(82"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"83)84@given(85dataframes(86allowed_dtypes=protocol_dtypes,87allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/1619088)89)90def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:91dfi = df.__dataframe__()92df_pd = pd.api.interchange.from_dataframe(dfi)93result = pl.from_pandas(df_pd, nan_to_null=False)94assert_frame_equal(result, df, categorical_as_str=True)959697@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related98@pytest.mark.filterwarnings(99"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"100)101@given(102dataframes(103allowed_dtypes=protocol_dtypes,104excluded_dtypes=[105pl.String, # Polars String type does not match protocol spec106pl.Categorical,107],108allow_chunks=False,109allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190110)111)112def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:113dfi = df.__dataframe__(allow_copy=False)114df_pd = pd.api.interchange.from_dataframe(dfi, allow_copy=False)115result = pl.from_pandas(df_pd, nan_to_null=False)116assert_frame_equal(result, df, categorical_as_str=True)117118119@given(120dataframes(121allowed_dtypes=protocol_dtypes,122excluded_dtypes=[123pl.Categorical, # Categoricals read back as Enum types124],125)126)127def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:128df_pa = df.to_arrow()129result = from_dataframe_interchange_protocol(df_pa)130assert_frame_equal(result, df, categorical_as_str=True)131132133@given(134dataframes(135allowed_dtypes=protocol_dtypes,136excluded_dtypes=[137pl.String, # Polars String type does not match protocol spec138pl.Categorical, # Polars copies the categories to construct a mapping139pl.Boolean, # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991140],141allow_chunks=False,142)143)144def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:145df_pa = df.to_arrow()146result = from_dataframe_interchange_protocol(df_pa, allow_copy=False)147assert_frame_equal(result, df)148149150@skip_if_broken_pandas_version151@given(152dataframes(153allowed_dtypes=protocol_dtypes,154excluded_dtypes=[155pl.Categorical, # Categoricals come back as Enums156pl.Float32, # NaN values come back as nulls157pl.Float64, # NaN values come back as nulls158],159)160)161def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:162df_pd = df.to_pandas(use_pyarrow_extension_array=True)163result = from_dataframe_interchange_protocol(df_pd)164assert_frame_equal(result, df, categorical_as_str=True)165166167@skip_if_broken_pandas_version168@given(169dataframes(170allowed_dtypes=protocol_dtypes,171excluded_dtypes=[172pl.String, # Polars String type does not match protocol spec173pl.Categorical, # Categoricals come back as Enums174pl.Float32, # NaN values come back as nulls175pl.Float64, # NaN values come back as nulls176pl.Boolean, # pandas exports boolean buffers as byte-packed177],178# Empty dataframes cause an error due to a bug in pandas.179# https://github.com/pandas-dev/pandas/issues/56700180min_size=1,181allow_chunks=False,182)183)184def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:185df_pd = df.to_pandas(use_pyarrow_extension_array=True)186result = from_dataframe_interchange_protocol(df_pd, allow_copy=False)187assert_frame_equal(result, df)188189190@given(191dataframes(192allowed_dtypes=protocol_dtypes,193excluded_dtypes=[194pl.Categorical, # Categoricals come back as Enums195pl.Float32, # NaN values come back as nulls196pl.Float64, # NaN values come back as nulls197],198# Empty string columns cause an error due to a bug in pandas.199# https://github.com/pandas-dev/pandas/issues/56703200min_size=1,201allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190202)203)204def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:205df_pd = df.to_pandas()206result = from_dataframe_interchange_protocol(df_pd)207assert_frame_equal(result, df, categorical_as_str=True)208209210@given(211dataframes(212allowed_dtypes=protocol_dtypes,213excluded_dtypes=[214pl.String, # Polars String type does not match protocol spec215pl.Categorical, # Categoricals come back as Enums216pl.Float32, # NaN values come back as nulls217pl.Float64, # NaN values come back as nulls218pl.Boolean, # pandas exports boolean buffers as byte-packed219],220# Empty dataframes cause an error due to a bug in pandas.221# https://github.com/pandas-dev/pandas/issues/56700222min_size=1,223allow_chunks=False,224allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190225)226)227def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:228df_pd = df.to_pandas()229result = from_dataframe_interchange_protocol(df_pd, allow_copy=False)230assert_frame_equal(result, df)231232233def test_to_dataframe_pandas_boolean_subchunks() -> None:234df = pl.Series("a", [False, False]).to_frame()235df_chunked = pl.concat([df[0, :], df[1, :]], rechunk=False)236dfi = df_chunked.__dataframe__()237238df_pd = pd.api.interchange.from_dataframe(dfi)239result = pl.from_pandas(df_pd, nan_to_null=False)240241assert_frame_equal(result, df)242243244def test_to_dataframe_pyarrow_boolean() -> None:245df = pl.Series("a", [True, False], dtype=pl.Boolean).to_frame()246dfi = df.__dataframe__()247248df_pa = pa.interchange.from_dataframe(dfi)249result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]250251assert_frame_equal(result, df)252253254def test_to_dataframe_pyarrow_boolean_midbyte_slice() -> None:255s = pl.Series("a", [False] * 9)[3:]256df = s.to_frame()257dfi = df.__dataframe__()258259df_pa = pa.interchange.from_dataframe(dfi)260result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]261262assert_frame_equal(result, df)263264265@pytest.mark.skipif(266parse_version(pd.__version__) < (2, 2),267reason="Pandas versions < 2.2 do not implement the required conversions",268)269def test_from_dataframe_pandas_timestamp_ns() -> None:270df = pl.Series("a", [datetime(2000, 1, 1)], dtype=pl.Datetime("ns")).to_frame()271df_pd = df.to_pandas(use_pyarrow_extension_array=True)272result = pl.from_dataframe(df_pd)273assert_frame_equal(result, df)274275276def test_from_pyarrow_str_dict_with_null_values_20270() -> None:277tb = pa.table(278{279"col1": pa.DictionaryArray.from_arrays(280[0, 0, None, 1, 2], ["A", None, "B"]281),282},283schema=pa.schema({"col1": pa.dictionary(pa.uint32(), pa.string())}),284)285df = pl.from_arrow(tb)286assert isinstance(df, pl.DataFrame)287288assert_series_equal(289df.to_series(), pl.Series("col1", ["A", "A", None, None, "B"], pl.Categorical)290)291292293