Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interchange/test_roundtrip.py
8424 views
1
from __future__ import annotations
2
3
from datetime import datetime
4
from typing import TYPE_CHECKING
5
6
import pandas as pd
7
import pyarrow as pa
8
import pyarrow.interchange
9
import pytest
10
from hypothesis import given
11
12
import polars as pl
13
from polars._utils.various import parse_version
14
from polars.interchange.from_dataframe import (
15
from_dataframe as from_dataframe_interchange_protocol,
16
)
17
from polars.testing import assert_frame_equal, assert_series_equal
18
from polars.testing.parametric import dataframes
19
20
skip_if_broken_pandas_version = pytest.mark.skipif(
21
pd.__version__.startswith("2"), reason="bug. see #20316"
22
)
23
24
if TYPE_CHECKING:
25
from polars._typing import PolarsDataType
26
27
protocol_dtypes: list[PolarsDataType] = [
28
pl.Int8,
29
pl.Int16,
30
pl.Int32,
31
pl.Int64,
32
pl.UInt8,
33
pl.UInt16,
34
pl.UInt32,
35
pl.UInt64,
36
pl.Float16,
37
pl.Float32,
38
pl.Float64,
39
pl.Boolean,
40
pl.String,
41
pl.Datetime,
42
# This is broken for empty dataframes
43
# TODO: Enable lexically ordered categoricals
44
# pl.Categorical(),
45
# TODO: Add Enum
46
# pl.Enum,
47
]
48
49
50
@given(
51
dataframes(
52
allowed_dtypes=protocol_dtypes,
53
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
54
)
55
)
56
def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
57
dfi = df.__dataframe__()
58
df_pa = pa.interchange.from_dataframe(dfi)
59
60
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
61
assert_frame_equal(result, df, categorical_as_str=True)
62
63
64
@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related
65
@given(
66
dataframes(
67
allowed_dtypes=protocol_dtypes,
68
excluded_dtypes=[
69
pl.String, # Polars String type does not match protocol spec
70
pl.Categorical,
71
],
72
allow_chunks=False,
73
)
74
)
75
def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
76
dfi = df.__dataframe__(allow_copy=False)
77
df_pa = pa.interchange.from_dataframe(dfi, allow_copy=False)
78
79
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
80
assert_frame_equal(result, df, categorical_as_str=True)
81
82
83
@pytest.mark.filterwarnings("ignore:.*copy keyword is deprecated:Warning")
84
@pytest.mark.filterwarnings(
85
"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
86
)
87
@given(
88
dataframes(
89
allowed_dtypes=protocol_dtypes,
90
excluded_dtypes=[pl.Float16], # Not yet supported by pandas
91
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
92
)
93
)
94
def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
95
dfi = df.__dataframe__()
96
df_pd = pd.api.interchange.from_dataframe(dfi)
97
result = pl.from_pandas(df_pd, nan_to_null=False)
98
assert_frame_equal(result, df, categorical_as_str=True)
99
100
101
@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related
102
@pytest.mark.filterwarnings(
103
"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
104
)
105
@given(
106
dataframes(
107
allowed_dtypes=protocol_dtypes,
108
excluded_dtypes=[
109
pl.String, # Polars String type does not match protocol spec
110
pl.Categorical,
111
pl.Float16, # Not yet supported by pandas
112
],
113
allow_chunks=False,
114
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
115
)
116
)
117
def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
118
dfi = df.__dataframe__(allow_copy=False)
119
df_pd = pd.api.interchange.from_dataframe(dfi, allow_copy=False)
120
result = pl.from_pandas(df_pd, nan_to_null=False)
121
assert_frame_equal(result, df, categorical_as_str=True)
122
123
124
@given(
125
dataframes(
126
allowed_dtypes=protocol_dtypes,
127
excluded_dtypes=[
128
pl.Categorical, # Categoricals read back as Enum types
129
],
130
)
131
)
132
def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
133
df_pa = df.to_arrow()
134
result = from_dataframe_interchange_protocol(df_pa)
135
assert_frame_equal(result, df, categorical_as_str=True)
136
137
138
@given(
139
dataframes(
140
allowed_dtypes=protocol_dtypes,
141
excluded_dtypes=[
142
pl.String, # Polars String type does not match protocol spec
143
pl.Categorical, # Polars copies the categories to construct a mapping
144
pl.Boolean, # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991
145
],
146
allow_chunks=False,
147
)
148
)
149
def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
150
df_pa = df.to_arrow()
151
result = from_dataframe_interchange_protocol(df_pa, allow_copy=False)
152
assert_frame_equal(result, df)
153
154
155
@skip_if_broken_pandas_version
156
@given(
157
dataframes(
158
allowed_dtypes=protocol_dtypes,
159
excluded_dtypes=[
160
pl.Categorical, # Categoricals come back as Enums
161
],
162
allow_nan=False, # NaN values come back as nulls
163
)
164
)
165
def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
166
df_pd = df.to_pandas(use_pyarrow_extension_array=True)
167
result = from_dataframe_interchange_protocol(df_pd) # type: ignore[arg-type,unused-ignore]
168
assert_frame_equal(result, df, categorical_as_str=True)
169
170
171
@skip_if_broken_pandas_version
172
@given(
173
dataframes(
174
allowed_dtypes=protocol_dtypes,
175
excluded_dtypes=[
176
pl.String, # Polars String type does not match protocol spec
177
pl.Categorical, # Categoricals come back as Enums
178
pl.Boolean, # pandas exports boolean buffers as byte-packed
179
],
180
# Empty dataframes cause an error due to a bug in pandas.
181
# https://github.com/pandas-dev/pandas/issues/56700
182
min_size=1,
183
allow_chunks=False,
184
allow_nan=False, # NaN values come back as nulls
185
)
186
)
187
def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
188
df_pd = df.to_pandas(use_pyarrow_extension_array=True)
189
result = from_dataframe_interchange_protocol(df_pd, allow_copy=False) # type: ignore[arg-type,unused-ignore]
190
assert_frame_equal(result, df)
191
192
193
@given(
194
dataframes(
195
allowed_dtypes=protocol_dtypes,
196
excluded_dtypes=[
197
pl.Categorical, # Categoricals come back as Enums
198
],
199
# Empty string columns cause an error due to a bug in pandas.
200
# https://github.com/pandas-dev/pandas/issues/56703
201
min_size=1,
202
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
203
allow_nan=False, # NaN values come back as nulls
204
)
205
)
206
def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
207
df_pd = df.to_pandas()
208
result = from_dataframe_interchange_protocol(df_pd) # type: ignore[arg-type,unused-ignore]
209
assert_frame_equal(result, df, categorical_as_str=True)
210
211
212
@given(
213
dataframes(
214
allowed_dtypes=protocol_dtypes,
215
excluded_dtypes=[
216
pl.String, # Polars String type does not match protocol spec
217
pl.Categorical, # Categoricals come back as Enums
218
pl.Boolean, # pandas exports boolean buffers as byte-packed
219
],
220
# Empty dataframes cause an error due to a bug in pandas.
221
# https://github.com/pandas-dev/pandas/issues/56700
222
min_size=1,
223
allow_chunks=False,
224
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
225
allow_nan=False, # NaN values come back as nulls
226
)
227
)
228
def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:
229
df_pd = df.to_pandas()
230
result = from_dataframe_interchange_protocol(df_pd, allow_copy=False) # type: ignore[arg-type,unused-ignore]
231
assert_frame_equal(result, df)
232
233
234
@pytest.mark.filterwarnings("ignore:.*copy keyword is deprecated:Warning")
235
def test_to_dataframe_pandas_boolean_subchunks() -> None:
236
df = pl.Series("a", [False, False]).to_frame()
237
df_chunked = pl.concat([df[0, :], df[1, :]], rechunk=False)
238
dfi = df_chunked.__dataframe__()
239
240
df_pd = pd.api.interchange.from_dataframe(dfi)
241
result = pl.from_pandas(df_pd, nan_to_null=False)
242
243
assert_frame_equal(result, df)
244
245
246
def test_to_dataframe_pyarrow_boolean() -> None:
247
df = pl.Series("a", [True, False], dtype=pl.Boolean).to_frame()
248
dfi = df.__dataframe__()
249
250
df_pa = pa.interchange.from_dataframe(dfi)
251
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
252
253
assert_frame_equal(result, df)
254
255
256
def test_to_dataframe_pyarrow_boolean_midbyte_slice() -> None:
257
s = pl.Series("a", [False] * 9)[3:]
258
df = s.to_frame()
259
dfi = df.__dataframe__()
260
261
df_pa = pa.interchange.from_dataframe(dfi)
262
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
263
264
assert_frame_equal(result, df)
265
266
267
@pytest.mark.skipif(
268
parse_version(pd.__version__) < (2, 2),
269
reason="Pandas versions < 2.2 do not implement the required conversions",
270
)
271
def test_from_dataframe_pandas_timestamp_ns() -> None:
272
df = pl.Series("a", [datetime(2000, 1, 1)], dtype=pl.Datetime("ns")).to_frame()
273
df_pd = df.to_pandas(use_pyarrow_extension_array=True)
274
result = pl.from_dataframe(df_pd)
275
assert_frame_equal(result, df)
276
277
278
def test_from_pyarrow_str_dict_with_null_values_20270() -> None:
279
tb = pa.table(
280
{
281
"col1": pa.DictionaryArray.from_arrays(
282
[0, 0, None, 1, 2], ["A", None, "B"]
283
),
284
},
285
schema=pa.schema({"col1": pa.dictionary(pa.uint32(), pa.string())}),
286
)
287
df = pl.from_arrow(tb)
288
assert isinstance(df, pl.DataFrame)
289
290
assert_series_equal(
291
df.to_series(), pl.Series("col1", ["A", "A", None, None, "B"], pl.Categorical)
292
)
293
294