Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interchange/test_roundtrip.py
6939 views
1
from __future__ import annotations
2
3
from datetime import datetime
4
from typing import TYPE_CHECKING
5
6
import pandas as pd
7
import pyarrow as pa
8
import pyarrow.interchange
9
import pytest
10
from hypothesis import given
11
12
import polars as pl
13
from polars._utils.various import parse_version
14
from polars.interchange.from_dataframe import (
15
from_dataframe as from_dataframe_interchange_protocol,
16
)
17
from polars.testing import assert_frame_equal, assert_series_equal
18
from polars.testing.parametric import dataframes
19
20
skip_if_broken_pandas_version = pytest.mark.skipif(
21
pd.__version__.startswith("2"), reason="bug. see #20316"
22
)
23
24
if TYPE_CHECKING:
25
from polars._typing import PolarsDataType
26
27
protocol_dtypes: list[PolarsDataType] = [
28
pl.Int8,
29
pl.Int16,
30
pl.Int32,
31
pl.Int64,
32
pl.UInt8,
33
pl.UInt16,
34
pl.UInt32,
35
pl.UInt64,
36
pl.Float32,
37
pl.Float64,
38
pl.Boolean,
39
pl.String,
40
pl.Datetime,
41
# This is broken for empty dataframes
42
# TODO: Enable lexically ordered categoricals
43
# pl.Categorical("lexical"),
44
# TODO: Add Enum
45
# pl.Enum,
46
]
47
48
49
@given(
50
dataframes(
51
allowed_dtypes=protocol_dtypes,
52
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
53
)
54
)
55
def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
56
dfi = df.__dataframe__()
57
df_pa = pa.interchange.from_dataframe(dfi)
58
59
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
60
assert_frame_equal(result, df, categorical_as_str=True)
61
62
63
@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related
64
@given(
65
dataframes(
66
allowed_dtypes=protocol_dtypes,
67
excluded_dtypes=[
68
pl.String, # Polars String type does not match protocol spec
69
pl.Categorical,
70
],
71
allow_chunks=False,
72
)
73
)
74
def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
75
dfi = df.__dataframe__(allow_copy=False)
76
df_pa = pa.interchange.from_dataframe(dfi, allow_copy=False)
77
78
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
79
assert_frame_equal(result, df, categorical_as_str=True)
80
81
82
@pytest.mark.filterwarnings(
83
"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
84
)
85
@given(
86
dataframes(
87
allowed_dtypes=protocol_dtypes,
88
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
89
)
90
)
91
def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
92
dfi = df.__dataframe__()
93
df_pd = pd.api.interchange.from_dataframe(dfi)
94
result = pl.from_pandas(df_pd, nan_to_null=False)
95
assert_frame_equal(result, df, categorical_as_str=True)
96
97
98
@pytest.mark.may_fail_cloud # reason: not-lazy, likely environment related
99
@pytest.mark.filterwarnings(
100
"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
101
)
102
@given(
103
dataframes(
104
allowed_dtypes=protocol_dtypes,
105
excluded_dtypes=[
106
pl.String, # Polars String type does not match protocol spec
107
pl.Categorical,
108
],
109
allow_chunks=False,
110
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
111
)
112
)
113
def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
114
dfi = df.__dataframe__(allow_copy=False)
115
df_pd = pd.api.interchange.from_dataframe(dfi, allow_copy=False)
116
result = pl.from_pandas(df_pd, nan_to_null=False)
117
assert_frame_equal(result, df, categorical_as_str=True)
118
119
120
@given(
121
dataframes(
122
allowed_dtypes=protocol_dtypes,
123
excluded_dtypes=[
124
pl.Categorical, # Categoricals read back as Enum types
125
],
126
)
127
)
128
def test_from_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
129
df_pa = df.to_arrow()
130
result = from_dataframe_interchange_protocol(df_pa)
131
assert_frame_equal(result, df, categorical_as_str=True)
132
133
134
@given(
135
dataframes(
136
allowed_dtypes=protocol_dtypes,
137
excluded_dtypes=[
138
pl.String, # Polars String type does not match protocol spec
139
pl.Categorical, # Polars copies the categories to construct a mapping
140
pl.Boolean, # pyarrow exports boolean buffers as byte-packed: https://github.com/apache/arrow/issues/37991
141
],
142
allow_chunks=False,
143
)
144
)
145
def test_from_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
146
df_pa = df.to_arrow()
147
result = from_dataframe_interchange_protocol(df_pa, allow_copy=False)
148
assert_frame_equal(result, df)
149
150
151
@skip_if_broken_pandas_version
152
@given(
153
dataframes(
154
allowed_dtypes=protocol_dtypes,
155
excluded_dtypes=[
156
pl.Categorical, # Categoricals come back as Enums
157
pl.Float32, # NaN values come back as nulls
158
pl.Float64, # NaN values come back as nulls
159
],
160
)
161
)
162
def test_from_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
163
df_pd = df.to_pandas(use_pyarrow_extension_array=True)
164
result = from_dataframe_interchange_protocol(df_pd)
165
assert_frame_equal(result, df, categorical_as_str=True)
166
167
168
@skip_if_broken_pandas_version
169
@given(
170
dataframes(
171
allowed_dtypes=protocol_dtypes,
172
excluded_dtypes=[
173
pl.String, # Polars String type does not match protocol spec
174
pl.Categorical, # Categoricals come back as Enums
175
pl.Float32, # NaN values come back as nulls
176
pl.Float64, # NaN values come back as nulls
177
pl.Boolean, # pandas exports boolean buffers as byte-packed
178
],
179
# Empty dataframes cause an error due to a bug in pandas.
180
# https://github.com/pandas-dev/pandas/issues/56700
181
min_size=1,
182
allow_chunks=False,
183
)
184
)
185
def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
186
df_pd = df.to_pandas(use_pyarrow_extension_array=True)
187
result = from_dataframe_interchange_protocol(df_pd, allow_copy=False)
188
assert_frame_equal(result, df)
189
190
191
@given(
192
dataframes(
193
allowed_dtypes=protocol_dtypes,
194
excluded_dtypes=[
195
pl.Categorical, # Categoricals come back as Enums
196
pl.Float32, # NaN values come back as nulls
197
pl.Float64, # NaN values come back as nulls
198
],
199
# Empty string columns cause an error due to a bug in pandas.
200
# https://github.com/pandas-dev/pandas/issues/56703
201
min_size=1,
202
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
203
)
204
)
205
def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
206
df_pd = df.to_pandas()
207
result = from_dataframe_interchange_protocol(df_pd)
208
assert_frame_equal(result, df, categorical_as_str=True)
209
210
211
@given(
212
dataframes(
213
allowed_dtypes=protocol_dtypes,
214
excluded_dtypes=[
215
pl.String, # Polars String type does not match protocol spec
216
pl.Categorical, # Categoricals come back as Enums
217
pl.Float32, # NaN values come back as nulls
218
pl.Float64, # NaN values come back as nulls
219
pl.Boolean, # pandas exports boolean buffers as byte-packed
220
],
221
# Empty dataframes cause an error due to a bug in pandas.
222
# https://github.com/pandas-dev/pandas/issues/56700
223
min_size=1,
224
allow_chunks=False,
225
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
226
)
227
)
228
def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:
229
df_pd = df.to_pandas()
230
result = from_dataframe_interchange_protocol(df_pd, allow_copy=False)
231
assert_frame_equal(result, df)
232
233
234
def test_to_dataframe_pandas_boolean_subchunks() -> None:
235
df = pl.Series("a", [False, False]).to_frame()
236
df_chunked = pl.concat([df[0, :], df[1, :]], rechunk=False)
237
dfi = df_chunked.__dataframe__()
238
239
df_pd = pd.api.interchange.from_dataframe(dfi)
240
result = pl.from_pandas(df_pd, nan_to_null=False)
241
242
assert_frame_equal(result, df)
243
244
245
def test_to_dataframe_pyarrow_boolean() -> None:
246
df = pl.Series("a", [True, False], dtype=pl.Boolean).to_frame()
247
dfi = df.__dataframe__()
248
249
df_pa = pa.interchange.from_dataframe(dfi)
250
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
251
252
assert_frame_equal(result, df)
253
254
255
def test_to_dataframe_pyarrow_boolean_midbyte_slice() -> None:
256
s = pl.Series("a", [False] * 9)[3:]
257
df = s.to_frame()
258
dfi = df.__dataframe__()
259
260
df_pa = pa.interchange.from_dataframe(dfi)
261
result: pl.DataFrame = pl.from_arrow(df_pa) # type: ignore[assignment]
262
263
assert_frame_equal(result, df)
264
265
266
@pytest.mark.skipif(
267
parse_version(pd.__version__) < (2, 2),
268
reason="Pandas versions < 2.2 do not implement the required conversions",
269
)
270
def test_from_dataframe_pandas_timestamp_ns() -> None:
271
df = pl.Series("a", [datetime(2000, 1, 1)], dtype=pl.Datetime("ns")).to_frame()
272
df_pd = df.to_pandas(use_pyarrow_extension_array=True)
273
result = pl.from_dataframe(df_pd)
274
assert_frame_equal(result, df)
275
276
277
def test_from_pyarrow_str_dict_with_null_values_20270() -> None:
278
tb = pa.table(
279
{
280
"col1": pa.DictionaryArray.from_arrays(
281
[0, 0, None, 1, 2], ["A", None, "B"]
282
),
283
},
284
schema=pa.schema({"col1": pa.dictionary(pa.uint32(), pa.string())}),
285
)
286
df = pl.from_arrow(tb)
287
assert isinstance(df, pl.DataFrame)
288
289
assert_series_equal(
290
df.to_series(), pl.Series("col1", ["A", "A", None, None, "B"], pl.Categorical)
291
)
292
293