Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_to_pandas.py
8458 views
1
from __future__ import annotations
2
3
from datetime import date, datetime
4
from typing import TYPE_CHECKING, Literal
5
6
import hypothesis.strategies as st
7
import numpy as np
8
import pandas as pd
9
import pyarrow as pa
10
import pytest
11
from hypothesis import given
12
13
import polars as pl
14
from polars._utils.various import parse_version
15
16
if TYPE_CHECKING:
17
from polars._typing import PolarsDataType
18
19
20
def test_df_to_pandas_empty() -> None:
21
df = pl.DataFrame()
22
result = df.to_pandas()
23
expected = pd.DataFrame()
24
pd.testing.assert_frame_equal(result, expected)
25
26
27
def test_to_pandas() -> None:
28
df = pl.DataFrame(
29
{
30
"a": [1, 2, 3],
31
"b": [6, None, 8],
32
"c": [10.0, 25.0, 50.5],
33
"d": [date(2023, 7, 5), None, date(1999, 12, 13)],
34
"e": ["a", "b", "c"],
35
"f": [None, "e", "f"],
36
"g": [datetime.now(), datetime.now(), None],
37
},
38
schema_overrides={"a": pl.UInt8},
39
).with_columns(
40
pl.col("e").cast(pl.Categorical).alias("h"),
41
pl.col("f").cast(pl.Categorical).alias("i"),
42
)
43
44
pd_out = df.to_pandas()
45
46
pd_version = parse_version(pd.__version__)
47
string_dtype = (
48
pd.StringDtype(na_value=float("nan")) if pd_version >= (3,) else np.object_
49
)
50
pd_out_dtypes_expected = [
51
np.dtype(np.uint8),
52
np.dtype(np.float64),
53
np.dtype(np.float64),
54
np.dtype("datetime64[ms]"),
55
string_dtype,
56
string_dtype,
57
np.dtype("datetime64[us]"),
58
]
59
assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]
60
assert all(
61
isinstance(dt, pd.CategoricalDtype) for dt in pd_out.dtypes.to_list()[-2:]
62
)
63
64
pd_out_dtypes_expected[3] = np.dtype("O")
65
pd_out = df.to_pandas(date_as_object=True)
66
assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]
67
68
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
69
pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]
70
pd_pa_dtypes_names_expected = [
71
"uint8[pyarrow]",
72
"int64[pyarrow]",
73
"double[pyarrow]",
74
"date32[day][pyarrow]",
75
"large_string[pyarrow]",
76
"large_string[pyarrow]",
77
"timestamp[us][pyarrow]",
78
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
79
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
80
]
81
assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected
82
83
84
@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum(["best", "test"])])
85
def test_cat_to_pandas(dtype: pl.DataType) -> None:
86
df = pl.DataFrame({"a": ["best", "test"]})
87
df = df.with_columns(pl.all().cast(dtype))
88
89
pd_out = df.to_pandas()
90
assert isinstance(pd_out["a"].dtype, pd.CategoricalDtype)
91
92
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
93
assert pd_pa_out["a"].dtype == pd.ArrowDtype(
94
pa.dictionary(pa.int64(), pa.large_string())
95
)
96
97
assert pl.Series(dtype=pl.Enum(["A"])).to_pandas().dtype.categories.tolist() == [ # type: ignore[union-attr]
98
"A"
99
]
100
101
102
@given(
103
column_type_names=st.lists(
104
st.one_of(st.just("Object"), st.just("Int32")), min_size=1, max_size=8
105
)
106
)
107
def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None:
108
"""
109
Converting ``pl.Object`` dtype columns to Pandas is handled correctly.
110
111
This edge case is handled with a separate code path than other data types,
112
so we test it more thoroughly.
113
"""
114
column_types = [getattr(pl, name) for name in column_type_names]
115
data = {
116
f"col_{i}": [object()] if dtype.is_object() else [-i]
117
for i, dtype in enumerate(column_types)
118
}
119
df = pl.DataFrame(
120
data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))}
121
)
122
for pyarrow in [True, False]:
123
pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow)
124
assert isinstance(pandas_df, pd.DataFrame)
125
assert pandas_df.to_dict(orient="list") == data
126
127
128
def test_from_empty_pandas_with_dtypes() -> None:
129
df = pd.DataFrame(columns=["a", "b"])
130
df["a"] = df["a"].astype(str)
131
df["b"] = df["b"].astype(float)
132
assert pl.from_pandas(df).dtypes == [pl.String, pl.Float64]
133
134
df = pl.DataFrame(
135
data=[],
136
schema={
137
"a": pl.Int32,
138
"b": pl.Datetime,
139
"c": pl.Float32,
140
"d": pl.Duration,
141
"e": pl.String,
142
},
143
).to_pandas()
144
145
assert pl.from_pandas(df).dtypes == [
146
pl.Int32,
147
pl.Datetime,
148
pl.Float32,
149
pl.Duration,
150
pl.String,
151
]
152
153
154
def test_to_pandas_series() -> None:
155
assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all()
156
157
158
def test_to_pandas_date() -> None:
159
data = [date(1990, 1, 1), date(2024, 12, 31)]
160
s = pl.Series("a", data)
161
162
result_series = s.to_pandas()
163
expected_series = pd.Series(data, dtype="datetime64[ms]", name="a")
164
pd.testing.assert_series_equal(result_series, expected_series)
165
166
result_df = s.to_frame().to_pandas()
167
expected_df = expected_series.to_frame()
168
pd.testing.assert_frame_equal(result_df, expected_df)
169
170
171
def test_to_pandas_datetime() -> None:
172
data = [datetime(1990, 1, 1, 0, 0, 0), datetime(2024, 12, 31, 23, 59, 59)]
173
s = pl.Series("a", data)
174
175
result_series = s.to_pandas()
176
expected_series = pd.Series(data, dtype="datetime64[us]", name="a")
177
pd.testing.assert_series_equal(result_series, expected_series)
178
179
result_df = s.to_frame().to_pandas()
180
expected_df = expected_series.to_frame()
181
pd.testing.assert_frame_equal(result_df, expected_df)
182
183
184
@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False])
185
def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None:
186
values = [object(), [1, 2, 3]]
187
pd.testing.assert_series_equal(
188
pl.Series("a", values, dtype=pl.Object).to_pandas(
189
use_pyarrow_extension_array=use_pyarrow_extension_array
190
),
191
pd.Series(values, dtype=object, name="a"),
192
)
193
194
195
@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])
196
def test_series_to_pandas_categorical(polars_dtype: PolarsDataType) -> None:
197
s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)
198
result = s.to_pandas()
199
expected = pd.Series(["a", "b", "a"], name="x", dtype="category")
200
assert isinstance(result.dtype, pd.CategoricalDtype)
201
pd.testing.assert_series_equal(result, expected, check_categorical=False)
202
203
204
@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])
205
def test_series_to_pandas_categorical_pyarrow(polars_dtype: PolarsDataType) -> None:
206
s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)
207
result = s.to_pandas(use_pyarrow_extension_array=True)
208
assert s.to_list() == result.to_list()
209
210