Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_to_pandas.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, datetime
4
from typing import TYPE_CHECKING, Literal
5
6
import hypothesis.strategies as st
7
import numpy as np
8
import pandas as pd
9
import pyarrow as pa
10
import pytest
11
from hypothesis import given
12
13
import polars as pl
14
15
if TYPE_CHECKING:
16
from polars._typing import PolarsDataType
17
18
19
def test_df_to_pandas_empty() -> None:
20
df = pl.DataFrame()
21
result = df.to_pandas()
22
expected = pd.DataFrame()
23
pd.testing.assert_frame_equal(result, expected)
24
25
26
def test_to_pandas() -> None:
27
df = pl.DataFrame(
28
{
29
"a": [1, 2, 3],
30
"b": [6, None, 8],
31
"c": [10.0, 25.0, 50.5],
32
"d": [date(2023, 7, 5), None, date(1999, 12, 13)],
33
"e": ["a", "b", "c"],
34
"f": [None, "e", "f"],
35
"g": [datetime.now(), datetime.now(), None],
36
},
37
schema_overrides={"a": pl.UInt8},
38
).with_columns(
39
pl.col("e").cast(pl.Categorical).alias("h"),
40
pl.col("f").cast(pl.Categorical).alias("i"),
41
)
42
43
pd_out = df.to_pandas()
44
45
pd_out_dtypes_expected = [
46
np.dtype(np.uint8),
47
np.dtype(np.float64),
48
np.dtype(np.float64),
49
np.dtype("datetime64[ms]"),
50
np.dtype(np.object_),
51
np.dtype(np.object_),
52
np.dtype("datetime64[us]"),
53
]
54
assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]
55
assert all(
56
isinstance(dt, pd.CategoricalDtype) for dt in pd_out.dtypes.to_list()[-2:]
57
)
58
59
pd_out_dtypes_expected[3] = np.dtype("O")
60
pd_out = df.to_pandas(date_as_object=True)
61
assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]
62
63
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
64
pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]
65
pd_pa_dtypes_names_expected = [
66
"uint8[pyarrow]",
67
"int64[pyarrow]",
68
"double[pyarrow]",
69
"date32[day][pyarrow]",
70
"large_string[pyarrow]",
71
"large_string[pyarrow]",
72
"timestamp[us][pyarrow]",
73
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
74
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
75
]
76
assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected
77
78
79
@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum(["best", "test"])])
80
def test_cat_to_pandas(dtype: pl.DataType) -> None:
81
df = pl.DataFrame({"a": ["best", "test"]})
82
df = df.with_columns(pl.all().cast(dtype))
83
84
pd_out = df.to_pandas()
85
assert isinstance(pd_out["a"].dtype, pd.CategoricalDtype)
86
87
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
88
assert pd_pa_out["a"].dtype == pd.ArrowDtype(
89
pa.dictionary(pa.int64(), pa.large_string())
90
)
91
92
93
@given(
94
column_type_names=st.lists(
95
st.one_of(st.just("Object"), st.just("Int32")), min_size=1, max_size=8
96
)
97
)
98
def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None:
99
"""
100
Converting ``pl.Object`` dtype columns to Pandas is handled correctly.
101
102
This edge case is handled with a separate code path than other data types,
103
so we test it more thoroughly.
104
"""
105
column_types = [getattr(pl, name) for name in column_type_names]
106
data = {
107
f"col_{i}": [object()] if dtype.is_object() else [-i]
108
for i, dtype in enumerate(column_types)
109
}
110
df = pl.DataFrame(
111
data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))}
112
)
113
for pyarrow in [True, False]:
114
pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow)
115
assert isinstance(pandas_df, pd.DataFrame)
116
assert pandas_df.to_dict(orient="list") == data
117
118
119
def test_from_empty_pandas_with_dtypes() -> None:
120
df = pd.DataFrame(columns=["a", "b"])
121
df["a"] = df["a"].astype(str)
122
df["b"] = df["b"].astype(float)
123
assert pl.from_pandas(df).dtypes == [pl.String, pl.Float64]
124
125
df = pl.DataFrame(
126
data=[],
127
schema={
128
"a": pl.Int32,
129
"b": pl.Datetime,
130
"c": pl.Float32,
131
"d": pl.Duration,
132
"e": pl.String,
133
},
134
).to_pandas()
135
136
assert pl.from_pandas(df).dtypes == [
137
pl.Int32,
138
pl.Datetime,
139
pl.Float32,
140
pl.Duration,
141
pl.String,
142
]
143
144
145
def test_to_pandas_series() -> None:
146
assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all()
147
148
149
def test_to_pandas_date() -> None:
150
data = [date(1990, 1, 1), date(2024, 12, 31)]
151
s = pl.Series("a", data)
152
153
result_series = s.to_pandas()
154
expected_series = pd.Series(data, dtype="datetime64[ms]", name="a")
155
pd.testing.assert_series_equal(result_series, expected_series)
156
157
result_df = s.to_frame().to_pandas()
158
expected_df = expected_series.to_frame()
159
pd.testing.assert_frame_equal(result_df, expected_df)
160
161
162
def test_to_pandas_datetime() -> None:
163
data = [datetime(1990, 1, 1, 0, 0, 0), datetime(2024, 12, 31, 23, 59, 59)]
164
s = pl.Series("a", data)
165
166
result_series = s.to_pandas()
167
expected_series = pd.Series(data, dtype="datetime64[us]", name="a")
168
pd.testing.assert_series_equal(result_series, expected_series)
169
170
result_df = s.to_frame().to_pandas()
171
expected_df = expected_series.to_frame()
172
pd.testing.assert_frame_equal(result_df, expected_df)
173
174
175
@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False])
176
def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None:
177
values = [object(), [1, 2, 3]]
178
pd.testing.assert_series_equal(
179
pl.Series("a", values, dtype=pl.Object).to_pandas(
180
use_pyarrow_extension_array=use_pyarrow_extension_array
181
),
182
pd.Series(values, dtype=object, name="a"),
183
)
184
185
186
@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])
187
def test_series_to_pandas_categorical(polars_dtype: PolarsDataType) -> None:
188
s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)
189
result = s.to_pandas()
190
expected = pd.Series(["a", "b", "a"], name="x", dtype="category")
191
assert isinstance(result.dtype, pd.CategoricalDtype)
192
pd.testing.assert_series_equal(result, expected, check_categorical=False)
193
194
195
@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])
196
def test_series_to_pandas_categorical_pyarrow(polars_dtype: PolarsDataType) -> None:
197
s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)
198
result = s.to_pandas(use_pyarrow_extension_array=True)
199
assert s.to_list() == result.to_list()
200
201