Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_from_pandas.py
8430 views
1
from __future__ import annotations
2
3
from datetime import datetime, timedelta
4
from typing import TYPE_CHECKING, Any
5
6
import numpy as np
7
import pandas as pd
8
import pytest
9
10
import polars as pl
11
from polars.testing import assert_frame_equal
12
from polars.testing.asserts.series import assert_series_equal
13
14
if TYPE_CHECKING:
15
from polars._typing import PolarsDataType
16
from tests.conftest import PlMonkeyPatch
17
18
19
def test_index_not_silently_excluded() -> None:
20
ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}
21
df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))
22
with pytest.raises(ValueError, match="indices and column names must not overlap"):
23
pl.from_pandas(df, include_index=True)
24
25
26
def test_nameless_multiindex_doesnt_raise_with_include_index_false_18130() -> None:
27
df = pd.DataFrame(
28
range(4),
29
columns=["A"],
30
index=pd.MultiIndex.from_product((["C", "D"], [3, 4])),
31
)
32
result = pl.from_pandas(df)
33
expected = pl.DataFrame({"A": [0, 1, 2, 3]})
34
assert_frame_equal(result, expected)
35
36
37
def test_from_pandas() -> None:
38
df = pd.DataFrame(
39
{
40
"bools": [False, True, False],
41
"bools_nulls": [None, True, False],
42
"int": [1, 2, 3],
43
"int_nulls": [1, None, 3],
44
"floats": [1.0, 2.0, 3.0],
45
"floats_nulls": [1.0, None, 3.0],
46
"strings": ["foo", "bar", "ham"],
47
"strings_nulls": ["foo", None, "ham"],
48
"strings-cat": ["foo", "bar", "ham"],
49
}
50
)
51
df["strings-cat"] = df["strings-cat"].astype("category")
52
53
out = pl.from_pandas(df)
54
assert out.shape == (3, 9)
55
assert out.schema == {
56
"bools": pl.Boolean,
57
"bools_nulls": pl.Boolean,
58
"int": pl.Int64,
59
"int_nulls": pl.Float64,
60
"floats": pl.Float64,
61
"floats_nulls": pl.Float64,
62
"strings": pl.String,
63
"strings_nulls": pl.String,
64
"strings-cat": pl.Categorical(),
65
}
66
assert out.rows() == [
67
(False, None, 1, 1.0, 1.0, 1.0, "foo", "foo", "foo"),
68
(True, True, 2, None, 2.0, None, "bar", None, "bar"),
69
(False, False, 3, 3.0, 3.0, 3.0, "ham", "ham", "ham"),
70
]
71
72
# partial dtype overrides from pandas
73
overrides = {"int": pl.Int8, "int_nulls": pl.Int32, "floats": pl.Float32}
74
out = pl.from_pandas(df, schema_overrides=overrides)
75
for col, dtype in overrides.items():
76
assert out.schema[col] == dtype
77
78
79
@pytest.mark.parametrize(
80
"nulls",
81
[
82
[],
83
[None],
84
[None, None],
85
[None, None, None],
86
],
87
)
88
def test_from_pandas_nulls(nulls: list[None]) -> None:
89
# empty and/or all null values, no pandas dtype
90
ps = pd.Series(nulls)
91
s = pl.from_pandas(ps)
92
assert nulls == s.to_list()
93
94
95
def test_from_pandas_nan_to_null() -> None:
96
df = pd.DataFrame(
97
{
98
"bools_nulls": [None, True, False],
99
"int_nulls": [1, None, 3],
100
"floats_nulls": [1.0, None, 3.0],
101
"strings_nulls": ["foo", None, "ham"],
102
"nulls": [None, np.nan, np.nan],
103
}
104
)
105
out_true = pl.from_pandas(df)
106
out_false = pl.from_pandas(df, nan_to_null=False)
107
assert all(val is None for val in out_true["nulls"])
108
assert all(np.isnan(val) for val in out_false["nulls"][1:])
109
110
df = pd.Series([2, np.nan, None], name="pd") # type: ignore[assignment]
111
out_true = pl.from_pandas(df)
112
out_false = pl.from_pandas(df, nan_to_null=False)
113
assert [val is None for val in out_true]
114
assert [np.isnan(val) for val in out_false[1:]]
115
116
117
def test_from_pandas_datetime() -> None:
118
ts = datetime(2021, 1, 1, 20, 20, 20, 20)
119
pd_s = pd.Series([ts, ts])
120
tmp = pl.from_pandas(pd_s.to_frame("a"))
121
s = tmp["a"]
122
assert s.dt.hour()[0] == 20
123
assert s.dt.minute()[0] == 20
124
assert s.dt.second()[0] == 20
125
126
date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 09:00:00", freq="1h")
127
s = pl.from_pandas(date_times)
128
assert s[0] == datetime(2021, 6, 24, 0, 0)
129
assert s[-1] == datetime(2021, 6, 24, 9, 0)
130
131
132
@pytest.mark.parametrize(
133
("index_class", "index_data", "index_params", "expected_data", "expected_dtype"),
134
[
135
(pd.Index, [100, 200, 300], {}, None, pl.Int64),
136
(pd.Index, [1, 2, 3], {"dtype": "uint32"}, None, pl.UInt32),
137
(pd.RangeIndex, 5, {}, [0, 1, 2, 3, 4], pl.Int64),
138
(pd.CategoricalIndex, ["N", "E", "S", "W"], {}, None, pl.Categorical),
139
(
140
pd.DatetimeIndex,
141
[datetime(1960, 12, 31), datetime(2077, 10, 20)],
142
{"dtype": "datetime64[ms]"},
143
None,
144
pl.Datetime("ms"),
145
),
146
(
147
pd.TimedeltaIndex,
148
["24 hours", "2 days 8 hours", "3 days 42 seconds"],
149
{"dtype": "timedelta64[us]"},
150
[timedelta(1), timedelta(days=2, hours=8), timedelta(days=3, seconds=42)],
151
pl.Duration("us"),
152
),
153
],
154
)
155
def test_from_pandas_index(
156
index_class: Any,
157
index_data: Any,
158
index_params: dict[str, Any],
159
expected_data: list[Any] | None,
160
expected_dtype: PolarsDataType,
161
) -> None:
162
if expected_data is None:
163
expected_data = index_data
164
165
s = pl.from_pandas(index_class(index_data, **index_params))
166
assert s.to_list() == expected_data
167
assert s.dtype == expected_dtype
168
169
170
def test_from_pandas_include_indexes() -> None:
171
data = {
172
"dtm": [datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)],
173
"val": [100, 200, 300],
174
"misc": ["x", "y", "z"],
175
}
176
pd_df = pd.DataFrame(data)
177
178
df = pl.from_pandas(pd_df.set_index(["dtm"]))
179
assert df.to_dict(as_series=False) == {
180
"val": [100, 200, 300],
181
"misc": ["x", "y", "z"],
182
}
183
184
df = pl.from_pandas(pd_df.set_index(["dtm", "val"]))
185
assert df.to_dict(as_series=False) == {"misc": ["x", "y", "z"]}
186
187
df = pl.from_pandas(pd_df.set_index(["dtm"]), include_index=True)
188
assert df.to_dict(as_series=False) == data
189
190
df = pl.from_pandas(pd_df.set_index(["dtm", "val"]), include_index=True)
191
assert df.to_dict(as_series=False) == data
192
193
194
def test_from_pandas_series_include_indexes() -> None:
195
# no default index
196
pd_series = pd.Series({"a": 1, "b": 2}, name="number").rename_axis(["letter"])
197
df = pl.from_pandas(pd_series, include_index=True)
198
assert df.to_dict(as_series=False) == {"letter": ["a", "b"], "number": [1, 2]}
199
200
# default index
201
pd_series = pd.Series(range(2))
202
df = pl.from_pandas(pd_series, include_index=True)
203
assert df.to_dict(as_series=False) == {"index": [0, 1], "0": [0, 1]}
204
205
206
def test_duplicate_cols_diff_types() -> None:
207
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])
208
with pytest.raises(
209
ValueError,
210
match="Pandas dataframe contains non-unique indices and/or column names",
211
):
212
pl.from_pandas(df)
213
214
215
def test_from_pandas_duplicated_columns() -> None:
216
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])
217
with pytest.raises(
218
ValueError,
219
match="Pandas dataframe contains non-unique indices and/or column names",
220
):
221
pl.from_pandas(df)
222
223
224
def test_from_pandas_null() -> None:
225
# null column is an object dtype, so pl.Utf8 is most close
226
df = pd.DataFrame([{"a": None}, {"a": None}])
227
out = pl.DataFrame(df)
228
assert out.dtypes == [pl.String]
229
assert out["a"][0] is None
230
231
df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}])
232
out = pl.DataFrame(df)
233
assert out.dtypes == [pl.String, pl.Int64]
234
235
236
def test_from_pandas_nested_list() -> None:
237
# this panicked in https://github.com/pola-rs/polars/issues/1615
238
pddf = pd.DataFrame(
239
{"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]}
240
)
241
pldf = pl.from_pandas(pddf)
242
assert pldf.shape == (4, 2)
243
assert pldf.rows() == [
244
(1, ["x", "y"]),
245
(2, ["x", "y", "z"]),
246
(3, ["x"]),
247
(4, ["x", "y"]),
248
]
249
250
251
def test_from_pandas_categorical_none() -> None:
252
s = pd.Series(["a", "b", "c", pd.NA], dtype="category")
253
out = pl.from_pandas(s)
254
assert out.dtype == pl.Categorical
255
assert out.to_list() == ["a", "b", "c", None]
256
257
258
def test_from_pandas_dataframe() -> None:
259
pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
260
df = pl.from_pandas(pd_df)
261
assert df.shape == (2, 3)
262
assert df.rows() == [(1, 2, 3), (4, 5, 6)]
263
264
# if not a pandas dataframe, raise a ValueError
265
with pytest.raises(TypeError):
266
_ = pl.from_pandas([1, 2]) # type: ignore[call-overload]
267
268
269
def test_from_pandas_series() -> None:
270
pd_series = pd.Series([1, 2, 3], name="pd")
271
s = pl.from_pandas(pd_series)
272
assert s.shape == (3,)
273
assert list(s) == [1, 2, 3]
274
275
276
def test_from_empty_pandas() -> None:
277
pandas_df = pd.DataFrame(
278
{
279
"A": [],
280
"fruits": [],
281
}
282
)
283
polars_df = pl.from_pandas(pandas_df)
284
assert polars_df.columns == ["A", "fruits"]
285
assert polars_df.dtypes == [pl.Float64, pl.Float64]
286
287
288
def test_from_null_column() -> None:
289
df = pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA], columns=["n/a"]))
290
291
assert df.shape == (2, 1)
292
assert df.columns == ["n/a"]
293
assert df.dtypes[0] == pl.Null
294
295
296
def test_from_pandas_ns_resolution() -> None:
297
df = pd.DataFrame(
298
[pd.Timestamp(year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)],
299
columns=["date"],
300
)
301
assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1)
302
303
304
def test_pandas_string_none_conversion_3298() -> None:
305
data: dict[str, list[str | None]] = {"col_1": ["a", "b", "c", "d"]}
306
data["col_1"][0] = None
307
df_pd = pd.DataFrame(data)
308
df_pl = pl.DataFrame(df_pd)
309
assert df_pl.to_series().to_list() == [None, "b", "c", "d"]
310
311
312
def test_from_pandas_null_struct_6412() -> None:
313
data = [
314
{
315
"a": {
316
"b": None,
317
},
318
},
319
{"a": None},
320
]
321
df_pandas = pd.DataFrame(data)
322
assert pl.from_pandas(df_pandas).to_dict(as_series=False) == {
323
"a": [{"b": None}, None]
324
}
325
326
327
def test_untrusted_categorical_input() -> None:
328
df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])})
329
df = pl.from_pandas(df_pd)
330
result = df.group_by("x").len()
331
expected = pl.DataFrame(
332
{"x": ["x"], "len": [1]},
333
schema={"x": pl.Categorical, "len": pl.get_index_type()},
334
)
335
assert_frame_equal(result, expected, categorical_as_str=True)
336
337
338
@pytest.fixture
339
def _set_pyarrow_unavailable(plmonkeypatch: PlMonkeyPatch) -> None:
340
plmonkeypatch.setattr(
341
"polars._utils.construction.dataframe._PYARROW_AVAILABLE", False
342
)
343
plmonkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False)
344
345
346
@pytest.mark.usefixtures("_set_pyarrow_unavailable")
347
def test_from_pandas_pyarrow_not_available_succeeds() -> None:
348
data: dict[str, Any] = {
349
"a": [1, 2],
350
"b": [3, 4],
351
"c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"),
352
"d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"),
353
"e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"),
354
"f": np.array([1, 2], dtype="timedelta64[ns]"),
355
"g": np.array([1, 2], dtype="timedelta64[us]"),
356
"h": np.array([1, 2], dtype="timedelta64[ms]"),
357
"i": [True, False],
358
}
359
360
# DataFrame
361
result = pl.from_pandas(pd.DataFrame(data))
362
expected = pl.DataFrame(data)
363
assert_frame_equal(result, expected)
364
365
# Series
366
for col in data:
367
s_pd = pd.Series(data[col])
368
result_s = pl.from_pandas(s_pd)
369
expected_s = pl.Series(data[col])
370
assert_series_equal(result_s, expected_s)
371
372
373
@pytest.mark.usefixtures("_set_pyarrow_unavailable")
374
def test_from_pandas_pyarrow_not_available_fails() -> None:
375
with pytest.raises(ImportError, match="pyarrow is required"):
376
pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64"))
377
with pytest.raises(ImportError, match="pyarrow is required"):
378
pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64"))
379
with pytest.raises(ImportError, match="pyarrow is required"):
380
pl.from_pandas(
381
pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()})
382
)
383
with pytest.raises(ImportError, match="pyarrow is required"):
384
pl.from_pandas(pd.DataFrame({"a": [None, "foo"]}))
385
386
387
def test_from_pandas_nan_to_null_16453(plmonkeypatch: PlMonkeyPatch) -> None:
388
plmonkeypatch.setattr(
389
"polars._utils.construction.dataframe._MIN_NUMPY_SIZE_FOR_MULTITHREADING", 2
390
)
391
df = pd.DataFrame(
392
{"a": [np.nan, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}
393
)
394
result = pl.from_pandas(df, nan_to_null=True)
395
expected = pl.DataFrame(
396
{"a": [None, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}
397
)
398
assert_frame_equal(result, expected)
399
400
401
@pytest.mark.parametrize("null", [pd.NA, np.nan, None, float("nan")])
402
def test_from_pandas_string_with_natype_17355(null: Any) -> None:
403
# https://github.com/pola-rs/polars/issues/17355
404
405
pd_df = pd.DataFrame({"col": ["a", null]})
406
result = pl.from_pandas(pd_df)
407
expected = pl.DataFrame({"col": ["a", None]})
408
assert_frame_equal(result, expected)
409
410