CoCalc -- test_from

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_from_pandas.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from datetime import datetime, timedelta
4
from typing import TYPE_CHECKING, Any
5

6
import numpy as np
7
import pandas as pd
8
import pytest
9

10
import polars as pl
11
from polars.testing import assert_frame_equal
12
from polars.testing.asserts.series import assert_series_equal
13

14
if TYPE_CHECKING:
15
    from polars._typing import PolarsDataType
16

17

18
def test_index_not_silently_excluded() -> None:
19
    ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}
20
    df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))
21
    with pytest.raises(ValueError, match="indices and column names must not overlap"):
22
        pl.from_pandas(df, include_index=True)
23

24

25
def test_nameless_multiindex_doesnt_raise_with_include_index_false_18130() -> None:
26
    df = pd.DataFrame(
27
        range(4),
28
        columns=["A"],
29
        index=pd.MultiIndex.from_product((["C", "D"], [3, 4])),
30
    )
31
    result = pl.from_pandas(df)
32
    expected = pl.DataFrame({"A": [0, 1, 2, 3]})
33
    assert_frame_equal(result, expected)
34

35

36
def test_from_pandas() -> None:
37
    df = pd.DataFrame(
38
        {
39
            "bools": [False, True, False],
40
            "bools_nulls": [None, True, False],
41
            "int": [1, 2, 3],
42
            "int_nulls": [1, None, 3],
43
            "floats": [1.0, 2.0, 3.0],
44
            "floats_nulls": [1.0, None, 3.0],
45
            "strings": ["foo", "bar", "ham"],
46
            "strings_nulls": ["foo", None, "ham"],
47
            "strings-cat": ["foo", "bar", "ham"],
48
        }
49
    )
50
    df["strings-cat"] = df["strings-cat"].astype("category")
51

52
    out = pl.from_pandas(df)
53
    assert out.shape == (3, 9)
54
    assert out.schema == {
55
        "bools": pl.Boolean,
56
        "bools_nulls": pl.Boolean,
57
        "int": pl.Int64,
58
        "int_nulls": pl.Float64,
59
        "floats": pl.Float64,
60
        "floats_nulls": pl.Float64,
61
        "strings": pl.String,
62
        "strings_nulls": pl.String,
63
        "strings-cat": pl.Categorical(ordering="lexical"),
64
    }
65
    assert out.rows() == [
66
        (False, None, 1, 1.0, 1.0, 1.0, "foo", "foo", "foo"),
67
        (True, True, 2, None, 2.0, None, "bar", None, "bar"),
68
        (False, False, 3, 3.0, 3.0, 3.0, "ham", "ham", "ham"),
69
    ]
70

71
    # partial dtype overrides from pandas
72
    overrides = {"int": pl.Int8, "int_nulls": pl.Int32, "floats": pl.Float32}
73
    out = pl.from_pandas(df, schema_overrides=overrides)
74
    for col, dtype in overrides.items():
75
        assert out.schema[col] == dtype
76

77

78
@pytest.mark.parametrize(
79
    "nulls",
80
    [
81
        [],
82
        [None],
83
        [None, None],
84
        [None, None, None],
85
    ],
86
)
87
def test_from_pandas_nulls(nulls: list[None]) -> None:
88
    # empty and/or all null values, no pandas dtype
89
    ps = pd.Series(nulls)
90
    s = pl.from_pandas(ps)
91
    assert nulls == s.to_list()
92

93

94
def test_from_pandas_nan_to_null() -> None:
95
    df = pd.DataFrame(
96
        {
97
            "bools_nulls": [None, True, False],
98
            "int_nulls": [1, None, 3],
99
            "floats_nulls": [1.0, None, 3.0],
100
            "strings_nulls": ["foo", None, "ham"],
101
            "nulls": [None, np.nan, np.nan],
102
        }
103
    )
104
    out_true = pl.from_pandas(df)
105
    out_false = pl.from_pandas(df, nan_to_null=False)
106
    assert all(val is None for val in out_true["nulls"])
107
    assert all(np.isnan(val) for val in out_false["nulls"][1:])
108

109
    df = pd.Series([2, np.nan, None], name="pd")  # type: ignore[assignment]
110
    out_true = pl.from_pandas(df)
111
    out_false = pl.from_pandas(df, nan_to_null=False)
112
    assert [val is None for val in out_true]
113
    assert [np.isnan(val) for val in out_false[1:]]
114

115

116
def test_from_pandas_datetime() -> None:
117
    ts = datetime(2021, 1, 1, 20, 20, 20, 20)
118
    pd_s = pd.Series([ts, ts])
119
    tmp = pl.from_pandas(pd_s.to_frame("a"))
120
    s = tmp["a"]
121
    assert s.dt.hour()[0] == 20
122
    assert s.dt.minute()[0] == 20
123
    assert s.dt.second()[0] == 20
124

125
    date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 09:00:00", freq="1h")
126
    s = pl.from_pandas(date_times)
127
    assert s[0] == datetime(2021, 6, 24, 0, 0)
128
    assert s[-1] == datetime(2021, 6, 24, 9, 0)
129

130

131
@pytest.mark.parametrize(
132
    ("index_class", "index_data", "index_params", "expected_data", "expected_dtype"),
133
    [
134
        (pd.Index, [100, 200, 300], {}, None, pl.Int64),
135
        (pd.Index, [1, 2, 3], {"dtype": "uint32"}, None, pl.UInt32),
136
        (pd.RangeIndex, 5, {}, [0, 1, 2, 3, 4], pl.Int64),
137
        (pd.CategoricalIndex, ["N", "E", "S", "W"], {}, None, pl.Categorical),
138
        (
139
            pd.DatetimeIndex,
140
            [datetime(1960, 12, 31), datetime(2077, 10, 20)],
141
            {"dtype": "datetime64[ms]"},
142
            None,
143
            pl.Datetime("ms"),
144
        ),
145
        (
146
            pd.TimedeltaIndex,
147
            ["24 hours", "2 days 8 hours", "3 days 42 seconds"],
148
            {},
149
            [timedelta(1), timedelta(days=2, hours=8), timedelta(days=3, seconds=42)],
150
            pl.Duration("ns"),
151
        ),
152
    ],
153
)
154
def test_from_pandas_index(
155
    index_class: Any,
156
    index_data: Any,
157
    index_params: dict[str, Any],
158
    expected_data: list[Any] | None,
159
    expected_dtype: PolarsDataType,
160
) -> None:
161
    if expected_data is None:
162
        expected_data = index_data
163

164
    s = pl.from_pandas(index_class(index_data, **index_params))
165
    assert s.to_list() == expected_data
166
    assert s.dtype == expected_dtype
167

168

169
def test_from_pandas_include_indexes() -> None:
170
    data = {
171
        "dtm": [datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)],
172
        "val": [100, 200, 300],
173
        "misc": ["x", "y", "z"],
174
    }
175
    pd_df = pd.DataFrame(data)
176

177
    df = pl.from_pandas(pd_df.set_index(["dtm"]))
178
    assert df.to_dict(as_series=False) == {
179
        "val": [100, 200, 300],
180
        "misc": ["x", "y", "z"],
181
    }
182

183
    df = pl.from_pandas(pd_df.set_index(["dtm", "val"]))
184
    assert df.to_dict(as_series=False) == {"misc": ["x", "y", "z"]}
185

186
    df = pl.from_pandas(pd_df.set_index(["dtm"]), include_index=True)
187
    assert df.to_dict(as_series=False) == data
188

189
    df = pl.from_pandas(pd_df.set_index(["dtm", "val"]), include_index=True)
190
    assert df.to_dict(as_series=False) == data
191

192

193
def test_from_pandas_series_include_indexes() -> None:
194
    # no default index
195
    pd_series = pd.Series({"a": 1, "b": 2}, name="number").rename_axis(["letter"])
196
    df = pl.from_pandas(pd_series, include_index=True)
197
    assert df.to_dict(as_series=False) == {"letter": ["a", "b"], "number": [1, 2]}
198

199
    # default index
200
    pd_series = pd.Series(range(2))
201
    df = pl.from_pandas(pd_series, include_index=True)
202
    assert df.to_dict(as_series=False) == {"index": [0, 1], "0": [0, 1]}
203

204

205
def test_duplicate_cols_diff_types() -> None:
206
    df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])
207
    with pytest.raises(
208
        ValueError,
209
        match="Pandas dataframe contains non-unique indices and/or column names",
210
    ):
211
        pl.from_pandas(df)
212

213

214
def test_from_pandas_duplicated_columns() -> None:
215
    df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])
216
    with pytest.raises(
217
        ValueError,
218
        match="Pandas dataframe contains non-unique indices and/or column names",
219
    ):
220
        pl.from_pandas(df)
221

222

223
def test_from_pandas_null() -> None:
224
    # null column is an object dtype, so pl.Utf8 is most close
225
    df = pd.DataFrame([{"a": None}, {"a": None}])
226
    out = pl.DataFrame(df)
227
    assert out.dtypes == [pl.String]
228
    assert out["a"][0] is None
229

230
    df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}])
231
    out = pl.DataFrame(df)
232
    assert out.dtypes == [pl.String, pl.Int64]
233

234

235
def test_from_pandas_nested_list() -> None:
236
    # this panicked in https://github.com/pola-rs/polars/issues/1615
237
    pddf = pd.DataFrame(
238
        {"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]}
239
    )
240
    pldf = pl.from_pandas(pddf)
241
    assert pldf.shape == (4, 2)
242
    assert pldf.rows() == [
243
        (1, ["x", "y"]),
244
        (2, ["x", "y", "z"]),
245
        (3, ["x"]),
246
        (4, ["x", "y"]),
247
    ]
248

249

250
def test_from_pandas_categorical_none() -> None:
251
    s = pd.Series(["a", "b", "c", pd.NA], dtype="category")
252
    out = pl.from_pandas(s)
253
    assert out.dtype == pl.Categorical
254
    assert out.to_list() == ["a", "b", "c", None]
255

256

257
def test_from_pandas_dataframe() -> None:
258
    pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
259
    df = pl.from_pandas(pd_df)
260
    assert df.shape == (2, 3)
261
    assert df.rows() == [(1, 2, 3), (4, 5, 6)]
262

263
    # if not a pandas dataframe, raise a ValueError
264
    with pytest.raises(TypeError):
265
        _ = pl.from_pandas([1, 2])  # type: ignore[call-overload]
266

267

268
def test_from_pandas_series() -> None:
269
    pd_series = pd.Series([1, 2, 3], name="pd")
270
    s = pl.from_pandas(pd_series)
271
    assert s.shape == (3,)
272
    assert list(s) == [1, 2, 3]
273

274

275
def test_from_empty_pandas() -> None:
276
    pandas_df = pd.DataFrame(
277
        {
278
            "A": [],
279
            "fruits": [],
280
        }
281
    )
282
    polars_df = pl.from_pandas(pandas_df)
283
    assert polars_df.columns == ["A", "fruits"]
284
    assert polars_df.dtypes == [pl.Float64, pl.Float64]
285

286

287
def test_from_null_column() -> None:
288
    df = pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA], columns=["n/a"]))
289

290
    assert df.shape == (2, 1)
291
    assert df.columns == ["n/a"]
292
    assert df.dtypes[0] == pl.Null
293

294

295
def test_from_pandas_ns_resolution() -> None:
296
    df = pd.DataFrame(
297
        [pd.Timestamp(year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)],
298
        columns=["date"],
299
    )
300
    assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1)
301

302

303
def test_pandas_string_none_conversion_3298() -> None:
304
    data: dict[str, list[str | None]] = {"col_1": ["a", "b", "c", "d"]}
305
    data["col_1"][0] = None
306
    df_pd = pd.DataFrame(data)
307
    df_pl = pl.DataFrame(df_pd)
308
    assert df_pl.to_series().to_list() == [None, "b", "c", "d"]
309

310

311
def test_from_pandas_null_struct_6412() -> None:
312
    data = [
313
        {
314
            "a": {
315
                "b": None,
316
            },
317
        },
318
        {"a": None},
319
    ]
320
    df_pandas = pd.DataFrame(data)
321
    assert pl.from_pandas(df_pandas).to_dict(as_series=False) == {
322
        "a": [{"b": None}, None]
323
    }
324

325

326
def test_untrusted_categorical_input() -> None:
327
    df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])})
328
    df = pl.from_pandas(df_pd)
329
    result = df.group_by("x").len()
330
    expected = pl.DataFrame(
331
        {"x": ["x"], "len": [1]}, schema={"x": pl.Categorical, "len": pl.UInt32}
332
    )
333
    assert_frame_equal(result, expected, categorical_as_str=True)
334

335

336
@pytest.fixture
337
def _set_pyarrow_unavailable(monkeypatch: pytest.MonkeyPatch) -> None:
338
    monkeypatch.setattr(
339
        "polars._utils.construction.dataframe._PYARROW_AVAILABLE", False
340
    )
341
    monkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False)
342

343

344
@pytest.mark.usefixtures("_set_pyarrow_unavailable")
345
def test_from_pandas_pyarrow_not_available_succeeds() -> None:
346
    data: dict[str, Any] = {
347
        "a": [1, 2],
348
        "b": ["one", "two"],
349
        "c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"),
350
        "d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"),
351
        "e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"),
352
        "f": np.array([1, 2], dtype="timedelta64[ns]"),
353
        "g": np.array([1, 2], dtype="timedelta64[us]"),
354
        "h": np.array([1, 2], dtype="timedelta64[ms]"),
355
        "i": [True, False],
356
    }
357

358
    # DataFrame
359
    result = pl.from_pandas(pd.DataFrame(data))
360
    expected = pl.DataFrame(data)
361
    assert_frame_equal(result, expected)
362

363
    # Series
364
    for col in data:
365
        s_pd = pd.Series(data[col])
366
        result_s = pl.from_pandas(s_pd)
367
        expected_s = pl.Series(data[col])
368
        assert_series_equal(result_s, expected_s)
369

370

371
@pytest.mark.usefixtures("_set_pyarrow_unavailable")
372
def test_from_pandas_pyarrow_not_available_fails() -> None:
373
    with pytest.raises(ImportError, match="pyarrow is required"):
374
        pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64"))
375
    with pytest.raises(ImportError, match="pyarrow is required"):
376
        pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64"))
377
    with pytest.raises(ImportError, match="pyarrow is required"):
378
        pl.from_pandas(
379
            pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()})
380
        )
381
    with pytest.raises(ImportError, match="pyarrow is required"):
382
        pl.from_pandas(pd.DataFrame({"a": [None, "foo"]}))
383

384

385
def test_from_pandas_nan_to_null_16453(monkeypatch: pytest.MonkeyPatch) -> None:
386
    monkeypatch.setattr(
387
        "polars._utils.construction.dataframe._MIN_NUMPY_SIZE_FOR_MULTITHREADING", 2
388
    )
389
    df = pd.DataFrame(
390
        {"a": [np.nan, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}
391
    )
392
    result = pl.from_pandas(df, nan_to_null=True)
393
    expected = pl.DataFrame(
394
        {"a": [None, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}
395
    )
396
    assert_frame_equal(result, expected)
397

398

399
@pytest.mark.parametrize("null", [pd.NA, np.nan, None, float("nan")])
400
def test_from_pandas_string_with_natype_17355(null: Any) -> None:
401
    # https://github.com/pola-rs/polars/issues/17355
402

403
    pd_df = pd.DataFrame({"col": ["a", null]})
404
    result = pl.from_pandas(pd_df)
405
    expected = pl.DataFrame({"col": ["a", None]})
406
    assert_frame_equal(result, expected)
407

408
Product

Resources

Company