CoCalc -- test_interop.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_interop.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from datetime import date, datetime, time, timedelta, timezone
4
from typing import Any, cast
5

6
import numpy as np
7
import pandas as pd
8
import pyarrow as pa
9
import pytest
10

11
import polars as pl
12
from polars.exceptions import ComputeError, DuplicateError, UnstableWarning
13
from polars.interchange.protocol import CompatLevel
14
from polars.testing import assert_frame_equal, assert_series_equal
15
from tests.unit.utils.pycapsule_utils import PyCapsuleStreamHolder
16

17

18
def test_arrow_list_roundtrip() -> None:
19
    # https://github.com/pola-rs/polars/issues/1064
20
    tbl = pa.table({"a": [1], "b": [[1, 2]]})
21
    arw = pl.from_arrow(tbl).to_arrow()
22

23
    assert arw.shape == tbl.shape
24
    assert arw.schema.names == tbl.schema.names
25
    for c1, c2 in zip(arw.columns, tbl.columns):
26
        assert c1.to_pylist() == c2.to_pylist()
27

28

29
def test_arrow_null_roundtrip() -> None:
30
    tbl = pa.table({"a": [None, None], "b": [[None, None], [None, None]]})
31
    df = pl.from_arrow(tbl)
32

33
    if isinstance(df, pl.DataFrame):
34
        assert df.dtypes == [pl.Null, pl.List(pl.Null)]
35

36
    arw = df.to_arrow()
37

38
    assert arw.shape == tbl.shape
39
    assert arw.schema.names == tbl.schema.names
40
    for c1, c2 in zip(arw.columns, tbl.columns):
41
        assert c1.to_pylist() == c2.to_pylist()
42

43

44
def test_arrow_empty_dataframe() -> None:
45
    # 0x0 dataframe
46
    df = pl.DataFrame({})
47
    tbl = pa.table({})
48
    assert df.to_arrow() == tbl
49
    df2 = cast(pl.DataFrame, pl.from_arrow(df.to_arrow()))
50
    assert_frame_equal(df2, df)
51

52
    # 0 row dataframe
53
    df = pl.DataFrame({}, schema={"a": pl.Int32})
54
    tbl = pa.Table.from_batches([], pa.schema([pa.field("a", pa.int32())]))
55
    assert df.to_arrow() == tbl
56
    df2 = cast(pl.DataFrame, pl.from_arrow(df.to_arrow()))
57
    assert df2.schema == {"a": pl.Int32}
58
    assert df2.shape == (0, 1)
59

60

61
def test_arrow_dict_to_polars() -> None:
62
    pa_dict = pa.DictionaryArray.from_arrays(
63
        indices=np.array([0, 1, 2, 3, 1, 0, 2, 3, 3, 2]),
64
        dictionary=np.array(["AAA", "BBB", "CCC", "DDD"]),
65
    ).cast(pa.large_utf8())
66

67
    s = pl.Series(
68
        name="pa_dict",
69
        values=["AAA", "BBB", "CCC", "DDD", "BBB", "AAA", "CCC", "DDD", "DDD", "CCC"],
70
    )
71
    assert_series_equal(s, pl.Series("pa_dict", pa_dict))
72

73

74
def test_arrow_list_chunked_array() -> None:
75
    a = pa.array([[1, 2], [3, 4]])
76
    ca = pa.chunked_array([a, a, a])
77
    s = cast(pl.Series, pl.from_arrow(ca))
78
    assert s.dtype == pl.List
79

80

81
# Test that polars convert Arrays of logical types correctly to arrow
82
def test_arrow_array_logical() -> None:
83
    # cast to large string and uint8 indices because polars converts to those
84
    pa_data1 = (
85
        pa.array(["a", "b", "c", "d"])
86
        .dictionary_encode()
87
        .cast(pa.dictionary(pa.uint8(), pa.large_string()))
88
    )
89
    pa_array_logical1 = pa.FixedSizeListArray.from_arrays(pa_data1, 2)
90

91
    s1 = pl.Series(
92
        values=[["a", "b"], ["c", "d"]],
93
        dtype=pl.Array(pl.Enum(["a", "b", "c", "d"]), shape=2),
94
    )
95
    assert s1.to_arrow() == pa_array_logical1
96

97
    pa_data2 = pa.array([date(2024, 1, 1), date(2024, 1, 2)])
98
    pa_array_logical2 = pa.FixedSizeListArray.from_arrays(pa_data2, 1)
99

100
    s2 = pl.Series(
101
        values=[[date(2024, 1, 1)], [date(2024, 1, 2)]],
102
        dtype=pl.Array(pl.Date, shape=1),
103
    )
104
    assert s2.to_arrow() == pa_array_logical2
105

106

107
def test_from_dict() -> None:
108
    data = {"a": [1, 2], "b": [3, 4]}
109
    df = pl.from_dict(data)
110
    assert df.shape == (2, 2)
111
    for s1, s2 in zip(list(df), [pl.Series("a", [1, 2]), pl.Series("b", [3, 4])]):
112
        assert_series_equal(s1, s2)
113

114

115
def test_from_dict_struct() -> None:
116
    data: dict[str, dict[str, list[int]] | list[int]] = {
117
        "a": {"b": [1, 3], "c": [2, 4]},
118
        "d": [5, 6],
119
    }
120
    df = pl.from_dict(data)
121
    assert df.shape == (2, 2)
122
    assert df["a"][0] == {"b": 1, "c": 2}
123
    assert df["a"][1] == {"b": 3, "c": 4}
124
    assert df.schema == {"a": pl.Struct({"b": pl.Int64, "c": pl.Int64}), "d": pl.Int64}
125

126

127
def test_from_dicts() -> None:
128
    data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": None}]
129
    df = pl.from_dicts(data)  # type: ignore[arg-type]
130
    assert df.shape == (3, 2)
131
    assert df.rows() == [(1, 4), (2, 5), (3, None)]
132
    assert df.schema == {"a": pl.Int64, "b": pl.Int64}
133

134

135
def test_from_dict_no_inference() -> None:
136
    schema = {"a": pl.String}
137
    data = [{"a": "aa"}]
138
    df = pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0)
139
    assert df.schema == schema
140
    assert df.to_dicts() == data
141

142

143
def test_from_dicts_schema_override() -> None:
144
    schema = {
145
        "a": pl.String,
146
        "b": pl.Int64,
147
        "c": pl.List(pl.Struct({"x": pl.Int64, "y": pl.String, "z": pl.Float64})),
148
    }
149

150
    # initial data matches the expected schema
151
    data1 = [
152
        {
153
            "a": "l",
154
            "b": i,
155
            "c": [{"x": (j + 2), "y": "?", "z": (j % 2)} for j in range(2)],
156
        }
157
        for i in range(5)
158
    ]
159

160
    # extend with a mix of fields that are/not in the schema
161
    data2 = [{"b": i + 5, "d": "ABC", "e": "DEF"} for i in range(5)]
162

163
    for n_infer in (0, 3, 5, 8, 10, 100):
164
        df = pl.DataFrame(
165
            data=(data1 + data2),
166
            schema=schema,  # type: ignore[arg-type]
167
            infer_schema_length=n_infer,
168
        )
169
        assert df.schema == schema
170
        assert df.rows() == [
171
            ("l", 0, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
172
            ("l", 1, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
173
            ("l", 2, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
174
            ("l", 3, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
175
            ("l", 4, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
176
            (None, 5, None),
177
            (None, 6, None),
178
            (None, 7, None),
179
            (None, 8, None),
180
            (None, 9, None),
181
        ]
182

183

184
def test_from_dicts_struct() -> None:
185
    data = [{"a": {"b": 1, "c": 2}, "d": 5}, {"a": {"b": 3, "c": 4}, "d": 6}]
186
    df = pl.from_dicts(data)
187
    assert df.shape == (2, 2)
188
    assert df["a"][0] == {"b": 1, "c": 2}
189
    assert df["a"][1] == {"b": 3, "c": 4}
190

191
    # 5649
192
    assert pl.from_dicts([{"a": [{"x": 1}]}, {"a": [{"y": 1}]}]).to_dict(
193
        as_series=False
194
    ) == {"a": [[{"y": None, "x": 1}], [{"y": 1, "x": None}]]}
195
    assert pl.from_dicts([{"a": [{"x": 1}, {"y": 2}]}, {"a": [{"y": 1}]}]).to_dict(
196
        as_series=False
197
    ) == {"a": [[{"y": None, "x": 1}, {"y": 2, "x": None}], [{"y": 1, "x": None}]]}
198

199

200
def test_from_records() -> None:
201
    data = [[1, 2, 3], [4, 5, 6]]
202
    df = pl.from_records(data, schema=["a", "b"])
203
    assert df.shape == (3, 2)
204
    assert df.rows() == [(1, 4), (2, 5), (3, 6)]
205

206

207
# https://github.com/pola-rs/polars/issues/15195
208
@pytest.mark.parametrize(
209
    "input",
210
    [
211
        pl.Series([1, 2]),
212
        pl.Series([{"a": 1, "b": 2}]),
213
        pl.DataFrame({"a": [1, 2], "b": [3, 4]}),
214
    ],
215
)
216
def test_from_records_non_sequence_input(input: Any) -> None:
217
    with pytest.raises(TypeError, match="expected data of type Sequence"):
218
        pl.from_records(input)
219

220

221
def test_from_arrow() -> None:
222
    data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
223
    df = pl.from_arrow(data)
224
    assert df.shape == (3, 2)
225
    assert df.rows() == [(1, 4), (2, 5), (3, 6)]  # type: ignore[union-attr]
226

227
    # if not a PyArrow type, raise a TypeError
228
    with pytest.raises(TypeError):
229
        _ = pl.from_arrow([1, 2])
230

231
    df = pl.from_arrow(
232
        data, schema=["a", "b"], schema_overrides={"a": pl.UInt32, "b": pl.UInt64}
233
    )
234
    assert df.rows() == [(1, 4), (2, 5), (3, 6)]  # type: ignore[union-attr]
235
    assert df.schema == {"a": pl.UInt32, "b": pl.UInt64}  # type: ignore[union-attr]
236

237

238
def test_from_arrow_with_bigquery_metadata() -> None:
239
    arrow_schema = pa.schema(
240
        [
241
            pa.field("id", pa.int64()).with_metadata(
242
                {"ARROW:extension:name": "google:sqlType:integer"}
243
            ),
244
            pa.field(
245
                "misc",
246
                pa.struct([("num", pa.int32()), ("val", pa.string())]),
247
            ).with_metadata({"ARROW:extension:name": "google:sqlType:struct"}),
248
        ]
249
    )
250
    arrow_tbl = pa.Table.from_pylist(
251
        [{"id": 1, "misc": None}, {"id": 2, "misc": None}],
252
        schema=arrow_schema,
253
    )
254

255
    expected_data = {"id": [1, 2], "num": [None, None], "val": [None, None]}
256
    expected_schema = {"id": pl.Int64, "num": pl.Int32, "val": pl.String}
257
    assert_frame_equal(
258
        pl.DataFrame(expected_data, schema=expected_schema),
259
        pl.from_arrow(arrow_tbl).unnest("misc"),  # type: ignore[union-attr]
260
    )
261

262

263
def test_from_optional_not_available() -> None:
264
    from polars.dependencies import _LazyModule
265

266
    # proxy module is created dynamically if the required module is not available
267
    # (see the polars.dependencies source code for additional detail/comments)
268

269
    np = _LazyModule("numpy", module_available=False)
270
    with pytest.raises(ImportError, match=r"np\.array requires 'numpy'"):
271
        pl.from_numpy(np.array([[1, 2], [3, 4]]), schema=["a", "b"])
272

273
    pa = _LazyModule("pyarrow", module_available=False)
274
    with pytest.raises(ImportError, match=r"pa\.table requires 'pyarrow'"):
275
        pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))
276

277
    pd = _LazyModule("pandas", module_available=False)
278
    with pytest.raises(ImportError, match=r"pd\.Series requires 'pandas'"):
279
        pl.from_pandas(pd.Series([1, 2, 3]))
280

281

282
def test_upcast_pyarrow_dicts() -> None:
283
    # https://github.com/pola-rs/polars/issues/1752
284
    tbls = [
285
        pa.table(
286
            {
287
                "col_name": pa.array(
288
                    [f"value_{i}"], pa.dictionary(pa.int8(), pa.string())
289
                )
290
            }
291
        )
292
        for i in range(128)
293
    ]
294

295
    tbl = pa.concat_tables(tbls, promote_options="default")
296
    out = cast(pl.DataFrame, pl.from_arrow(tbl))
297
    assert out.shape == (128, 1)
298
    assert out["col_name"][0] == "value_0"
299
    assert out["col_name"][127] == "value_127"
300

301

302
def test_no_rechunk() -> None:
303
    table = pa.Table.from_pydict({"x": pa.chunked_array([list("ab"), list("cd")])})
304
    # table
305
    assert pl.from_arrow(table, rechunk=False).n_chunks() == 2
306
    # chunked array
307
    assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2
308

309

310
def test_from_empty_arrow() -> None:
311
    df = cast(pl.DataFrame, pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []}))))
312
    assert df.columns == ["a", "b"]
313
    assert df.dtypes == [pl.Float64, pl.Float64]
314

315
    # 2705
316
    df1 = pd.DataFrame(columns=["b"], dtype=float, index=pd.Index([]))
317
    tbl = pa.Table.from_pandas(df1)
318
    out = cast(pl.DataFrame, pl.from_arrow(tbl))
319
    assert out.columns == ["b", "__index_level_0__"]
320
    assert out.dtypes == [pl.Float64, pl.Null]
321
    tbl = pa.Table.from_pandas(df1, preserve_index=False)
322
    out = cast(pl.DataFrame, pl.from_arrow(tbl))
323
    assert out.columns == ["b"]
324
    assert out.dtypes == [pl.Float64]
325

326
    # 4568
327
    tbl = pa.table({"l": []}, schema=pa.schema([("l", pa.large_list(pa.uint8()))]))
328

329
    df = cast(pl.DataFrame, pl.from_arrow(tbl))
330
    assert df.schema["l"] == pl.List(pl.UInt8)
331

332

333
def test_cat_int_types_3500() -> None:
334
    # Create an enum / categorical / dictionary typed pyarrow array
335
    # Most simply done by creating a pandas categorical series first
336
    categorical_s = pd.Series(["a", "a", "b"], dtype="category")
337
    pyarrow_array = pa.Array.from_pandas(categorical_s)
338

339
    # The in-memory representation of each category can either be a signed or
340
    # unsigned 8-bit integer. Pandas uses Int8...
341
    int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8())
342
    # ... while DuckDB uses UInt8
343
    uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8())
344

345
    for t in [int_dict_type, uint_dict_type]:
346
        s = cast(pl.Series, pl.from_arrow(pyarrow_array.cast(t)))
347
        assert_series_equal(
348
            s, pl.Series(["a", "a", "b"]).cast(pl.Categorical), check_names=False
349
        )
350

351

352
def test_from_pyarrow_chunked_array() -> None:
353
    column = pa.chunked_array([[1], [2]])
354
    series = pl.Series("column", column)
355
    assert series.to_list() == [1, 2]
356

357

358
def test_arrow_list_null_5697() -> None:
359
    # Create a pyarrow table with a list[null] column.
360
    pa_table = pa.table([[[None]]], names=["mycol"])
361
    df = pl.from_arrow(pa_table)
362
    pa_table = df.to_arrow()
363
    # again to polars to test the schema
364
    assert pl.from_arrow(pa_table).schema == {"mycol": pl.List(pl.Null)}  # type: ignore[union-attr]
365

366

367
def test_from_pyarrow_map() -> None:
368
    pa_table = pa.table(
369
        [[1, 2], [[("a", "something")], [("a", "else"), ("b", "another key")]]],
370
        schema=pa.schema(
371
            [("idx", pa.int16()), ("mapping", pa.map_(pa.string(), pa.string()))]
372
        ),
373
    )
374

375
    # Convert from an empty table to trigger an ArrowSchema -> native schema
376
    # conversion (checks that ArrowDataType::Map is handled in Rust).
377
    pl.DataFrame(pa_table.slice(0, 0))
378

379
    result = pl.DataFrame(pa_table)
380
    assert result.to_dict(as_series=False) == {
381
        "idx": [1, 2],
382
        "mapping": [
383
            [{"key": "a", "value": "something"}],
384
            [{"key": "a", "value": "else"}, {"key": "b", "value": "another key"}],
385
        ],
386
    }
387

388

389
def test_from_fixed_size_binary_list() -> None:
390
    val = [[b"63A0B1C66575DD5708E1EB2B"]]
391
    arrow_array = pa.array(val, type=pa.list_(pa.binary(24)))
392
    s = cast(pl.Series, pl.from_arrow(arrow_array))
393
    assert s.dtype == pl.List(pl.Binary)
394
    assert s.to_list() == val
395

396

397
def test_dataframe_from_repr() -> None:
398
    # round-trip various types
399
    frame = (
400
        pl.LazyFrame(
401
            {
402
                "a": [1, 2, None],
403
                "b": [4.5, 5.5, 6.5],
404
                "c": ["x", "y", "z"],
405
                "d": [True, False, True],
406
                "e": [None, "", None],
407
                "f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],
408
                "g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],
409
                "h": [
410
                    datetime(2022, 7, 5, 10, 30, 45, 4560),
411
                    datetime(2023, 10, 12, 20, 3, 8, 11),
412
                    None,
413
                ],
414
            },
415
        )
416
        .with_columns(
417
            pl.col("c").cast(pl.Categorical),
418
            pl.col("h").cast(pl.Datetime("ns")),
419
        )
420
        .collect()
421
    )
422

423
    assert frame.schema == {
424
        "a": pl.Int64,
425
        "b": pl.Float64,
426
        "c": pl.Categorical(ordering="lexical"),
427
        "d": pl.Boolean,
428
        "e": pl.String,
429
        "f": pl.Date,
430
        "g": pl.Time,
431
        "h": pl.Datetime("ns"),
432
    }
433
    df = cast(pl.DataFrame, pl.from_repr(repr(frame)))
434
    assert_frame_equal(frame, df)
435

436
    # empty frame; confirm schema is inferred
437
    df = cast(
438
        pl.DataFrame,
439
        pl.from_repr(
440
            """
441
            ┌─────┬─────┬─────┬─────┬─────┬───────┐
442
            │ id  ┆ q1  ┆ q2  ┆ q3  ┆ q4  ┆ total │
443
            │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ ---   │
444
            │ str ┆ i8  ┆ i16 ┆ i32 ┆ i64 ┆ f64   │
445
            ╞═════╪═════╪═════╪═════╪═════╪═══════╡
446
            └─────┴─────┴─────┴─────┴─────┴───────┘
447
            """
448
        ),
449
    )
450
    assert df.shape == (0, 6)
451
    assert df.rows() == []
452
    assert df.schema == {
453
        "id": pl.String,
454
        "q1": pl.Int8,
455
        "q2": pl.Int16,
456
        "q3": pl.Int32,
457
        "q4": pl.Int64,
458
        "total": pl.Float64,
459
    }
460

461
    # empty frame with no dtypes
462
    df = cast(
463
        pl.DataFrame,
464
        pl.from_repr(
465
            """
466
            ┌──────┬───────┐
467
            │ misc ┆ other │
468
            ╞══════╪═══════╡
469
            └──────┴───────┘
470
            """
471
        ),
472
    )
473
    assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String}))
474

475
    # empty frame with a non-standard/blank 'null' in numeric col
476
    df = cast(
477
        pl.DataFrame,
478
        pl.from_repr(
479
            """
480
            ┌─────┬──────┐
481
            │ c1  ┆  c2  │
482
            │ --- ┆  --- │
483
            │ i32 ┆  f64 │
484
            ╞═════╪══════╡
485
            │     │ NULL │
486
            └─────┴──────┘
487
            """
488
        ),
489
    )
490
    assert_frame_equal(
491
        df,
492
        pl.DataFrame(
493
            data=[(None, None)],
494
            schema={"c1": pl.Int32, "c2": pl.Float64},
495
            orient="row",
496
        ),
497
    )
498

499
    df = cast(
500
        pl.DataFrame,
501
        pl.from_repr(
502
            """
503
            # >>> Missing cols with old-style ellipsis, nulls, commented out
504
            # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐
505
            # │ dt         ┆ c1  ┆ c2  ┆ c3  ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99  │
506
            # │ ---        ┆ --- ┆ --- ┆ --- ┆     ┆ --- ┆ --- ┆ --- ┆ ---  │
507
            # │ date       ┆ i32 ┆ i32 ┆ i32 ┆     ┆ i64 ┆ i64 ┆ i64 ┆ i64  │
508
            # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡
509
            # │ 2023-03-25 ┆ 1   ┆ 2   ┆ 3   ┆ ... ┆ 96  ┆ 97  ┆ 98  ┆ 99   │
510
            # │ 1999-12-31 ┆ 3   ┆ 6   ┆ 9   ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │
511
            # │ null       ┆ 9   ┆ 18  ┆ 27  ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891  │
512
            # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘
513
            """
514
        ),
515
    )
516
    assert df.schema == {
517
        "dt": pl.Date,
518
        "c1": pl.Int32,
519
        "c2": pl.Int32,
520
        "c3": pl.Int32,
521
        "c96": pl.Int64,
522
        "c97": pl.Int64,
523
        "c98": pl.Int64,
524
        "c99": pl.Int64,
525
    }
526
    assert df.rows() == [
527
        (date(2023, 3, 25), 1, 2, 3, 96, 97, 98, 99),
528
        (date(1999, 12, 31), 3, 6, 9, 288, 291, 294, None),
529
        (None, 9, 18, 27, 864, 873, 882, 891),
530
    ]
531

532
    df = cast(
533
        pl.DataFrame,
534
        pl.from_repr(
535
            """
536
            # >>> no dtypes:
537
            # ┌────────────┬──────┐
538
            # │ dt         ┆ c99  │
539
            # ╞════════════╪══════╡
540
            # │ 2023-03-25 ┆ 99   │
541
            # │ 1999-12-31 ┆ null │
542
            # │ null       ┆ 891  │
543
            # └────────────┴──────┘
544
            """
545
        ),
546
    )
547
    assert df.schema == {"dt": pl.Date, "c99": pl.Int64}
548
    assert df.rows() == [
549
        (date(2023, 3, 25), 99),
550
        (date(1999, 12, 31), None),
551
        (None, 891),
552
    ]
553

554
    df = cast(
555
        pl.DataFrame,
556
        pl.from_repr(
557
            """
558
            In [2]: with pl.Config() as cfg:
559
               ...:     pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True)
560
               ...:     print(df)
561
               ...:
562
            shape: (1, 5)
563
            ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮
564
            │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp                      │
565
            │ tor_id    ┆ nnel_id    ┆   ┆ ---   ┆ ---                            │
566
            │ ---       ┆ ---        ┆   ┆ str   ┆ datetime[μs, Asia/Tokyo]       │
567
            │ i32       ┆ i64        ┆   ┆       ┆                                │
568
            ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡
569
            │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │
570
            ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
571
            │ …         ┆ …          ┆ … ┆ …     ┆ …                              │
572
            ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
573
            │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │
574
            ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯
575
            # "Een fluitje van een cent..." :)
576
            """
577
        ),
578
    )
579
    assert df.shape == (2, 4)
580
    assert df.schema == {
581
        "source_actor_id": pl.Int32,
582
        "source_channel_id": pl.Int64,
583
        "ident": pl.String,
584
        "timestamp": pl.Datetime("us", "Asia/Tokyo"),
585
    }
586

587

588
def test_dataframe_from_repr_24110() -> None:
589
    df = cast(
590
        pl.DataFrame,
591
        pl.from_repr("""
592
            shape: (7, 1)
593
            ┌──────────────┐
594
            │ time_offset  │
595
            │ ---          │
596
            │ duration[μs] │
597
            ╞══════════════╡
598
            │ -2h          │
599
            │ 0µs          │
600
            │ 2h           │
601
            │ +2h          │
602
            └──────────────┘
603
    """),
604
    )
605
    expected = pl.DataFrame(
606
        {
607
            "time_offset": [
608
                timedelta(hours=-2),
609
                timedelta(),
610
                timedelta(hours=2),
611
                timedelta(hours=2),
612
            ]
613
        },
614
        schema={"time_offset": pl.Duration("us")},
615
    )
616
    assert_frame_equal(df, expected)
617

618

619
def test_dataframe_from_duckdb_repr() -> None:
620
    df = cast(
621
        pl.DataFrame,
622
        pl.from_repr(
623
            """
624
            # misc streaming stats
625
            ┌────────────┬───────┬───────────────────┬───┬────────────────┬───────────────────┐
626
            │   As Of    │ Rank  │ Year to Date Rank │ … │ Days In Top 10 │ Streaming Seconds │
627
            │    date    │ int32 │      varchar      │   │     int16      │      int128       │
628
            ├────────────┼───────┼───────────────────┼───┼────────────────┼───────────────────┤
629
            │ 2025-05-09 │     1 │ 1                 │ … │             29 │  1864939402857430 │
630
            │ 2025-05-09 │     2 │ 2                 │ … │             15 │   658937443590045 │
631
            │ 2025-05-09 │     3 │ 3                 │ … │              9 │   267876522242076 │
632
            └────────────┴───────┴───────────────────┴───┴────────────────┴───────────────────┘
633
            """
634
        ),
635
    )
636
    expected = pl.DataFrame(
637
        {
638
            "As Of": [date(2025, 5, 9), date(2025, 5, 9), date(2025, 5, 9)],
639
            "Rank": [1, 2, 3],
640
            "Year to Date Rank": ["1", "2", "3"],
641
            "Days In Top 10": [29, 15, 9],
642
            "Streaming Seconds": [1864939402857430, 658937443590045, 267876522242076],
643
        },
644
        schema={
645
            "As Of": pl.Date,
646
            "Rank": pl.Int32,
647
            "Year to Date Rank": pl.String,
648
            "Days In Top 10": pl.Int16,
649
            "Streaming Seconds": pl.Int128,
650
        },
651
    )
652
    assert_frame_equal(expected, df)
653

654

655
def test_series_from_repr() -> None:
656
    frame = (
657
        pl.LazyFrame(
658
            {
659
                "a": [1, 2, None],
660
                "b": [4.5, 5.5, 6.5],
661
                "c": ["x", "y", "z"],
662
                "d": [True, False, True],
663
                "e": [None, "", None],
664
                "f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],
665
                "g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],
666
                "h": [
667
                    datetime(2022, 7, 5, 10, 30, 45, 4560),
668
                    datetime(2023, 10, 12, 20, 3, 8, 11),
669
                    None,
670
                ],
671
            },
672
        )
673
        .with_columns(
674
            pl.col("c").cast(pl.Categorical),
675
            pl.col("h").cast(pl.Datetime("ns")),
676
        )
677
        .collect()
678
    )
679

680
    for col in frame.columns:
681
        s = cast(pl.Series, pl.from_repr(repr(frame[col])))
682
        assert_series_equal(s, frame[col])
683

684
    s = cast(
685
        pl.Series,
686
        pl.from_repr(
687
            """
688
            Out[3]:
689
            shape: (3,)
690
            Series: 's' [str]
691
            [
692
                "a"
693
                 …
694
                "c"
695
            ]
696
            """
697
        ),
698
    )
699
    assert_series_equal(s, pl.Series("s", ["a", "c"]))
700

701
    s = cast(
702
        pl.Series,
703
        pl.from_repr(
704
            """
705
            Series: 'flt' [f32]
706
            [
707
            ]
708
            """
709
        ),
710
    )
711
    assert_series_equal(s, pl.Series("flt", [], dtype=pl.Float32))
712

713
    s = cast(
714
        pl.Series,
715
        pl.from_repr(
716
            """
717
            Series: 'flt' [f64]
718
            [
719
                null
720
                +inf
721
                -inf
722
                inf
723
                0.0
724
                NaN
725
            ]
726
            >>> print("stuff")
727
            """
728
        ),
729
    )
730
    inf, nan = float("inf"), float("nan")
731
    assert_series_equal(
732
        s,
733
        pl.Series(
734
            name="flt",
735
            dtype=pl.Float64,
736
            values=[None, inf, -inf, inf, 0.0, nan],
737
        ),
738
    )
739

740

741
def test_dataframe_from_repr_custom_separators() -> None:
742
    # repr created with custom digit-grouping
743
    # and non-default group/decimal separators
744
    df = cast(
745
        pl.DataFrame,
746
        pl.from_repr(
747
            """
748
            ┌───────────┬────────────┐
749
            │ x         ┆ y          │
750
            │ ---       ┆ ---        │
751
            │ i32       ┆ f64        │
752
            ╞═══════════╪════════════╡
753
            │ 123.456   ┆ -10.000,55 │
754
            │ -9.876    ┆ 10,0       │
755
            │ 9.999.999 ┆ 8,5e8      │
756
            └───────────┴────────────┘
757
            """
758
        ),
759
    )
760
    assert_frame_equal(
761
        df,
762
        pl.DataFrame(
763
            {
764
                "x": [123456, -9876, 9999999],
765
                "y": [-10000.55, 10.0, 850000000.0],
766
            },
767
            schema={"x": pl.Int32, "y": pl.Float64},
768
        ),
769
    )
770

771

772
def test_sliced_struct_from_arrow() -> None:
773
    # Create a dataset with 3 rows
774
    tbl = pa.Table.from_arrays(
775
        arrays=[
776
            pa.StructArray.from_arrays(
777
                arrays=[
778
                    pa.array([1, 2, 3], pa.int32()),
779
                    pa.array(["foo", "bar", "baz"], pa.utf8()),
780
                ],
781
                names=["a", "b"],
782
            )
783
        ],
784
        names=["struct_col"],
785
    )
786

787
    # slice the table
788
    # check if FFI correctly reads sliced
789
    result = cast(pl.DataFrame, pl.from_arrow(tbl.slice(1, 2)))
790
    assert result.to_dict(as_series=False) == {
791
        "struct_col": [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]
792
    }
793

794
    result = cast(pl.DataFrame, pl.from_arrow(tbl.slice(1, 1)))
795
    assert result.to_dict(as_series=False) == {"struct_col": [{"a": 2, "b": "bar"}]}
796

797

798
def test_from_arrow_invalid_time_zone() -> None:
799
    arr = pa.array(
800
        [datetime(2021, 1, 1, 0, 0, 0, 0)],
801
        type=pa.timestamp("ns", tz="this-is-not-a-time-zone"),
802
    )
803
    with pytest.raises(
804
        ComputeError, match=r"unable to parse time zone: 'this-is-not-a-time-zone'"
805
    ):
806
        pl.from_arrow(arr)
807

808

809
@pytest.mark.parametrize(
810
    ("fixed_offset", "etc_tz"),
811
    [
812
        ("+10:00", "Etc/GMT-10"),
813
        ("10:00", "Etc/GMT-10"),
814
        ("-10:00", "Etc/GMT+10"),
815
        ("+05:00", "Etc/GMT-5"),
816
        ("05:00", "Etc/GMT-5"),
817
        ("-05:00", "Etc/GMT+5"),
818
    ],
819
)
820
def test_from_arrow_fixed_offset(fixed_offset: str, etc_tz: str) -> None:
821
    arr = pa.array(
822
        [datetime(2021, 1, 1, 0, 0, 0, 0)],
823
        type=pa.timestamp("us", tz=fixed_offset),
824
    )
825
    result = cast(pl.Series, pl.from_arrow(arr))
826
    expected = pl.Series(
827
        [datetime(2021, 1, 1, tzinfo=timezone.utc)]
828
    ).dt.convert_time_zone(etc_tz)
829
    assert_series_equal(result, expected)
830

831

832
def test_from_avro_valid_time_zone_13032() -> None:
833
    arr = pa.array(
834
        [datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="00:00")
835
    )
836
    result = cast(pl.Series, pl.from_arrow(arr))
837
    expected = pl.Series([datetime(2021, 1, 1)], dtype=pl.Datetime("ns", "UTC"))
838
    assert_series_equal(result, expected)
839

840

841
def test_from_numpy_different_resolution_15991() -> None:
842
    result = pl.Series(
843
        np.array(["2020-01-01"], dtype="datetime64[ns]"), dtype=pl.Datetime("us")
844
    )
845
    expected = pl.Series([datetime(2020, 1, 1)], dtype=pl.Datetime("us"))
846
    assert_series_equal(result, expected)
847

848

849
def test_from_numpy_different_resolution_invalid() -> None:
850
    with pytest.raises(ValueError, match="Please cast"):
851
        pl.Series(
852
            np.array(["2020-01-01"], dtype="datetime64[s]"), dtype=pl.Datetime("us")
853
        )
854

855

856
def test_compat_level(monkeypatch: pytest.MonkeyPatch) -> None:
857
    # change these if compat level bumped
858
    monkeypatch.setenv("POLARS_WARN_UNSTABLE", "1")
859
    oldest = CompatLevel.oldest()
860
    assert oldest is CompatLevel.oldest()  # test singleton
861
    assert oldest._version == 0
862
    with pytest.warns(UnstableWarning):
863
        newest = CompatLevel.newest()
864
        assert newest is CompatLevel.newest()
865
    assert newest._version == 1
866

867
    str_col = pl.Series(["awd"])
868
    bin_col = pl.Series([b"dwa"])
869
    assert str_col._newest_compat_level() == newest._version
870
    assert isinstance(str_col.to_arrow(), pa.LargeStringArray)
871
    assert isinstance(str_col.to_arrow(compat_level=oldest), pa.LargeStringArray)
872
    assert isinstance(str_col.to_arrow(compat_level=newest), pa.StringViewArray)
873
    assert isinstance(bin_col.to_arrow(), pa.LargeBinaryArray)
874
    assert isinstance(bin_col.to_arrow(compat_level=oldest), pa.LargeBinaryArray)
875
    assert isinstance(bin_col.to_arrow(compat_level=newest), pa.BinaryViewArray)
876

877
    df = pl.DataFrame({"str_col": str_col, "bin_col": bin_col})
878
    assert isinstance(df.to_arrow()["str_col"][0], pa.LargeStringScalar)
879
    assert isinstance(
880
        df.to_arrow(compat_level=oldest)["str_col"][0], pa.LargeStringScalar
881
    )
882
    assert isinstance(
883
        df.to_arrow(compat_level=newest)["str_col"][0], pa.StringViewScalar
884
    )
885
    assert isinstance(df.to_arrow()["bin_col"][0], pa.LargeBinaryScalar)
886
    assert isinstance(
887
        df.to_arrow(compat_level=oldest)["bin_col"][0], pa.LargeBinaryScalar
888
    )
889
    assert isinstance(
890
        df.to_arrow(compat_level=newest)["bin_col"][0], pa.BinaryViewScalar
891
    )
892

893
    assert len(df.write_ipc(None).getbuffer()) == 738
894
    assert len(df.write_ipc(None, compat_level=oldest).getbuffer()) == 866
895
    assert len(df.write_ipc(None, compat_level=newest).getbuffer()) == 738
896
    assert len(df.write_ipc_stream(None).getbuffer()) == 520
897
    assert len(df.write_ipc_stream(None, compat_level=oldest).getbuffer()) == 648
898
    assert len(df.write_ipc_stream(None, compat_level=newest).getbuffer()) == 520
899

900

901
def test_df_pycapsule_interface() -> None:
902
    df = pl.DataFrame(
903
        {
904
            "a": [1, 2, 3],
905
            "b": ["a", "b", "c"],
906
            "c": ["fooooooooooooooooooooo", "bar", "looooooooooooooooong string"],
907
        }
908
    )
909

910
    capsule_df = PyCapsuleStreamHolder(df)
911
    out = pa.table(capsule_df)
912
    assert df.shape == out.shape
913
    assert df.schema.names() == out.schema.names
914

915
    schema_overrides = {"a": pl.Int128}
916
    expected_schema = pl.Schema([("a", pl.Int128), ("b", pl.String), ("c", pl.String)])
917

918
    for arrow_obj in (
919
        pl.from_arrow(capsule_df),  # capsule
920
        out,  # table loaded from capsule
921
    ):
922
        df_res = pl.from_arrow(arrow_obj, schema_overrides=schema_overrides)
923
        assert expected_schema == df_res.schema  # type: ignore[union-attr]
924
        assert isinstance(df_res, pl.DataFrame)
925
        assert df.equals(df_res)
926

927

928
def test_misaligned_nested_arrow_19097() -> None:
929
    a = pl.Series("a", [1, 2, 3])
930
    a = a.slice(1, 2)  # by slicing we offset=1 the values
931
    a = a.replace(2, None)  # then we add a validity mask with offset=0
932
    a = a.reshape((2, 1))  # then we make it nested
933
    assert_series_equal(pl.Series("a", a.to_arrow()), a)
934

935

936
def test_arrow_roundtrip_lex_cat_20288() -> None:
937
    tb = (
938
        pl.Series("a", ["A", "B"], pl.Categorical(ordering="lexical"))
939
        .to_frame()
940
        .to_arrow()
941
    )
942
    df = pl.from_arrow(tb)
943
    assert isinstance(df, pl.DataFrame)
944
    dt = df.schema["a"]
945
    assert isinstance(dt, pl.Categorical)
946
    assert dt.ordering == "lexical"
947

948

949
def test_from_arrow_20271() -> None:
950
    df = pl.from_arrow(
951
        pa.table({"b": pa.DictionaryArray.from_arrays([0, 1], ["D", "E"])})
952
    )
953
    assert isinstance(df, pl.DataFrame)
954
    assert_series_equal(df.to_series(), pl.Series("b", ["D", "E"], pl.Categorical))
955

956

957
def test_to_arrow_empty_chunks_20627() -> None:
958
    df = pl.concat(2 * [pl.Series([1])]).filter(pl.Series([False, True])).to_frame()
959
    assert df.to_arrow().shape == (1, 1)
960

961

962
def test_from_arrow_recorbatch() -> None:
963
    n_legs = pa.array([2, 2, 4, 4, 5, 100])
964
    animals = pa.array(
965
        ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]
966
    )
967
    names = ["n_legs", "animals"]
968
    record_batch = pa.RecordBatch.from_arrays([n_legs, animals], names=names)
969
    assert_frame_equal(
970
        pl.DataFrame(record_batch),
971
        pl.DataFrame(
972
            {
973
                "n_legs": n_legs,
974
                "animals": animals,
975
            }
976
        ),
977
    )
978

979

980
def test_from_arrow_map_containing_timestamp_23658() -> None:
981
    arrow_tbl = pa.Table.from_pydict(
982
        {
983
            "column_1": [
984
                [
985
                    {
986
                        "field_1": [
987
                            {"key": 1, "value": datetime(2025, 1, 1)},
988
                            {"key": 2, "value": datetime(2025, 1, 2)},
989
                            {"key": 2, "value": None},
990
                        ]
991
                    },
992
                    {"field_1": []},
993
                    None,
994
                ]
995
            ],
996
        },
997
        schema=pa.schema(
998
            [
999
                (
1000
                    "column_1",
1001
                    pa.list_(
1002
                        pa.struct(
1003
                            [
1004
                                ("field_1", pa.map_(pa.int32(), pa.timestamp("ms"))),
1005
                            ]
1006
                        )
1007
                    ),
1008
                )
1009
            ]
1010
        ),
1011
    )
1012

1013
    expect = pl.DataFrame(
1014
        {
1015
            "column_1": [
1016
                [
1017
                    {
1018
                        "field_1": [
1019
                            {"key": 1, "value": datetime(2025, 1, 1)},
1020
                            {"key": 2, "value": datetime(2025, 1, 2)},
1021
                            {"key": 2, "value": None},
1022
                        ]
1023
                    },
1024
                    {"field_1": []},
1025
                    None,
1026
                ]
1027
            ],
1028
        },
1029
        schema={
1030
            "column_1": pl.List(
1031
                pl.Struct(
1032
                    {
1033
                        "field_1": pl.List(
1034
                            pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")})
1035
                        )
1036
                    }
1037
                )
1038
            )
1039
        },
1040
    )
1041

1042
    out = pl.DataFrame(arrow_tbl)
1043

1044
    assert_frame_equal(out, expect)
1045

1046

1047
def test_schema_constructor_from_schema_capsule() -> None:
1048
    arrow_schema = pa.schema(
1049
        [pa.field("test", pa.map_(pa.int32(), pa.timestamp("ms")))]
1050
    )
1051

1052
    assert pl.Schema(arrow_schema) == {
1053
        "test": pl.List(pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")}))
1054
    }
1055

1056
    arrow_schema = pa.schema([pa.field("a", pa.int32()), pa.field("a", pa.int32())])
1057

1058
    with pytest.raises(
1059
        DuplicateError,
1060
        match="arrow schema contained duplicate name: a",
1061
    ):
1062
        pl.Schema(arrow_schema)
1063

1064
    with pytest.raises(
1065
        ValueError,
1066
        match="object passed to pl.Schema did not return struct dtype: object: pyarrow.Field<a: int32>, dtype: Int32",
1067
    ):
1068
        pl.Schema(pa.field("a", pa.int32()))
1069

1070
    assert pl.Schema([pa.field("a", pa.int32()), pa.field("b", pa.string())]) == {
1071
        "a": pl.Int32,
1072
        "b": pl.String,
1073
    }
1074

1075
    with pytest.raises(
1076
        DuplicateError,
1077
        match="iterable passed to pl.Schema contained duplicate name 'a'",
1078
    ):
1079
        pl.Schema([pa.field("a", pa.int32()), pa.field("a", pa.int64())])
1080

1081

1082
def test_to_arrow_24142() -> None:
1083
    df = pl.DataFrame({"a": object(), "b": "any string or bytes"})
1084
    df.to_arrow(compat_level=CompatLevel.oldest())
1085

1086

1087
def test_comprehensive_pycapsule_interface() -> None:
1088
    """Test all data types via Arrow C Stream PyCapsule interface."""
1089
    from datetime import date, datetime, time, timedelta
1090
    from decimal import Decimal
1091

1092
    class PyCapsuleStreamWrap:
1093
        def __init__(self, v: Any) -> None:
1094
            self.capsule = v.__arrow_c_stream__()
1095

1096
        def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
1097
            return self.capsule
1098

1099
    def roundtrip_series_pycapsule(s: pl.Series) -> pl.Series:
1100
        return pl.Series(PyCapsuleStreamWrap(s))
1101

1102
    df = pl.DataFrame(
1103
        {
1104
            "bool": [True, False, None],
1105
            "int8": pl.Series([1, 2, None], dtype=pl.Int8),
1106
            "int16": pl.Series([1, 2, None], dtype=pl.Int16),
1107
            "int32": pl.Series([1, 2, None], dtype=pl.Int32),
1108
            "int64": pl.Series([1, 2, None], dtype=pl.Int64),
1109
            "uint8": pl.Series([1, 2, None], dtype=pl.UInt8),
1110
            "uint16": pl.Series([1, 2, None], dtype=pl.UInt16),
1111
            "uint32": pl.Series([1, 2, None], dtype=pl.UInt32),
1112
            "uint64": pl.Series([1, 2, None], dtype=pl.UInt64),
1113
            "float32": pl.Series([1.1, 2.2, None], dtype=pl.Float32),
1114
            "float64": pl.Series([1.1, 2.2, None], dtype=pl.Float64),
1115
            "string": ["hello", "world", None],
1116
            "binary": [b"hello", b"world", None],
1117
            "decimal": pl.Series(
1118
                [Decimal("1.23"), Decimal("4.56"), None], dtype=pl.Decimal(10, 2)
1119
            ),
1120
            "date": [date(2023, 1, 1), date(2023, 1, 2), None],
1121
            "datetime": [
1122
                datetime(2023, 1, 1, 12, 0),
1123
                datetime(2023, 1, 2, 13, 30),
1124
                None,
1125
            ],
1126
            "time": [time(12, 0, 0), time(13, 30, 0), None],
1127
            "duration_us": pl.Series(
1128
                [timedelta(days=1), timedelta(hours=2), None], dtype=pl.Duration("us")
1129
            ),
1130
            "duration_ms": pl.Series(
1131
                [timedelta(milliseconds=100), timedelta(microseconds=500), None],
1132
                dtype=pl.Duration("ms"),
1133
            ),
1134
            "duration_ns": pl.Series(
1135
                [timedelta(seconds=1), timedelta(microseconds=1000), None],
1136
                dtype=pl.Duration("ns"),
1137
            ),
1138
            "categorical": pl.Series(
1139
                ["apple", "banana", "apple"], dtype=pl.Categorical
1140
            ),
1141
            "list_duration": [
1142
                [timedelta(days=1), timedelta(hours=2)],
1143
                [timedelta(minutes=30)],
1144
                None,
1145
            ],
1146
            "struct_with_duration": [
1147
                {"x": timedelta(days=1), "y": 1},
1148
                {"x": timedelta(hours=2), "y": 2},
1149
                None,
1150
            ],
1151
        }
1152
    ).cast(
1153
        {
1154
            "list_duration": pl.List(pl.Duration("us")),
1155
            "struct_with_duration": pl.Struct({"x": pl.Duration("ns"), "y": pl.Int32}),
1156
        }
1157
    )
1158

1159
    df_roundtrip = df.map_columns(pl.selectors.all(), roundtrip_series_pycapsule)
1160

1161
    assert_frame_equal(df_roundtrip, df)
1162

1163
    df_roundtrip_direct = pl.DataFrame(PyCapsuleStreamWrap(df))
1164

1165
    assert_frame_equal(df_roundtrip_direct, df)
1166

1167
Product

Resources

Company