Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_interop.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, datetime, time, timedelta, timezone
4
from typing import Any, cast
5
6
import numpy as np
7
import pandas as pd
8
import pyarrow as pa
9
import pytest
10
11
import polars as pl
12
from polars.exceptions import ComputeError, DuplicateError, UnstableWarning
13
from polars.interchange.protocol import CompatLevel
14
from polars.testing import assert_frame_equal, assert_series_equal
15
from tests.unit.utils.pycapsule_utils import PyCapsuleStreamHolder
16
17
18
def test_arrow_list_roundtrip() -> None:
19
# https://github.com/pola-rs/polars/issues/1064
20
tbl = pa.table({"a": [1], "b": [[1, 2]]})
21
arw = pl.from_arrow(tbl).to_arrow()
22
23
assert arw.shape == tbl.shape
24
assert arw.schema.names == tbl.schema.names
25
for c1, c2 in zip(arw.columns, tbl.columns):
26
assert c1.to_pylist() == c2.to_pylist()
27
28
29
def test_arrow_null_roundtrip() -> None:
30
tbl = pa.table({"a": [None, None], "b": [[None, None], [None, None]]})
31
df = pl.from_arrow(tbl)
32
33
if isinstance(df, pl.DataFrame):
34
assert df.dtypes == [pl.Null, pl.List(pl.Null)]
35
36
arw = df.to_arrow()
37
38
assert arw.shape == tbl.shape
39
assert arw.schema.names == tbl.schema.names
40
for c1, c2 in zip(arw.columns, tbl.columns):
41
assert c1.to_pylist() == c2.to_pylist()
42
43
44
def test_arrow_empty_dataframe() -> None:
45
# 0x0 dataframe
46
df = pl.DataFrame({})
47
tbl = pa.table({})
48
assert df.to_arrow() == tbl
49
df2 = cast(pl.DataFrame, pl.from_arrow(df.to_arrow()))
50
assert_frame_equal(df2, df)
51
52
# 0 row dataframe
53
df = pl.DataFrame({}, schema={"a": pl.Int32})
54
tbl = pa.Table.from_batches([], pa.schema([pa.field("a", pa.int32())]))
55
assert df.to_arrow() == tbl
56
df2 = cast(pl.DataFrame, pl.from_arrow(df.to_arrow()))
57
assert df2.schema == {"a": pl.Int32}
58
assert df2.shape == (0, 1)
59
60
61
def test_arrow_dict_to_polars() -> None:
62
pa_dict = pa.DictionaryArray.from_arrays(
63
indices=np.array([0, 1, 2, 3, 1, 0, 2, 3, 3, 2]),
64
dictionary=np.array(["AAA", "BBB", "CCC", "DDD"]),
65
).cast(pa.large_utf8())
66
67
s = pl.Series(
68
name="pa_dict",
69
values=["AAA", "BBB", "CCC", "DDD", "BBB", "AAA", "CCC", "DDD", "DDD", "CCC"],
70
)
71
assert_series_equal(s, pl.Series("pa_dict", pa_dict))
72
73
74
def test_arrow_list_chunked_array() -> None:
75
a = pa.array([[1, 2], [3, 4]])
76
ca = pa.chunked_array([a, a, a])
77
s = cast(pl.Series, pl.from_arrow(ca))
78
assert s.dtype == pl.List
79
80
81
# Test that polars convert Arrays of logical types correctly to arrow
82
def test_arrow_array_logical() -> None:
83
# cast to large string and uint8 indices because polars converts to those
84
pa_data1 = (
85
pa.array(["a", "b", "c", "d"])
86
.dictionary_encode()
87
.cast(pa.dictionary(pa.uint8(), pa.large_string()))
88
)
89
pa_array_logical1 = pa.FixedSizeListArray.from_arrays(pa_data1, 2)
90
91
s1 = pl.Series(
92
values=[["a", "b"], ["c", "d"]],
93
dtype=pl.Array(pl.Enum(["a", "b", "c", "d"]), shape=2),
94
)
95
assert s1.to_arrow() == pa_array_logical1
96
97
pa_data2 = pa.array([date(2024, 1, 1), date(2024, 1, 2)])
98
pa_array_logical2 = pa.FixedSizeListArray.from_arrays(pa_data2, 1)
99
100
s2 = pl.Series(
101
values=[[date(2024, 1, 1)], [date(2024, 1, 2)]],
102
dtype=pl.Array(pl.Date, shape=1),
103
)
104
assert s2.to_arrow() == pa_array_logical2
105
106
107
def test_from_dict() -> None:
108
data = {"a": [1, 2], "b": [3, 4]}
109
df = pl.from_dict(data)
110
assert df.shape == (2, 2)
111
for s1, s2 in zip(list(df), [pl.Series("a", [1, 2]), pl.Series("b", [3, 4])]):
112
assert_series_equal(s1, s2)
113
114
115
def test_from_dict_struct() -> None:
116
data: dict[str, dict[str, list[int]] | list[int]] = {
117
"a": {"b": [1, 3], "c": [2, 4]},
118
"d": [5, 6],
119
}
120
df = pl.from_dict(data)
121
assert df.shape == (2, 2)
122
assert df["a"][0] == {"b": 1, "c": 2}
123
assert df["a"][1] == {"b": 3, "c": 4}
124
assert df.schema == {"a": pl.Struct({"b": pl.Int64, "c": pl.Int64}), "d": pl.Int64}
125
126
127
def test_from_dicts() -> None:
128
data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": None}]
129
df = pl.from_dicts(data) # type: ignore[arg-type]
130
assert df.shape == (3, 2)
131
assert df.rows() == [(1, 4), (2, 5), (3, None)]
132
assert df.schema == {"a": pl.Int64, "b": pl.Int64}
133
134
135
def test_from_dict_no_inference() -> None:
136
schema = {"a": pl.String}
137
data = [{"a": "aa"}]
138
df = pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0)
139
assert df.schema == schema
140
assert df.to_dicts() == data
141
142
143
def test_from_dicts_schema_override() -> None:
144
schema = {
145
"a": pl.String,
146
"b": pl.Int64,
147
"c": pl.List(pl.Struct({"x": pl.Int64, "y": pl.String, "z": pl.Float64})),
148
}
149
150
# initial data matches the expected schema
151
data1 = [
152
{
153
"a": "l",
154
"b": i,
155
"c": [{"x": (j + 2), "y": "?", "z": (j % 2)} for j in range(2)],
156
}
157
for i in range(5)
158
]
159
160
# extend with a mix of fields that are/not in the schema
161
data2 = [{"b": i + 5, "d": "ABC", "e": "DEF"} for i in range(5)]
162
163
for n_infer in (0, 3, 5, 8, 10, 100):
164
df = pl.DataFrame(
165
data=(data1 + data2),
166
schema=schema, # type: ignore[arg-type]
167
infer_schema_length=n_infer,
168
)
169
assert df.schema == schema
170
assert df.rows() == [
171
("l", 0, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
172
("l", 1, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
173
("l", 2, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
174
("l", 3, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
175
("l", 4, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
176
(None, 5, None),
177
(None, 6, None),
178
(None, 7, None),
179
(None, 8, None),
180
(None, 9, None),
181
]
182
183
184
def test_from_dicts_struct() -> None:
185
data = [{"a": {"b": 1, "c": 2}, "d": 5}, {"a": {"b": 3, "c": 4}, "d": 6}]
186
df = pl.from_dicts(data)
187
assert df.shape == (2, 2)
188
assert df["a"][0] == {"b": 1, "c": 2}
189
assert df["a"][1] == {"b": 3, "c": 4}
190
191
# 5649
192
assert pl.from_dicts([{"a": [{"x": 1}]}, {"a": [{"y": 1}]}]).to_dict(
193
as_series=False
194
) == {"a": [[{"y": None, "x": 1}], [{"y": 1, "x": None}]]}
195
assert pl.from_dicts([{"a": [{"x": 1}, {"y": 2}]}, {"a": [{"y": 1}]}]).to_dict(
196
as_series=False
197
) == {"a": [[{"y": None, "x": 1}, {"y": 2, "x": None}], [{"y": 1, "x": None}]]}
198
199
200
def test_from_records() -> None:
201
data = [[1, 2, 3], [4, 5, 6]]
202
df = pl.from_records(data, schema=["a", "b"])
203
assert df.shape == (3, 2)
204
assert df.rows() == [(1, 4), (2, 5), (3, 6)]
205
206
207
# https://github.com/pola-rs/polars/issues/15195
208
@pytest.mark.parametrize(
209
"input",
210
[
211
pl.Series([1, 2]),
212
pl.Series([{"a": 1, "b": 2}]),
213
pl.DataFrame({"a": [1, 2], "b": [3, 4]}),
214
],
215
)
216
def test_from_records_non_sequence_input(input: Any) -> None:
217
with pytest.raises(TypeError, match="expected data of type Sequence"):
218
pl.from_records(input)
219
220
221
def test_from_arrow() -> None:
222
data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
223
df = pl.from_arrow(data)
224
assert df.shape == (3, 2)
225
assert df.rows() == [(1, 4), (2, 5), (3, 6)] # type: ignore[union-attr]
226
227
# if not a PyArrow type, raise a TypeError
228
with pytest.raises(TypeError):
229
_ = pl.from_arrow([1, 2])
230
231
df = pl.from_arrow(
232
data, schema=["a", "b"], schema_overrides={"a": pl.UInt32, "b": pl.UInt64}
233
)
234
assert df.rows() == [(1, 4), (2, 5), (3, 6)] # type: ignore[union-attr]
235
assert df.schema == {"a": pl.UInt32, "b": pl.UInt64} # type: ignore[union-attr]
236
237
238
def test_from_arrow_with_bigquery_metadata() -> None:
239
arrow_schema = pa.schema(
240
[
241
pa.field("id", pa.int64()).with_metadata(
242
{"ARROW:extension:name": "google:sqlType:integer"}
243
),
244
pa.field(
245
"misc",
246
pa.struct([("num", pa.int32()), ("val", pa.string())]),
247
).with_metadata({"ARROW:extension:name": "google:sqlType:struct"}),
248
]
249
)
250
arrow_tbl = pa.Table.from_pylist(
251
[{"id": 1, "misc": None}, {"id": 2, "misc": None}],
252
schema=arrow_schema,
253
)
254
255
expected_data = {"id": [1, 2], "num": [None, None], "val": [None, None]}
256
expected_schema = {"id": pl.Int64, "num": pl.Int32, "val": pl.String}
257
assert_frame_equal(
258
pl.DataFrame(expected_data, schema=expected_schema),
259
pl.from_arrow(arrow_tbl).unnest("misc"), # type: ignore[union-attr]
260
)
261
262
263
def test_from_optional_not_available() -> None:
264
from polars.dependencies import _LazyModule
265
266
# proxy module is created dynamically if the required module is not available
267
# (see the polars.dependencies source code for additional detail/comments)
268
269
np = _LazyModule("numpy", module_available=False)
270
with pytest.raises(ImportError, match=r"np\.array requires 'numpy'"):
271
pl.from_numpy(np.array([[1, 2], [3, 4]]), schema=["a", "b"])
272
273
pa = _LazyModule("pyarrow", module_available=False)
274
with pytest.raises(ImportError, match=r"pa\.table requires 'pyarrow'"):
275
pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))
276
277
pd = _LazyModule("pandas", module_available=False)
278
with pytest.raises(ImportError, match=r"pd\.Series requires 'pandas'"):
279
pl.from_pandas(pd.Series([1, 2, 3]))
280
281
282
def test_upcast_pyarrow_dicts() -> None:
283
# https://github.com/pola-rs/polars/issues/1752
284
tbls = [
285
pa.table(
286
{
287
"col_name": pa.array(
288
[f"value_{i}"], pa.dictionary(pa.int8(), pa.string())
289
)
290
}
291
)
292
for i in range(128)
293
]
294
295
tbl = pa.concat_tables(tbls, promote_options="default")
296
out = cast(pl.DataFrame, pl.from_arrow(tbl))
297
assert out.shape == (128, 1)
298
assert out["col_name"][0] == "value_0"
299
assert out["col_name"][127] == "value_127"
300
301
302
def test_no_rechunk() -> None:
303
table = pa.Table.from_pydict({"x": pa.chunked_array([list("ab"), list("cd")])})
304
# table
305
assert pl.from_arrow(table, rechunk=False).n_chunks() == 2
306
# chunked array
307
assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2
308
309
310
def test_from_empty_arrow() -> None:
311
df = cast(pl.DataFrame, pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []}))))
312
assert df.columns == ["a", "b"]
313
assert df.dtypes == [pl.Float64, pl.Float64]
314
315
# 2705
316
df1 = pd.DataFrame(columns=["b"], dtype=float, index=pd.Index([]))
317
tbl = pa.Table.from_pandas(df1)
318
out = cast(pl.DataFrame, pl.from_arrow(tbl))
319
assert out.columns == ["b", "__index_level_0__"]
320
assert out.dtypes == [pl.Float64, pl.Null]
321
tbl = pa.Table.from_pandas(df1, preserve_index=False)
322
out = cast(pl.DataFrame, pl.from_arrow(tbl))
323
assert out.columns == ["b"]
324
assert out.dtypes == [pl.Float64]
325
326
# 4568
327
tbl = pa.table({"l": []}, schema=pa.schema([("l", pa.large_list(pa.uint8()))]))
328
329
df = cast(pl.DataFrame, pl.from_arrow(tbl))
330
assert df.schema["l"] == pl.List(pl.UInt8)
331
332
333
def test_cat_int_types_3500() -> None:
334
# Create an enum / categorical / dictionary typed pyarrow array
335
# Most simply done by creating a pandas categorical series first
336
categorical_s = pd.Series(["a", "a", "b"], dtype="category")
337
pyarrow_array = pa.Array.from_pandas(categorical_s)
338
339
# The in-memory representation of each category can either be a signed or
340
# unsigned 8-bit integer. Pandas uses Int8...
341
int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8())
342
# ... while DuckDB uses UInt8
343
uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8())
344
345
for t in [int_dict_type, uint_dict_type]:
346
s = cast(pl.Series, pl.from_arrow(pyarrow_array.cast(t)))
347
assert_series_equal(
348
s, pl.Series(["a", "a", "b"]).cast(pl.Categorical), check_names=False
349
)
350
351
352
def test_from_pyarrow_chunked_array() -> None:
353
column = pa.chunked_array([[1], [2]])
354
series = pl.Series("column", column)
355
assert series.to_list() == [1, 2]
356
357
358
def test_arrow_list_null_5697() -> None:
359
# Create a pyarrow table with a list[null] column.
360
pa_table = pa.table([[[None]]], names=["mycol"])
361
df = pl.from_arrow(pa_table)
362
pa_table = df.to_arrow()
363
# again to polars to test the schema
364
assert pl.from_arrow(pa_table).schema == {"mycol": pl.List(pl.Null)} # type: ignore[union-attr]
365
366
367
def test_from_pyarrow_map() -> None:
368
pa_table = pa.table(
369
[[1, 2], [[("a", "something")], [("a", "else"), ("b", "another key")]]],
370
schema=pa.schema(
371
[("idx", pa.int16()), ("mapping", pa.map_(pa.string(), pa.string()))]
372
),
373
)
374
375
# Convert from an empty table to trigger an ArrowSchema -> native schema
376
# conversion (checks that ArrowDataType::Map is handled in Rust).
377
pl.DataFrame(pa_table.slice(0, 0))
378
379
result = pl.DataFrame(pa_table)
380
assert result.to_dict(as_series=False) == {
381
"idx": [1, 2],
382
"mapping": [
383
[{"key": "a", "value": "something"}],
384
[{"key": "a", "value": "else"}, {"key": "b", "value": "another key"}],
385
],
386
}
387
388
389
def test_from_fixed_size_binary_list() -> None:
390
val = [[b"63A0B1C66575DD5708E1EB2B"]]
391
arrow_array = pa.array(val, type=pa.list_(pa.binary(24)))
392
s = cast(pl.Series, pl.from_arrow(arrow_array))
393
assert s.dtype == pl.List(pl.Binary)
394
assert s.to_list() == val
395
396
397
def test_dataframe_from_repr() -> None:
398
# round-trip various types
399
frame = (
400
pl.LazyFrame(
401
{
402
"a": [1, 2, None],
403
"b": [4.5, 5.5, 6.5],
404
"c": ["x", "y", "z"],
405
"d": [True, False, True],
406
"e": [None, "", None],
407
"f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],
408
"g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],
409
"h": [
410
datetime(2022, 7, 5, 10, 30, 45, 4560),
411
datetime(2023, 10, 12, 20, 3, 8, 11),
412
None,
413
],
414
},
415
)
416
.with_columns(
417
pl.col("c").cast(pl.Categorical),
418
pl.col("h").cast(pl.Datetime("ns")),
419
)
420
.collect()
421
)
422
423
assert frame.schema == {
424
"a": pl.Int64,
425
"b": pl.Float64,
426
"c": pl.Categorical(ordering="lexical"),
427
"d": pl.Boolean,
428
"e": pl.String,
429
"f": pl.Date,
430
"g": pl.Time,
431
"h": pl.Datetime("ns"),
432
}
433
df = cast(pl.DataFrame, pl.from_repr(repr(frame)))
434
assert_frame_equal(frame, df)
435
436
# empty frame; confirm schema is inferred
437
df = cast(
438
pl.DataFrame,
439
pl.from_repr(
440
"""
441
┌─────┬─────┬─────┬─────┬─────┬───────┐
442
│ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │
443
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
444
│ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │
445
╞═════╪═════╪═════╪═════╪═════╪═══════╡
446
└─────┴─────┴─────┴─────┴─────┴───────┘
447
"""
448
),
449
)
450
assert df.shape == (0, 6)
451
assert df.rows() == []
452
assert df.schema == {
453
"id": pl.String,
454
"q1": pl.Int8,
455
"q2": pl.Int16,
456
"q3": pl.Int32,
457
"q4": pl.Int64,
458
"total": pl.Float64,
459
}
460
461
# empty frame with no dtypes
462
df = cast(
463
pl.DataFrame,
464
pl.from_repr(
465
"""
466
┌──────┬───────┐
467
│ misc ┆ other │
468
╞══════╪═══════╡
469
└──────┴───────┘
470
"""
471
),
472
)
473
assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String}))
474
475
# empty frame with a non-standard/blank 'null' in numeric col
476
df = cast(
477
pl.DataFrame,
478
pl.from_repr(
479
"""
480
┌─────┬──────┐
481
│ c1 ┆ c2 │
482
│ --- ┆ --- │
483
│ i32 ┆ f64 │
484
╞═════╪══════╡
485
│ │ NULL │
486
└─────┴──────┘
487
"""
488
),
489
)
490
assert_frame_equal(
491
df,
492
pl.DataFrame(
493
data=[(None, None)],
494
schema={"c1": pl.Int32, "c2": pl.Float64},
495
orient="row",
496
),
497
)
498
499
df = cast(
500
pl.DataFrame,
501
pl.from_repr(
502
"""
503
# >>> Missing cols with old-style ellipsis, nulls, commented out
504
# ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐
505
# │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │
506
# │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
507
# │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
508
# ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡
509
# │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │
510
# │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │
511
# │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │
512
# └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘
513
"""
514
),
515
)
516
assert df.schema == {
517
"dt": pl.Date,
518
"c1": pl.Int32,
519
"c2": pl.Int32,
520
"c3": pl.Int32,
521
"c96": pl.Int64,
522
"c97": pl.Int64,
523
"c98": pl.Int64,
524
"c99": pl.Int64,
525
}
526
assert df.rows() == [
527
(date(2023, 3, 25), 1, 2, 3, 96, 97, 98, 99),
528
(date(1999, 12, 31), 3, 6, 9, 288, 291, 294, None),
529
(None, 9, 18, 27, 864, 873, 882, 891),
530
]
531
532
df = cast(
533
pl.DataFrame,
534
pl.from_repr(
535
"""
536
# >>> no dtypes:
537
# ┌────────────┬──────┐
538
# │ dt ┆ c99 │
539
# ╞════════════╪══════╡
540
# │ 2023-03-25 ┆ 99 │
541
# │ 1999-12-31 ┆ null │
542
# │ null ┆ 891 │
543
# └────────────┴──────┘
544
"""
545
),
546
)
547
assert df.schema == {"dt": pl.Date, "c99": pl.Int64}
548
assert df.rows() == [
549
(date(2023, 3, 25), 99),
550
(date(1999, 12, 31), None),
551
(None, 891),
552
]
553
554
df = cast(
555
pl.DataFrame,
556
pl.from_repr(
557
"""
558
In [2]: with pl.Config() as cfg:
559
...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True)
560
...: print(df)
561
...:
562
shape: (1, 5)
563
╭───────────┬────────────┬───┬───────┬────────────────────────────────╮
564
│ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │
565
│ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │
566
│ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │
567
│ i32 ┆ i64 ┆ ┆ ┆ │
568
╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡
569
│ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │
570
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
571
│ … ┆ … ┆ … ┆ … ┆ … │
572
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
573
│ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │
574
╰───────────┴────────────┴───┴───────┴────────────────────────────────╯
575
# "Een fluitje van een cent..." :)
576
"""
577
),
578
)
579
assert df.shape == (2, 4)
580
assert df.schema == {
581
"source_actor_id": pl.Int32,
582
"source_channel_id": pl.Int64,
583
"ident": pl.String,
584
"timestamp": pl.Datetime("us", "Asia/Tokyo"),
585
}
586
587
588
def test_dataframe_from_repr_24110() -> None:
589
df = cast(
590
pl.DataFrame,
591
pl.from_repr("""
592
shape: (7, 1)
593
┌──────────────┐
594
│ time_offset │
595
│ --- │
596
│ duration[μs] │
597
╞══════════════╡
598
│ -2h │
599
│ 0µs │
600
│ 2h │
601
│ +2h │
602
└──────────────┘
603
"""),
604
)
605
expected = pl.DataFrame(
606
{
607
"time_offset": [
608
timedelta(hours=-2),
609
timedelta(),
610
timedelta(hours=2),
611
timedelta(hours=2),
612
]
613
},
614
schema={"time_offset": pl.Duration("us")},
615
)
616
assert_frame_equal(df, expected)
617
618
619
def test_dataframe_from_duckdb_repr() -> None:
620
df = cast(
621
pl.DataFrame,
622
pl.from_repr(
623
"""
624
# misc streaming stats
625
┌────────────┬───────┬───────────────────┬───┬────────────────┬───────────────────┐
626
│ As Of │ Rank │ Year to Date Rank │ … │ Days In Top 10 │ Streaming Seconds │
627
│ date │ int32 │ varchar │ │ int16 │ int128 │
628
├────────────┼───────┼───────────────────┼───┼────────────────┼───────────────────┤
629
│ 2025-05-09 │ 1 │ 1 │ … │ 29 │ 1864939402857430 │
630
│ 2025-05-09 │ 2 │ 2 │ … │ 15 │ 658937443590045 │
631
│ 2025-05-09 │ 3 │ 3 │ … │ 9 │ 267876522242076 │
632
└────────────┴───────┴───────────────────┴───┴────────────────┴───────────────────┘
633
"""
634
),
635
)
636
expected = pl.DataFrame(
637
{
638
"As Of": [date(2025, 5, 9), date(2025, 5, 9), date(2025, 5, 9)],
639
"Rank": [1, 2, 3],
640
"Year to Date Rank": ["1", "2", "3"],
641
"Days In Top 10": [29, 15, 9],
642
"Streaming Seconds": [1864939402857430, 658937443590045, 267876522242076],
643
},
644
schema={
645
"As Of": pl.Date,
646
"Rank": pl.Int32,
647
"Year to Date Rank": pl.String,
648
"Days In Top 10": pl.Int16,
649
"Streaming Seconds": pl.Int128,
650
},
651
)
652
assert_frame_equal(expected, df)
653
654
655
def test_series_from_repr() -> None:
656
frame = (
657
pl.LazyFrame(
658
{
659
"a": [1, 2, None],
660
"b": [4.5, 5.5, 6.5],
661
"c": ["x", "y", "z"],
662
"d": [True, False, True],
663
"e": [None, "", None],
664
"f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],
665
"g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],
666
"h": [
667
datetime(2022, 7, 5, 10, 30, 45, 4560),
668
datetime(2023, 10, 12, 20, 3, 8, 11),
669
None,
670
],
671
},
672
)
673
.with_columns(
674
pl.col("c").cast(pl.Categorical),
675
pl.col("h").cast(pl.Datetime("ns")),
676
)
677
.collect()
678
)
679
680
for col in frame.columns:
681
s = cast(pl.Series, pl.from_repr(repr(frame[col])))
682
assert_series_equal(s, frame[col])
683
684
s = cast(
685
pl.Series,
686
pl.from_repr(
687
"""
688
Out[3]:
689
shape: (3,)
690
Series: 's' [str]
691
[
692
"a"
693
694
"c"
695
]
696
"""
697
),
698
)
699
assert_series_equal(s, pl.Series("s", ["a", "c"]))
700
701
s = cast(
702
pl.Series,
703
pl.from_repr(
704
"""
705
Series: 'flt' [f32]
706
[
707
]
708
"""
709
),
710
)
711
assert_series_equal(s, pl.Series("flt", [], dtype=pl.Float32))
712
713
s = cast(
714
pl.Series,
715
pl.from_repr(
716
"""
717
Series: 'flt' [f64]
718
[
719
null
720
+inf
721
-inf
722
inf
723
0.0
724
NaN
725
]
726
>>> print("stuff")
727
"""
728
),
729
)
730
inf, nan = float("inf"), float("nan")
731
assert_series_equal(
732
s,
733
pl.Series(
734
name="flt",
735
dtype=pl.Float64,
736
values=[None, inf, -inf, inf, 0.0, nan],
737
),
738
)
739
740
741
def test_dataframe_from_repr_custom_separators() -> None:
742
# repr created with custom digit-grouping
743
# and non-default group/decimal separators
744
df = cast(
745
pl.DataFrame,
746
pl.from_repr(
747
"""
748
┌───────────┬────────────┐
749
│ x ┆ y │
750
│ --- ┆ --- │
751
│ i32 ┆ f64 │
752
╞═══════════╪════════════╡
753
│ 123.456 ┆ -10.000,55 │
754
│ -9.876 ┆ 10,0 │
755
│ 9.999.999 ┆ 8,5e8 │
756
└───────────┴────────────┘
757
"""
758
),
759
)
760
assert_frame_equal(
761
df,
762
pl.DataFrame(
763
{
764
"x": [123456, -9876, 9999999],
765
"y": [-10000.55, 10.0, 850000000.0],
766
},
767
schema={"x": pl.Int32, "y": pl.Float64},
768
),
769
)
770
771
772
def test_sliced_struct_from_arrow() -> None:
773
# Create a dataset with 3 rows
774
tbl = pa.Table.from_arrays(
775
arrays=[
776
pa.StructArray.from_arrays(
777
arrays=[
778
pa.array([1, 2, 3], pa.int32()),
779
pa.array(["foo", "bar", "baz"], pa.utf8()),
780
],
781
names=["a", "b"],
782
)
783
],
784
names=["struct_col"],
785
)
786
787
# slice the table
788
# check if FFI correctly reads sliced
789
result = cast(pl.DataFrame, pl.from_arrow(tbl.slice(1, 2)))
790
assert result.to_dict(as_series=False) == {
791
"struct_col": [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]
792
}
793
794
result = cast(pl.DataFrame, pl.from_arrow(tbl.slice(1, 1)))
795
assert result.to_dict(as_series=False) == {"struct_col": [{"a": 2, "b": "bar"}]}
796
797
798
def test_from_arrow_invalid_time_zone() -> None:
799
arr = pa.array(
800
[datetime(2021, 1, 1, 0, 0, 0, 0)],
801
type=pa.timestamp("ns", tz="this-is-not-a-time-zone"),
802
)
803
with pytest.raises(
804
ComputeError, match=r"unable to parse time zone: 'this-is-not-a-time-zone'"
805
):
806
pl.from_arrow(arr)
807
808
809
@pytest.mark.parametrize(
810
("fixed_offset", "etc_tz"),
811
[
812
("+10:00", "Etc/GMT-10"),
813
("10:00", "Etc/GMT-10"),
814
("-10:00", "Etc/GMT+10"),
815
("+05:00", "Etc/GMT-5"),
816
("05:00", "Etc/GMT-5"),
817
("-05:00", "Etc/GMT+5"),
818
],
819
)
820
def test_from_arrow_fixed_offset(fixed_offset: str, etc_tz: str) -> None:
821
arr = pa.array(
822
[datetime(2021, 1, 1, 0, 0, 0, 0)],
823
type=pa.timestamp("us", tz=fixed_offset),
824
)
825
result = cast(pl.Series, pl.from_arrow(arr))
826
expected = pl.Series(
827
[datetime(2021, 1, 1, tzinfo=timezone.utc)]
828
).dt.convert_time_zone(etc_tz)
829
assert_series_equal(result, expected)
830
831
832
def test_from_avro_valid_time_zone_13032() -> None:
833
arr = pa.array(
834
[datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="00:00")
835
)
836
result = cast(pl.Series, pl.from_arrow(arr))
837
expected = pl.Series([datetime(2021, 1, 1)], dtype=pl.Datetime("ns", "UTC"))
838
assert_series_equal(result, expected)
839
840
841
def test_from_numpy_different_resolution_15991() -> None:
842
result = pl.Series(
843
np.array(["2020-01-01"], dtype="datetime64[ns]"), dtype=pl.Datetime("us")
844
)
845
expected = pl.Series([datetime(2020, 1, 1)], dtype=pl.Datetime("us"))
846
assert_series_equal(result, expected)
847
848
849
def test_from_numpy_different_resolution_invalid() -> None:
850
with pytest.raises(ValueError, match="Please cast"):
851
pl.Series(
852
np.array(["2020-01-01"], dtype="datetime64[s]"), dtype=pl.Datetime("us")
853
)
854
855
856
def test_compat_level(monkeypatch: pytest.MonkeyPatch) -> None:
857
# change these if compat level bumped
858
monkeypatch.setenv("POLARS_WARN_UNSTABLE", "1")
859
oldest = CompatLevel.oldest()
860
assert oldest is CompatLevel.oldest() # test singleton
861
assert oldest._version == 0
862
with pytest.warns(UnstableWarning):
863
newest = CompatLevel.newest()
864
assert newest is CompatLevel.newest()
865
assert newest._version == 1
866
867
str_col = pl.Series(["awd"])
868
bin_col = pl.Series([b"dwa"])
869
assert str_col._newest_compat_level() == newest._version
870
assert isinstance(str_col.to_arrow(), pa.LargeStringArray)
871
assert isinstance(str_col.to_arrow(compat_level=oldest), pa.LargeStringArray)
872
assert isinstance(str_col.to_arrow(compat_level=newest), pa.StringViewArray)
873
assert isinstance(bin_col.to_arrow(), pa.LargeBinaryArray)
874
assert isinstance(bin_col.to_arrow(compat_level=oldest), pa.LargeBinaryArray)
875
assert isinstance(bin_col.to_arrow(compat_level=newest), pa.BinaryViewArray)
876
877
df = pl.DataFrame({"str_col": str_col, "bin_col": bin_col})
878
assert isinstance(df.to_arrow()["str_col"][0], pa.LargeStringScalar)
879
assert isinstance(
880
df.to_arrow(compat_level=oldest)["str_col"][0], pa.LargeStringScalar
881
)
882
assert isinstance(
883
df.to_arrow(compat_level=newest)["str_col"][0], pa.StringViewScalar
884
)
885
assert isinstance(df.to_arrow()["bin_col"][0], pa.LargeBinaryScalar)
886
assert isinstance(
887
df.to_arrow(compat_level=oldest)["bin_col"][0], pa.LargeBinaryScalar
888
)
889
assert isinstance(
890
df.to_arrow(compat_level=newest)["bin_col"][0], pa.BinaryViewScalar
891
)
892
893
assert len(df.write_ipc(None).getbuffer()) == 738
894
assert len(df.write_ipc(None, compat_level=oldest).getbuffer()) == 866
895
assert len(df.write_ipc(None, compat_level=newest).getbuffer()) == 738
896
assert len(df.write_ipc_stream(None).getbuffer()) == 520
897
assert len(df.write_ipc_stream(None, compat_level=oldest).getbuffer()) == 648
898
assert len(df.write_ipc_stream(None, compat_level=newest).getbuffer()) == 520
899
900
901
def test_df_pycapsule_interface() -> None:
902
df = pl.DataFrame(
903
{
904
"a": [1, 2, 3],
905
"b": ["a", "b", "c"],
906
"c": ["fooooooooooooooooooooo", "bar", "looooooooooooooooong string"],
907
}
908
)
909
910
capsule_df = PyCapsuleStreamHolder(df)
911
out = pa.table(capsule_df)
912
assert df.shape == out.shape
913
assert df.schema.names() == out.schema.names
914
915
schema_overrides = {"a": pl.Int128}
916
expected_schema = pl.Schema([("a", pl.Int128), ("b", pl.String), ("c", pl.String)])
917
918
for arrow_obj in (
919
pl.from_arrow(capsule_df), # capsule
920
out, # table loaded from capsule
921
):
922
df_res = pl.from_arrow(arrow_obj, schema_overrides=schema_overrides)
923
assert expected_schema == df_res.schema # type: ignore[union-attr]
924
assert isinstance(df_res, pl.DataFrame)
925
assert df.equals(df_res)
926
927
928
def test_misaligned_nested_arrow_19097() -> None:
929
a = pl.Series("a", [1, 2, 3])
930
a = a.slice(1, 2) # by slicing we offset=1 the values
931
a = a.replace(2, None) # then we add a validity mask with offset=0
932
a = a.reshape((2, 1)) # then we make it nested
933
assert_series_equal(pl.Series("a", a.to_arrow()), a)
934
935
936
def test_arrow_roundtrip_lex_cat_20288() -> None:
937
tb = (
938
pl.Series("a", ["A", "B"], pl.Categorical(ordering="lexical"))
939
.to_frame()
940
.to_arrow()
941
)
942
df = pl.from_arrow(tb)
943
assert isinstance(df, pl.DataFrame)
944
dt = df.schema["a"]
945
assert isinstance(dt, pl.Categorical)
946
assert dt.ordering == "lexical"
947
948
949
def test_from_arrow_20271() -> None:
950
df = pl.from_arrow(
951
pa.table({"b": pa.DictionaryArray.from_arrays([0, 1], ["D", "E"])})
952
)
953
assert isinstance(df, pl.DataFrame)
954
assert_series_equal(df.to_series(), pl.Series("b", ["D", "E"], pl.Categorical))
955
956
957
def test_to_arrow_empty_chunks_20627() -> None:
958
df = pl.concat(2 * [pl.Series([1])]).filter(pl.Series([False, True])).to_frame()
959
assert df.to_arrow().shape == (1, 1)
960
961
962
def test_from_arrow_recorbatch() -> None:
963
n_legs = pa.array([2, 2, 4, 4, 5, 100])
964
animals = pa.array(
965
["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]
966
)
967
names = ["n_legs", "animals"]
968
record_batch = pa.RecordBatch.from_arrays([n_legs, animals], names=names)
969
assert_frame_equal(
970
pl.DataFrame(record_batch),
971
pl.DataFrame(
972
{
973
"n_legs": n_legs,
974
"animals": animals,
975
}
976
),
977
)
978
979
980
def test_from_arrow_map_containing_timestamp_23658() -> None:
981
arrow_tbl = pa.Table.from_pydict(
982
{
983
"column_1": [
984
[
985
{
986
"field_1": [
987
{"key": 1, "value": datetime(2025, 1, 1)},
988
{"key": 2, "value": datetime(2025, 1, 2)},
989
{"key": 2, "value": None},
990
]
991
},
992
{"field_1": []},
993
None,
994
]
995
],
996
},
997
schema=pa.schema(
998
[
999
(
1000
"column_1",
1001
pa.list_(
1002
pa.struct(
1003
[
1004
("field_1", pa.map_(pa.int32(), pa.timestamp("ms"))),
1005
]
1006
)
1007
),
1008
)
1009
]
1010
),
1011
)
1012
1013
expect = pl.DataFrame(
1014
{
1015
"column_1": [
1016
[
1017
{
1018
"field_1": [
1019
{"key": 1, "value": datetime(2025, 1, 1)},
1020
{"key": 2, "value": datetime(2025, 1, 2)},
1021
{"key": 2, "value": None},
1022
]
1023
},
1024
{"field_1": []},
1025
None,
1026
]
1027
],
1028
},
1029
schema={
1030
"column_1": pl.List(
1031
pl.Struct(
1032
{
1033
"field_1": pl.List(
1034
pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")})
1035
)
1036
}
1037
)
1038
)
1039
},
1040
)
1041
1042
out = pl.DataFrame(arrow_tbl)
1043
1044
assert_frame_equal(out, expect)
1045
1046
1047
def test_schema_constructor_from_schema_capsule() -> None:
1048
arrow_schema = pa.schema(
1049
[pa.field("test", pa.map_(pa.int32(), pa.timestamp("ms")))]
1050
)
1051
1052
assert pl.Schema(arrow_schema) == {
1053
"test": pl.List(pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")}))
1054
}
1055
1056
arrow_schema = pa.schema([pa.field("a", pa.int32()), pa.field("a", pa.int32())])
1057
1058
with pytest.raises(
1059
DuplicateError,
1060
match="arrow schema contained duplicate name: a",
1061
):
1062
pl.Schema(arrow_schema)
1063
1064
with pytest.raises(
1065
ValueError,
1066
match="object passed to pl.Schema did not return struct dtype: object: pyarrow.Field<a: int32>, dtype: Int32",
1067
):
1068
pl.Schema(pa.field("a", pa.int32()))
1069
1070
assert pl.Schema([pa.field("a", pa.int32()), pa.field("b", pa.string())]) == {
1071
"a": pl.Int32,
1072
"b": pl.String,
1073
}
1074
1075
with pytest.raises(
1076
DuplicateError,
1077
match="iterable passed to pl.Schema contained duplicate name 'a'",
1078
):
1079
pl.Schema([pa.field("a", pa.int32()), pa.field("a", pa.int64())])
1080
1081
1082
def test_to_arrow_24142() -> None:
1083
df = pl.DataFrame({"a": object(), "b": "any string or bytes"})
1084
df.to_arrow(compat_level=CompatLevel.oldest())
1085
1086
1087
def test_comprehensive_pycapsule_interface() -> None:
1088
"""Test all data types via Arrow C Stream PyCapsule interface."""
1089
from datetime import date, datetime, time, timedelta
1090
from decimal import Decimal
1091
1092
class PyCapsuleStreamWrap:
1093
def __init__(self, v: Any) -> None:
1094
self.capsule = v.__arrow_c_stream__()
1095
1096
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
1097
return self.capsule
1098
1099
def roundtrip_series_pycapsule(s: pl.Series) -> pl.Series:
1100
return pl.Series(PyCapsuleStreamWrap(s))
1101
1102
df = pl.DataFrame(
1103
{
1104
"bool": [True, False, None],
1105
"int8": pl.Series([1, 2, None], dtype=pl.Int8),
1106
"int16": pl.Series([1, 2, None], dtype=pl.Int16),
1107
"int32": pl.Series([1, 2, None], dtype=pl.Int32),
1108
"int64": pl.Series([1, 2, None], dtype=pl.Int64),
1109
"uint8": pl.Series([1, 2, None], dtype=pl.UInt8),
1110
"uint16": pl.Series([1, 2, None], dtype=pl.UInt16),
1111
"uint32": pl.Series([1, 2, None], dtype=pl.UInt32),
1112
"uint64": pl.Series([1, 2, None], dtype=pl.UInt64),
1113
"float32": pl.Series([1.1, 2.2, None], dtype=pl.Float32),
1114
"float64": pl.Series([1.1, 2.2, None], dtype=pl.Float64),
1115
"string": ["hello", "world", None],
1116
"binary": [b"hello", b"world", None],
1117
"decimal": pl.Series(
1118
[Decimal("1.23"), Decimal("4.56"), None], dtype=pl.Decimal(10, 2)
1119
),
1120
"date": [date(2023, 1, 1), date(2023, 1, 2), None],
1121
"datetime": [
1122
datetime(2023, 1, 1, 12, 0),
1123
datetime(2023, 1, 2, 13, 30),
1124
None,
1125
],
1126
"time": [time(12, 0, 0), time(13, 30, 0), None],
1127
"duration_us": pl.Series(
1128
[timedelta(days=1), timedelta(hours=2), None], dtype=pl.Duration("us")
1129
),
1130
"duration_ms": pl.Series(
1131
[timedelta(milliseconds=100), timedelta(microseconds=500), None],
1132
dtype=pl.Duration("ms"),
1133
),
1134
"duration_ns": pl.Series(
1135
[timedelta(seconds=1), timedelta(microseconds=1000), None],
1136
dtype=pl.Duration("ns"),
1137
),
1138
"categorical": pl.Series(
1139
["apple", "banana", "apple"], dtype=pl.Categorical
1140
),
1141
"list_duration": [
1142
[timedelta(days=1), timedelta(hours=2)],
1143
[timedelta(minutes=30)],
1144
None,
1145
],
1146
"struct_with_duration": [
1147
{"x": timedelta(days=1), "y": 1},
1148
{"x": timedelta(hours=2), "y": 2},
1149
None,
1150
],
1151
}
1152
).cast(
1153
{
1154
"list_duration": pl.List(pl.Duration("us")),
1155
"struct_with_duration": pl.Struct({"x": pl.Duration("ns"), "y": pl.Int32}),
1156
}
1157
)
1158
1159
df_roundtrip = df.map_columns(pl.selectors.all(), roundtrip_series_pycapsule)
1160
1161
assert_frame_equal(df_roundtrip, df)
1162
1163
df_roundtrip_direct = pl.DataFrame(PyCapsuleStreamWrap(df))
1164
1165
assert_frame_equal(df_roundtrip_direct, df)
1166
1167