Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/test_interop.py
8415 views
1
from __future__ import annotations
2
3
import io
4
from datetime import date, datetime, time, timedelta, timezone
5
from typing import TYPE_CHECKING, Any, cast
6
7
import numpy as np
8
import pandas as pd
9
import pyarrow as pa
10
import pyarrow.parquet as pq
11
import pytest
12
13
import polars as pl
14
from polars.exceptions import (
15
ComputeError,
16
DuplicateError,
17
InvalidOperationError,
18
PanicException,
19
UnstableWarning,
20
)
21
from polars.interchange.protocol import CompatLevel
22
from polars.testing import assert_frame_equal, assert_series_equal
23
from tests.unit.utils.pycapsule_utils import PyCapsuleStreamHolder
24
25
if TYPE_CHECKING:
26
from tests.conftest import PlMonkeyPatch
27
28
29
def test_arrow_list_roundtrip() -> None:
30
# https://github.com/pola-rs/polars/issues/1064
31
tbl = pa.table({"a": [1], "b": [[1, 2]]})
32
arw = pl.from_arrow(tbl).to_arrow()
33
34
assert arw.shape == tbl.shape
35
assert arw.schema.names == tbl.schema.names
36
for c1, c2 in zip(arw.columns, tbl.columns, strict=True):
37
assert c1.to_pylist() == c2.to_pylist()
38
39
40
def test_arrow_null_roundtrip() -> None:
41
tbl = pa.table({"a": [None, None], "b": [[None, None], [None, None]]})
42
df = pl.from_arrow(tbl)
43
44
if isinstance(df, pl.DataFrame):
45
assert df.dtypes == [pl.Null, pl.List(pl.Null)]
46
47
arw = df.to_arrow()
48
49
assert arw.shape == tbl.shape
50
assert arw.schema.names == tbl.schema.names
51
for c1, c2 in zip(arw.columns, tbl.columns, strict=True):
52
assert c1.to_pylist() == c2.to_pylist()
53
54
55
def test_arrow_empty_dataframe() -> None:
56
# 0x0 dataframe
57
df = pl.DataFrame({})
58
tbl = pa.table({})
59
assert df.to_arrow() == tbl
60
df2 = cast("pl.DataFrame", pl.from_arrow(df.to_arrow()))
61
assert_frame_equal(df2, df)
62
63
# 0 row dataframe
64
df = pl.DataFrame({}, schema={"a": pl.Int32})
65
tbl = pa.Table.from_batches([], pa.schema([pa.field("a", pa.int32())]))
66
assert df.to_arrow() == tbl
67
df2 = cast("pl.DataFrame", pl.from_arrow(df.to_arrow()))
68
assert df2.schema == {"a": pl.Int32}
69
assert df2.shape == (0, 1)
70
71
72
def test_arrow_dict_to_polars() -> None:
73
pa_dict = pa.DictionaryArray.from_arrays(
74
indices=np.array([0, 1, 2, 3, 1, 0, 2, 3, 3, 2]),
75
dictionary=np.array(["AAA", "BBB", "CCC", "DDD"]),
76
).cast(pa.large_utf8())
77
78
s = pl.Series(
79
name="pa_dict",
80
values=["AAA", "BBB", "CCC", "DDD", "BBB", "AAA", "CCC", "DDD", "DDD", "CCC"],
81
)
82
assert_series_equal(s, pl.Series("pa_dict", pa_dict))
83
84
85
def test_arrow_list_chunked_array() -> None:
86
a = pa.array([[1, 2], [3, 4]])
87
ca = pa.chunked_array([a, a, a])
88
s = cast("pl.Series", pl.from_arrow(ca))
89
assert s.dtype == pl.List
90
91
92
# Test that polars convert Arrays of logical types correctly to arrow
93
def test_arrow_array_logical() -> None:
94
# cast to large string and uint8 indices because polars converts to those
95
pa_data1 = (
96
pa.array(["a", "b", "c", "d"])
97
.dictionary_encode()
98
.cast(pa.dictionary(pa.uint8(), pa.large_string()))
99
)
100
pa_array_logical1 = pa.FixedSizeListArray.from_arrays(pa_data1, 2)
101
102
s1 = pl.Series(
103
values=[["a", "b"], ["c", "d"]],
104
dtype=pl.Array(pl.Enum(["a", "b", "c", "d"]), shape=2),
105
)
106
assert s1.to_arrow() == pa_array_logical1
107
108
pa_data2 = pa.array([date(2024, 1, 1), date(2024, 1, 2)])
109
pa_array_logical2 = pa.FixedSizeListArray.from_arrays(pa_data2, 1)
110
111
s2 = pl.Series(
112
values=[[date(2024, 1, 1)], [date(2024, 1, 2)]],
113
dtype=pl.Array(pl.Date, shape=1),
114
)
115
assert s2.to_arrow() == pa_array_logical2
116
117
118
def test_from_dict() -> None:
119
data = {"a": [1, 2], "b": [3, 4]}
120
df = pl.from_dict(data)
121
assert df.shape == (2, 2)
122
for s1, s2 in zip(
123
list(df), [pl.Series("a", [1, 2]), pl.Series("b", [3, 4])], strict=True
124
):
125
assert_series_equal(s1, s2)
126
127
128
def test_from_dict_struct() -> None:
129
data: dict[str, dict[str, list[int]] | list[int]] = {
130
"a": {"b": [1, 3], "c": [2, 4]},
131
"d": [5, 6],
132
}
133
df = pl.from_dict(data)
134
assert df.shape == (2, 2)
135
assert df["a"][0] == {"b": 1, "c": 2}
136
assert df["a"][1] == {"b": 3, "c": 4}
137
assert df.schema == {"a": pl.Struct({"b": pl.Int64, "c": pl.Int64}), "d": pl.Int64}
138
139
140
def test_from_dicts() -> None:
141
data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": None}]
142
df = pl.from_dicts(data) # type: ignore[arg-type]
143
assert df.shape == (3, 2)
144
assert df.rows() == [(1, 4), (2, 5), (3, None)]
145
assert df.schema == {"a": pl.Int64, "b": pl.Int64}
146
147
148
def test_from_dict_no_inference() -> None:
149
schema = {"a": pl.String}
150
data = [{"a": "aa"}]
151
df = pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0)
152
assert df.schema == schema
153
assert df.to_dicts() == data
154
155
156
def test_from_dicts_schema_override() -> None:
157
schema = {
158
"a": pl.String,
159
"b": pl.Int64,
160
"c": pl.List(pl.Struct({"x": pl.Int64, "y": pl.String, "z": pl.Float64})),
161
}
162
163
# initial data matches the expected schema
164
data1 = [
165
{
166
"a": "l",
167
"b": i,
168
"c": [{"x": (j + 2), "y": "?", "z": (j % 2)} for j in range(2)],
169
}
170
for i in range(5)
171
]
172
173
# extend with a mix of fields that are/not in the schema
174
data2 = [{"b": i + 5, "d": "ABC", "e": "DEF"} for i in range(5)]
175
176
for n_infer in (0, 3, 5, 8, 10, 100):
177
df = pl.DataFrame(
178
data=(data1 + data2),
179
schema=schema, # type: ignore[arg-type]
180
infer_schema_length=n_infer,
181
)
182
assert df.schema == schema
183
assert df.rows() == [
184
("l", 0, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
185
("l", 1, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
186
("l", 2, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
187
("l", 3, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
188
("l", 4, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),
189
(None, 5, None),
190
(None, 6, None),
191
(None, 7, None),
192
(None, 8, None),
193
(None, 9, None),
194
]
195
196
197
def test_from_dicts_struct() -> None:
198
data = [{"a": {"b": 1, "c": 2}, "d": 5}, {"a": {"b": 3, "c": 4}, "d": 6}]
199
df = pl.from_dicts(data)
200
assert df.shape == (2, 2)
201
assert df["a"][0] == {"b": 1, "c": 2}
202
assert df["a"][1] == {"b": 3, "c": 4}
203
204
# 5649
205
assert pl.from_dicts([{"a": [{"x": 1}]}, {"a": [{"y": 1}]}]).to_dict(
206
as_series=False
207
) == {"a": [[{"y": None, "x": 1}], [{"y": 1, "x": None}]]}
208
assert pl.from_dicts([{"a": [{"x": 1}, {"y": 2}]}, {"a": [{"y": 1}]}]).to_dict(
209
as_series=False
210
) == {"a": [[{"y": None, "x": 1}, {"y": 2, "x": None}], [{"y": 1, "x": None}]]}
211
212
213
def test_from_records() -> None:
214
data = [[1, 2, 3], [4, 5, 6]]
215
df = pl.from_records(data, schema=["a", "b"])
216
assert df.shape == (3, 2)
217
assert df.rows() == [(1, 4), (2, 5), (3, 6)]
218
219
220
# https://github.com/pola-rs/polars/issues/15195
221
@pytest.mark.parametrize(
222
"input",
223
[
224
pl.Series([1, 2]),
225
pl.Series([{"a": 1, "b": 2}]),
226
pl.DataFrame({"a": [1, 2], "b": [3, 4]}),
227
],
228
)
229
def test_from_records_non_sequence_input(input: Any) -> None:
230
with pytest.raises(TypeError, match="expected data of type Sequence"):
231
pl.from_records(input)
232
233
234
def test_from_arrow() -> None:
235
data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})
236
df = pl.from_arrow(data)
237
assert df.shape == (3, 2)
238
assert df.rows() == [(1, 4), (2, 5), (3, 6)] # type: ignore[union-attr]
239
240
# if not a PyArrow type, raise a TypeError
241
with pytest.raises(TypeError):
242
_ = pl.from_arrow([1, 2])
243
244
df = pl.from_arrow(
245
data, schema=["a", "b"], schema_overrides={"a": pl.UInt32, "b": pl.UInt64}
246
)
247
assert df.rows() == [(1, 4), (2, 5), (3, 6)] # type: ignore[union-attr]
248
assert df.schema == {"a": pl.UInt32, "b": pl.UInt64} # type: ignore[union-attr]
249
250
251
def test_from_arrow_with_bigquery_metadata() -> None:
252
arrow_schema = pa.schema(
253
[
254
pa.field("id", pa.int64()).with_metadata(
255
{"ARROW:extension:name": "google:sqlType:integer"}
256
),
257
pa.field(
258
"misc",
259
pa.struct([("num", pa.int32()), ("val", pa.string())]),
260
).with_metadata({"ARROW:extension:name": "google:sqlType:struct"}),
261
]
262
)
263
arrow_tbl = pa.Table.from_pylist(
264
[{"id": 1, "misc": None}, {"id": 2, "misc": None}],
265
schema=arrow_schema,
266
)
267
268
expected_data = {"id": [1, 2], "num": [None, None], "val": [None, None]}
269
expected_schema = {"id": pl.Int64, "num": pl.Int32, "val": pl.String}
270
assert_frame_equal(
271
pl.DataFrame(expected_data, schema=expected_schema),
272
pl.from_arrow(arrow_tbl).unnest("misc"), # type: ignore[union-attr]
273
)
274
275
276
def test_from_optional_not_available() -> None:
277
from polars._dependencies import _LazyModule
278
279
# proxy module is created dynamically if the required module is not available
280
# (see the polars._dependencies source code for additional detail/comments)
281
282
np = _LazyModule("numpy", module_available=False)
283
with pytest.raises(ImportError, match=r"np\.array requires 'numpy'"):
284
pl.from_numpy(np.array([[1, 2], [3, 4]]), schema=["a", "b"])
285
286
pa = _LazyModule("pyarrow", module_available=False)
287
with pytest.raises(ImportError, match=r"pa\.table requires 'pyarrow'"):
288
pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))
289
290
pd = _LazyModule("pandas", module_available=False)
291
with pytest.raises(ImportError, match=r"pd\.Series requires 'pandas'"):
292
pl.from_pandas(pd.Series([1, 2, 3]))
293
294
295
def test_upcast_pyarrow_dicts() -> None:
296
# https://github.com/pola-rs/polars/issues/1752
297
tbls = [
298
pa.table(
299
{
300
"col_name": pa.array(
301
[f"value_{i}"], pa.dictionary(pa.int8(), pa.string())
302
)
303
}
304
)
305
for i in range(128)
306
]
307
308
tbl = pa.concat_tables(tbls, promote_options="default")
309
out = cast("pl.DataFrame", pl.from_arrow(tbl))
310
assert out.shape == (128, 1)
311
assert out["col_name"][0] == "value_0"
312
assert out["col_name"][127] == "value_127"
313
314
315
def test_no_rechunk() -> None:
316
table = pa.Table.from_pydict({"x": pa.chunked_array([list("ab"), list("cd")])})
317
# table
318
assert pl.from_arrow(table, rechunk=False).n_chunks() == 2
319
# chunked array
320
assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2
321
322
323
def test_from_empty_arrow() -> None:
324
df = cast("pl.DataFrame", pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []}))))
325
assert df.columns == ["a", "b"]
326
assert df.dtypes == [pl.Float64, pl.Float64]
327
328
# 2705
329
df1 = pd.DataFrame(columns=["b"], dtype=float, index=pd.Index([]))
330
tbl = pa.Table.from_pandas(df1)
331
out = cast("pl.DataFrame", pl.from_arrow(tbl))
332
assert out.columns == ["b", "__index_level_0__"]
333
assert out.dtypes == [pl.Float64, pl.Null]
334
tbl = pa.Table.from_pandas(df1, preserve_index=False)
335
out = cast("pl.DataFrame", pl.from_arrow(tbl))
336
assert out.columns == ["b"]
337
assert out.dtypes == [pl.Float64]
338
339
# 4568
340
tbl = pa.table({"l": []}, schema=pa.schema([("l", pa.large_list(pa.uint8()))]))
341
342
df = cast("pl.DataFrame", pl.from_arrow(tbl))
343
assert df.schema["l"] == pl.List(pl.UInt8)
344
345
346
def test_cat_int_types_3500() -> None:
347
# Create an enum / categorical / dictionary typed pyarrow array
348
# Most simply done by creating a pandas categorical series first
349
categorical_s = pd.Series(["a", "a", "b"], dtype="category")
350
pyarrow_array = pa.Array.from_pandas(categorical_s)
351
352
# The in-memory representation of each category can either be a signed or
353
# unsigned 8-bit integer. Pandas uses Int8...
354
int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8())
355
# ... while DuckDB uses UInt8
356
uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8())
357
358
for t in [int_dict_type, uint_dict_type]:
359
s = cast("pl.Series", pl.from_arrow(pyarrow_array.cast(t)))
360
assert_series_equal(
361
s, pl.Series(["a", "a", "b"]).cast(pl.Categorical), check_names=False
362
)
363
364
365
def test_from_pyarrow_chunked_array() -> None:
366
column = pa.chunked_array([[1], [2]])
367
series = pl.Series("column", column)
368
assert series.to_list() == [1, 2]
369
370
371
def test_arrow_list_null_5697() -> None:
372
# Create a pyarrow table with a list[null] column.
373
pa_table = pa.table([[[None]]], names=["mycol"])
374
df = pl.from_arrow(pa_table)
375
pa_table = df.to_arrow()
376
# again to polars to test the schema
377
assert pl.from_arrow(pa_table).schema == {"mycol": pl.List(pl.Null)} # type: ignore[union-attr]
378
379
380
def test_from_pyarrow_map() -> None:
381
pa_table = pa.table(
382
[[1, 2], [[("a", "something")], [("a", "else"), ("b", "another key")]]],
383
schema=pa.schema(
384
[("idx", pa.int16()), ("mapping", pa.map_(pa.string(), pa.string()))]
385
),
386
)
387
388
# Convert from an empty table to trigger an ArrowSchema -> native schema
389
# conversion (checks that ArrowDataType::Map is handled in Rust).
390
pl.DataFrame(pa_table.slice(0, 0))
391
392
result = pl.DataFrame(pa_table)
393
assert result.to_dict(as_series=False) == {
394
"idx": [1, 2],
395
"mapping": [
396
[{"key": "a", "value": "something"}],
397
[{"key": "a", "value": "else"}, {"key": "b", "value": "another key"}],
398
],
399
}
400
401
402
def test_from_fixed_size_binary_list() -> None:
403
val = [[b"63A0B1C66575DD5708E1EB2B"]]
404
arrow_array = pa.array(val, type=pa.list_(pa.binary(24)))
405
s = cast("pl.Series", pl.from_arrow(arrow_array))
406
assert s.dtype == pl.List(pl.Binary)
407
assert s.to_list() == val
408
409
410
def test_dataframe_from_repr() -> None:
411
# round-trip various types
412
frame = (
413
pl.LazyFrame(
414
{
415
"a": [1, 2, None],
416
"b": [4.5, 5.23e13, -3.12e12],
417
"c": ["x", "y", "z"],
418
"d": [True, False, True],
419
"e": [None, "", None],
420
"f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],
421
"g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],
422
"h": [
423
datetime(2022, 7, 5, 10, 30, 45, 4560),
424
datetime(2023, 10, 12, 20, 3, 8, 11),
425
None,
426
],
427
},
428
)
429
.with_columns(
430
pl.col("c").cast(pl.Categorical),
431
pl.col("h").cast(pl.Datetime("ns")),
432
)
433
.collect()
434
)
435
436
assert frame.schema == {
437
"a": pl.Int64,
438
"b": pl.Float64,
439
"c": pl.Categorical(),
440
"d": pl.Boolean,
441
"e": pl.String,
442
"f": pl.Date,
443
"g": pl.Time,
444
"h": pl.Datetime("ns"),
445
}
446
df = cast("pl.DataFrame", pl.from_repr(repr(frame)))
447
assert_frame_equal(frame, df)
448
449
# empty frame; confirm schema is inferred
450
df = cast(
451
"pl.DataFrame",
452
pl.from_repr(
453
"""
454
┌─────┬─────┬─────┬─────┬─────┬───────┐
455
│ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │
456
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
457
│ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │
458
╞═════╪═════╪═════╪═════╪═════╪═══════╡
459
└─────┴─────┴─────┴─────┴─────┴───────┘
460
"""
461
),
462
)
463
assert df.shape == (0, 6)
464
assert df.rows() == []
465
assert df.schema == {
466
"id": pl.String,
467
"q1": pl.Int8,
468
"q2": pl.Int16,
469
"q3": pl.Int32,
470
"q4": pl.Int64,
471
"total": pl.Float64,
472
}
473
474
# empty frame with no dtypes
475
df = cast(
476
"pl.DataFrame",
477
pl.from_repr(
478
"""
479
┌──────┬───────┐
480
│ misc ┆ other │
481
╞══════╪═══════╡
482
└──────┴───────┘
483
"""
484
),
485
)
486
assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String}))
487
488
# empty frame with a non-standard/blank 'null' in numeric col
489
df = cast(
490
"pl.DataFrame",
491
pl.from_repr(
492
"""
493
┌─────┬──────┐
494
│ c1 ┆ c2 │
495
│ --- ┆ --- │
496
│ i32 ┆ f64 │
497
╞═════╪══════╡
498
│ │ NULL │
499
└─────┴──────┘
500
"""
501
),
502
)
503
assert_frame_equal(
504
df,
505
pl.DataFrame(
506
data=[(None, None)],
507
schema={"c1": pl.Int32, "c2": pl.Float64},
508
orient="row",
509
),
510
)
511
512
df = cast(
513
"pl.DataFrame",
514
pl.from_repr(
515
"""
516
# >>> Missing cols with old-style ellipsis, nulls, commented out
517
# ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐
518
# │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │
519
# │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
520
# │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
521
# ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡
522
# │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │
523
# │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │
524
# │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │
525
# └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘
526
"""
527
),
528
)
529
assert df.schema == {
530
"dt": pl.Date,
531
"c1": pl.Int32,
532
"c2": pl.Int32,
533
"c3": pl.Int32,
534
"c96": pl.Int64,
535
"c97": pl.Int64,
536
"c98": pl.Int64,
537
"c99": pl.Int64,
538
}
539
assert df.rows() == [
540
(date(2023, 3, 25), 1, 2, 3, 96, 97, 98, 99),
541
(date(1999, 12, 31), 3, 6, 9, 288, 291, 294, None),
542
(None, 9, 18, 27, 864, 873, 882, 891),
543
]
544
545
df = cast(
546
"pl.DataFrame",
547
pl.from_repr(
548
"""
549
# >>> no dtypes:
550
# ┌────────────┬──────┐
551
# │ dt ┆ c99 │
552
# ╞════════════╪══════╡
553
# │ 2023-03-25 ┆ 99 │
554
# │ 1999-12-31 ┆ null │
555
# │ null ┆ 891 │
556
# └────────────┴──────┘
557
"""
558
),
559
)
560
assert df.schema == {"dt": pl.Date, "c99": pl.Int64}
561
assert df.rows() == [
562
(date(2023, 3, 25), 99),
563
(date(1999, 12, 31), None),
564
(None, 891),
565
]
566
567
df = cast(
568
"pl.DataFrame",
569
pl.from_repr(
570
"""
571
In [2]: with pl.Config() as cfg:
572
...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True)
573
...: print(df)
574
...:
575
shape: (1, 5)
576
╭───────────┬────────────┬───┬───────┬────────────────────────────────╮
577
│ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │
578
│ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │
579
│ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │
580
│ i32 ┆ i64 ┆ ┆ ┆ │
581
╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡
582
│ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │
583
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
584
│ … ┆ … ┆ … ┆ … ┆ … │
585
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
586
│ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │
587
╰───────────┴────────────┴───┴───────┴────────────────────────────────╯
588
# "Een fluitje van een cent..." :)
589
"""
590
),
591
)
592
assert df.shape == (2, 4)
593
assert df.schema == {
594
"source_actor_id": pl.Int32,
595
"source_channel_id": pl.Int64,
596
"ident": pl.String,
597
"timestamp": pl.Datetime("us", "Asia/Tokyo"),
598
}
599
600
601
def test_dataframe_from_repr_24110() -> None:
602
df = cast(
603
"pl.DataFrame",
604
pl.from_repr("""
605
shape: (7, 1)
606
┌──────────────┐
607
│ time_offset │
608
│ --- │
609
│ duration[μs] │
610
╞══════════════╡
611
│ -2h │
612
│ 0µs │
613
│ 2h │
614
│ +2h │
615
└──────────────┘
616
"""),
617
)
618
expected = pl.DataFrame(
619
{
620
"time_offset": [
621
timedelta(hours=-2),
622
timedelta(),
623
timedelta(hours=2),
624
timedelta(hours=2),
625
]
626
},
627
schema={"time_offset": pl.Duration("us")},
628
)
629
assert_frame_equal(df, expected)
630
631
632
def test_dataframe_from_duckdb_repr() -> None:
633
df = cast(
634
"pl.DataFrame",
635
pl.from_repr(
636
"""
637
# misc streaming stats
638
┌────────────┬───────┬───────────────────┬───┬────────────────┬───────────────────┐
639
│ As Of │ Rank │ Year to Date Rank │ … │ Days In Top 10 │ Streaming Seconds │
640
│ date │ int32 │ varchar │ │ int16 │ int128 │
641
├────────────┼───────┼───────────────────┼───┼────────────────┼───────────────────┤
642
│ 2025-05-09 │ 1 │ 1 │ … │ 29 │ 1864939402857430 │
643
│ 2025-05-09 │ 2 │ 2 │ … │ 15 │ 658937443590045 │
644
│ 2025-05-09 │ 3 │ 3 │ … │ 9 │ 267876522242076 │
645
└────────────┴───────┴───────────────────┴───┴────────────────┴───────────────────┘
646
"""
647
),
648
)
649
expected = pl.DataFrame(
650
{
651
"As Of": [date(2025, 5, 9), date(2025, 5, 9), date(2025, 5, 9)],
652
"Rank": [1, 2, 3],
653
"Year to Date Rank": ["1", "2", "3"],
654
"Days In Top 10": [29, 15, 9],
655
"Streaming Seconds": [1864939402857430, 658937443590045, 267876522242076],
656
},
657
schema={
658
"As Of": pl.Date,
659
"Rank": pl.Int32,
660
"Year to Date Rank": pl.String,
661
"Days In Top 10": pl.Int16,
662
"Streaming Seconds": pl.Int128,
663
},
664
)
665
assert_frame_equal(expected, df)
666
667
668
def test_series_from_repr() -> None:
669
frame = (
670
pl.LazyFrame(
671
{
672
"a": [1, 2, None],
673
"b": [4.5, 5.5, 6.5],
674
"c": ["x", "y", "z"],
675
"d": [True, False, True],
676
"e": [None, "", None],
677
"f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],
678
"g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],
679
"h": [
680
datetime(2022, 7, 5, 10, 30, 45, 4560),
681
datetime(2023, 10, 12, 20, 3, 8, 11),
682
None,
683
],
684
},
685
)
686
.with_columns(
687
pl.col("c").cast(pl.Categorical),
688
pl.col("h").cast(pl.Datetime("ns")),
689
)
690
.collect()
691
)
692
693
for col in frame.columns:
694
s = cast("pl.Series", pl.from_repr(repr(frame[col])))
695
assert_series_equal(s, frame[col])
696
697
s = cast(
698
"pl.Series",
699
pl.from_repr(
700
"""
701
Out[3]:
702
shape: (3,)
703
Series: 's' [str]
704
[
705
"a"
706
707
"c"
708
]
709
"""
710
),
711
)
712
assert_series_equal(s, pl.Series("s", ["a", "c"]))
713
714
s = cast(
715
"pl.Series",
716
pl.from_repr(
717
"""
718
Series: 'flt' [f32]
719
[
720
]
721
"""
722
),
723
)
724
assert_series_equal(s, pl.Series("flt", [], dtype=pl.Float32))
725
726
s = cast(
727
"pl.Series",
728
pl.from_repr(
729
"""
730
Series: 'flt' [f64]
731
[
732
null
733
+inf
734
-inf
735
inf
736
0.0
737
NaN
738
]
739
>>> print("stuff")
740
"""
741
),
742
)
743
inf, nan = float("inf"), float("nan")
744
assert_series_equal(
745
s,
746
pl.Series(
747
name="flt",
748
dtype=pl.Float64,
749
values=[None, inf, -inf, inf, 0.0, nan],
750
),
751
)
752
753
754
def test_dataframe_from_repr_custom_separators() -> None:
755
# repr created with custom digit-grouping
756
# and non-default group/decimal separators
757
df = cast(
758
"pl.DataFrame",
759
pl.from_repr(
760
"""
761
┌───────────┬────────────┐
762
│ x ┆ y │
763
│ --- ┆ --- │
764
│ i32 ┆ f64 │
765
╞═══════════╪════════════╡
766
│ 123.456 ┆ -10.000,55 │
767
│ -9.876 ┆ 10,0 │
768
│ 9.999.999 ┆ 8,5e8 │
769
└───────────┴────────────┘
770
"""
771
),
772
)
773
assert_frame_equal(
774
df,
775
pl.DataFrame(
776
{
777
"x": [123456, -9876, 9999999],
778
"y": [-10000.55, 10.0, 850000000.0],
779
},
780
schema={"x": pl.Int32, "y": pl.Float64},
781
),
782
)
783
784
785
def test_sliced_struct_from_arrow() -> None:
786
# Create a dataset with 3 rows
787
tbl = pa.Table.from_arrays(
788
arrays=[
789
pa.StructArray.from_arrays(
790
arrays=[
791
pa.array([1, 2, 3], pa.int32()),
792
pa.array(["foo", "bar", "baz"], pa.utf8()),
793
],
794
names=["a", "b"],
795
)
796
],
797
names=["struct_col"],
798
)
799
800
# slice the table
801
# check if FFI correctly reads sliced
802
result = cast("pl.DataFrame", pl.from_arrow(tbl.slice(1, 2)))
803
assert result.to_dict(as_series=False) == {
804
"struct_col": [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]
805
}
806
807
result = cast("pl.DataFrame", pl.from_arrow(tbl.slice(1, 1)))
808
assert result.to_dict(as_series=False) == {"struct_col": [{"a": 2, "b": "bar"}]}
809
810
811
def test_from_arrow_invalid_time_zone() -> None:
812
arr = pa.array(
813
[datetime(2021, 1, 1, 0, 0, 0, 0)],
814
type=pa.timestamp("ns", tz="this-is-not-a-time-zone"),
815
)
816
with pytest.raises(
817
ComputeError, match=r"unable to parse time zone: 'this-is-not-a-time-zone'"
818
):
819
pl.from_arrow(arr)
820
821
822
@pytest.mark.parametrize(
823
("fixed_offset", "etc_tz"),
824
[
825
("+10:00", "Etc/GMT-10"),
826
("10:00", "Etc/GMT-10"),
827
("-10:00", "Etc/GMT+10"),
828
("+05:00", "Etc/GMT-5"),
829
("05:00", "Etc/GMT-5"),
830
("-05:00", "Etc/GMT+5"),
831
],
832
)
833
def test_from_arrow_fixed_offset(fixed_offset: str, etc_tz: str) -> None:
834
arr = pa.array(
835
[datetime(2021, 1, 1, 0, 0, 0, 0)],
836
type=pa.timestamp("us", tz=fixed_offset),
837
)
838
result = cast("pl.Series", pl.from_arrow(arr))
839
expected = pl.Series(
840
[datetime(2021, 1, 1, tzinfo=timezone.utc)]
841
).dt.convert_time_zone(etc_tz)
842
assert_series_equal(result, expected)
843
844
845
def test_from_avro_valid_time_zone_13032() -> None:
846
arr = pa.array(
847
[datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="00:00")
848
)
849
result = cast("pl.Series", pl.from_arrow(arr))
850
expected = pl.Series([datetime(2021, 1, 1)], dtype=pl.Datetime("ns", "UTC"))
851
assert_series_equal(result, expected)
852
853
854
def test_from_numpy_different_resolution_15991() -> None:
855
result = pl.Series(
856
np.array(["2020-01-01"], dtype="datetime64[ns]"), dtype=pl.Datetime("us")
857
)
858
expected = pl.Series([datetime(2020, 1, 1)], dtype=pl.Datetime("us"))
859
assert_series_equal(result, expected)
860
861
862
def test_from_numpy_different_resolution_invalid() -> None:
863
with pytest.raises(ValueError, match="Please cast"):
864
pl.Series(
865
np.array(["2020-01-01"], dtype="datetime64[s]"), dtype=pl.Datetime("us")
866
)
867
868
869
def test_compat_level(plmonkeypatch: PlMonkeyPatch) -> None:
870
# change these if compat level bumped
871
plmonkeypatch.setenv("POLARS_WARN_UNSTABLE", "1")
872
oldest = CompatLevel.oldest()
873
assert oldest is CompatLevel.oldest() # test singleton
874
assert oldest._version == 0
875
with pytest.warns(UnstableWarning):
876
newest = CompatLevel.newest()
877
with pytest.warns(UnstableWarning):
878
assert newest is CompatLevel.newest()
879
assert newest._version == 1
880
881
str_col = pl.Series(["awd"])
882
bin_col = pl.Series([b"dwa"])
883
assert str_col._newest_compat_level() == newest._version
884
assert isinstance(str_col.to_arrow(), pa.LargeStringArray)
885
assert isinstance(str_col.to_arrow(compat_level=oldest), pa.LargeStringArray)
886
assert isinstance(str_col.to_arrow(compat_level=newest), pa.StringViewArray)
887
assert isinstance(bin_col.to_arrow(), pa.LargeBinaryArray)
888
assert isinstance(bin_col.to_arrow(compat_level=oldest), pa.LargeBinaryArray)
889
assert isinstance(bin_col.to_arrow(compat_level=newest), pa.BinaryViewArray)
890
891
df = pl.DataFrame({"str_col": str_col, "bin_col": bin_col})
892
assert isinstance(df.to_arrow()["str_col"][0], pa.LargeStringScalar)
893
assert isinstance(
894
df.to_arrow(compat_level=oldest)["str_col"][0], pa.LargeStringScalar
895
)
896
assert isinstance(
897
df.to_arrow(compat_level=newest)["str_col"][0], pa.StringViewScalar
898
)
899
assert isinstance(df.to_arrow()["bin_col"][0], pa.LargeBinaryScalar)
900
assert isinstance(
901
df.to_arrow(compat_level=oldest)["bin_col"][0], pa.LargeBinaryScalar
902
)
903
assert isinstance(
904
df.to_arrow(compat_level=newest)["bin_col"][0], pa.BinaryViewScalar
905
)
906
907
908
def test_df_pycapsule_interface() -> None:
909
df = pl.DataFrame(
910
{
911
"a": [1, 2, 3],
912
"b": ["a", "b", "c"],
913
"c": ["fooooooooooooooooooooo", "bar", "looooooooooooooooong string"],
914
}
915
)
916
917
capsule_df = PyCapsuleStreamHolder(df)
918
out = pa.table(capsule_df)
919
assert df.shape == out.shape
920
assert df.schema.names() == out.schema.names
921
922
schema_overrides = {"a": pl.Int128}
923
expected_schema = pl.Schema([("a", pl.Int128), ("b", pl.String), ("c", pl.String)])
924
925
for arrow_obj in (
926
pl.from_arrow(capsule_df), # capsule
927
out, # table loaded from capsule
928
):
929
df_res = pl.from_arrow(arrow_obj, schema_overrides=schema_overrides)
930
assert expected_schema == df_res.schema # type: ignore[union-attr]
931
assert isinstance(df_res, pl.DataFrame)
932
assert df.equals(df_res)
933
934
935
def test_misaligned_nested_arrow_19097() -> None:
936
a = pl.Series("a", [1, 2, 3])
937
a = a.slice(1, 2) # by slicing we offset=1 the values
938
a = a.replace(2, None) # then we add a validity mask with offset=0
939
a = a.reshape((2, 1)) # then we make it nested
940
assert_series_equal(pl.Series("a", a.to_arrow()), a)
941
942
943
def test_arrow_roundtrip_lex_cat_20288() -> None:
944
tb = pl.Series("a", ["A", "B"], pl.Categorical()).to_frame().to_arrow()
945
df = pl.from_arrow(tb)
946
assert isinstance(df, pl.DataFrame)
947
dt = df.schema["a"]
948
assert isinstance(dt, pl.Categorical)
949
assert dt.ordering == "lexical"
950
951
952
def test_from_arrow_20271() -> None:
953
df = pl.from_arrow(
954
pa.table({"b": pa.DictionaryArray.from_arrays([0, 1], ["D", "E"])})
955
)
956
assert isinstance(df, pl.DataFrame)
957
assert_series_equal(
958
df.to_series(),
959
pl.Series("b", ["D", "E"], pl.Categorical),
960
)
961
962
963
def test_to_arrow_empty_chunks_20627() -> None:
964
df = pl.concat(2 * [pl.Series([1])]).filter(pl.Series([False, True])).to_frame()
965
assert df.to_arrow().shape == (1, 1)
966
967
968
def test_from_arrow_recorbatch() -> None:
969
n_legs = pa.array([2, 2, 4, 4, 5, 100])
970
animals = pa.array(
971
["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]
972
)
973
names = ["n_legs", "animals"]
974
record_batch = pa.RecordBatch.from_arrays([n_legs, animals], names=names)
975
assert_frame_equal(
976
pl.DataFrame(record_batch),
977
pl.DataFrame({"n_legs": n_legs, "animals": animals}),
978
)
979
980
981
def test_from_arrow_map_containing_timestamp_23658() -> None:
982
arrow_tbl = pa.Table.from_pydict(
983
{
984
"column_1": [
985
[
986
{
987
"field_1": [
988
{"key": 1, "value": datetime(2025, 1, 1)},
989
{"key": 2, "value": datetime(2025, 1, 2)},
990
{"key": 2, "value": None},
991
]
992
},
993
{"field_1": []},
994
None,
995
]
996
],
997
},
998
schema=pa.schema(
999
[
1000
(
1001
"column_1",
1002
pa.list_(
1003
pa.struct(
1004
[
1005
("field_1", pa.map_(pa.int32(), pa.timestamp("ms"))),
1006
]
1007
)
1008
),
1009
)
1010
]
1011
),
1012
)
1013
1014
expect = pl.DataFrame(
1015
{
1016
"column_1": [
1017
[
1018
{
1019
"field_1": [
1020
{"key": 1, "value": datetime(2025, 1, 1)},
1021
{"key": 2, "value": datetime(2025, 1, 2)},
1022
{"key": 2, "value": None},
1023
]
1024
},
1025
{"field_1": []},
1026
None,
1027
]
1028
],
1029
},
1030
schema={
1031
"column_1": pl.List(
1032
pl.Struct(
1033
{
1034
"field_1": pl.List(
1035
pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")})
1036
)
1037
}
1038
)
1039
)
1040
},
1041
)
1042
1043
out = pl.DataFrame(arrow_tbl)
1044
assert_frame_equal(out, expect)
1045
1046
1047
def test_schema_constructor_from_schema_capsule() -> None:
1048
arrow_schema = pa.schema(
1049
[pa.field("test", pa.map_(pa.int32(), pa.timestamp("ms")))]
1050
)
1051
1052
assert pl.Schema(arrow_schema) == {
1053
"test": pl.List(pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")}))
1054
}
1055
1056
# Test __arrow_c_schema__ implementation on `pl.Schema`
1057
assert pa.schema(pl.Schema({"x": pl.Int32})) == pa.schema(
1058
[pa.field("x", pa.int32())]
1059
)
1060
1061
arrow_schema = pa.schema([pa.field("a", pa.int32()), pa.field("a", pa.int32())])
1062
1063
with pytest.raises(
1064
DuplicateError,
1065
match="arrow schema contained duplicate name: a",
1066
):
1067
pl.Schema(arrow_schema)
1068
1069
with pytest.raises(
1070
ValueError,
1071
match=r"object passed to pl.Schema did not return struct dtype: object: pyarrow\.Field<a: int32>, dtype: Int32",
1072
):
1073
pl.Schema(pa.field("a", pa.int32()))
1074
1075
assert pl.Schema([pa.field("a", pa.int32()), pa.field("b", pa.string())]) == {
1076
"a": pl.Int32,
1077
"b": pl.String,
1078
}
1079
1080
with pytest.raises(
1081
DuplicateError,
1082
match=r"iterable passed to pl\.Schema contained duplicate name 'a'",
1083
):
1084
pl.Schema([pa.field("a", pa.int32()), pa.field("a", pa.int64())])
1085
1086
1087
def test_to_arrow_24142() -> None:
1088
df = pl.DataFrame({"a": object(), "b": "any string or bytes"})
1089
df.to_arrow(compat_level=CompatLevel.oldest())
1090
1091
1092
def test_pycapsule_stream_interface_all_types() -> None:
1093
"""Test all data types via Arrow C Stream PyCapsule interface."""
1094
import datetime
1095
from decimal import Decimal
1096
1097
df = pl.DataFrame(
1098
[
1099
pl.Series("bool", [True, False, None], dtype=pl.Boolean),
1100
pl.Series("int8", [1, 2, None], dtype=pl.Int8),
1101
pl.Series("int16", [1, 2, None], dtype=pl.Int16),
1102
pl.Series("int32", [1, 2, None], dtype=pl.Int32),
1103
pl.Series("int64", [1, 2, None], dtype=pl.Int64),
1104
pl.Series("uint8", [1, 2, None], dtype=pl.UInt8),
1105
pl.Series("uint16", [1, 2, None], dtype=pl.UInt16),
1106
pl.Series("uint32", [1, 2, None], dtype=pl.UInt32),
1107
pl.Series("uint64", [1, 2, None], dtype=pl.UInt64),
1108
pl.Series(
1109
"float32",
1110
[1.100000023841858, 2.200000047683716, None],
1111
dtype=pl.Float32,
1112
),
1113
pl.Series("float64", [1.1, 2.2, None], dtype=pl.Float64),
1114
pl.Series("string", ["hello", "world", None], dtype=pl.String),
1115
pl.Series("binary", [b"hello", b"world", None], dtype=pl.Binary),
1116
pl.Series(
1117
"decimal",
1118
[Decimal("1.23"), Decimal("4.56"), None],
1119
dtype=pl.Decimal(precision=10, scale=2),
1120
),
1121
pl.Series(
1122
"date",
1123
[datetime.date(2023, 1, 1), datetime.date(2023, 1, 2), None],
1124
dtype=pl.Date,
1125
),
1126
pl.Series(
1127
"datetime",
1128
[
1129
datetime.datetime(2023, 1, 1, 12, 0),
1130
datetime.datetime(2023, 1, 2, 13, 30),
1131
None,
1132
],
1133
dtype=pl.Datetime(time_unit="us", time_zone=None),
1134
),
1135
pl.Series(
1136
"time",
1137
[datetime.time(12, 0), datetime.time(13, 30), None],
1138
dtype=pl.Time,
1139
),
1140
pl.Series(
1141
"duration_us",
1142
[datetime.timedelta(days=1), datetime.timedelta(seconds=7200), None],
1143
dtype=pl.Duration(time_unit="us"),
1144
),
1145
pl.Series(
1146
"duration_ms",
1147
[datetime.timedelta(microseconds=100000), datetime.timedelta(0), None],
1148
dtype=pl.Duration(time_unit="ms"),
1149
),
1150
pl.Series(
1151
"duration_ns",
1152
[
1153
datetime.timedelta(seconds=1),
1154
datetime.timedelta(microseconds=1000),
1155
None,
1156
],
1157
dtype=pl.Duration(time_unit="ns"),
1158
),
1159
pl.Series(
1160
"categorical", ["apple", "banana", "apple"], dtype=pl.Categorical
1161
),
1162
pl.Series(
1163
"categorical_named",
1164
["apple", "banana", "apple"],
1165
dtype=pl.Categorical(pl.Categories(name="test")),
1166
),
1167
]
1168
)
1169
1170
assert_frame_equal(
1171
df.map_columns(
1172
pl.selectors.all(), lambda s: pl.Series(PyCapsuleStreamHolder(s))
1173
),
1174
df,
1175
)
1176
1177
assert_frame_equal(
1178
df.map_columns(
1179
pl.selectors.all(),
1180
lambda s: (
1181
pl.Series(
1182
PyCapsuleStreamHolder(pl.select(pl.struct(pl.lit(s))).to_series())
1183
)
1184
.struct.unnest()
1185
.to_series()
1186
),
1187
),
1188
df,
1189
)
1190
1191
assert_frame_equal(
1192
df.map_columns(
1193
pl.selectors.all(),
1194
lambda s: pl.Series(PyCapsuleStreamHolder(s.implode())).explode(),
1195
),
1196
df,
1197
)
1198
1199
assert_frame_equal(
1200
df.map_columns(
1201
pl.selectors.all(),
1202
lambda s: pl.Series(PyCapsuleStreamHolder(s.reshape((3, 1)))).reshape((3,)),
1203
),
1204
df,
1205
)
1206
1207
assert_frame_equal(pl.DataFrame(PyCapsuleStreamHolder(df)), df)
1208
assert_frame_equal(
1209
pl.DataFrame(PyCapsuleStreamHolder(df.select(pl.struct("*")))).unnest("*"), df
1210
)
1211
assert_frame_equal(
1212
pl.DataFrame(PyCapsuleStreamHolder(df.select(pl.all().implode()))).explode("*"),
1213
df,
1214
)
1215
assert_frame_equal(
1216
pl.DataFrame(PyCapsuleStreamHolder(df.select(pl.all().reshape((3, 1))))).select(
1217
pl.all().reshape((3,))
1218
),
1219
df,
1220
)
1221
1222
1223
def pyarrow_table_to_ipc_bytes(tbl: pa.Table) -> bytes:
1224
f = io.BytesIO()
1225
batches = tbl.to_batches()
1226
1227
with pa.ipc.new_file(f, batches[0].schema) as writer:
1228
for batch in batches:
1229
writer.write_batch(batch)
1230
1231
return f.getvalue()
1232
1233
1234
@pytest.mark.write_disk
1235
def test_month_day_nano_from_ffi_15969(plmonkeypatch: PlMonkeyPatch) -> None:
1236
import datetime
1237
1238
def new_interval_scalar(months: int, days: int, nanoseconds: int) -> pa.Scalar:
1239
return pa.scalar((months, days, nanoseconds), type=pa.month_day_nano_interval())
1240
1241
arrow_tbl = pa.Table.from_pydict(
1242
{
1243
"interval": [
1244
new_interval_scalar(1, 0, 0),
1245
new_interval_scalar(0, 1, 0),
1246
new_interval_scalar(0, 0, 1_000),
1247
new_interval_scalar(1, 1, 1_000_001_000),
1248
new_interval_scalar(-1, 0, 0),
1249
new_interval_scalar(0, -1, 0),
1250
new_interval_scalar(0, 0, -1_000),
1251
new_interval_scalar(-1, -1, -1_000_001_000),
1252
new_interval_scalar(3558, 0, 0),
1253
new_interval_scalar(-3558, 0, 0),
1254
new_interval_scalar(1, -1, 1_999_999_000),
1255
]
1256
},
1257
schema=pa.schema([pa.field("interval", pa.month_day_nano_interval())]),
1258
)
1259
1260
ipc_bytes = pyarrow_table_to_ipc_bytes(arrow_tbl)
1261
1262
import_err_msg = (
1263
"could not import from `month_day_nano_interval` type. "
1264
"Hint: This can be imported by setting "
1265
"POLARS_IMPORT_INTERVAL_AS_STRUCT=1 in the environment. "
1266
"Note however that this is unstable functionality "
1267
"that may change at any time."
1268
)
1269
1270
with pytest.raises(PanicException, match=import_err_msg):
1271
pl.scan_ipc(ipc_bytes).collect_schema()
1272
1273
with pytest.raises(PanicException, match=import_err_msg):
1274
pl.scan_ipc(ipc_bytes).collect()
1275
1276
with pytest.raises(PanicException, match=import_err_msg):
1277
pl.DataFrame(
1278
pa.Table.from_pydict(
1279
{"interval": pa.array([], type=pa.month_day_nano_interval())}
1280
)
1281
)
1282
1283
with pytest.raises(ComputeError, match=import_err_msg):
1284
pl.Series(pa.array([], type=pa.month_day_nano_interval()))
1285
1286
plmonkeypatch.setenv("POLARS_IMPORT_INTERVAL_AS_STRUCT", "1")
1287
1288
expect = pl.DataFrame(
1289
[
1290
pl.Series(
1291
"interval",
1292
[
1293
{"months": 1, "days": 0, "nanoseconds": datetime.timedelta(0)},
1294
{"months": 0, "days": 1, "nanoseconds": datetime.timedelta(0)},
1295
{
1296
"months": 0,
1297
"days": 0,
1298
"nanoseconds": datetime.timedelta(microseconds=1),
1299
},
1300
{
1301
"months": 1,
1302
"days": 1,
1303
"nanoseconds": datetime.timedelta(seconds=1, microseconds=1),
1304
},
1305
{"months": -1, "days": 0, "nanoseconds": datetime.timedelta(0)},
1306
{"months": 0, "days": -1, "nanoseconds": datetime.timedelta(0)},
1307
{
1308
"months": 0,
1309
"days": 0,
1310
"nanoseconds": datetime.timedelta(
1311
days=-1, seconds=86399, microseconds=999999
1312
),
1313
},
1314
{
1315
"months": -1,
1316
"days": -1,
1317
"nanoseconds": datetime.timedelta(
1318
days=-1, seconds=86398, microseconds=999999
1319
),
1320
},
1321
{"months": 3558, "days": 0, "nanoseconds": datetime.timedelta(0)},
1322
{"months": -3558, "days": 0, "nanoseconds": datetime.timedelta(0)},
1323
{
1324
"months": 1,
1325
"days": -1,
1326
"nanoseconds": datetime.timedelta(
1327
seconds=1, microseconds=999999
1328
),
1329
},
1330
],
1331
dtype=pl.Struct(
1332
{
1333
"months": pl.Int32,
1334
"days": pl.Int32,
1335
"nanoseconds": pl.Duration(time_unit="ns"),
1336
}
1337
),
1338
),
1339
]
1340
)
1341
1342
assert_frame_equal(pl.DataFrame(arrow_tbl), expect)
1343
assert_series_equal(
1344
pl.Series(arrow_tbl.column(0)).alias("interval"), expect.to_series()
1345
)
1346
1347
# Test IPC scan
1348
assert pl.scan_ipc(ipc_bytes).collect_schema() == {
1349
"interval": pl.Struct(
1350
{
1351
"months": pl.Int32,
1352
"days": pl.Int32,
1353
"nanoseconds": pl.Duration(time_unit="ns"),
1354
}
1355
)
1356
}
1357
assert_frame_equal(pl.scan_ipc(ipc_bytes).collect(), expect)
1358
1359
assert_frame_equal(
1360
pl.DataFrame(
1361
pa.Table.from_pydict(
1362
{"interval": pa.array([], type=pa.month_day_nano_interval())}
1363
)
1364
),
1365
pl.DataFrame(
1366
schema={
1367
"interval": pl.Struct(
1368
{
1369
"months": pl.Int32,
1370
"days": pl.Int32,
1371
"nanoseconds": pl.Duration(time_unit="ns"),
1372
}
1373
)
1374
}
1375
),
1376
)
1377
1378
assert_series_equal(
1379
pl.Series(pa.array([], type=pa.month_day_nano_interval())),
1380
pl.Series(
1381
dtype=pl.Struct(
1382
{
1383
"months": pl.Int32,
1384
"days": pl.Int32,
1385
"nanoseconds": pl.Duration(time_unit="ns"),
1386
}
1387
)
1388
),
1389
)
1390
1391
f = io.BytesIO()
1392
1393
# TODO: Add Parquet round-trip test if this starts working.
1394
with pytest.raises(pa.ArrowNotImplementedError):
1395
pq.write_table(arrow_tbl, f)
1396
1397
1398
def test_schema_to_arrow_15563() -> None:
1399
assert pl.Schema({"x": pl.String}).to_arrow() == pa.schema(
1400
[pa.field("x", pa.string_view())]
1401
)
1402
1403
assert pl.Schema({"x": pl.String}).to_arrow(
1404
compat_level=CompatLevel.oldest()
1405
) == pa.schema([pa.field("x", pa.large_string())])
1406
1407
1408
def test_0_width_df_roundtrip() -> None:
1409
assert pl.DataFrame(height=(1 << 32) - 1).to_numpy().shape == ((1 << 32) - 1, 0)
1410
assert pl.DataFrame(np.zeros((10, 0))).shape == (10, 0)
1411
1412
arrow_table = pl.DataFrame(height=(1 << 32) - 1).to_arrow()
1413
assert arrow_table.shape == ((1 << 32) - 1, 0)
1414
assert pl.DataFrame(arrow_table).shape == ((1 << 32) - 1, 0)
1415
1416
pandas_df = pl.DataFrame(height=(1 << 32) - 1).to_pandas()
1417
assert pandas_df.shape == ((1 << 32) - 1, 0)
1418
assert pl.DataFrame(pandas_df).shape == ((1 << 32) - 1, 0)
1419
1420
df = pl.DataFrame(height=5)
1421
1422
assert pl.DataFrame.deserialize(df.serialize()).shape == (5, 0)
1423
assert pl.LazyFrame.deserialize(df.lazy().serialize()).collect().shape == (5, 0)
1424
1425
for file_format in ["parquet", "ipc", "ndjson"]:
1426
f = io.BytesIO()
1427
getattr(pl.DataFrame, f"write_{file_format}")(df, f)
1428
f.seek(0)
1429
assert getattr(pl, f"read_{file_format}")(f).shape == (5, 0)
1430
1431
f = io.BytesIO()
1432
getattr(pl.LazyFrame, f"sink_{file_format}")(df.lazy(), f)
1433
f.seek(0)
1434
assert getattr(pl, f"scan_{file_format}")(f).collect().shape == (5, 0)
1435
1436
f = io.BytesIO()
1437
pl.LazyFrame().sink_csv(f)
1438
v = f.getvalue()
1439
assert v == b"\n"
1440
1441
with pytest.raises(
1442
InvalidOperationError,
1443
match=r"cannot sink 0-width DataFrame with non-zero height \(1\) to CSV",
1444
):
1445
pl.LazyFrame(height=1).sink_csv(io.BytesIO())
1446
1447