Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_spreadsheet.py
6939 views
1
from __future__ import annotations
2
3
import warnings
4
from collections import OrderedDict
5
from datetime import date, datetime, time
6
from io import BytesIO
7
from pathlib import Path
8
from typing import TYPE_CHECKING, Any, Callable
9
from zoneinfo import ZoneInfo
10
11
import pytest
12
13
import polars as pl
14
import polars.selectors as cs
15
from polars.exceptions import (
16
NoDataError,
17
ParameterCollisionError,
18
)
19
from polars.testing import assert_frame_equal, assert_series_equal
20
from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES
21
22
if TYPE_CHECKING:
23
from collections.abc import Sequence
24
25
from polars._typing import (
26
ExcelSpreadsheetEngine,
27
PolarsDataType,
28
SchemaDict,
29
SelectorType,
30
)
31
32
33
# pytestmark = pytest.mark.slow()
34
35
36
@pytest.fixture
37
def path_xls(io_files_path: Path) -> Path:
38
# old excel 97-2004 format
39
return io_files_path / "example.xls"
40
41
42
@pytest.fixture
43
def path_xlsx(io_files_path: Path) -> Path:
44
# modern excel format
45
return io_files_path / "example.xlsx"
46
47
48
@pytest.fixture
49
def path_xlsb(io_files_path: Path) -> Path:
50
# excel binary format
51
return io_files_path / "example.xlsb"
52
53
54
@pytest.fixture
55
def path_ods(io_files_path: Path) -> Path:
56
# open document spreadsheet
57
return io_files_path / "example.ods"
58
59
60
@pytest.fixture
61
def path_xls_empty(io_files_path: Path) -> Path:
62
return io_files_path / "empty.xls"
63
64
65
@pytest.fixture
66
def path_xlsx_empty(io_files_path: Path) -> Path:
67
return io_files_path / "empty.xlsx"
68
69
70
@pytest.fixture
71
def path_xlsx_mixed(io_files_path: Path) -> Path:
72
return io_files_path / "mixed.xlsx"
73
74
75
@pytest.fixture
76
def path_xlsb_empty(io_files_path: Path) -> Path:
77
return io_files_path / "empty.xlsb"
78
79
80
@pytest.fixture
81
def path_xlsb_mixed(io_files_path: Path) -> Path:
82
return io_files_path / "mixed.xlsb"
83
84
85
@pytest.fixture
86
def path_ods_empty(io_files_path: Path) -> Path:
87
return io_files_path / "empty.ods"
88
89
90
@pytest.fixture
91
def path_ods_mixed(io_files_path: Path) -> Path:
92
return io_files_path / "mixed.ods"
93
94
95
@pytest.fixture
96
def path_empty_rows_excel(io_files_path: Path) -> Path:
97
return io_files_path / "test_empty_rows.xlsx"
98
99
100
@pytest.mark.parametrize(
101
("read_spreadsheet", "source", "engine_params"),
102
[
103
# xls file
104
(pl.read_excel, "path_xls", {"engine": "calamine"}),
105
# xlsx file
106
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
107
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
108
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
109
# xlsb file (binary)
110
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
111
# open document
112
(pl.read_ods, "path_ods", {}),
113
],
114
)
115
def test_read_spreadsheet(
116
read_spreadsheet: Callable[..., pl.DataFrame],
117
source: str,
118
engine_params: dict[str, str],
119
request: pytest.FixtureRequest,
120
) -> None:
121
sheet_params: dict[str, Any]
122
123
for sheet_params in (
124
{"sheet_name": None, "sheet_id": None},
125
{"sheet_name": "test1"},
126
{"sheet_id": 1},
127
):
128
df = read_spreadsheet(
129
source=request.getfixturevalue(source),
130
**engine_params,
131
**sheet_params,
132
)
133
expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
134
assert_frame_equal(df, expected)
135
136
137
@pytest.mark.parametrize(
138
("read_spreadsheet", "source", "params"),
139
[
140
# xls file
141
(pl.read_excel, "path_xls", {"engine": "calamine"}),
142
# xlsx file
143
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
144
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
145
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
146
# xlsb file (binary)
147
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
148
# open document
149
(pl.read_ods, "path_ods", {}),
150
],
151
)
152
def test_read_excel_multiple_worksheets(
153
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
154
source: str,
155
params: dict[str, str],
156
request: pytest.FixtureRequest,
157
) -> None:
158
spreadsheet_path = request.getfixturevalue(source)
159
frames_by_id = read_spreadsheet(
160
spreadsheet_path,
161
sheet_id=[2, 1],
162
sheet_name=None,
163
**params,
164
)
165
frames_by_name = read_spreadsheet(
166
spreadsheet_path,
167
sheet_id=None,
168
sheet_name=["test2", "test1"],
169
**params,
170
)
171
for frames in (frames_by_id, frames_by_name):
172
assert list(frames_by_name) == ["test2", "test1"]
173
174
expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
175
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
176
177
assert_frame_equal(frames["test1"], expected1)
178
assert_frame_equal(frames["test2"], expected2)
179
180
181
@pytest.mark.parametrize(
182
("read_spreadsheet", "source", "params"),
183
[
184
# xls file
185
(pl.read_excel, "path_xls", {"engine": "calamine"}),
186
# xlsx file
187
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
188
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
189
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
190
# xlsb file (binary)
191
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
192
# open document
193
(pl.read_ods, "path_ods", {}),
194
],
195
)
196
def test_read_excel_multiple_workbooks(
197
read_spreadsheet: Callable[..., Any],
198
source: str,
199
params: dict[str, str],
200
request: pytest.FixtureRequest,
201
) -> None:
202
spreadsheet_path = request.getfixturevalue(source)
203
204
# multiple workbooks, single worksheet
205
df = read_spreadsheet(
206
[
207
spreadsheet_path,
208
spreadsheet_path,
209
spreadsheet_path,
210
],
211
sheet_id=None,
212
sheet_name="test1",
213
include_file_paths="path",
214
**params,
215
)
216
expected = pl.DataFrame(
217
{
218
"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"],
219
"path": [str(spreadsheet_path)] * 6,
220
},
221
)
222
assert_frame_equal(df, expected)
223
224
# multiple workbooks, multiple worksheets
225
res = read_spreadsheet(
226
[
227
spreadsheet_path,
228
spreadsheet_path,
229
spreadsheet_path,
230
],
231
sheet_id=None,
232
sheet_name=["test1", "test2"],
233
**params,
234
)
235
expected_frames = {
236
"test1": pl.DataFrame(
237
{"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"]}
238
),
239
"test2": pl.DataFrame(
240
{"world": ["Row 3", "Row 4", "Row 3", "Row 4", "Row 3", "Row 4"]}
241
),
242
}
243
assert sorted(res) == sorted(expected_frames)
244
assert_frame_equal(res["test1"], expected_frames["test1"])
245
assert_frame_equal(res["test2"], expected_frames["test2"])
246
247
248
@pytest.mark.parametrize(
249
("read_spreadsheet", "source", "params"),
250
[
251
# xls file
252
(pl.read_excel, "path_xls", {"engine": "calamine"}),
253
# xlsx file
254
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
255
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
256
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
257
# xlsb file (binary)
258
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
259
# open document
260
(pl.read_ods, "path_ods", {}),
261
],
262
)
263
def test_read_excel_all_sheets(
264
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
265
source: str,
266
params: dict[str, str],
267
request: pytest.FixtureRequest,
268
) -> None:
269
spreadsheet_path = request.getfixturevalue(source)
270
frames = read_spreadsheet(
271
spreadsheet_path,
272
sheet_id=0,
273
**params,
274
)
275
assert len(frames) == (4 if str(spreadsheet_path).endswith("ods") else 6)
276
277
expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
278
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
279
expected3 = pl.DataFrame(
280
{
281
"cardinality": [1, 3, 15, 30, 150, 300],
282
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
283
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
284
}
285
)
286
assert_frame_equal(frames["test1"], expected1)
287
assert_frame_equal(frames["test2"], expected2)
288
if params.get("engine") == "openpyxl":
289
# TODO: flag that trims trailing all-null rows?
290
assert_frame_equal(frames["test3"], expected3)
291
assert_frame_equal(frames["test4"].drop_nulls(), expected3)
292
293
294
@pytest.mark.parametrize(
295
"engine",
296
["calamine", "openpyxl", "xlsx2csv"],
297
)
298
def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:
299
df = pl.DataFrame(
300
{
301
"A": [1, 2, 3, 4, 5],
302
"fruits": ["banana", "banana", "apple", "apple", "banana"],
303
"floats": [1.1, 1.2, 1.3, 1.4, 1.5],
304
"datetime": [datetime(2023, 1, x) for x in range(1, 6)],
305
"nulls": [1, None, None, None, 0],
306
},
307
)
308
xls = BytesIO()
309
df.write_excel(xls, position="C5")
310
311
schema_overrides = {"datetime": pl.Datetime("us"), "nulls": pl.Boolean()}
312
df_compare = df.with_columns(
313
pl.col(nm).cast(tp) for nm, tp in schema_overrides.items()
314
)
315
for sheet_id, sheet_name in ((None, None), (1, None), (None, "Sheet1")):
316
df_from_excel = pl.read_excel(
317
xls,
318
sheet_id=sheet_id,
319
sheet_name=sheet_name,
320
engine=engine,
321
schema_overrides=schema_overrides,
322
)
323
assert_frame_equal(df_compare, df_from_excel)
324
325
# check some additional overrides
326
# (note: xlsx2csv can't currently convert datetime with trailing '00:00:00' to date)
327
dt_override = {"datetime": pl.Date} if engine != "xlsx2csv" else {}
328
df = pl.read_excel(
329
xls,
330
sheet_id=sheet_id,
331
sheet_name=sheet_name,
332
engine=engine,
333
schema_overrides={"A": pl.Float32, **dt_override},
334
)
335
assert_series_equal(
336
df["A"],
337
pl.Series(name="A", values=[1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float32),
338
)
339
if dt_override:
340
assert_series_equal(
341
df["datetime"],
342
pl.Series(
343
name="datetime",
344
values=[date(2023, 1, x) for x in range(1, 6)],
345
dtype=pl.Date,
346
),
347
)
348
349
350
@pytest.mark.parametrize(
351
("read_spreadsheet", "source", "params"),
352
[
353
# TODO: uncomment once fastexcel offers a suitable param
354
# (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
355
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
356
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
357
],
358
)
359
def test_read_dropped_cols(
360
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
361
source: str,
362
params: dict[str, str],
363
request: pytest.FixtureRequest,
364
) -> None:
365
spreadsheet_path = request.getfixturevalue(source)
366
367
df1 = read_spreadsheet(
368
spreadsheet_path,
369
sheet_name="test4",
370
**params,
371
)
372
df2 = read_spreadsheet(
373
spreadsheet_path,
374
sheet_name="test4",
375
drop_empty_cols=False,
376
**params,
377
)
378
assert df1.to_dict(as_series=False) == { # type: ignore[attr-defined]
379
"cardinality": [1, 3, 15, 30, 150, 300],
380
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
381
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
382
}
383
assert df2.to_dict(as_series=False) == { # type: ignore[attr-defined]
384
"": [None, None, None, None, None, None],
385
"cardinality": [1, 3, 15, 30, 150, 300],
386
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
387
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
388
"0": [None, None, None, None, None, None],
389
"1": [None, None, None, None, None, None],
390
}
391
392
393
@pytest.mark.parametrize(
394
("source", "params"),
395
[
396
("path_xls", {"engine": "calamine", "sheet_name": "temporal"}),
397
("path_xlsx", {"engine": "calamine", "table_name": "TemporalData"}),
398
("path_xlsx", {"engine": "openpyxl", "sheet_name": "temporal"}),
399
("path_xlsb", {"engine": "calamine", "sheet_name": "temporal"}),
400
],
401
)
402
def test_read_excel_temporal_data(
403
source: str,
404
params: dict[str, str],
405
request: pytest.FixtureRequest,
406
) -> None:
407
source_path = request.getfixturevalue(source)
408
409
temporal_schema = {
410
"id": pl.UInt16(),
411
"dtm": pl.Datetime("ms"),
412
"dt": pl.Date(),
413
"dtm_str": pl.Datetime(time_zone="Asia/Tokyo"),
414
"dt_str": pl.Date(),
415
"tm_str": pl.Time(),
416
}
417
parsed_df = pl.read_excel( # type: ignore[call-overload]
418
source_path,
419
**params,
420
schema_overrides=temporal_schema,
421
)
422
TK = ZoneInfo("Asia/Tokyo")
423
424
expected = pl.DataFrame(
425
{
426
"id": [100, 200, 300, 400],
427
"dtm": [
428
datetime(1999, 12, 31, 1, 2, 3),
429
None,
430
datetime(1969, 7, 5, 10, 30, 45),
431
datetime(2077, 10, 10, 5, 59, 44),
432
],
433
"dt": [
434
date(2000, 1, 18),
435
date(1965, 8, 8),
436
date(2027, 4, 22),
437
None,
438
],
439
"dtm_str": [
440
None,
441
datetime(1900, 1, 30, 14, 50, 20, tzinfo=TK),
442
datetime(2026, 5, 7, 23, 59, 59, tzinfo=TK),
443
datetime(2007, 6, 1, 0, 0, tzinfo=TK),
444
],
445
"dt_str": [
446
date(2000, 6, 14),
447
date(1978, 2, 28),
448
None,
449
date(2040, 12, 4),
450
],
451
"tm_str": [
452
time(23, 50, 22),
453
time(0, 0, 1),
454
time(10, 10, 33),
455
time(18, 30, 15),
456
],
457
},
458
schema=temporal_schema,
459
)
460
assert_frame_equal(expected, parsed_df)
461
462
463
@pytest.mark.parametrize(
464
("read_spreadsheet", "source", "params"),
465
[
466
# xls file
467
(pl.read_excel, "path_xls", {"engine": "calamine"}),
468
# xlsx file
469
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
470
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
471
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
472
# xlsb file (binary)
473
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
474
# open document
475
(pl.read_ods, "path_ods", {}),
476
],
477
)
478
def test_read_invalid_worksheet(
479
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
480
source: str,
481
params: dict[str, str],
482
request: pytest.FixtureRequest,
483
) -> None:
484
spreadsheet_path = request.getfixturevalue(source)
485
for param, sheet_id, sheet_name in (
486
("id", 999, None),
487
("name", None, "not_a_sheet_name"),
488
):
489
value = sheet_id if param == "id" else sheet_name
490
with pytest.raises(
491
ValueError,
492
match=f"no matching sheet found when `sheet_{param}` is {value!r}",
493
):
494
read_spreadsheet(
495
spreadsheet_path, sheet_id=sheet_id, sheet_name=sheet_name, **params
496
)
497
498
499
@pytest.mark.parametrize(
500
("read_spreadsheet", "source", "additional_params"),
501
[
502
(pl.read_excel, "path_xlsx_mixed", {"engine": "openpyxl"}),
503
(pl.read_ods, "path_ods_mixed", {}),
504
],
505
)
506
@pytest.mark.may_fail_auto_streaming
507
def test_read_mixed_dtype_columns(
508
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
509
source: str,
510
additional_params: dict[str, str],
511
request: pytest.FixtureRequest,
512
) -> None:
513
spreadsheet_path = request.getfixturevalue(source)
514
schema_overrides = {
515
"Employee ID": pl.Utf8(),
516
"Employee Name": pl.Utf8(),
517
"Date": pl.Date(),
518
"Details": pl.Categorical("lexical"),
519
"Asset ID": pl.Utf8(),
520
}
521
df = read_spreadsheet(
522
spreadsheet_path,
523
sheet_id=0,
524
schema_overrides=schema_overrides,
525
**additional_params,
526
)["Sheet1"]
527
528
assert_frame_equal(
529
df,
530
pl.DataFrame(
531
{
532
"Employee ID": ["123456", "44333", "US00011", "135967", "IN86868"],
533
"Employee Name": ["Test1", "Test2", "Test4", "Test5", "Test6"],
534
"Date": [
535
date(2023, 7, 21),
536
date(2023, 7, 21),
537
date(2023, 7, 21),
538
date(2023, 7, 21),
539
date(2023, 7, 21),
540
],
541
"Details": [
542
"Healthcare",
543
"Healthcare",
544
"Healthcare",
545
"Healthcare",
546
"Something",
547
],
548
"Asset ID": ["84444", "84444", "84444", "84444", "ABC123"],
549
},
550
schema_overrides=schema_overrides,
551
),
552
)
553
554
555
def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> None:
556
df1 = pl.read_excel(
557
path_xlsx,
558
sheet_name="test4",
559
schema_overrides={"cardinality": pl.UInt16},
560
).drop_nulls()
561
562
assert df1.schema["cardinality"] == pl.UInt16
563
assert df1.schema["rows_by_key"] == pl.Float64
564
assert df1.schema["iter_groups"] == pl.Float64
565
566
df2 = pl.read_excel(
567
path_xlsx,
568
sheet_name="test4",
569
engine="xlsx2csv",
570
read_options={"schema_overrides": {"cardinality": pl.UInt16}},
571
).drop_nulls()
572
573
assert df2.schema["cardinality"] == pl.UInt16
574
assert df2.schema["rows_by_key"] == pl.Float64
575
assert df2.schema["iter_groups"] == pl.Float64
576
577
df3 = pl.read_excel(
578
path_xlsx,
579
sheet_name="test4",
580
engine="xlsx2csv",
581
schema_overrides={"cardinality": pl.UInt16},
582
read_options={
583
"schema_overrides": {
584
"rows_by_key": pl.Float32,
585
"iter_groups": pl.Float32,
586
},
587
},
588
).drop_nulls()
589
590
assert df3.schema["cardinality"] == pl.UInt16
591
assert df3.schema["rows_by_key"] == pl.Float32
592
assert df3.schema["iter_groups"] == pl.Float32
593
594
for workbook_path in (path_xlsx, path_xlsb, path_ods):
595
read_spreadsheet = (
596
pl.read_ods if workbook_path.suffix == ".ods" else pl.read_excel
597
)
598
df4 = read_spreadsheet( # type: ignore[operator]
599
workbook_path,
600
sheet_name="test5",
601
schema_overrides={"dtm": pl.Datetime("ns"), "dt": pl.Date},
602
)
603
assert_frame_equal(
604
df4,
605
pl.DataFrame(
606
{
607
"dtm": [
608
datetime(1999, 12, 31, 10, 30, 45),
609
datetime(2010, 10, 11, 12, 13, 14),
610
],
611
"dt": [date(2024, 1, 1), date(2018, 8, 7)],
612
"val": [1.5, -0.5],
613
},
614
schema={"dtm": pl.Datetime("ns"), "dt": pl.Date, "val": pl.Float64},
615
),
616
)
617
618
with pytest.raises(ParameterCollisionError):
619
# cannot specify 'cardinality' in both schema_overrides and read_options
620
pl.read_excel(
621
path_xlsx,
622
sheet_name="test4",
623
engine="xlsx2csv",
624
schema_overrides={"cardinality": pl.UInt16},
625
read_options={"schema_overrides": {"cardinality": pl.Int32}},
626
)
627
628
# read multiple sheets in conjunction with 'schema_overrides'
629
# (note: reading the same sheet twice simulates the issue in #11850)
630
overrides = OrderedDict(
631
[
632
("cardinality", pl.UInt32),
633
("rows_by_key", pl.Float32),
634
("iter_groups", pl.Float64),
635
]
636
)
637
df = pl.read_excel(
638
path_xlsx,
639
sheet_name=["test4", "test4"],
640
schema_overrides=overrides,
641
)
642
for col, dtype in overrides.items():
643
assert df["test4"].schema[col] == dtype
644
645
646
@pytest.mark.parametrize(
647
("engine", "read_opts_param"),
648
[
649
("xlsx2csv", "infer_schema_length"),
650
("calamine", "schema_sample_rows"),
651
],
652
)
653
def test_invalid_parameter_combinations_infer_schema_len(
654
path_xlsx: Path, engine: str, read_opts_param: str
655
) -> None:
656
with pytest.raises(
657
ParameterCollisionError,
658
match=f"cannot specify both `infer_schema_length`.*{read_opts_param}",
659
):
660
pl.read_excel( # type: ignore[call-overload]
661
path_xlsx,
662
sheet_id=1,
663
engine=engine,
664
read_options={read_opts_param: 512},
665
infer_schema_length=1024,
666
)
667
668
669
@pytest.mark.parametrize(
670
("engine", "read_opts_param"),
671
[
672
("xlsx2csv", "columns"),
673
("calamine", "use_columns"),
674
],
675
)
676
def test_invalid_parameter_combinations_columns(
677
path_xlsx: Path, engine: str, read_opts_param: str
678
) -> None:
679
with pytest.raises(
680
ParameterCollisionError,
681
match=f"cannot specify both `columns`.*{read_opts_param}",
682
):
683
pl.read_excel( # type: ignore[call-overload]
684
path_xlsx,
685
sheet_id=1,
686
engine=engine,
687
read_options={read_opts_param: ["B", "C", "D"]},
688
columns=["A", "B", "C"],
689
)
690
691
692
def test_unsupported_engine() -> None:
693
with pytest.raises(NotImplementedError):
694
pl.read_excel(None, engine="foo") # type: ignore[call-overload]
695
696
697
def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
698
with pytest.raises(Exception, match="does not support binary format"):
699
pl.read_excel(path_xlsb, engine="openpyxl")
700
701
702
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
703
def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:
704
with pytest.raises(
705
ValueError,
706
match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)",
707
):
708
pl.read_excel( # type: ignore[call-overload]
709
path_xlsx,
710
sheet_id=1,
711
sheet_name="Sheet1",
712
engine=engine,
713
)
714
715
716
# the parameters don't change the data, only the formatting, so we expect
717
# the same result each time. however, it's important to validate that the
718
# parameter permutations don't raise exceptions, or interfere with the
719
# values written to the worksheet, so test multiple variations.
720
@pytest.mark.parametrize(
721
"write_params",
722
[
723
# default parameters
724
{},
725
# basic formatting
726
{
727
"autofit": True,
728
"table_style": "Table Style Dark 2",
729
"column_totals": True,
730
"float_precision": 0,
731
},
732
# slightly customized formatting, with some formulas
733
{
734
"position": (0, 0),
735
"table_style": {
736
"style": "Table Style Medium 23",
737
"first_column": True,
738
},
739
"conditional_formats": {"val": "data_bar"},
740
"column_formats": {
741
"val": "#,##0.000;[White]-#,##0.000",
742
("day", "month", "year"): {"align": "left", "num_format": "0"},
743
},
744
"header_format": {"italic": True, "bg_color": "#d9d9d9"},
745
"column_widths": {"val": 100},
746
"row_heights": {0: 35},
747
"formulas": {
748
# string: formula added to the end of the table (but before row_totals)
749
"day": "=DAY([@dtm])",
750
"month": "=MONTH([@dtm])",
751
"year": {
752
# dict: full control over formula positioning/dtype
753
"formula": "=YEAR([@dtm])",
754
"insert_after": "month",
755
"return_type": pl.Int16,
756
},
757
},
758
"column_totals": True,
759
"row_totals": True,
760
},
761
# heavily customized formatting/definition
762
{
763
"position": "A1",
764
"table_name": "PolarsFrameData",
765
"table_style": "Table Style Light 9",
766
"conditional_formats": {
767
# single dict format
768
"str": {
769
"type": "duplicate",
770
"format": {"bg_color": "#ff0000", "font_color": "#ffffff"},
771
},
772
# multiple dict formats
773
"val": [
774
{
775
"type": "3_color_scale",
776
"min_color": "#4bacc6",
777
"mid_color": "#ffffff",
778
"max_color": "#daeef3",
779
},
780
{
781
"type": "cell",
782
"criteria": "<",
783
"value": -90,
784
"format": {"font_color": "white"},
785
},
786
],
787
"dtm": [
788
{
789
"type": "top",
790
"value": 1,
791
"format": {"bold": True, "font_color": "green"},
792
},
793
{
794
"type": "bottom",
795
"value": 1,
796
"format": {"bold": True, "font_color": "red"},
797
},
798
],
799
},
800
"dtype_formats": {
801
frozenset(
802
FLOAT_DTYPES
803
): '_(£* #,##0.00_);_(£* (#,##0.00);_(£* "-"??_);_(@_)',
804
pl.Date: "dd-mm-yyyy",
805
},
806
"column_formats": {"dtm": {"font_color": "#31869c", "bg_color": "#b7dee8"}},
807
"column_totals": {"val": "average", "dtm": "min"},
808
"column_widths": {("str", "val"): 60, "dtm": 80},
809
"row_totals": {"tot": True},
810
"hidden_columns": ["str"],
811
"hide_gridlines": True,
812
"include_header": False,
813
},
814
],
815
)
816
def test_excel_round_trip(write_params: dict[str, Any]) -> None:
817
df = pl.DataFrame(
818
{
819
"dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)],
820
"str": ["xxx", "yyy", "xxx"],
821
"val": [100.5, 55.0, -99.5],
822
}
823
)
824
825
engine: ExcelSpreadsheetEngine
826
for engine in ("calamine", "xlsx2csv"):
827
read_options, has_header = (
828
({}, True)
829
if write_params.get("include_header", True)
830
else (
831
{"new_columns": ["dtm", "str", "val"]}
832
if engine == "xlsx2csv"
833
else {"column_names": ["dtm", "str", "val"]},
834
False,
835
)
836
)
837
838
fmt_strptime = "%Y-%m-%d"
839
if write_params.get("dtype_formats", {}).get(pl.Date) == "dd-mm-yyyy":
840
fmt_strptime = "%d-%m-%Y"
841
842
# write to xlsx using various parameters...
843
xls = BytesIO()
844
_wb = df.write_excel(workbook=xls, worksheet="data", **write_params)
845
846
# ...and read it back again:
847
xldf = pl.read_excel(
848
xls,
849
sheet_name="data",
850
engine=engine,
851
read_options=read_options,
852
has_header=has_header,
853
)[:3].select(df.columns[:3])
854
855
if engine == "xlsx2csv":
856
xldf = xldf.with_columns(pl.col("dtm").str.strptime(pl.Date, fmt_strptime))
857
858
assert_frame_equal(df, xldf)
859
860
861
@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"])
862
def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None:
863
df = pl.DataFrame(
864
{
865
"id": ["aaa", "bbb", "ccc", "ddd", "eee"],
866
# float cols
867
"q1": [100.0, 55.5, -20.0, 0.5, 35.0],
868
"q2": [30.5, -10.25, 15.0, 60.0, 20.5],
869
# int cols
870
"q3": [-50, 0, 40, 80, 80],
871
"q4": [75, 55, 25, -10, -55],
872
}
873
)
874
for fn_sum in (True, "sum", "SUM"):
875
xls = BytesIO()
876
df.write_excel(
877
xls,
878
worksheet="misc",
879
sparklines={"trend": ["q1", "q2", "q3", "q4"]},
880
row_totals={
881
# add semiannual row total columns
882
"h1": ("q1", "q2"),
883
"h2": ("q3", "q4"),
884
},
885
column_totals=fn_sum,
886
)
887
888
# note that the totals are written as formulae, so we
889
# won't have the calculated values in the dataframe
890
xldf = pl.read_excel(xls, sheet_name="misc", engine=engine)
891
892
assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
893
assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)
894
895
896
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
897
@pytest.mark.parametrize(
898
("engine", "list_dtype"),
899
[
900
("calamine", pl.List(pl.Int8)),
901
("openpyxl", pl.List(pl.UInt16)),
902
("xlsx2csv", pl.Array(pl.Int32, 3)),
903
],
904
)
905
def test_excel_write_compound_types(
906
engine: ExcelSpreadsheetEngine,
907
list_dtype: PolarsDataType,
908
) -> None:
909
df = pl.DataFrame(
910
data={"x": [None, [1, 2, 3], [4, 5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]},
911
schema_overrides={"x": pl.Array(pl.Int32, 3)},
912
).select("x", pl.struct(["y", "z"]))
913
914
xls = BytesIO()
915
df.write_excel(xls, worksheet="data")
916
917
# also test reading from the various flavours of supported binary data
918
# across all backend engines (check bytesio, bytes, and memoryview)
919
for binary_data in (
920
xls,
921
xls.getvalue(),
922
xls.getbuffer(),
923
):
924
xldf = pl.read_excel(
925
binary_data,
926
sheet_name="data",
927
engine=engine,
928
include_file_paths="wbook",
929
)
930
931
# expect string conversion (only scalar values are supported)
932
assert xldf.rows() == [
933
(None, "{'y': 'a', 'z': 9}", "in-mem"),
934
("[1, 2, 3]", "{'y': 'b', 'z': 8}", "in-mem"),
935
("[4, 5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
936
]
937
938
939
def test_excel_read_named_table_with_total_row(tmp_path: Path) -> None:
940
df = pl.DataFrame(
941
{
942
"x": ["aa", "bb", "cc"],
943
"y": [100, 325, -250],
944
"z": [975, -444, 123],
945
}
946
)
947
# when we read back a named table object with a total row we expect the read
948
# to automatically omit that row as it is *not* part of the actual table data
949
wb_path = Path(tmp_path).joinpath("test_named_table_read.xlsx")
950
df.write_excel(
951
wb_path,
952
worksheet="data",
953
table_name="PolarsFrameTable",
954
column_totals=True,
955
)
956
for engine in ("calamine", "openpyxl"):
957
xldf = pl.read_excel(wb_path, table_name="PolarsFrameTable", engine=engine)
958
assert_frame_equal(df, xldf)
959
960
# xlsx2csv doesn't support reading named tables, so we see the
961
# column total if we don't filter it out after reading the data
962
with pytest.raises(
963
ValueError,
964
match="the `table_name` parameter is not supported by the 'xlsx2csv' engine",
965
):
966
pl.read_excel(wb_path, table_name="PolarsFrameTable", engine="xlsx2csv")
967
968
xldf = pl.read_excel(wb_path, sheet_name="data", engine="xlsx2csv")
969
assert_frame_equal(df, xldf.head(3))
970
assert xldf.height == 4
971
assert xldf.row(3) == (None, 0, 0)
972
973
974
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
975
def test_excel_write_to_bytesio(engine: ExcelSpreadsheetEngine) -> None:
976
df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]})
977
978
excel_bytes = BytesIO()
979
df.write_excel(excel_bytes)
980
981
df_read = pl.read_excel(excel_bytes, engine=engine)
982
assert_frame_equal(df, df_read)
983
984
# also confirm consistent behaviour when 'infer_schema_length=0'
985
df_read = pl.read_excel(excel_bytes, engine=engine, infer_schema_length=0)
986
expected = pl.DataFrame({"colx": ["1.5", "-2", "0"], "coly": ["a", None, "c"]})
987
assert_frame_equal(expected, df_read)
988
989
990
@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
991
def test_excel_write_to_file_object(
992
engine: ExcelSpreadsheetEngine, tmp_path: Path
993
) -> None:
994
tmp_path.mkdir(exist_ok=True)
995
996
df = pl.DataFrame({"x": ["aaa", "bbb", "ccc"], "y": [123, 456, 789]})
997
998
# write to bytesio
999
xls = BytesIO()
1000
df.write_excel(xls, worksheet="data")
1001
assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1002
1003
# write to file path
1004
path = Path(tmp_path).joinpath("test_write_path.xlsx")
1005
df.write_excel(path, worksheet="data")
1006
assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1007
1008
# write to file path (as string)
1009
path = Path(tmp_path).joinpath("test_write_path_str.xlsx")
1010
df.write_excel(str(path), worksheet="data")
1011
assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1012
1013
# write to file object
1014
path = Path(tmp_path).joinpath("test_write_file_object.xlsx")
1015
with path.open("wb") as tgt:
1016
df.write_excel(tgt, worksheet="data")
1017
with path.open("rb") as src:
1018
assert_frame_equal(df, pl.read_excel(src, engine=engine))
1019
1020
1021
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1022
def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
1023
df = pl.DataFrame(
1024
{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
1025
)
1026
xls = BytesIO()
1027
df.write_excel(xls, worksheet="data", include_header=False)
1028
1029
xldf = pl.read_excel(xls, engine=engine, has_header=False)
1030
expected = xldf.rename({"column_1": "colx", "column_2": "coly", "column_3": "colz"})
1031
assert_frame_equal(df, expected)
1032
1033
1034
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1035
def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
1036
from xlsxwriter import Workbook
1037
1038
# note that we don't (quite) expect sparkline export to round-trip as we
1039
# inject additional empty columns to hold them (which will read as nulls)
1040
df = pl.DataFrame(
1041
{
1042
"id": ["aaa", "bbb", "ccc", "ddd", "eee"],
1043
"q1": [100, 55, -20, 0, 35],
1044
"q2": [30, -10, 15, 60, 20],
1045
"q3": [-50, 0, 40, 80, 80],
1046
"q4": [75, 55, 25, -10, -55],
1047
}
1048
).cast(dtypes={pl.Int64: pl.Float64})
1049
1050
# also: confirm that we can use a Workbook directly with "write_excel"
1051
xls = BytesIO()
1052
with Workbook(xls) as wb:
1053
df.write_excel(
1054
workbook=wb,
1055
worksheet="frame_data",
1056
table_style="Table Style Light 2",
1057
dtype_formats={frozenset(NUMERIC_DTYPES): "#,##0_);(#,##0)"},
1058
column_formats={cs.starts_with("h"): "#,##0_);(#,##0)"},
1059
sparklines={
1060
"trend": ["q1", "q2", "q3", "q4"],
1061
"+/-": {
1062
"columns": ["q1", "q2", "q3", "q4"],
1063
"insert_after": "id",
1064
"type": "win_loss",
1065
},
1066
},
1067
conditional_formats={
1068
cs.starts_with("q", "h"): {
1069
"type": "2_color_scale",
1070
"min_color": "#95b3d7",
1071
"max_color": "#ffffff",
1072
}
1073
},
1074
column_widths={cs.starts_with("q", "h"): 40},
1075
row_totals={
1076
"h1": ("q1", "q2"),
1077
"h2": ("q3", "q4"),
1078
},
1079
hide_gridlines=True,
1080
row_heights=35,
1081
sheet_zoom=125,
1082
)
1083
1084
tables = {tbl["name"] for tbl in wb.get_worksheet_by_name("frame_data").tables}
1085
assert "Frame0" in tables
1086
1087
with warnings.catch_warnings():
1088
# ignore an openpyxl user warning about sparklines
1089
warnings.simplefilter("ignore", UserWarning)
1090
xldf = pl.read_excel(xls, sheet_name="frame_data", engine=engine)
1091
1092
# ┌─────┬──────┬─────┬─────┬─────┬─────┬───────┬─────┬─────┐
1093
# │ id ┆ +/- ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ trend ┆ h1 ┆ h2 │
1094
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1095
# │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │
1096
# ╞═════╪══════╪═════╪═════╪═════╪═════╪═══════╪═════╪═════╡
1097
# │ aaa ┆ null ┆ 100 ┆ 30 ┆ -50 ┆ 75 ┆ null ┆ 0 ┆ 0 │
1098
# │ bbb ┆ null ┆ 55 ┆ -10 ┆ 0 ┆ 55 ┆ null ┆ 0 ┆ 0 │
1099
# │ ccc ┆ null ┆ -20 ┆ 15 ┆ 40 ┆ 25 ┆ null ┆ 0 ┆ 0 │
1100
# │ ddd ┆ null ┆ 0 ┆ 60 ┆ 80 ┆ -10 ┆ null ┆ 0 ┆ 0 │
1101
# │ eee ┆ null ┆ 35 ┆ 20 ┆ 80 ┆ -55 ┆ null ┆ 0 ┆ 0 │
1102
# └─────┴──────┴─────┴─────┴─────┴─────┴───────┴─────┴─────┘
1103
1104
for sparkline_col in ("+/-", "trend"):
1105
assert set(xldf[sparkline_col]) in ({None}, {""})
1106
1107
assert xldf.columns == ["id", "+/-", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
1108
assert_frame_equal(
1109
df, xldf.drop("+/-", "trend", "h1", "h2").cast(dtypes={pl.Int64: pl.Float64})
1110
)
1111
1112
1113
def test_excel_write_multiple_tables() -> None:
1114
from xlsxwriter import Workbook
1115
1116
# note: also checks that empty tables don't error on write
1117
df = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1118
1119
# write multiple frames to multiple worksheets
1120
xls = BytesIO()
1121
with Workbook(xls) as wb:
1122
df.rename({"colx": "colx0", "coly": "coly0", "colz": "colz0"}).write_excel(
1123
workbook=wb, worksheet="sheet1", position="A1"
1124
)
1125
df.rename({"colx": "colx1", "coly": "coly1", "colz": "colz1"}).write_excel(
1126
workbook=wb, worksheet="sheet1", position="X10"
1127
)
1128
df.rename({"colx": "colx2", "coly": "coly2", "colz": "colz2"}).write_excel(
1129
workbook=wb, worksheet="sheet2", position="C25"
1130
)
1131
1132
# also validate integration of externally-added formats
1133
fmt = wb.add_format({"bg_color": "#ffff00"})
1134
df.rename({"colx": "colx3", "coly": "coly3", "colz": "colz3"}).write_excel(
1135
workbook=wb,
1136
worksheet="sheet3",
1137
position="D4",
1138
conditional_formats={
1139
"colz3": {
1140
"type": "formula",
1141
"criteria": "=C2=B2",
1142
"format": fmt,
1143
}
1144
},
1145
)
1146
1147
table_names = {
1148
tbl["name"]
1149
for sheet in wb.sheetnames
1150
for tbl in wb.get_worksheet_by_name(sheet).tables
1151
}
1152
assert table_names == {f"Frame{n}" for n in range(4)}
1153
assert pl.read_excel(xls, sheet_name="sheet3").rows() == []
1154
1155
# test loading one of the written tables by name
1156
for engine in ("calamine", "openpyxl"):
1157
df1 = pl.read_excel(
1158
xls,
1159
sheet_name="sheet2",
1160
table_name="Frame2",
1161
engine=engine,
1162
)
1163
df2 = pl.read_excel(
1164
xls,
1165
table_name="Frame2",
1166
engine=engine,
1167
)
1168
assert df1.columns == ["colx2", "coly2", "colz2"]
1169
assert_frame_equal(df1, df2)
1170
1171
# if we supply a sheet name (which is optional when using `table_name`),
1172
# then the table name must be present in *that* sheet, or we raise an error
1173
with pytest.raises(
1174
RuntimeError,
1175
match="table named 'Frame3' not found in sheet 'sheet1'",
1176
):
1177
pl.read_excel(xls, sheet_name="sheet1", table_name="Frame3")
1178
1179
1180
def test_excel_write_worksheet_object() -> None:
1181
# write to worksheet object
1182
from xlsxwriter import Workbook
1183
1184
df = pl.DataFrame({"colx": ["aaa", "bbb", "ccc"], "coly": [-1234, 0, 5678]})
1185
1186
with Workbook(xls := BytesIO()) as wb:
1187
ws = wb.add_worksheet("frame_data")
1188
df.write_excel(wb, worksheet=ws)
1189
ws.hide_zero()
1190
1191
assert_frame_equal(df, pl.read_excel(xls, sheet_name="frame_data"))
1192
1193
with pytest.raises( # noqa: SIM117
1194
ValueError,
1195
match="the given workbook object .* is not the parent of worksheet 'frame_data'",
1196
):
1197
with Workbook(BytesIO()) as wb:
1198
df.write_excel(wb, worksheet=ws)
1199
1200
with pytest.raises( # noqa: SIM117
1201
TypeError,
1202
match="worksheet object requires the parent workbook object; found workbook=None",
1203
):
1204
with Workbook(BytesIO()) as wb:
1205
df.write_excel(None, worksheet=ws)
1206
1207
1208
def test_excel_write_beyond_max_rows_cols(tmp_path: Path) -> None:
1209
tmp_path.mkdir(exist_ok=True)
1210
path = tmp_path / "test_max_dimensions.xlsx"
1211
sheet = "mysheet"
1212
1213
df = pl.DataFrame({"col1": range(10), "col2": range(10, 20)})
1214
1215
with pytest.raises(pl.exceptions.InvalidOperationError):
1216
df.write_excel(workbook=path, worksheet=sheet, position="A1048570")
1217
1218
1219
def test_excel_freeze_panes() -> None:
1220
from xlsxwriter import Workbook
1221
1222
# note: checks that empty tables don't error on write
1223
df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1224
df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1225
df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1226
1227
xls = BytesIO()
1228
1229
# use all three freeze_pane notations
1230
with Workbook(xls) as wb:
1231
df1.write_excel(workbook=wb, worksheet="sheet1", freeze_panes=(1, 0))
1232
df2.write_excel(workbook=wb, worksheet="sheet2", freeze_panes=(1, 0, 3, 4))
1233
df3.write_excel(workbook=wb, worksheet="sheet3", freeze_panes=("B2"))
1234
1235
table_names: set[str] = set()
1236
for sheet in ("sheet1", "sheet2", "sheet3"):
1237
table_names.update(
1238
tbl["name"] for tbl in wb.get_worksheet_by_name(sheet).tables
1239
)
1240
assert table_names == {f"Frame{n}" for n in range(3)}
1241
assert pl.read_excel(xls, sheet_name="sheet3").rows() == []
1242
1243
1244
@pytest.mark.parametrize(
1245
("read_spreadsheet", "source", "schema_overrides"),
1246
[
1247
(pl.read_excel, "path_xlsx_empty", None),
1248
(pl.read_excel, "path_xlsb_empty", None),
1249
(pl.read_excel, "path_xls_empty", None),
1250
(pl.read_ods, "path_ods_empty", None),
1251
# Test with schema overrides, to ensure they don't interfere with
1252
# raising NoDataErrors.
1253
(pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}),
1254
(pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}),
1255
(pl.read_excel, "path_xls_empty", {"a": pl.Int64}),
1256
(pl.read_ods, "path_ods_empty", {"a": pl.Int64}),
1257
],
1258
)
1259
def test_excel_empty_sheet(
1260
read_spreadsheet: Callable[..., pl.DataFrame],
1261
source: str,
1262
request: pytest.FixtureRequest,
1263
schema_overrides: SchemaDict | None,
1264
) -> None:
1265
ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods"
1266
read_spreadsheet = pl.read_ods if ods else pl.read_excel # type: ignore[assignment]
1267
1268
with pytest.raises(NoDataError, match="empty Excel sheet"):
1269
read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides)
1270
1271
engine_params = [{}] if ods else [{"engine": "calamine"}]
1272
for params in engine_params:
1273
df = read_spreadsheet(
1274
empty_spreadsheet_path,
1275
sheet_name="no_data",
1276
raise_if_empty=False,
1277
**params,
1278
)
1279
expected = pl.DataFrame()
1280
assert_frame_equal(df, expected)
1281
1282
df = read_spreadsheet(
1283
empty_spreadsheet_path,
1284
sheet_name="no_rows",
1285
raise_if_empty=False,
1286
**params,
1287
)
1288
expected = pl.DataFrame(schema={f"col{c}": pl.String for c in ("x", "y", "z")})
1289
assert_frame_equal(df, expected)
1290
1291
1292
@pytest.mark.parametrize(
1293
("engine", "hidden_columns"),
1294
[
1295
("xlsx2csv", ["a"]),
1296
("openpyxl", ["a", "b"]),
1297
("calamine", ["a", "b"]),
1298
("xlsx2csv", cs.numeric()),
1299
("openpyxl", cs.last()),
1300
],
1301
)
1302
def test_excel_hidden_columns(
1303
hidden_columns: list[str] | SelectorType,
1304
engine: ExcelSpreadsheetEngine,
1305
) -> None:
1306
df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})
1307
1308
xls = BytesIO()
1309
df.write_excel(xls, hidden_columns=hidden_columns)
1310
1311
read_df = pl.read_excel(xls)
1312
assert_frame_equal(df, read_df)
1313
1314
1315
def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None:
1316
df = pl.read_excel(io_files_path / "nan_test.xlsx", engine="calamine")
1317
nan = float("nan")
1318
assert_frame_equal(
1319
pl.DataFrame({"float_col": [nan, nan, nan, 100.0, 200.0, 300.0]}),
1320
df,
1321
)
1322
1323
1324
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1325
@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df
1326
def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
1327
df = pl.DataFrame(
1328
{
1329
"a": [1, 2, None],
1330
"b": [1.0, None, 3.5],
1331
"c": ["x", None, "z"],
1332
"d": [True, False, None],
1333
"e": [
1334
date(2023, 1, 1),
1335
None,
1336
date(2023, 1, 4),
1337
],
1338
"f": [
1339
datetime(2023, 1, 1),
1340
datetime(2000, 10, 10, 10, 10),
1341
None,
1342
],
1343
"g": [
1344
None,
1345
"1920-08-08 00:00:00",
1346
"2077-10-20 00:00:00.000000",
1347
],
1348
}
1349
)
1350
xls = BytesIO()
1351
df.write_excel(xls)
1352
1353
reversed_cols = list(reversed(df.columns))
1354
read_cols: Sequence[str] | Sequence[int]
1355
expected = df.select(reversed_cols).with_columns(
1356
pl.col("g").str.slice(0, 10).str.to_date()
1357
)
1358
for read_cols in (
1359
reversed_cols,
1360
[6, 5, 4, 3, 2, 1, 0],
1361
):
1362
read_df = pl.read_excel(
1363
xls,
1364
engine=engine,
1365
columns=read_cols,
1366
schema_overrides={
1367
"e": pl.Date,
1368
"f": pl.Datetime("us"),
1369
"g": pl.Date,
1370
},
1371
)
1372
assert_frame_equal(expected, read_df)
1373
1374
1375
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1376
def test_drop_empty_rows(
1377
path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine
1378
) -> None:
1379
df1 = pl.read_excel(
1380
source=path_empty_rows_excel,
1381
engine=engine,
1382
) # check default
1383
assert df1.shape == (8, 4)
1384
1385
df2 = pl.read_excel(
1386
source=path_empty_rows_excel,
1387
engine=engine,
1388
drop_empty_rows=True,
1389
)
1390
assert df2.shape == (8, 4)
1391
1392
df3 = pl.read_excel(
1393
source=path_empty_rows_excel,
1394
engine=engine,
1395
drop_empty_rows=False,
1396
)
1397
assert df3.shape == (10, 4)
1398
1399
1400
def test_excel_write_select_col_dtype() -> None:
1401
from openpyxl import load_workbook
1402
from xlsxwriter import Workbook
1403
1404
def get_col_widths(wb_bytes: BytesIO) -> dict[str, int]:
1405
return {
1406
k: round(v.width)
1407
for k, v in load_workbook(wb_bytes).active.column_dimensions.items()
1408
}
1409
1410
df = pl.DataFrame(
1411
{
1412
"name": [["Alice", "Ben"], ["Charlie", "Delta"]],
1413
"col2": ["Hi", "Bye"],
1414
}
1415
)
1416
1417
# column_widths test:
1418
# pl.List(pl.String)) datatype should not match column with no list
1419
check = BytesIO()
1420
with Workbook(check) as wb:
1421
df.write_excel(wb, column_widths={cs.by_dtype(pl.List(pl.String)): 300})
1422
1423
assert get_col_widths(check) == {"A": 43}
1424
1425
# column_widths test:
1426
# pl.String datatype should not match column with list
1427
check = BytesIO()
1428
with Workbook(check) as wb:
1429
df.write_excel(wb, column_widths={cs.by_dtype(pl.String): 300})
1430
1431
assert get_col_widths(check) == {"B": 43}
1432
1433
# hidden_columns test:
1434
# pl.List(pl.String)) datatype should not match column with no list
1435
check = BytesIO()
1436
with Workbook(check) as wb:
1437
df.write_excel(wb, hidden_columns=cs.by_dtype(pl.List(pl.String)))
1438
1439
assert get_col_widths(check) == {"A": 0}
1440
1441
# hidden_columns test:
1442
# pl.String datatype should not match column with list
1443
check = BytesIO()
1444
with Workbook(check) as wb:
1445
df.write_excel(wb, hidden_columns=cs.by_dtype(pl.String))
1446
1447
assert get_col_widths(check) == {"B": 0}
1448
1449
1450
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1451
def test_excel_read_columns_nonlist_sequence(engine: ExcelSpreadsheetEngine) -> None:
1452
df = pl.DataFrame(
1453
{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
1454
)
1455
xls = BytesIO()
1456
df.write_excel(xls, worksheet="data")
1457
1458
xldf = pl.read_excel(xls, engine=engine, columns=("colx", "coly"))
1459
expected = df.select("colx", "coly")
1460
assert_frame_equal(xldf, expected)
1461
1462
xldf = pl.read_excel(xls, engine=engine, columns="colx")
1463
expected = df.select("colx")
1464
assert_frame_equal(xldf, expected)
1465
1466