Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_spreadsheet.py
8420 views
1
from __future__ import annotations
2
3
import warnings
4
from collections import OrderedDict
5
from datetime import date, datetime, time
6
from io import BytesIO
7
from pathlib import Path
8
from typing import TYPE_CHECKING, Any
9
from zoneinfo import ZoneInfo
10
11
import pytest
12
13
import polars as pl
14
import polars.selectors as cs
15
from polars.exceptions import (
16
NoDataError,
17
ParameterCollisionError,
18
)
19
from polars.testing import assert_frame_equal, assert_series_equal
20
from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES
21
22
if TYPE_CHECKING:
23
from collections.abc import Callable, Sequence
24
25
from polars._typing import (
26
ExcelSpreadsheetEngine,
27
PolarsDataType,
28
SchemaDict,
29
SelectorType,
30
)
31
32
33
# pytestmark = pytest.mark.slow()
34
35
36
@pytest.fixture
37
def path_xls(io_files_path: Path) -> Path:
38
# old excel 97-2004 format
39
return io_files_path / "example.xls"
40
41
42
@pytest.fixture
43
def path_xlsx(io_files_path: Path) -> Path:
44
# modern excel format
45
return io_files_path / "example.xlsx"
46
47
48
@pytest.fixture
49
def path_xlsb(io_files_path: Path) -> Path:
50
# excel binary format
51
return io_files_path / "example.xlsb"
52
53
54
@pytest.fixture
55
def path_ods(io_files_path: Path) -> Path:
56
# open document spreadsheet
57
return io_files_path / "example.ods"
58
59
60
@pytest.fixture
61
def path_xls_empty(io_files_path: Path) -> Path:
62
return io_files_path / "empty.xls"
63
64
65
@pytest.fixture
66
def path_xlsx_empty(io_files_path: Path) -> Path:
67
return io_files_path / "empty.xlsx"
68
69
70
@pytest.fixture
71
def path_xlsx_mixed(io_files_path: Path) -> Path:
72
return io_files_path / "mixed.xlsx"
73
74
75
@pytest.fixture
76
def path_xlsb_empty(io_files_path: Path) -> Path:
77
return io_files_path / "empty.xlsb"
78
79
80
@pytest.fixture
81
def path_xlsb_mixed(io_files_path: Path) -> Path:
82
return io_files_path / "mixed.xlsb"
83
84
85
@pytest.fixture
86
def path_ods_empty(io_files_path: Path) -> Path:
87
return io_files_path / "empty.ods"
88
89
90
@pytest.fixture
91
def path_ods_mixed(io_files_path: Path) -> Path:
92
return io_files_path / "mixed.ods"
93
94
95
@pytest.fixture
96
def path_empty_rows_excel(io_files_path: Path) -> Path:
97
return io_files_path / "test_empty_rows.xlsx"
98
99
100
@pytest.mark.parametrize(
101
("read_spreadsheet", "source", "engine_params"),
102
[
103
# xls file
104
(pl.read_excel, "path_xls", {"engine": "calamine"}),
105
# xlsx file
106
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
107
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
108
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
109
# xlsb file (binary)
110
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
111
# open document
112
(pl.read_ods, "path_ods", {}),
113
],
114
)
115
def test_read_spreadsheet(
116
read_spreadsheet: Callable[..., pl.DataFrame],
117
source: str,
118
engine_params: dict[str, str],
119
request: pytest.FixtureRequest,
120
) -> None:
121
sheet_params: dict[str, Any]
122
123
for sheet_params in (
124
{"sheet_name": None, "sheet_id": None},
125
{"sheet_name": "test1"},
126
{"sheet_id": 1},
127
):
128
df = read_spreadsheet(
129
source=request.getfixturevalue(source),
130
**engine_params,
131
**sheet_params,
132
)
133
expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
134
assert_frame_equal(df, expected)
135
136
137
@pytest.mark.parametrize(
138
("read_spreadsheet", "source", "params"),
139
[
140
# xls file
141
(pl.read_excel, "path_xls", {"engine": "calamine"}),
142
# xlsx file
143
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
144
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
145
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
146
# xlsb file (binary)
147
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
148
# open document
149
(pl.read_ods, "path_ods", {}),
150
],
151
)
152
def test_read_excel_multiple_worksheets(
153
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
154
source: str,
155
params: dict[str, str],
156
request: pytest.FixtureRequest,
157
) -> None:
158
spreadsheet_path = request.getfixturevalue(source)
159
frames_by_id = read_spreadsheet(
160
spreadsheet_path,
161
sheet_id=[2, 1],
162
sheet_name=None,
163
**params,
164
)
165
frames_by_name = read_spreadsheet(
166
spreadsheet_path,
167
sheet_id=None,
168
sheet_name=["test2", "test1"],
169
**params,
170
)
171
for frames in (frames_by_id, frames_by_name):
172
assert list(frames_by_name) == ["test2", "test1"]
173
174
expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
175
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
176
177
assert_frame_equal(frames["test1"], expected1)
178
assert_frame_equal(frames["test2"], expected2)
179
180
181
@pytest.mark.parametrize(
182
("read_spreadsheet", "source", "params"),
183
[
184
# xls file
185
(pl.read_excel, "path_xls", {"engine": "calamine"}),
186
# xlsx file
187
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
188
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
189
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
190
# xlsb file (binary)
191
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
192
# open document
193
(pl.read_ods, "path_ods", {}),
194
],
195
)
196
def test_read_excel_multiple_workbooks(
197
read_spreadsheet: Callable[..., Any],
198
source: str,
199
params: dict[str, str],
200
request: pytest.FixtureRequest,
201
) -> None:
202
spreadsheet_path = request.getfixturevalue(source)
203
204
# multiple workbooks, single worksheet
205
df = read_spreadsheet(
206
[
207
spreadsheet_path,
208
spreadsheet_path,
209
spreadsheet_path,
210
],
211
sheet_id=None,
212
sheet_name="test1",
213
include_file_paths="path",
214
**params,
215
)
216
expected = pl.DataFrame(
217
{
218
"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"],
219
"path": [str(spreadsheet_path)] * 6,
220
},
221
)
222
assert_frame_equal(df, expected)
223
224
# multiple workbooks, multiple worksheets
225
res = read_spreadsheet(
226
[
227
spreadsheet_path,
228
spreadsheet_path,
229
spreadsheet_path,
230
],
231
sheet_id=None,
232
sheet_name=["test1", "test2"],
233
**params,
234
)
235
expected_frames = {
236
"test1": pl.DataFrame(
237
{"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"]}
238
),
239
"test2": pl.DataFrame(
240
{"world": ["Row 3", "Row 4", "Row 3", "Row 4", "Row 3", "Row 4"]}
241
),
242
}
243
assert sorted(res) == sorted(expected_frames)
244
assert_frame_equal(res["test1"], expected_frames["test1"])
245
assert_frame_equal(res["test2"], expected_frames["test2"])
246
247
248
@pytest.mark.parametrize(
249
("read_spreadsheet", "source", "params"),
250
[
251
# xls file
252
(pl.read_excel, "path_xls", {"engine": "calamine"}),
253
# xlsx file
254
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
255
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
256
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
257
# xlsb file (binary)
258
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
259
# open document
260
(pl.read_ods, "path_ods", {}),
261
],
262
)
263
def test_read_excel_all_sheets(
264
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
265
source: str,
266
params: dict[str, str],
267
request: pytest.FixtureRequest,
268
) -> None:
269
spreadsheet_path = request.getfixturevalue(source)
270
frames = read_spreadsheet(
271
spreadsheet_path,
272
sheet_id=0,
273
**params,
274
)
275
assert len(frames) == (4 if str(spreadsheet_path).endswith("ods") else 6)
276
277
expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
278
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
279
expected3 = pl.DataFrame(
280
{
281
"cardinality": [1, 3, 15, 30, 150, 300],
282
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
283
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
284
}
285
)
286
assert_frame_equal(frames["test1"], expected1)
287
assert_frame_equal(frames["test2"], expected2)
288
if params.get("engine") == "openpyxl":
289
# TODO: flag that trims trailing all-null rows?
290
assert_frame_equal(frames["test3"], expected3)
291
assert_frame_equal(frames["test4"].drop_nulls(), expected3)
292
293
294
@pytest.mark.parametrize(
295
"engine",
296
["calamine", "openpyxl", "xlsx2csv"],
297
)
298
def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:
299
df = pl.DataFrame(
300
{
301
"A": [1, 2, 3, 4, 5],
302
"fruits": ["banana", "banana", "apple", "apple", "banana"],
303
"floats": [1.1, 1.2, 1.3, 1.4, 1.5],
304
"datetime": [datetime(2023, 1, x) for x in range(1, 6)],
305
"nulls": [1, None, None, None, 0],
306
},
307
)
308
xls = BytesIO()
309
df.write_excel(xls, position="C5")
310
311
schema_overrides = {"datetime": pl.Datetime("us"), "nulls": pl.Boolean()}
312
df_compare = df.with_columns(
313
pl.col(nm).cast(tp) for nm, tp in schema_overrides.items()
314
)
315
for sheet_id, sheet_name in ((None, None), (1, None), (None, "Sheet1")):
316
df_from_excel = pl.read_excel(
317
xls,
318
sheet_id=sheet_id,
319
sheet_name=sheet_name,
320
engine=engine,
321
schema_overrides=schema_overrides,
322
)
323
assert_frame_equal(df_compare, df_from_excel)
324
325
# check some additional overrides
326
# (note: xlsx2csv can't currently convert datetime with trailing '00:00:00' to date)
327
dt_override = {"datetime": pl.Date} if engine != "xlsx2csv" else {}
328
df = pl.read_excel(
329
xls,
330
sheet_id=sheet_id,
331
sheet_name=sheet_name,
332
engine=engine,
333
schema_overrides={"A": pl.Float32, **dt_override},
334
)
335
assert_series_equal(
336
df["A"],
337
pl.Series(name="A", values=[1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float32),
338
)
339
if dt_override:
340
assert_series_equal(
341
df["datetime"],
342
pl.Series(
343
name="datetime",
344
values=[date(2023, 1, x) for x in range(1, 6)],
345
dtype=pl.Date,
346
),
347
)
348
349
350
@pytest.mark.parametrize(
351
("read_spreadsheet", "source", "params"),
352
[
353
# TODO: uncomment once fastexcel offers a suitable param
354
# (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
355
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
356
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
357
],
358
)
359
def test_read_dropped_cols(
360
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
361
source: str,
362
params: dict[str, str],
363
request: pytest.FixtureRequest,
364
) -> None:
365
spreadsheet_path = request.getfixturevalue(source)
366
367
df1 = read_spreadsheet(
368
spreadsheet_path,
369
sheet_name="test4",
370
**params,
371
)
372
df2 = read_spreadsheet(
373
spreadsheet_path,
374
sheet_name="test4",
375
drop_empty_cols=False,
376
**params,
377
)
378
assert df1.to_dict(as_series=False) == { # type: ignore[attr-defined]
379
"cardinality": [1, 3, 15, 30, 150, 300],
380
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
381
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
382
}
383
assert df2.to_dict(as_series=False) == { # type: ignore[attr-defined]
384
"": [None, None, None, None, None, None],
385
"cardinality": [1, 3, 15, 30, 150, 300],
386
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
387
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
388
"0": [None, None, None, None, None, None],
389
"1": [None, None, None, None, None, None],
390
}
391
392
393
@pytest.mark.parametrize(
394
("source", "params"),
395
[
396
("path_xls", {"engine": "calamine", "sheet_name": "temporal"}),
397
("path_xlsx", {"engine": "calamine", "table_name": "TemporalData"}),
398
("path_xlsx", {"engine": "openpyxl", "sheet_name": "temporal"}),
399
("path_xlsb", {"engine": "calamine", "sheet_name": "temporal"}),
400
],
401
)
402
def test_read_excel_temporal_data(
403
source: str,
404
params: dict[str, str],
405
request: pytest.FixtureRequest,
406
) -> None:
407
source_path = request.getfixturevalue(source)
408
409
temporal_schema = {
410
"id": pl.UInt16(),
411
"dtm": pl.Datetime("ms"),
412
"dt": pl.Date(),
413
"dtm_str": pl.Datetime(time_zone="Asia/Tokyo"),
414
"dt_str": pl.Date(),
415
"tm_str": pl.Time(),
416
}
417
parsed_df = pl.read_excel( # type: ignore[call-overload]
418
source_path,
419
**params,
420
schema_overrides=temporal_schema,
421
)
422
TK = ZoneInfo("Asia/Tokyo")
423
424
expected = pl.DataFrame(
425
{
426
"id": [100, 200, 300, 400],
427
"dtm": [
428
datetime(1999, 12, 31, 1, 2, 3),
429
None,
430
datetime(1969, 7, 5, 10, 30, 45),
431
datetime(2077, 10, 10, 5, 59, 44),
432
],
433
"dt": [
434
date(2000, 1, 18),
435
date(1965, 8, 8),
436
date(2027, 4, 22),
437
None,
438
],
439
"dtm_str": [
440
None,
441
datetime(1900, 1, 30, 14, 50, 20, tzinfo=TK),
442
datetime(2026, 5, 7, 23, 59, 59, tzinfo=TK),
443
datetime(2007, 6, 1, 0, 0, tzinfo=TK),
444
],
445
"dt_str": [
446
date(2000, 6, 14),
447
date(1978, 2, 28),
448
None,
449
date(2040, 12, 4),
450
],
451
"tm_str": [
452
time(23, 50, 22),
453
time(0, 0, 1),
454
time(10, 10, 33),
455
time(18, 30, 15),
456
],
457
},
458
schema=temporal_schema,
459
)
460
assert_frame_equal(expected, parsed_df)
461
462
463
@pytest.mark.parametrize(
464
("read_spreadsheet", "source", "params"),
465
[
466
# xls file
467
(pl.read_excel, "path_xls", {"engine": "calamine"}),
468
# xlsx file
469
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
470
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
471
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
472
# xlsb file (binary)
473
(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
474
# open document
475
(pl.read_ods, "path_ods", {}),
476
],
477
)
478
def test_read_invalid_worksheet(
479
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
480
source: str,
481
params: dict[str, str],
482
request: pytest.FixtureRequest,
483
) -> None:
484
spreadsheet_path = request.getfixturevalue(source)
485
for param, sheet_id, sheet_name in (
486
("id", 999, None),
487
("name", None, "not_a_sheet_name"),
488
):
489
value = sheet_id if param == "id" else sheet_name
490
with pytest.raises(
491
ValueError,
492
match=f"no matching sheet found when `sheet_{param}` is {value!r}",
493
):
494
read_spreadsheet(
495
spreadsheet_path, sheet_id=sheet_id, sheet_name=sheet_name, **params
496
)
497
498
499
@pytest.mark.parametrize(
500
("read_spreadsheet", "source", "additional_params"),
501
[
502
(pl.read_excel, "path_xlsx_mixed", {"engine": "openpyxl"}),
503
(pl.read_ods, "path_ods_mixed", {}),
504
],
505
)
506
@pytest.mark.may_fail_auto_streaming
507
def test_read_mixed_dtype_columns(
508
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
509
source: str,
510
additional_params: dict[str, str],
511
request: pytest.FixtureRequest,
512
) -> None:
513
spreadsheet_path = request.getfixturevalue(source)
514
schema_overrides = {
515
"Employee ID": pl.Utf8(),
516
"Employee Name": pl.Utf8(),
517
"Date": pl.Date(),
518
"Details": pl.Categorical(),
519
"Asset ID": pl.Utf8(),
520
}
521
df = read_spreadsheet(
522
spreadsheet_path,
523
sheet_id=0,
524
schema_overrides=schema_overrides,
525
**additional_params,
526
)["Sheet1"]
527
528
assert_frame_equal(
529
df,
530
pl.DataFrame(
531
{
532
"Employee ID": ["123456", "44333", "US00011", "135967", "IN86868"],
533
"Employee Name": ["Test1", "Test2", "Test4", "Test5", "Test6"],
534
"Date": [
535
date(2023, 7, 21),
536
date(2023, 7, 21),
537
date(2023, 7, 21),
538
date(2023, 7, 21),
539
date(2023, 7, 21),
540
],
541
"Details": [
542
"Healthcare",
543
"Healthcare",
544
"Healthcare",
545
"Healthcare",
546
"Something",
547
],
548
"Asset ID": ["84444", "84444", "84444", "84444", "ABC123"],
549
},
550
schema_overrides=schema_overrides,
551
),
552
)
553
554
555
def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> None:
556
df1 = pl.read_excel(
557
path_xlsx,
558
sheet_name="test4",
559
schema_overrides={"cardinality": pl.UInt16},
560
).drop_nulls()
561
562
assert df1.schema["cardinality"] == pl.UInt16
563
assert df1.schema["rows_by_key"] == pl.Float64
564
assert df1.schema["iter_groups"] == pl.Float64
565
566
df2 = pl.read_excel(
567
path_xlsx,
568
sheet_name="test4",
569
engine="xlsx2csv",
570
read_options={"schema_overrides": {"cardinality": pl.UInt16}},
571
).drop_nulls()
572
573
assert df2.schema["cardinality"] == pl.UInt16
574
assert df2.schema["rows_by_key"] == pl.Float64
575
assert df2.schema["iter_groups"] == pl.Float64
576
577
df3 = pl.read_excel(
578
path_xlsx,
579
sheet_name="test4",
580
engine="xlsx2csv",
581
schema_overrides={"cardinality": pl.UInt16},
582
read_options={
583
"schema_overrides": {
584
"rows_by_key": pl.Float32,
585
"iter_groups": pl.Float32,
586
},
587
},
588
).drop_nulls()
589
590
assert df3.schema["cardinality"] == pl.UInt16
591
assert df3.schema["rows_by_key"] == pl.Float32
592
assert df3.schema["iter_groups"] == pl.Float32
593
594
for workbook_path in (path_xlsx, path_xlsb, path_ods):
595
read_spreadsheet = (
596
pl.read_ods if workbook_path.suffix == ".ods" else pl.read_excel
597
)
598
df4 = read_spreadsheet( # type: ignore[operator]
599
workbook_path,
600
sheet_name="test5",
601
schema_overrides={"dtm": pl.Datetime("ns"), "dt": pl.Date},
602
)
603
assert_frame_equal(
604
df4,
605
pl.DataFrame(
606
{
607
"dtm": [
608
datetime(1999, 12, 31, 10, 30, 45),
609
datetime(2010, 10, 11, 12, 13, 14),
610
],
611
"dt": [date(2024, 1, 1), date(2018, 8, 7)],
612
"val": [1.5, -0.5],
613
},
614
schema={"dtm": pl.Datetime("ns"), "dt": pl.Date, "val": pl.Float64},
615
),
616
)
617
618
with pytest.raises(ParameterCollisionError):
619
# cannot specify 'cardinality' in both schema_overrides and read_options
620
pl.read_excel(
621
path_xlsx,
622
sheet_name="test4",
623
engine="xlsx2csv",
624
schema_overrides={"cardinality": pl.UInt16},
625
read_options={"schema_overrides": {"cardinality": pl.Int32}},
626
)
627
628
# read multiple sheets in conjunction with 'schema_overrides'
629
# (note: reading the same sheet twice simulates the issue in #11850)
630
overrides = OrderedDict(
631
[
632
("cardinality", pl.UInt32),
633
("rows_by_key", pl.Float32),
634
("iter_groups", pl.Float64),
635
]
636
)
637
df = pl.read_excel(
638
path_xlsx,
639
sheet_name=("test4", "test4"),
640
schema_overrides=overrides,
641
)
642
for col, dtype in overrides.items():
643
assert df["test4"].schema[col] == dtype
644
645
646
@pytest.mark.parametrize(
647
("engine", "read_opts_param"),
648
[
649
("xlsx2csv", "infer_schema_length"),
650
("calamine", "schema_sample_rows"),
651
],
652
)
653
def test_invalid_parameter_combinations_infer_schema_len(
654
path_xlsx: Path, engine: str, read_opts_param: str
655
) -> None:
656
with pytest.raises(
657
ParameterCollisionError,
658
match=f"cannot specify both `infer_schema_length`.*{read_opts_param}",
659
):
660
pl.read_excel( # type: ignore[call-overload]
661
path_xlsx,
662
sheet_id=1,
663
engine=engine,
664
read_options={read_opts_param: 512},
665
infer_schema_length=1024,
666
)
667
668
669
@pytest.mark.parametrize(
670
("engine", "read_opts_param"),
671
[
672
("xlsx2csv", "columns"),
673
("calamine", "use_columns"),
674
],
675
)
676
def test_invalid_parameter_combinations_columns(
677
path_xlsx: Path, engine: str, read_opts_param: str
678
) -> None:
679
with pytest.raises(
680
ParameterCollisionError,
681
match=f"cannot specify both `columns`.*{read_opts_param}",
682
):
683
pl.read_excel( # type: ignore[call-overload]
684
path_xlsx,
685
sheet_id=1,
686
engine=engine,
687
read_options={read_opts_param: ["B", "C", "D"]},
688
columns=["A", "B", "C"],
689
)
690
691
692
def test_unsupported_engine() -> None:
693
with pytest.raises(NotImplementedError):
694
pl.read_excel(None, engine="foo") # type: ignore[call-overload]
695
696
697
def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
698
with pytest.raises(Exception, match="does not support binary format"):
699
pl.read_excel(path_xlsb, engine="openpyxl")
700
701
702
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
703
def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:
704
with pytest.raises(
705
ValueError,
706
match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)",
707
):
708
pl.read_excel( # type: ignore[call-overload]
709
path_xlsx,
710
sheet_id=1,
711
sheet_name="Sheet1",
712
engine=engine,
713
)
714
715
716
# the parameters don't change the data, only the formatting, so we expect
717
# the same result each time. however, it's important to validate that the
718
# parameter permutations don't raise exceptions, or interfere with the
719
# values written to the worksheet, so test multiple variations.
720
@pytest.mark.parametrize(
721
"write_params",
722
[
723
# default parameters
724
{},
725
# basic formatting
726
{
727
"autofit": True,
728
"table_style": "Table Style Dark 2",
729
"column_totals": True,
730
"float_precision": 0,
731
},
732
# slightly customized formatting, with some formulas
733
{
734
"position": (0, 0),
735
"table_style": {
736
"style": "Table Style Medium 23",
737
"first_column": True,
738
},
739
"conditional_formats": {"val": "data_bar"},
740
"column_formats": {
741
"val": "#,##0.000;[White]-#,##0.000",
742
("day", "month", "year"): {"align": "left", "num_format": "0"},
743
},
744
"header_format": {"italic": True, "bg_color": "#d9d9d9"},
745
"column_widths": {"val": 100},
746
"row_heights": {0: 35},
747
"formulas": {
748
# string: formula added to the end of the table (but before row_totals)
749
"day": "=DAY([@dtm])",
750
"month": "=MONTH([@dtm])",
751
"year": {
752
# dict: full control over formula positioning/dtype
753
"formula": "=YEAR([@dtm])",
754
"insert_after": "month",
755
"return_type": pl.Int16,
756
},
757
},
758
"column_totals": True,
759
"row_totals": True,
760
},
761
# heavily customized formatting/definition
762
{
763
"position": "A1",
764
"table_name": "PolarsFrameData",
765
"table_style": "Table Style Light 9",
766
"conditional_formats": {
767
# single dict format
768
"str": {
769
"type": "duplicate",
770
"format": {"bg_color": "#ff0000", "font_color": "#ffffff"},
771
},
772
# multiple dict formats
773
"val": [
774
{
775
"type": "3_color_scale",
776
"min_color": "#4bacc6",
777
"mid_color": "#ffffff",
778
"max_color": "#daeef3",
779
},
780
{
781
"type": "cell",
782
"criteria": "<",
783
"value": -90,
784
"format": {"font_color": "white"},
785
},
786
],
787
"dtm": [
788
{
789
"type": "top",
790
"value": 1,
791
"format": {"bold": True, "font_color": "green"},
792
},
793
{
794
"type": "bottom",
795
"value": 1,
796
"format": {"bold": True, "font_color": "red"},
797
},
798
],
799
},
800
"dtype_formats": {
801
frozenset(
802
FLOAT_DTYPES
803
): '_(£* #,##0.00_);_(£* (#,##0.00);_(£* "-"??_);_(@_)',
804
pl.Date: "dd-mm-yyyy",
805
},
806
"column_formats": {"dtm": {"font_color": "#31869c", "bg_color": "#b7dee8"}},
807
"column_totals": {"val": "average", "dtm": "min"},
808
"column_widths": {("str", "val"): 60, "dtm": 80},
809
"row_totals": {"tot": True},
810
"hidden_columns": ["str"],
811
"hide_gridlines": True,
812
"include_header": False,
813
},
814
],
815
)
816
def test_excel_round_trip(write_params: dict[str, Any]) -> None:
817
df = pl.DataFrame(
818
{
819
"dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)],
820
"str": ["xxx", "yyy", "xxx"],
821
"val": [100.5, 55.0, -99.5],
822
}
823
)
824
825
engine: ExcelSpreadsheetEngine
826
for engine in ("calamine", "xlsx2csv"):
827
read_options, has_header = (
828
({}, True)
829
if write_params.get("include_header", True)
830
else (
831
{"new_columns": ["dtm", "str", "val"]}
832
if engine == "xlsx2csv"
833
else {"column_names": ["dtm", "str", "val"]},
834
False,
835
)
836
)
837
838
fmt_strptime = "%Y-%m-%d"
839
if write_params.get("dtype_formats", {}).get(pl.Date) == "dd-mm-yyyy":
840
fmt_strptime = "%d-%m-%Y"
841
842
# write to xlsx using various parameters...
843
xls = BytesIO()
844
_wb = df.write_excel(workbook=xls, worksheet="data", **write_params)
845
846
# ...and read it back again:
847
xldf = pl.read_excel(
848
xls,
849
sheet_name="data",
850
engine=engine,
851
read_options=read_options,
852
has_header=has_header,
853
)[:3].select(df.columns[:3])
854
855
if engine == "xlsx2csv":
856
xldf = xldf.with_columns(pl.col("dtm").str.strptime(pl.Date, fmt_strptime))
857
858
assert_frame_equal(df, xldf)
859
860
861
@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"])
862
def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None:
863
df = pl.DataFrame(
864
{
865
"id": ["aaa", "bbb", "ccc", "ddd", "eee"],
866
# float cols
867
"q1": [100.0, 55.5, -20.0, 0.5, 35.0],
868
"q2": [30.5, -10.25, 15.0, 60.0, 20.5],
869
# int cols
870
"q3": [-50, 0, 40, 80, 80],
871
"q4": [75, 55, 25, -10, -55],
872
}
873
)
874
for fn_sum in (True, "sum", "SUM"):
875
xls = BytesIO()
876
df.write_excel(
877
xls,
878
worksheet="misc",
879
sparklines={"trend": ["q1", "q2", "q3", "q4"]},
880
row_totals={
881
# add semiannual row total columns
882
"h1": ("q1", "q2"),
883
"h2": ("q3", "q4"),
884
},
885
column_totals=fn_sum,
886
)
887
888
# note that the totals are written as formulae, so we
889
# won't have the calculated values in the dataframe
890
xldf = pl.read_excel(xls, sheet_name="misc", engine=engine)
891
892
assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
893
assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)
894
895
896
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
897
@pytest.mark.parametrize(
898
("engine", "list_dtype"),
899
[
900
("calamine", pl.List(pl.Int8)),
901
("openpyxl", pl.List(pl.UInt16)),
902
("xlsx2csv", pl.Array(pl.Int32, 3)),
903
],
904
)
905
def test_excel_write_compound_types(
906
engine: ExcelSpreadsheetEngine,
907
list_dtype: PolarsDataType,
908
) -> None:
909
df = pl.DataFrame(
910
data={"x": [None, [1, 2, 3], [4, 5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]},
911
schema_overrides={"x": pl.Array(pl.Int32, 3)},
912
).select("x", pl.struct(["y", "z"]))
913
914
xls = BytesIO()
915
df.write_excel(xls, worksheet="data")
916
917
# also test reading from the various flavours of supported binary data
918
# across all backend engines (check bytesio, bytes, and memoryview)
919
for binary_data in (
920
xls,
921
xls.getvalue(),
922
xls.getbuffer(),
923
):
924
xldf = pl.read_excel(
925
binary_data, # type: ignore[arg-type]
926
sheet_name="data",
927
engine=engine,
928
include_file_paths="wbook",
929
)
930
931
# expect string conversion (only scalar values are supported)
932
assert xldf.rows() == [
933
(None, "{'y': 'a', 'z': 9}", "in-mem"),
934
("[1, 2, 3]", "{'y': 'b', 'z': 8}", "in-mem"),
935
("[4, 5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
936
]
937
938
939
def test_excel_read_named_table_with_total_row(tmp_path: Path) -> None:
940
df = pl.DataFrame(
941
{
942
"x": ["aa", "bb", "cc"],
943
"y": [100, 325, -250],
944
"z": [975, -444, 123],
945
}
946
)
947
# when we read back a named table object with a total row we expect the read
948
# to automatically omit that row as it is *not* part of the actual table data
949
wb_path = Path(tmp_path).joinpath("test_named_table_read.xlsx")
950
df.write_excel(
951
wb_path,
952
worksheet="data",
953
table_name="PolarsFrameTable",
954
column_totals=True,
955
)
956
for engine in ("calamine", "openpyxl"):
957
col_subset = ["x", "z"]
958
subset_kwarg = {"columns": col_subset}
959
base_kwargs = {"table_name": "PolarsFrameTable", "engine": engine}
960
961
for kwargs, df_expected in (
962
(base_kwargs, df), # all cols
963
({**base_kwargs, **subset_kwarg}, df.select(col_subset)),
964
):
965
# read from named table
966
xldf = pl.read_excel(wb_path, **kwargs)
967
assert_frame_equal(df_expected, xldf)
968
969
# xlsx2csv doesn't support reading named tables, so we see the
970
# column total if we don't filter it out after reading the data
971
with pytest.raises(
972
ValueError,
973
match="the `table_name` parameter is not supported by the 'xlsx2csv' engine",
974
):
975
pl.read_excel(wb_path, table_name="PolarsFrameTable", engine="xlsx2csv")
976
977
xldf = pl.read_excel(wb_path, sheet_name="data", engine="xlsx2csv")
978
assert_frame_equal(df, xldf.head(3))
979
assert xldf.height == 4
980
assert xldf.row(3) == (None, 0, 0)
981
982
983
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
984
def test_excel_write_to_bytesio(engine: ExcelSpreadsheetEngine) -> None:
985
df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]})
986
987
excel_bytes = BytesIO()
988
df.write_excel(excel_bytes)
989
990
df_read = pl.read_excel(excel_bytes, engine=engine)
991
assert_frame_equal(df, df_read)
992
993
# also confirm consistent behaviour when 'infer_schema_length=0'
994
df_read = pl.read_excel(excel_bytes, engine=engine, infer_schema_length=0)
995
expected = pl.DataFrame({"colx": ["1.5", "-2", "0"], "coly": ["a", None, "c"]})
996
assert_frame_equal(expected, df_read)
997
998
999
@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
1000
def test_excel_write_to_file_object(
1001
engine: ExcelSpreadsheetEngine, tmp_path: Path
1002
) -> None:
1003
tmp_path.mkdir(exist_ok=True)
1004
1005
df = pl.DataFrame({"x": ["aaa", "bbb", "ccc"], "y": [123, 456, 789]})
1006
1007
# write to bytesio
1008
xls = BytesIO()
1009
df.write_excel(xls, worksheet="data")
1010
assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1011
1012
# write to file path
1013
path = Path(tmp_path).joinpath("test_write_path.xlsx")
1014
df.write_excel(path, worksheet="data")
1015
assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1016
1017
# write to file path (as string)
1018
path = Path(tmp_path).joinpath("test_write_path_str.xlsx")
1019
df.write_excel(str(path), worksheet="data")
1020
assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1021
1022
# write to file object
1023
path = Path(tmp_path).joinpath("test_write_file_object.xlsx")
1024
with path.open("wb") as tgt:
1025
df.write_excel(tgt, worksheet="data")
1026
with path.open("rb") as src:
1027
assert_frame_equal(df, pl.read_excel(src, engine=engine))
1028
1029
1030
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1031
def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
1032
df = pl.DataFrame(
1033
{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
1034
)
1035
xls = BytesIO()
1036
df.write_excel(xls, worksheet="data", include_header=False)
1037
1038
xldf = pl.read_excel(xls, engine=engine, has_header=False)
1039
expected = xldf.rename({"column_1": "colx", "column_2": "coly", "column_3": "colz"})
1040
assert_frame_equal(df, expected)
1041
1042
1043
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1044
def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
1045
from xlsxwriter import Workbook
1046
1047
# note that we don't (quite) expect sparkline export to round-trip as we
1048
# inject additional empty columns to hold them (which will read as nulls)
1049
df = pl.DataFrame(
1050
{
1051
"id": ["aaa", "bbb", "ccc", "ddd", "eee"],
1052
"q1": [100, 55, -20, 0, 35],
1053
"q2": [30, -10, 15, 60, 20],
1054
"q3": [-50, 0, 40, 80, 80],
1055
"q4": [75, 55, 25, -10, -55],
1056
}
1057
).cast(dtypes={pl.Int64: pl.Float64})
1058
1059
# also: confirm that we can use a Workbook directly with "write_excel"
1060
xls = BytesIO()
1061
with Workbook(xls) as wb:
1062
df.write_excel(
1063
workbook=wb,
1064
worksheet="frame_data",
1065
table_style="Table Style Light 2",
1066
dtype_formats={frozenset(NUMERIC_DTYPES): "#,##0_);(#,##0)"},
1067
column_formats={cs.starts_with("h"): "#,##0_);(#,##0)"},
1068
sparklines={
1069
"trend": ["q1", "q2", "q3", "q4"],
1070
"+/-": {
1071
"columns": ["q1", "q2", "q3", "q4"],
1072
"insert_after": "id",
1073
"type": "win_loss",
1074
},
1075
},
1076
conditional_formats={
1077
cs.starts_with("q", "h"): {
1078
"type": "2_color_scale",
1079
"min_color": "#95b3d7",
1080
"max_color": "#ffffff",
1081
}
1082
},
1083
column_widths={cs.starts_with("q", "h"): 40},
1084
row_totals={
1085
"h1": ("q1", "q2"),
1086
"h2": ("q3", "q4"),
1087
},
1088
hide_gridlines=True,
1089
row_heights=35,
1090
sheet_zoom=125,
1091
)
1092
1093
tables = {tbl["name"] for tbl in wb.get_worksheet_by_name("frame_data").tables}
1094
assert "Frame0" in tables
1095
1096
with warnings.catch_warnings():
1097
# ignore an openpyxl user warning about sparklines
1098
warnings.simplefilter("ignore", UserWarning)
1099
xldf = pl.read_excel(xls, sheet_name="frame_data", engine=engine)
1100
1101
# ┌─────┬──────┬─────┬─────┬─────┬─────┬───────┬─────┬─────┐
1102
# │ id ┆ +/- ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ trend ┆ h1 ┆ h2 │
1103
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1104
# │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │
1105
# ╞═════╪══════╪═════╪═════╪═════╪═════╪═══════╪═════╪═════╡
1106
# │ aaa ┆ null ┆ 100 ┆ 30 ┆ -50 ┆ 75 ┆ null ┆ 0 ┆ 0 │
1107
# │ bbb ┆ null ┆ 55 ┆ -10 ┆ 0 ┆ 55 ┆ null ┆ 0 ┆ 0 │
1108
# │ ccc ┆ null ┆ -20 ┆ 15 ┆ 40 ┆ 25 ┆ null ┆ 0 ┆ 0 │
1109
# │ ddd ┆ null ┆ 0 ┆ 60 ┆ 80 ┆ -10 ┆ null ┆ 0 ┆ 0 │
1110
# │ eee ┆ null ┆ 35 ┆ 20 ┆ 80 ┆ -55 ┆ null ┆ 0 ┆ 0 │
1111
# └─────┴──────┴─────┴─────┴─────┴─────┴───────┴─────┴─────┘
1112
1113
for sparkline_col in ("+/-", "trend"):
1114
assert set(xldf[sparkline_col]) in ({None}, {""})
1115
1116
assert xldf.columns == ["id", "+/-", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
1117
assert_frame_equal(
1118
df, xldf.drop("+/-", "trend", "h1", "h2").cast(dtypes={pl.Int64: pl.Float64})
1119
)
1120
1121
1122
def test_excel_write_multiple_tables() -> None:
1123
from xlsxwriter import Workbook
1124
1125
# note: also checks that empty tables don't error on write
1126
df = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1127
1128
# write multiple frames to multiple worksheets
1129
xls = BytesIO()
1130
with Workbook(xls) as wb:
1131
df.rename({"colx": "colx0", "coly": "coly0", "colz": "colz0"}).write_excel(
1132
workbook=wb, worksheet="sheet1", position="A1"
1133
)
1134
df.rename({"colx": "colx1", "coly": "coly1", "colz": "colz1"}).write_excel(
1135
workbook=wb, worksheet="sheet1", position="X10"
1136
)
1137
df.rename({"colx": "colx2", "coly": "coly2", "colz": "colz2"}).write_excel(
1138
workbook=wb, worksheet="sheet2", position="C25"
1139
)
1140
1141
# also validate integration of externally-added formats
1142
fmt = wb.add_format({"bg_color": "#ffff00"})
1143
df.rename({"colx": "colx3", "coly": "coly3", "colz": "colz3"}).write_excel(
1144
workbook=wb,
1145
worksheet="sheet3",
1146
position="D4",
1147
conditional_formats={
1148
"colz3": {
1149
"type": "formula",
1150
"criteria": "=C2=B2",
1151
"format": fmt,
1152
}
1153
},
1154
)
1155
1156
table_names = {
1157
tbl["name"]
1158
for sheet in wb.sheetnames
1159
for tbl in wb.get_worksheet_by_name(sheet).tables
1160
}
1161
assert table_names == {f"Frame{n}" for n in range(4)}
1162
assert pl.read_excel(xls, sheet_name="sheet3").rows() == []
1163
1164
# test loading one of the written tables by name
1165
for engine in ("calamine", "openpyxl"):
1166
df1 = pl.read_excel(
1167
xls,
1168
sheet_name="sheet2",
1169
table_name="Frame2",
1170
engine=engine,
1171
)
1172
df2 = pl.read_excel(
1173
xls,
1174
table_name="Frame2",
1175
engine=engine,
1176
)
1177
assert df1.columns == ["colx2", "coly2", "colz2"]
1178
assert_frame_equal(df1, df2)
1179
1180
# if we supply a sheet name (which is optional when using `table_name`),
1181
# then the table name must be present in *that* sheet, or we raise an error
1182
with pytest.raises(
1183
RuntimeError,
1184
match="table named 'Frame3' not found in sheet 'sheet1'",
1185
):
1186
pl.read_excel(xls, sheet_name="sheet1", table_name="Frame3")
1187
1188
1189
def test_excel_write_worksheet_object() -> None:
1190
# write to worksheet object
1191
from xlsxwriter import Workbook
1192
1193
df = pl.DataFrame({"colx": ["aaa", "bbb", "ccc"], "coly": [-1234, 0, 5678]})
1194
1195
with Workbook(xls := BytesIO()) as wb:
1196
ws = wb.add_worksheet("frame_data")
1197
df.write_excel(wb, worksheet=ws)
1198
ws.hide_zero()
1199
1200
assert_frame_equal(df, pl.read_excel(xls, sheet_name="frame_data"))
1201
1202
with pytest.raises( # noqa: SIM117
1203
ValueError,
1204
match=r"the given workbook object .* is not the parent of worksheet 'frame_data'",
1205
):
1206
with Workbook(BytesIO()) as wb:
1207
df.write_excel(wb, worksheet=ws)
1208
1209
with pytest.raises( # noqa: SIM117
1210
TypeError,
1211
match="worksheet object requires the parent workbook object; found workbook=None",
1212
):
1213
with Workbook(BytesIO()) as wb:
1214
df.write_excel(None, worksheet=ws)
1215
1216
1217
def test_excel_write_beyond_max_rows_cols(tmp_path: Path) -> None:
1218
tmp_path.mkdir(exist_ok=True)
1219
path = tmp_path / "test_max_dimensions.xlsx"
1220
sheet = "mysheet"
1221
1222
df = pl.DataFrame({"col1": range(10), "col2": range(10, 20)})
1223
1224
with pytest.raises(pl.exceptions.InvalidOperationError):
1225
df.write_excel(workbook=path, worksheet=sheet, position="A1048570")
1226
1227
1228
def test_excel_freeze_panes() -> None:
1229
from xlsxwriter import Workbook
1230
1231
# note: checks that empty tables don't error on write
1232
df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1233
df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1234
df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1235
1236
xls = BytesIO()
1237
1238
# use all three freeze_pane notations
1239
with Workbook(xls) as wb:
1240
df1.write_excel(workbook=wb, worksheet="sheet1", freeze_panes=(1, 0))
1241
df2.write_excel(workbook=wb, worksheet="sheet2", freeze_panes=(1, 0, 3, 4))
1242
df3.write_excel(workbook=wb, worksheet="sheet3", freeze_panes=("B2"))
1243
1244
table_names: set[str] = set()
1245
for sheet in ("sheet1", "sheet2", "sheet3"):
1246
table_names.update(
1247
tbl["name"] for tbl in wb.get_worksheet_by_name(sheet).tables
1248
)
1249
assert table_names == {f"Frame{n}" for n in range(3)}
1250
assert pl.read_excel(xls, sheet_name="sheet3").rows() == []
1251
1252
1253
@pytest.mark.parametrize(
1254
("read_spreadsheet", "source", "schema_overrides"),
1255
[
1256
(pl.read_excel, "path_xlsx_empty", None),
1257
(pl.read_excel, "path_xlsb_empty", None),
1258
(pl.read_excel, "path_xls_empty", None),
1259
(pl.read_ods, "path_ods_empty", None),
1260
# Test with schema overrides, to ensure they don't interfere with
1261
# raising NoDataErrors.
1262
(pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}),
1263
(pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}),
1264
(pl.read_excel, "path_xls_empty", {"a": pl.Int64}),
1265
(pl.read_ods, "path_ods_empty", {"a": pl.Int64}),
1266
],
1267
)
1268
def test_excel_empty_sheet(
1269
read_spreadsheet: Callable[..., pl.DataFrame],
1270
source: str,
1271
request: pytest.FixtureRequest,
1272
schema_overrides: SchemaDict | None,
1273
) -> None:
1274
ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods"
1275
read_spreadsheet = pl.read_ods if ods else pl.read_excel # type: ignore[assignment]
1276
1277
with pytest.raises(NoDataError, match="empty Excel sheet"):
1278
read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides)
1279
1280
engine_params = [{}] if ods else [{"engine": "calamine"}]
1281
for params in engine_params:
1282
df = read_spreadsheet(
1283
empty_spreadsheet_path,
1284
sheet_name="no_data",
1285
raise_if_empty=False,
1286
**params,
1287
)
1288
expected = pl.DataFrame()
1289
assert_frame_equal(df, expected)
1290
1291
df = read_spreadsheet(
1292
empty_spreadsheet_path,
1293
sheet_name="no_rows",
1294
raise_if_empty=False,
1295
**params,
1296
)
1297
expected = pl.DataFrame(schema={f"col{c}": pl.String for c in ("x", "y", "z")})
1298
assert_frame_equal(df, expected)
1299
1300
1301
@pytest.mark.parametrize(
1302
("engine", "hidden_columns"),
1303
[
1304
("xlsx2csv", ["a"]),
1305
("openpyxl", ["a", "b"]),
1306
("calamine", ["a", "b"]),
1307
("xlsx2csv", cs.numeric()),
1308
("openpyxl", cs.last()),
1309
],
1310
)
1311
def test_excel_hidden_columns(
1312
hidden_columns: list[str] | SelectorType,
1313
engine: ExcelSpreadsheetEngine,
1314
) -> None:
1315
df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})
1316
1317
xls = BytesIO()
1318
df.write_excel(xls, hidden_columns=hidden_columns)
1319
1320
read_df = pl.read_excel(xls)
1321
assert_frame_equal(df, read_df)
1322
1323
1324
def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None:
1325
df = pl.read_excel(io_files_path / "nan_test.xlsx", engine="calamine")
1326
nan = float("nan")
1327
assert_frame_equal(
1328
pl.DataFrame({"float_col": [nan, nan, nan, 100.0, 200.0, 300.0]}),
1329
df,
1330
)
1331
1332
1333
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1334
@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df
1335
def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
1336
df = pl.DataFrame(
1337
{
1338
"a": [1, 2, None],
1339
"b": [1.0, None, 3.5],
1340
"c": ["x", None, "z"],
1341
"d": [True, False, None],
1342
"e": [
1343
date(2023, 1, 1),
1344
None,
1345
date(2023, 1, 4),
1346
],
1347
"f": [
1348
datetime(2023, 1, 1),
1349
datetime(2000, 10, 10, 10, 10),
1350
None,
1351
],
1352
"g": [
1353
None,
1354
"1920-08-08 00:00:00",
1355
"2077-10-20 00:00:00.000000",
1356
],
1357
}
1358
)
1359
xls = BytesIO()
1360
df.write_excel(xls)
1361
1362
reversed_cols = list(reversed(df.columns))
1363
read_cols: Sequence[str] | Sequence[int]
1364
expected = df.select(reversed_cols).with_columns(
1365
pl.col("g").str.slice(0, 10).str.to_date()
1366
)
1367
for read_cols in (
1368
reversed_cols,
1369
[6, 5, 4, 3, 2, 1, 0],
1370
):
1371
read_df = pl.read_excel(
1372
xls,
1373
engine=engine,
1374
columns=read_cols,
1375
schema_overrides={
1376
"e": pl.Date,
1377
"f": pl.Datetime("us"),
1378
"g": pl.Date,
1379
},
1380
)
1381
assert_frame_equal(expected, read_df)
1382
1383
1384
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1385
def test_drop_empty_rows(
1386
path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine
1387
) -> None:
1388
df1 = pl.read_excel(
1389
source=path_empty_rows_excel,
1390
engine=engine,
1391
) # check default
1392
assert df1.shape == (8, 4)
1393
1394
df2 = pl.read_excel(
1395
source=path_empty_rows_excel,
1396
engine=engine,
1397
drop_empty_rows=True,
1398
)
1399
assert df2.shape == (8, 4)
1400
1401
df3 = pl.read_excel(
1402
source=path_empty_rows_excel,
1403
engine=engine,
1404
drop_empty_rows=False,
1405
)
1406
assert df3.shape == (10, 4)
1407
1408
1409
def test_excel_write_select_col_dtype() -> None:
1410
from openpyxl import load_workbook
1411
from xlsxwriter import Workbook
1412
1413
def get_col_widths(wb_bytes: BytesIO) -> dict[str, int]:
1414
return {
1415
k: round(v.width)
1416
for k, v in load_workbook(wb_bytes).active.column_dimensions.items()
1417
}
1418
1419
df = pl.DataFrame(
1420
{
1421
"name": [["Alice", "Ben"], ["Charlie", "Delta"]],
1422
"col2": ["Hi", "Bye"],
1423
}
1424
)
1425
1426
# column_widths test:
1427
# pl.List(pl.String)) datatype should not match column with no list
1428
check = BytesIO()
1429
with Workbook(check) as wb:
1430
df.write_excel(wb, column_widths={cs.by_dtype(pl.List(pl.String)): 300})
1431
1432
assert get_col_widths(check) == {"A": 43}
1433
1434
# column_widths test:
1435
# pl.String datatype should not match column with list
1436
check = BytesIO()
1437
with Workbook(check) as wb:
1438
df.write_excel(wb, column_widths={cs.by_dtype(pl.String): 300})
1439
1440
assert get_col_widths(check) == {"B": 43}
1441
1442
# hidden_columns test:
1443
# pl.List(pl.String)) datatype should not match column with no list
1444
check = BytesIO()
1445
with Workbook(check) as wb:
1446
df.write_excel(wb, hidden_columns=cs.by_dtype(pl.List(pl.String)))
1447
1448
assert get_col_widths(check) == {"A": 0}
1449
1450
# hidden_columns test:
1451
# pl.String datatype should not match column with list
1452
check = BytesIO()
1453
with Workbook(check) as wb:
1454
df.write_excel(wb, hidden_columns=cs.by_dtype(pl.String))
1455
1456
assert get_col_widths(check) == {"B": 0}
1457
1458
1459
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1460
def test_excel_read_columns_nonlist_sequence(engine: ExcelSpreadsheetEngine) -> None:
1461
df = pl.DataFrame(
1462
{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
1463
)
1464
xls = BytesIO()
1465
df.write_excel(xls, worksheet="data")
1466
1467
xldf = pl.read_excel(xls, engine=engine, columns=("colx", "coly"))
1468
expected = df.select("colx", "coly")
1469
assert_frame_equal(xldf, expected)
1470
1471
xldf = pl.read_excel(xls, engine=engine, columns="colx")
1472
expected = df.select("colx")
1473
assert_frame_equal(xldf, expected)
1474
1475