CoCalc -- test_spreadsheet.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_spreadsheet.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
import warnings
4
from collections import OrderedDict
5
from datetime import date, datetime, time
6
from io import BytesIO
7
from pathlib import Path
8
from typing import TYPE_CHECKING, Any, Callable
9
from zoneinfo import ZoneInfo
10

11
import pytest
12

13
import polars as pl
14
import polars.selectors as cs
15
from polars.exceptions import (
16
    NoDataError,
17
    ParameterCollisionError,
18
)
19
from polars.testing import assert_frame_equal, assert_series_equal
20
from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES
21

22
if TYPE_CHECKING:
23
    from collections.abc import Sequence
24

25
    from polars._typing import (
26
        ExcelSpreadsheetEngine,
27
        PolarsDataType,
28
        SchemaDict,
29
        SelectorType,
30
    )
31

32

33
# pytestmark = pytest.mark.slow()
34

35

36
@pytest.fixture
37
def path_xls(io_files_path: Path) -> Path:
38
    # old excel 97-2004 format
39
    return io_files_path / "example.xls"
40

41

42
@pytest.fixture
43
def path_xlsx(io_files_path: Path) -> Path:
44
    # modern excel format
45
    return io_files_path / "example.xlsx"
46

47

48
@pytest.fixture
49
def path_xlsb(io_files_path: Path) -> Path:
50
    # excel binary format
51
    return io_files_path / "example.xlsb"
52

53

54
@pytest.fixture
55
def path_ods(io_files_path: Path) -> Path:
56
    # open document spreadsheet
57
    return io_files_path / "example.ods"
58

59

60
@pytest.fixture
61
def path_xls_empty(io_files_path: Path) -> Path:
62
    return io_files_path / "empty.xls"
63

64

65
@pytest.fixture
66
def path_xlsx_empty(io_files_path: Path) -> Path:
67
    return io_files_path / "empty.xlsx"
68

69

70
@pytest.fixture
71
def path_xlsx_mixed(io_files_path: Path) -> Path:
72
    return io_files_path / "mixed.xlsx"
73

74

75
@pytest.fixture
76
def path_xlsb_empty(io_files_path: Path) -> Path:
77
    return io_files_path / "empty.xlsb"
78

79

80
@pytest.fixture
81
def path_xlsb_mixed(io_files_path: Path) -> Path:
82
    return io_files_path / "mixed.xlsb"
83

84

85
@pytest.fixture
86
def path_ods_empty(io_files_path: Path) -> Path:
87
    return io_files_path / "empty.ods"
88

89

90
@pytest.fixture
91
def path_ods_mixed(io_files_path: Path) -> Path:
92
    return io_files_path / "mixed.ods"
93

94

95
@pytest.fixture
96
def path_empty_rows_excel(io_files_path: Path) -> Path:
97
    return io_files_path / "test_empty_rows.xlsx"
98

99

100
@pytest.mark.parametrize(
101
    ("read_spreadsheet", "source", "engine_params"),
102
    [
103
        # xls file
104
        (pl.read_excel, "path_xls", {"engine": "calamine"}),
105
        # xlsx file
106
        (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
107
        (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
108
        (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
109
        # xlsb file (binary)
110
        (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
111
        # open document
112
        (pl.read_ods, "path_ods", {}),
113
    ],
114
)
115
def test_read_spreadsheet(
116
    read_spreadsheet: Callable[..., pl.DataFrame],
117
    source: str,
118
    engine_params: dict[str, str],
119
    request: pytest.FixtureRequest,
120
) -> None:
121
    sheet_params: dict[str, Any]
122

123
    for sheet_params in (
124
        {"sheet_name": None, "sheet_id": None},
125
        {"sheet_name": "test1"},
126
        {"sheet_id": 1},
127
    ):
128
        df = read_spreadsheet(
129
            source=request.getfixturevalue(source),
130
            **engine_params,
131
            **sheet_params,
132
        )
133
        expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
134
        assert_frame_equal(df, expected)
135

136

137
@pytest.mark.parametrize(
138
    ("read_spreadsheet", "source", "params"),
139
    [
140
        # xls file
141
        (pl.read_excel, "path_xls", {"engine": "calamine"}),
142
        # xlsx file
143
        (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
144
        (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
145
        (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
146
        # xlsb file (binary)
147
        (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
148
        # open document
149
        (pl.read_ods, "path_ods", {}),
150
    ],
151
)
152
def test_read_excel_multiple_worksheets(
153
    read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
154
    source: str,
155
    params: dict[str, str],
156
    request: pytest.FixtureRequest,
157
) -> None:
158
    spreadsheet_path = request.getfixturevalue(source)
159
    frames_by_id = read_spreadsheet(
160
        spreadsheet_path,
161
        sheet_id=[2, 1],
162
        sheet_name=None,
163
        **params,
164
    )
165
    frames_by_name = read_spreadsheet(
166
        spreadsheet_path,
167
        sheet_id=None,
168
        sheet_name=["test2", "test1"],
169
        **params,
170
    )
171
    for frames in (frames_by_id, frames_by_name):
172
        assert list(frames_by_name) == ["test2", "test1"]
173

174
        expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
175
        expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
176

177
        assert_frame_equal(frames["test1"], expected1)
178
        assert_frame_equal(frames["test2"], expected2)
179

180

181
@pytest.mark.parametrize(
182
    ("read_spreadsheet", "source", "params"),
183
    [
184
        # xls file
185
        (pl.read_excel, "path_xls", {"engine": "calamine"}),
186
        # xlsx file
187
        (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
188
        (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
189
        (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
190
        # xlsb file (binary)
191
        (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
192
        # open document
193
        (pl.read_ods, "path_ods", {}),
194
    ],
195
)
196
def test_read_excel_multiple_workbooks(
197
    read_spreadsheet: Callable[..., Any],
198
    source: str,
199
    params: dict[str, str],
200
    request: pytest.FixtureRequest,
201
) -> None:
202
    spreadsheet_path = request.getfixturevalue(source)
203

204
    # multiple workbooks, single worksheet
205
    df = read_spreadsheet(
206
        [
207
            spreadsheet_path,
208
            spreadsheet_path,
209
            spreadsheet_path,
210
        ],
211
        sheet_id=None,
212
        sheet_name="test1",
213
        include_file_paths="path",
214
        **params,
215
    )
216
    expected = pl.DataFrame(
217
        {
218
            "hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"],
219
            "path": [str(spreadsheet_path)] * 6,
220
        },
221
    )
222
    assert_frame_equal(df, expected)
223

224
    # multiple workbooks, multiple worksheets
225
    res = read_spreadsheet(
226
        [
227
            spreadsheet_path,
228
            spreadsheet_path,
229
            spreadsheet_path,
230
        ],
231
        sheet_id=None,
232
        sheet_name=["test1", "test2"],
233
        **params,
234
    )
235
    expected_frames = {
236
        "test1": pl.DataFrame(
237
            {"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"]}
238
        ),
239
        "test2": pl.DataFrame(
240
            {"world": ["Row 3", "Row 4", "Row 3", "Row 4", "Row 3", "Row 4"]}
241
        ),
242
    }
243
    assert sorted(res) == sorted(expected_frames)
244
    assert_frame_equal(res["test1"], expected_frames["test1"])
245
    assert_frame_equal(res["test2"], expected_frames["test2"])
246

247

248
@pytest.mark.parametrize(
249
    ("read_spreadsheet", "source", "params"),
250
    [
251
        # xls file
252
        (pl.read_excel, "path_xls", {"engine": "calamine"}),
253
        # xlsx file
254
        (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
255
        (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
256
        (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
257
        # xlsb file (binary)
258
        (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
259
        # open document
260
        (pl.read_ods, "path_ods", {}),
261
    ],
262
)
263
def test_read_excel_all_sheets(
264
    read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
265
    source: str,
266
    params: dict[str, str],
267
    request: pytest.FixtureRequest,
268
) -> None:
269
    spreadsheet_path = request.getfixturevalue(source)
270
    frames = read_spreadsheet(
271
        spreadsheet_path,
272
        sheet_id=0,
273
        **params,
274
    )
275
    assert len(frames) == (4 if str(spreadsheet_path).endswith("ods") else 6)
276

277
    expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
278
    expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})
279
    expected3 = pl.DataFrame(
280
        {
281
            "cardinality": [1, 3, 15, 30, 150, 300],
282
            "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
283
            "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
284
        }
285
    )
286
    assert_frame_equal(frames["test1"], expected1)
287
    assert_frame_equal(frames["test2"], expected2)
288
    if params.get("engine") == "openpyxl":
289
        # TODO: flag that trims trailing all-null rows?
290
        assert_frame_equal(frames["test3"], expected3)
291
        assert_frame_equal(frames["test4"].drop_nulls(), expected3)
292

293

294
@pytest.mark.parametrize(
295
    "engine",
296
    ["calamine", "openpyxl", "xlsx2csv"],
297
)
298
def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:
299
    df = pl.DataFrame(
300
        {
301
            "A": [1, 2, 3, 4, 5],
302
            "fruits": ["banana", "banana", "apple", "apple", "banana"],
303
            "floats": [1.1, 1.2, 1.3, 1.4, 1.5],
304
            "datetime": [datetime(2023, 1, x) for x in range(1, 6)],
305
            "nulls": [1, None, None, None, 0],
306
        },
307
    )
308
    xls = BytesIO()
309
    df.write_excel(xls, position="C5")
310

311
    schema_overrides = {"datetime": pl.Datetime("us"), "nulls": pl.Boolean()}
312
    df_compare = df.with_columns(
313
        pl.col(nm).cast(tp) for nm, tp in schema_overrides.items()
314
    )
315
    for sheet_id, sheet_name in ((None, None), (1, None), (None, "Sheet1")):
316
        df_from_excel = pl.read_excel(
317
            xls,
318
            sheet_id=sheet_id,
319
            sheet_name=sheet_name,
320
            engine=engine,
321
            schema_overrides=schema_overrides,
322
        )
323
        assert_frame_equal(df_compare, df_from_excel)
324

325
    # check some additional overrides
326
    # (note: xlsx2csv can't currently convert datetime with trailing '00:00:00' to date)
327
    dt_override = {"datetime": pl.Date} if engine != "xlsx2csv" else {}
328
    df = pl.read_excel(
329
        xls,
330
        sheet_id=sheet_id,
331
        sheet_name=sheet_name,
332
        engine=engine,
333
        schema_overrides={"A": pl.Float32, **dt_override},
334
    )
335
    assert_series_equal(
336
        df["A"],
337
        pl.Series(name="A", values=[1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float32),
338
    )
339
    if dt_override:
340
        assert_series_equal(
341
            df["datetime"],
342
            pl.Series(
343
                name="datetime",
344
                values=[date(2023, 1, x) for x in range(1, 6)],
345
                dtype=pl.Date,
346
            ),
347
        )
348

349

350
@pytest.mark.parametrize(
351
    ("read_spreadsheet", "source", "params"),
352
    [
353
        # TODO: uncomment once fastexcel offers a suitable param
354
        # (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
355
        (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
356
        (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
357
    ],
358
)
359
def test_read_dropped_cols(
360
    read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
361
    source: str,
362
    params: dict[str, str],
363
    request: pytest.FixtureRequest,
364
) -> None:
365
    spreadsheet_path = request.getfixturevalue(source)
366

367
    df1 = read_spreadsheet(
368
        spreadsheet_path,
369
        sheet_name="test4",
370
        **params,
371
    )
372
    df2 = read_spreadsheet(
373
        spreadsheet_path,
374
        sheet_name="test4",
375
        drop_empty_cols=False,
376
        **params,
377
    )
378
    assert df1.to_dict(as_series=False) == {  # type: ignore[attr-defined]
379
        "cardinality": [1, 3, 15, 30, 150, 300],
380
        "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
381
        "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
382
    }
383
    assert df2.to_dict(as_series=False) == {  # type: ignore[attr-defined]
384
        "": [None, None, None, None, None, None],
385
        "cardinality": [1, 3, 15, 30, 150, 300],
386
        "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
387
        "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
388
        "0": [None, None, None, None, None, None],
389
        "1": [None, None, None, None, None, None],
390
    }
391

392

393
@pytest.mark.parametrize(
394
    ("source", "params"),
395
    [
396
        ("path_xls", {"engine": "calamine", "sheet_name": "temporal"}),
397
        ("path_xlsx", {"engine": "calamine", "table_name": "TemporalData"}),
398
        ("path_xlsx", {"engine": "openpyxl", "sheet_name": "temporal"}),
399
        ("path_xlsb", {"engine": "calamine", "sheet_name": "temporal"}),
400
    ],
401
)
402
def test_read_excel_temporal_data(
403
    source: str,
404
    params: dict[str, str],
405
    request: pytest.FixtureRequest,
406
) -> None:
407
    source_path = request.getfixturevalue(source)
408

409
    temporal_schema = {
410
        "id": pl.UInt16(),
411
        "dtm": pl.Datetime("ms"),
412
        "dt": pl.Date(),
413
        "dtm_str": pl.Datetime(time_zone="Asia/Tokyo"),
414
        "dt_str": pl.Date(),
415
        "tm_str": pl.Time(),
416
    }
417
    parsed_df = pl.read_excel(  # type: ignore[call-overload]
418
        source_path,
419
        **params,
420
        schema_overrides=temporal_schema,
421
    )
422
    TK = ZoneInfo("Asia/Tokyo")
423

424
    expected = pl.DataFrame(
425
        {
426
            "id": [100, 200, 300, 400],
427
            "dtm": [
428
                datetime(1999, 12, 31, 1, 2, 3),
429
                None,
430
                datetime(1969, 7, 5, 10, 30, 45),
431
                datetime(2077, 10, 10, 5, 59, 44),
432
            ],
433
            "dt": [
434
                date(2000, 1, 18),
435
                date(1965, 8, 8),
436
                date(2027, 4, 22),
437
                None,
438
            ],
439
            "dtm_str": [
440
                None,
441
                datetime(1900, 1, 30, 14, 50, 20, tzinfo=TK),
442
                datetime(2026, 5, 7, 23, 59, 59, tzinfo=TK),
443
                datetime(2007, 6, 1, 0, 0, tzinfo=TK),
444
            ],
445
            "dt_str": [
446
                date(2000, 6, 14),
447
                date(1978, 2, 28),
448
                None,
449
                date(2040, 12, 4),
450
            ],
451
            "tm_str": [
452
                time(23, 50, 22),
453
                time(0, 0, 1),
454
                time(10, 10, 33),
455
                time(18, 30, 15),
456
            ],
457
        },
458
        schema=temporal_schema,
459
    )
460
    assert_frame_equal(expected, parsed_df)
461

462

463
@pytest.mark.parametrize(
464
    ("read_spreadsheet", "source", "params"),
465
    [
466
        # xls file
467
        (pl.read_excel, "path_xls", {"engine": "calamine"}),
468
        # xlsx file
469
        (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
470
        (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
471
        (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
472
        # xlsb file (binary)
473
        (pl.read_excel, "path_xlsb", {"engine": "calamine"}),
474
        # open document
475
        (pl.read_ods, "path_ods", {}),
476
    ],
477
)
478
def test_read_invalid_worksheet(
479
    read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
480
    source: str,
481
    params: dict[str, str],
482
    request: pytest.FixtureRequest,
483
) -> None:
484
    spreadsheet_path = request.getfixturevalue(source)
485
    for param, sheet_id, sheet_name in (
486
        ("id", 999, None),
487
        ("name", None, "not_a_sheet_name"),
488
    ):
489
        value = sheet_id if param == "id" else sheet_name
490
        with pytest.raises(
491
            ValueError,
492
            match=f"no matching sheet found when `sheet_{param}` is {value!r}",
493
        ):
494
            read_spreadsheet(
495
                spreadsheet_path, sheet_id=sheet_id, sheet_name=sheet_name, **params
496
            )
497

498

499
@pytest.mark.parametrize(
500
    ("read_spreadsheet", "source", "additional_params"),
501
    [
502
        (pl.read_excel, "path_xlsx_mixed", {"engine": "openpyxl"}),
503
        (pl.read_ods, "path_ods_mixed", {}),
504
    ],
505
)
506
@pytest.mark.may_fail_auto_streaming
507
def test_read_mixed_dtype_columns(
508
    read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
509
    source: str,
510
    additional_params: dict[str, str],
511
    request: pytest.FixtureRequest,
512
) -> None:
513
    spreadsheet_path = request.getfixturevalue(source)
514
    schema_overrides = {
515
        "Employee ID": pl.Utf8(),
516
        "Employee Name": pl.Utf8(),
517
        "Date": pl.Date(),
518
        "Details": pl.Categorical("lexical"),
519
        "Asset ID": pl.Utf8(),
520
    }
521
    df = read_spreadsheet(
522
        spreadsheet_path,
523
        sheet_id=0,
524
        schema_overrides=schema_overrides,
525
        **additional_params,
526
    )["Sheet1"]
527

528
    assert_frame_equal(
529
        df,
530
        pl.DataFrame(
531
            {
532
                "Employee ID": ["123456", "44333", "US00011", "135967", "IN86868"],
533
                "Employee Name": ["Test1", "Test2", "Test4", "Test5", "Test6"],
534
                "Date": [
535
                    date(2023, 7, 21),
536
                    date(2023, 7, 21),
537
                    date(2023, 7, 21),
538
                    date(2023, 7, 21),
539
                    date(2023, 7, 21),
540
                ],
541
                "Details": [
542
                    "Healthcare",
543
                    "Healthcare",
544
                    "Healthcare",
545
                    "Healthcare",
546
                    "Something",
547
                ],
548
                "Asset ID": ["84444", "84444", "84444", "84444", "ABC123"],
549
            },
550
            schema_overrides=schema_overrides,
551
        ),
552
    )
553

554

555
def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> None:
556
    df1 = pl.read_excel(
557
        path_xlsx,
558
        sheet_name="test4",
559
        schema_overrides={"cardinality": pl.UInt16},
560
    ).drop_nulls()
561

562
    assert df1.schema["cardinality"] == pl.UInt16
563
    assert df1.schema["rows_by_key"] == pl.Float64
564
    assert df1.schema["iter_groups"] == pl.Float64
565

566
    df2 = pl.read_excel(
567
        path_xlsx,
568
        sheet_name="test4",
569
        engine="xlsx2csv",
570
        read_options={"schema_overrides": {"cardinality": pl.UInt16}},
571
    ).drop_nulls()
572

573
    assert df2.schema["cardinality"] == pl.UInt16
574
    assert df2.schema["rows_by_key"] == pl.Float64
575
    assert df2.schema["iter_groups"] == pl.Float64
576

577
    df3 = pl.read_excel(
578
        path_xlsx,
579
        sheet_name="test4",
580
        engine="xlsx2csv",
581
        schema_overrides={"cardinality": pl.UInt16},
582
        read_options={
583
            "schema_overrides": {
584
                "rows_by_key": pl.Float32,
585
                "iter_groups": pl.Float32,
586
            },
587
        },
588
    ).drop_nulls()
589

590
    assert df3.schema["cardinality"] == pl.UInt16
591
    assert df3.schema["rows_by_key"] == pl.Float32
592
    assert df3.schema["iter_groups"] == pl.Float32
593

594
    for workbook_path in (path_xlsx, path_xlsb, path_ods):
595
        read_spreadsheet = (
596
            pl.read_ods if workbook_path.suffix == ".ods" else pl.read_excel
597
        )
598
        df4 = read_spreadsheet(  # type: ignore[operator]
599
            workbook_path,
600
            sheet_name="test5",
601
            schema_overrides={"dtm": pl.Datetime("ns"), "dt": pl.Date},
602
        )
603
        assert_frame_equal(
604
            df4,
605
            pl.DataFrame(
606
                {
607
                    "dtm": [
608
                        datetime(1999, 12, 31, 10, 30, 45),
609
                        datetime(2010, 10, 11, 12, 13, 14),
610
                    ],
611
                    "dt": [date(2024, 1, 1), date(2018, 8, 7)],
612
                    "val": [1.5, -0.5],
613
                },
614
                schema={"dtm": pl.Datetime("ns"), "dt": pl.Date, "val": pl.Float64},
615
            ),
616
        )
617

618
    with pytest.raises(ParameterCollisionError):
619
        # cannot specify 'cardinality' in both schema_overrides and read_options
620
        pl.read_excel(
621
            path_xlsx,
622
            sheet_name="test4",
623
            engine="xlsx2csv",
624
            schema_overrides={"cardinality": pl.UInt16},
625
            read_options={"schema_overrides": {"cardinality": pl.Int32}},
626
        )
627

628
    # read multiple sheets in conjunction with 'schema_overrides'
629
    # (note: reading the same sheet twice simulates the issue in #11850)
630
    overrides = OrderedDict(
631
        [
632
            ("cardinality", pl.UInt32),
633
            ("rows_by_key", pl.Float32),
634
            ("iter_groups", pl.Float64),
635
        ]
636
    )
637
    df = pl.read_excel(
638
        path_xlsx,
639
        sheet_name=["test4", "test4"],
640
        schema_overrides=overrides,
641
    )
642
    for col, dtype in overrides.items():
643
        assert df["test4"].schema[col] == dtype
644

645

646
@pytest.mark.parametrize(
647
    ("engine", "read_opts_param"),
648
    [
649
        ("xlsx2csv", "infer_schema_length"),
650
        ("calamine", "schema_sample_rows"),
651
    ],
652
)
653
def test_invalid_parameter_combinations_infer_schema_len(
654
    path_xlsx: Path, engine: str, read_opts_param: str
655
) -> None:
656
    with pytest.raises(
657
        ParameterCollisionError,
658
        match=f"cannot specify both `infer_schema_length`.*{read_opts_param}",
659
    ):
660
        pl.read_excel(  # type: ignore[call-overload]
661
            path_xlsx,
662
            sheet_id=1,
663
            engine=engine,
664
            read_options={read_opts_param: 512},
665
            infer_schema_length=1024,
666
        )
667

668

669
@pytest.mark.parametrize(
670
    ("engine", "read_opts_param"),
671
    [
672
        ("xlsx2csv", "columns"),
673
        ("calamine", "use_columns"),
674
    ],
675
)
676
def test_invalid_parameter_combinations_columns(
677
    path_xlsx: Path, engine: str, read_opts_param: str
678
) -> None:
679
    with pytest.raises(
680
        ParameterCollisionError,
681
        match=f"cannot specify both `columns`.*{read_opts_param}",
682
    ):
683
        pl.read_excel(  # type: ignore[call-overload]
684
            path_xlsx,
685
            sheet_id=1,
686
            engine=engine,
687
            read_options={read_opts_param: ["B", "C", "D"]},
688
            columns=["A", "B", "C"],
689
        )
690

691

692
def test_unsupported_engine() -> None:
693
    with pytest.raises(NotImplementedError):
694
        pl.read_excel(None, engine="foo")  # type: ignore[call-overload]
695

696

697
def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
698
    with pytest.raises(Exception, match="does not support binary format"):
699
        pl.read_excel(path_xlsb, engine="openpyxl")
700

701

702
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
703
def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:
704
    with pytest.raises(
705
        ValueError,
706
        match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)",
707
    ):
708
        pl.read_excel(  # type: ignore[call-overload]
709
            path_xlsx,
710
            sheet_id=1,
711
            sheet_name="Sheet1",
712
            engine=engine,
713
        )
714

715

716
# the parameters don't change the data, only the formatting, so we expect
717
# the same result each time. however, it's important to validate that the
718
# parameter permutations don't raise exceptions, or interfere with the
719
# values written to the worksheet, so test multiple variations.
720
@pytest.mark.parametrize(
721
    "write_params",
722
    [
723
        # default parameters
724
        {},
725
        # basic formatting
726
        {
727
            "autofit": True,
728
            "table_style": "Table Style Dark 2",
729
            "column_totals": True,
730
            "float_precision": 0,
731
        },
732
        # slightly customized formatting, with some formulas
733
        {
734
            "position": (0, 0),
735
            "table_style": {
736
                "style": "Table Style Medium 23",
737
                "first_column": True,
738
            },
739
            "conditional_formats": {"val": "data_bar"},
740
            "column_formats": {
741
                "val": "#,##0.000;[White]-#,##0.000",
742
                ("day", "month", "year"): {"align": "left", "num_format": "0"},
743
            },
744
            "header_format": {"italic": True, "bg_color": "#d9d9d9"},
745
            "column_widths": {"val": 100},
746
            "row_heights": {0: 35},
747
            "formulas": {
748
                # string: formula added to the end of the table (but before row_totals)
749
                "day": "=DAY([@dtm])",
750
                "month": "=MONTH([@dtm])",
751
                "year": {
752
                    # dict: full control over formula positioning/dtype
753
                    "formula": "=YEAR([@dtm])",
754
                    "insert_after": "month",
755
                    "return_type": pl.Int16,
756
                },
757
            },
758
            "column_totals": True,
759
            "row_totals": True,
760
        },
761
        # heavily customized formatting/definition
762
        {
763
            "position": "A1",
764
            "table_name": "PolarsFrameData",
765
            "table_style": "Table Style Light 9",
766
            "conditional_formats": {
767
                # single dict format
768
                "str": {
769
                    "type": "duplicate",
770
                    "format": {"bg_color": "#ff0000", "font_color": "#ffffff"},
771
                },
772
                # multiple dict formats
773
                "val": [
774
                    {
775
                        "type": "3_color_scale",
776
                        "min_color": "#4bacc6",
777
                        "mid_color": "#ffffff",
778
                        "max_color": "#daeef3",
779
                    },
780
                    {
781
                        "type": "cell",
782
                        "criteria": "<",
783
                        "value": -90,
784
                        "format": {"font_color": "white"},
785
                    },
786
                ],
787
                "dtm": [
788
                    {
789
                        "type": "top",
790
                        "value": 1,
791
                        "format": {"bold": True, "font_color": "green"},
792
                    },
793
                    {
794
                        "type": "bottom",
795
                        "value": 1,
796
                        "format": {"bold": True, "font_color": "red"},
797
                    },
798
                ],
799
            },
800
            "dtype_formats": {
801
                frozenset(
802
                    FLOAT_DTYPES
803
                ): '_(£* #,##0.00_);_(£* (#,##0.00);_(£* "-"??_);_(@_)',
804
                pl.Date: "dd-mm-yyyy",
805
            },
806
            "column_formats": {"dtm": {"font_color": "#31869c", "bg_color": "#b7dee8"}},
807
            "column_totals": {"val": "average", "dtm": "min"},
808
            "column_widths": {("str", "val"): 60, "dtm": 80},
809
            "row_totals": {"tot": True},
810
            "hidden_columns": ["str"],
811
            "hide_gridlines": True,
812
            "include_header": False,
813
        },
814
    ],
815
)
816
def test_excel_round_trip(write_params: dict[str, Any]) -> None:
817
    df = pl.DataFrame(
818
        {
819
            "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)],
820
            "str": ["xxx", "yyy", "xxx"],
821
            "val": [100.5, 55.0, -99.5],
822
        }
823
    )
824

825
    engine: ExcelSpreadsheetEngine
826
    for engine in ("calamine", "xlsx2csv"):
827
        read_options, has_header = (
828
            ({}, True)
829
            if write_params.get("include_header", True)
830
            else (
831
                {"new_columns": ["dtm", "str", "val"]}
832
                if engine == "xlsx2csv"
833
                else {"column_names": ["dtm", "str", "val"]},
834
                False,
835
            )
836
        )
837

838
        fmt_strptime = "%Y-%m-%d"
839
        if write_params.get("dtype_formats", {}).get(pl.Date) == "dd-mm-yyyy":
840
            fmt_strptime = "%d-%m-%Y"
841

842
        # write to xlsx using various parameters...
843
        xls = BytesIO()
844
        _wb = df.write_excel(workbook=xls, worksheet="data", **write_params)
845

846
        # ...and read it back again:
847
        xldf = pl.read_excel(
848
            xls,
849
            sheet_name="data",
850
            engine=engine,
851
            read_options=read_options,
852
            has_header=has_header,
853
        )[:3].select(df.columns[:3])
854

855
        if engine == "xlsx2csv":
856
            xldf = xldf.with_columns(pl.col("dtm").str.strptime(pl.Date, fmt_strptime))
857

858
        assert_frame_equal(df, xldf)
859

860

861
@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"])
862
def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None:
863
    df = pl.DataFrame(
864
        {
865
            "id": ["aaa", "bbb", "ccc", "ddd", "eee"],
866
            # float cols
867
            "q1": [100.0, 55.5, -20.0, 0.5, 35.0],
868
            "q2": [30.5, -10.25, 15.0, 60.0, 20.5],
869
            # int cols
870
            "q3": [-50, 0, 40, 80, 80],
871
            "q4": [75, 55, 25, -10, -55],
872
        }
873
    )
874
    for fn_sum in (True, "sum", "SUM"):
875
        xls = BytesIO()
876
        df.write_excel(
877
            xls,
878
            worksheet="misc",
879
            sparklines={"trend": ["q1", "q2", "q3", "q4"]},
880
            row_totals={
881
                # add semiannual row total columns
882
                "h1": ("q1", "q2"),
883
                "h2": ("q3", "q4"),
884
            },
885
            column_totals=fn_sum,
886
        )
887

888
        # note that the totals are written as formulae, so we
889
        # won't have the calculated values in the dataframe
890
        xldf = pl.read_excel(xls, sheet_name="misc", engine=engine)
891

892
        assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
893
        assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)
894

895

896
@pytest.mark.may_fail_cloud  # reason: eager - return_dtype must be set
897
@pytest.mark.parametrize(
898
    ("engine", "list_dtype"),
899
    [
900
        ("calamine", pl.List(pl.Int8)),
901
        ("openpyxl", pl.List(pl.UInt16)),
902
        ("xlsx2csv", pl.Array(pl.Int32, 3)),
903
    ],
904
)
905
def test_excel_write_compound_types(
906
    engine: ExcelSpreadsheetEngine,
907
    list_dtype: PolarsDataType,
908
) -> None:
909
    df = pl.DataFrame(
910
        data={"x": [None, [1, 2, 3], [4, 5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]},
911
        schema_overrides={"x": pl.Array(pl.Int32, 3)},
912
    ).select("x", pl.struct(["y", "z"]))
913

914
    xls = BytesIO()
915
    df.write_excel(xls, worksheet="data")
916

917
    # also test reading from the various flavours of supported binary data
918
    # across all backend engines (check bytesio, bytes, and memoryview)
919
    for binary_data in (
920
        xls,
921
        xls.getvalue(),
922
        xls.getbuffer(),
923
    ):
924
        xldf = pl.read_excel(
925
            binary_data,
926
            sheet_name="data",
927
            engine=engine,
928
            include_file_paths="wbook",
929
        )
930

931
        # expect string conversion (only scalar values are supported)
932
        assert xldf.rows() == [
933
            (None, "{'y': 'a', 'z': 9}", "in-mem"),
934
            ("[1, 2, 3]", "{'y': 'b', 'z': 8}", "in-mem"),
935
            ("[4, 5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
936
        ]
937

938

939
def test_excel_read_named_table_with_total_row(tmp_path: Path) -> None:
940
    df = pl.DataFrame(
941
        {
942
            "x": ["aa", "bb", "cc"],
943
            "y": [100, 325, -250],
944
            "z": [975, -444, 123],
945
        }
946
    )
947
    # when we read back a named table object with a total row we expect the read
948
    # to automatically omit that row as it is *not* part of the actual table data
949
    wb_path = Path(tmp_path).joinpath("test_named_table_read.xlsx")
950
    df.write_excel(
951
        wb_path,
952
        worksheet="data",
953
        table_name="PolarsFrameTable",
954
        column_totals=True,
955
    )
956
    for engine in ("calamine", "openpyxl"):
957
        xldf = pl.read_excel(wb_path, table_name="PolarsFrameTable", engine=engine)
958
        assert_frame_equal(df, xldf)
959

960
    # xlsx2csv doesn't support reading named tables, so we see the
961
    # column total if we don't filter it out after reading the data
962
    with pytest.raises(
963
        ValueError,
964
        match="the `table_name` parameter is not supported by the 'xlsx2csv' engine",
965
    ):
966
        pl.read_excel(wb_path, table_name="PolarsFrameTable", engine="xlsx2csv")
967

968
    xldf = pl.read_excel(wb_path, sheet_name="data", engine="xlsx2csv")
969
    assert_frame_equal(df, xldf.head(3))
970
    assert xldf.height == 4
971
    assert xldf.row(3) == (None, 0, 0)
972

973

974
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
975
def test_excel_write_to_bytesio(engine: ExcelSpreadsheetEngine) -> None:
976
    df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]})
977

978
    excel_bytes = BytesIO()
979
    df.write_excel(excel_bytes)
980

981
    df_read = pl.read_excel(excel_bytes, engine=engine)
982
    assert_frame_equal(df, df_read)
983

984
    # also confirm consistent behaviour when 'infer_schema_length=0'
985
    df_read = pl.read_excel(excel_bytes, engine=engine, infer_schema_length=0)
986
    expected = pl.DataFrame({"colx": ["1.5", "-2", "0"], "coly": ["a", None, "c"]})
987
    assert_frame_equal(expected, df_read)
988

989

990
@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
991
def test_excel_write_to_file_object(
992
    engine: ExcelSpreadsheetEngine, tmp_path: Path
993
) -> None:
994
    tmp_path.mkdir(exist_ok=True)
995

996
    df = pl.DataFrame({"x": ["aaa", "bbb", "ccc"], "y": [123, 456, 789]})
997

998
    # write to bytesio
999
    xls = BytesIO()
1000
    df.write_excel(xls, worksheet="data")
1001
    assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1002

1003
    # write to file path
1004
    path = Path(tmp_path).joinpath("test_write_path.xlsx")
1005
    df.write_excel(path, worksheet="data")
1006
    assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1007

1008
    # write to file path (as string)
1009
    path = Path(tmp_path).joinpath("test_write_path_str.xlsx")
1010
    df.write_excel(str(path), worksheet="data")
1011
    assert_frame_equal(df, pl.read_excel(xls, engine=engine))
1012

1013
    # write to file object
1014
    path = Path(tmp_path).joinpath("test_write_file_object.xlsx")
1015
    with path.open("wb") as tgt:
1016
        df.write_excel(tgt, worksheet="data")
1017
    with path.open("rb") as src:
1018
        assert_frame_equal(df, pl.read_excel(src, engine=engine))
1019

1020

1021
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1022
def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
1023
    df = pl.DataFrame(
1024
        {"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
1025
    )
1026
    xls = BytesIO()
1027
    df.write_excel(xls, worksheet="data", include_header=False)
1028

1029
    xldf = pl.read_excel(xls, engine=engine, has_header=False)
1030
    expected = xldf.rename({"column_1": "colx", "column_2": "coly", "column_3": "colz"})
1031
    assert_frame_equal(df, expected)
1032

1033

1034
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1035
def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
1036
    from xlsxwriter import Workbook
1037

1038
    # note that we don't (quite) expect sparkline export to round-trip as we
1039
    # inject additional empty columns to hold them (which will read as nulls)
1040
    df = pl.DataFrame(
1041
        {
1042
            "id": ["aaa", "bbb", "ccc", "ddd", "eee"],
1043
            "q1": [100, 55, -20, 0, 35],
1044
            "q2": [30, -10, 15, 60, 20],
1045
            "q3": [-50, 0, 40, 80, 80],
1046
            "q4": [75, 55, 25, -10, -55],
1047
        }
1048
    ).cast(dtypes={pl.Int64: pl.Float64})
1049

1050
    # also: confirm that we can use a Workbook directly with "write_excel"
1051
    xls = BytesIO()
1052
    with Workbook(xls) as wb:
1053
        df.write_excel(
1054
            workbook=wb,
1055
            worksheet="frame_data",
1056
            table_style="Table Style Light 2",
1057
            dtype_formats={frozenset(NUMERIC_DTYPES): "#,##0_);(#,##0)"},
1058
            column_formats={cs.starts_with("h"): "#,##0_);(#,##0)"},
1059
            sparklines={
1060
                "trend": ["q1", "q2", "q3", "q4"],
1061
                "+/-": {
1062
                    "columns": ["q1", "q2", "q3", "q4"],
1063
                    "insert_after": "id",
1064
                    "type": "win_loss",
1065
                },
1066
            },
1067
            conditional_formats={
1068
                cs.starts_with("q", "h"): {
1069
                    "type": "2_color_scale",
1070
                    "min_color": "#95b3d7",
1071
                    "max_color": "#ffffff",
1072
                }
1073
            },
1074
            column_widths={cs.starts_with("q", "h"): 40},
1075
            row_totals={
1076
                "h1": ("q1", "q2"),
1077
                "h2": ("q3", "q4"),
1078
            },
1079
            hide_gridlines=True,
1080
            row_heights=35,
1081
            sheet_zoom=125,
1082
        )
1083

1084
    tables = {tbl["name"] for tbl in wb.get_worksheet_by_name("frame_data").tables}
1085
    assert "Frame0" in tables
1086

1087
    with warnings.catch_warnings():
1088
        # ignore an openpyxl user warning about sparklines
1089
        warnings.simplefilter("ignore", UserWarning)
1090
        xldf = pl.read_excel(xls, sheet_name="frame_data", engine=engine)
1091

1092
    # ┌─────┬──────┬─────┬─────┬─────┬─────┬───────┬─────┬─────┐
1093
    # │ id  ┆ +/-  ┆ q1  ┆ q2  ┆ q3  ┆ q4  ┆ trend ┆ h1  ┆ h2  │
1094
    # │ --- ┆ ---  ┆ --- ┆ --- ┆ --- ┆ --- ┆ ---   ┆ --- ┆ --- │
1095
    # │ str ┆ str  ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ str   ┆ i64 ┆ i64 │
1096
    # ╞═════╪══════╪═════╪═════╪═════╪═════╪═══════╪═════╪═════╡
1097
    # │ aaa ┆ null ┆ 100 ┆ 30  ┆ -50 ┆ 75  ┆ null  ┆ 0   ┆ 0   │
1098
    # │ bbb ┆ null ┆ 55  ┆ -10 ┆ 0   ┆ 55  ┆ null  ┆ 0   ┆ 0   │
1099
    # │ ccc ┆ null ┆ -20 ┆ 15  ┆ 40  ┆ 25  ┆ null  ┆ 0   ┆ 0   │
1100
    # │ ddd ┆ null ┆ 0   ┆ 60  ┆ 80  ┆ -10 ┆ null  ┆ 0   ┆ 0   │
1101
    # │ eee ┆ null ┆ 35  ┆ 20  ┆ 80  ┆ -55 ┆ null  ┆ 0   ┆ 0   │
1102
    # └─────┴──────┴─────┴─────┴─────┴─────┴───────┴─────┴─────┘
1103

1104
    for sparkline_col in ("+/-", "trend"):
1105
        assert set(xldf[sparkline_col]) in ({None}, {""})
1106

1107
    assert xldf.columns == ["id", "+/-", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
1108
    assert_frame_equal(
1109
        df, xldf.drop("+/-", "trend", "h1", "h2").cast(dtypes={pl.Int64: pl.Float64})
1110
    )
1111

1112

1113
def test_excel_write_multiple_tables() -> None:
1114
    from xlsxwriter import Workbook
1115

1116
    # note: also checks that empty tables don't error on write
1117
    df = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1118

1119
    # write multiple frames to multiple worksheets
1120
    xls = BytesIO()
1121
    with Workbook(xls) as wb:
1122
        df.rename({"colx": "colx0", "coly": "coly0", "colz": "colz0"}).write_excel(
1123
            workbook=wb, worksheet="sheet1", position="A1"
1124
        )
1125
        df.rename({"colx": "colx1", "coly": "coly1", "colz": "colz1"}).write_excel(
1126
            workbook=wb, worksheet="sheet1", position="X10"
1127
        )
1128
        df.rename({"colx": "colx2", "coly": "coly2", "colz": "colz2"}).write_excel(
1129
            workbook=wb, worksheet="sheet2", position="C25"
1130
        )
1131

1132
        # also validate integration of externally-added formats
1133
        fmt = wb.add_format({"bg_color": "#ffff00"})
1134
        df.rename({"colx": "colx3", "coly": "coly3", "colz": "colz3"}).write_excel(
1135
            workbook=wb,
1136
            worksheet="sheet3",
1137
            position="D4",
1138
            conditional_formats={
1139
                "colz3": {
1140
                    "type": "formula",
1141
                    "criteria": "=C2=B2",
1142
                    "format": fmt,
1143
                }
1144
            },
1145
        )
1146

1147
    table_names = {
1148
        tbl["name"]
1149
        for sheet in wb.sheetnames
1150
        for tbl in wb.get_worksheet_by_name(sheet).tables
1151
    }
1152
    assert table_names == {f"Frame{n}" for n in range(4)}
1153
    assert pl.read_excel(xls, sheet_name="sheet3").rows() == []
1154

1155
    # test loading one of the written tables by name
1156
    for engine in ("calamine", "openpyxl"):
1157
        df1 = pl.read_excel(
1158
            xls,
1159
            sheet_name="sheet2",
1160
            table_name="Frame2",
1161
            engine=engine,
1162
        )
1163
        df2 = pl.read_excel(
1164
            xls,
1165
            table_name="Frame2",
1166
            engine=engine,
1167
        )
1168
        assert df1.columns == ["colx2", "coly2", "colz2"]
1169
        assert_frame_equal(df1, df2)
1170

1171
        # if we supply a sheet name (which is optional when using `table_name`),
1172
        # then the table name must be present in *that* sheet, or we raise an error
1173
        with pytest.raises(
1174
            RuntimeError,
1175
            match="table named 'Frame3' not found in sheet 'sheet1'",
1176
        ):
1177
            pl.read_excel(xls, sheet_name="sheet1", table_name="Frame3")
1178

1179

1180
def test_excel_write_worksheet_object() -> None:
1181
    # write to worksheet object
1182
    from xlsxwriter import Workbook
1183

1184
    df = pl.DataFrame({"colx": ["aaa", "bbb", "ccc"], "coly": [-1234, 0, 5678]})
1185

1186
    with Workbook(xls := BytesIO()) as wb:
1187
        ws = wb.add_worksheet("frame_data")
1188
        df.write_excel(wb, worksheet=ws)
1189
        ws.hide_zero()
1190

1191
    assert_frame_equal(df, pl.read_excel(xls, sheet_name="frame_data"))
1192

1193
    with pytest.raises(  # noqa: SIM117
1194
        ValueError,
1195
        match="the given workbook object .* is not the parent of worksheet 'frame_data'",
1196
    ):
1197
        with Workbook(BytesIO()) as wb:
1198
            df.write_excel(wb, worksheet=ws)
1199

1200
    with pytest.raises(  # noqa: SIM117
1201
        TypeError,
1202
        match="worksheet object requires the parent workbook object; found workbook=None",
1203
    ):
1204
        with Workbook(BytesIO()) as wb:
1205
            df.write_excel(None, worksheet=ws)
1206

1207

1208
def test_excel_write_beyond_max_rows_cols(tmp_path: Path) -> None:
1209
    tmp_path.mkdir(exist_ok=True)
1210
    path = tmp_path / "test_max_dimensions.xlsx"
1211
    sheet = "mysheet"
1212

1213
    df = pl.DataFrame({"col1": range(10), "col2": range(10, 20)})
1214

1215
    with pytest.raises(pl.exceptions.InvalidOperationError):
1216
        df.write_excel(workbook=path, worksheet=sheet, position="A1048570")
1217

1218

1219
def test_excel_freeze_panes() -> None:
1220
    from xlsxwriter import Workbook
1221

1222
    # note: checks that empty tables don't error on write
1223
    df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1224
    df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1225
    df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})
1226

1227
    xls = BytesIO()
1228

1229
    # use all three freeze_pane notations
1230
    with Workbook(xls) as wb:
1231
        df1.write_excel(workbook=wb, worksheet="sheet1", freeze_panes=(1, 0))
1232
        df2.write_excel(workbook=wb, worksheet="sheet2", freeze_panes=(1, 0, 3, 4))
1233
        df3.write_excel(workbook=wb, worksheet="sheet3", freeze_panes=("B2"))
1234

1235
    table_names: set[str] = set()
1236
    for sheet in ("sheet1", "sheet2", "sheet3"):
1237
        table_names.update(
1238
            tbl["name"] for tbl in wb.get_worksheet_by_name(sheet).tables
1239
        )
1240
    assert table_names == {f"Frame{n}" for n in range(3)}
1241
    assert pl.read_excel(xls, sheet_name="sheet3").rows() == []
1242

1243

1244
@pytest.mark.parametrize(
1245
    ("read_spreadsheet", "source", "schema_overrides"),
1246
    [
1247
        (pl.read_excel, "path_xlsx_empty", None),
1248
        (pl.read_excel, "path_xlsb_empty", None),
1249
        (pl.read_excel, "path_xls_empty", None),
1250
        (pl.read_ods, "path_ods_empty", None),
1251
        # Test with schema overrides, to ensure they don't interfere with
1252
        # raising NoDataErrors.
1253
        (pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}),
1254
        (pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}),
1255
        (pl.read_excel, "path_xls_empty", {"a": pl.Int64}),
1256
        (pl.read_ods, "path_ods_empty", {"a": pl.Int64}),
1257
    ],
1258
)
1259
def test_excel_empty_sheet(
1260
    read_spreadsheet: Callable[..., pl.DataFrame],
1261
    source: str,
1262
    request: pytest.FixtureRequest,
1263
    schema_overrides: SchemaDict | None,
1264
) -> None:
1265
    ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods"
1266
    read_spreadsheet = pl.read_ods if ods else pl.read_excel  # type: ignore[assignment]
1267

1268
    with pytest.raises(NoDataError, match="empty Excel sheet"):
1269
        read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides)
1270

1271
    engine_params = [{}] if ods else [{"engine": "calamine"}]
1272
    for params in engine_params:
1273
        df = read_spreadsheet(
1274
            empty_spreadsheet_path,
1275
            sheet_name="no_data",
1276
            raise_if_empty=False,
1277
            **params,
1278
        )
1279
        expected = pl.DataFrame()
1280
        assert_frame_equal(df, expected)
1281

1282
        df = read_spreadsheet(
1283
            empty_spreadsheet_path,
1284
            sheet_name="no_rows",
1285
            raise_if_empty=False,
1286
            **params,
1287
        )
1288
        expected = pl.DataFrame(schema={f"col{c}": pl.String for c in ("x", "y", "z")})
1289
        assert_frame_equal(df, expected)
1290

1291

1292
@pytest.mark.parametrize(
1293
    ("engine", "hidden_columns"),
1294
    [
1295
        ("xlsx2csv", ["a"]),
1296
        ("openpyxl", ["a", "b"]),
1297
        ("calamine", ["a", "b"]),
1298
        ("xlsx2csv", cs.numeric()),
1299
        ("openpyxl", cs.last()),
1300
    ],
1301
)
1302
def test_excel_hidden_columns(
1303
    hidden_columns: list[str] | SelectorType,
1304
    engine: ExcelSpreadsheetEngine,
1305
) -> None:
1306
    df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})
1307

1308
    xls = BytesIO()
1309
    df.write_excel(xls, hidden_columns=hidden_columns)
1310

1311
    read_df = pl.read_excel(xls)
1312
    assert_frame_equal(df, read_df)
1313

1314

1315
def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None:
1316
    df = pl.read_excel(io_files_path / "nan_test.xlsx", engine="calamine")
1317
    nan = float("nan")
1318
    assert_frame_equal(
1319
        pl.DataFrame({"float_col": [nan, nan, nan, 100.0, 200.0, 300.0]}),
1320
        df,
1321
    )
1322

1323

1324
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1325
@pytest.mark.may_fail_auto_streaming  # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df
1326
def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
1327
    df = pl.DataFrame(
1328
        {
1329
            "a": [1, 2, None],
1330
            "b": [1.0, None, 3.5],
1331
            "c": ["x", None, "z"],
1332
            "d": [True, False, None],
1333
            "e": [
1334
                date(2023, 1, 1),
1335
                None,
1336
                date(2023, 1, 4),
1337
            ],
1338
            "f": [
1339
                datetime(2023, 1, 1),
1340
                datetime(2000, 10, 10, 10, 10),
1341
                None,
1342
            ],
1343
            "g": [
1344
                None,
1345
                "1920-08-08 00:00:00",
1346
                "2077-10-20 00:00:00.000000",
1347
            ],
1348
        }
1349
    )
1350
    xls = BytesIO()
1351
    df.write_excel(xls)
1352

1353
    reversed_cols = list(reversed(df.columns))
1354
    read_cols: Sequence[str] | Sequence[int]
1355
    expected = df.select(reversed_cols).with_columns(
1356
        pl.col("g").str.slice(0, 10).str.to_date()
1357
    )
1358
    for read_cols in (
1359
        reversed_cols,
1360
        [6, 5, 4, 3, 2, 1, 0],
1361
    ):
1362
        read_df = pl.read_excel(
1363
            xls,
1364
            engine=engine,
1365
            columns=read_cols,
1366
            schema_overrides={
1367
                "e": pl.Date,
1368
                "f": pl.Datetime("us"),
1369
                "g": pl.Date,
1370
            },
1371
        )
1372
        assert_frame_equal(expected, read_df)
1373

1374

1375
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1376
def test_drop_empty_rows(
1377
    path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine
1378
) -> None:
1379
    df1 = pl.read_excel(
1380
        source=path_empty_rows_excel,
1381
        engine=engine,
1382
    )  # check default
1383
    assert df1.shape == (8, 4)
1384

1385
    df2 = pl.read_excel(
1386
        source=path_empty_rows_excel,
1387
        engine=engine,
1388
        drop_empty_rows=True,
1389
    )
1390
    assert df2.shape == (8, 4)
1391

1392
    df3 = pl.read_excel(
1393
        source=path_empty_rows_excel,
1394
        engine=engine,
1395
        drop_empty_rows=False,
1396
    )
1397
    assert df3.shape == (10, 4)
1398

1399

1400
def test_excel_write_select_col_dtype() -> None:
1401
    from openpyxl import load_workbook
1402
    from xlsxwriter import Workbook
1403

1404
    def get_col_widths(wb_bytes: BytesIO) -> dict[str, int]:
1405
        return {
1406
            k: round(v.width)
1407
            for k, v in load_workbook(wb_bytes).active.column_dimensions.items()
1408
        }
1409

1410
    df = pl.DataFrame(
1411
        {
1412
            "name": [["Alice", "Ben"], ["Charlie", "Delta"]],
1413
            "col2": ["Hi", "Bye"],
1414
        }
1415
    )
1416

1417
    # column_widths test:
1418
    # pl.List(pl.String)) datatype should not match column with no list
1419
    check = BytesIO()
1420
    with Workbook(check) as wb:
1421
        df.write_excel(wb, column_widths={cs.by_dtype(pl.List(pl.String)): 300})
1422

1423
    assert get_col_widths(check) == {"A": 43}
1424

1425
    # column_widths test:
1426
    # pl.String datatype should not match column with list
1427
    check = BytesIO()
1428
    with Workbook(check) as wb:
1429
        df.write_excel(wb, column_widths={cs.by_dtype(pl.String): 300})
1430

1431
    assert get_col_widths(check) == {"B": 43}
1432

1433
    # hidden_columns test:
1434
    # pl.List(pl.String)) datatype should not match column with no list
1435
    check = BytesIO()
1436
    with Workbook(check) as wb:
1437
        df.write_excel(wb, hidden_columns=cs.by_dtype(pl.List(pl.String)))
1438

1439
    assert get_col_widths(check) == {"A": 0}
1440

1441
    # hidden_columns test:
1442
    # pl.String datatype should not match column with list
1443
    check = BytesIO()
1444
    with Workbook(check) as wb:
1445
        df.write_excel(wb, hidden_columns=cs.by_dtype(pl.String))
1446

1447
    assert get_col_widths(check) == {"B": 0}
1448

1449

1450
@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
1451
def test_excel_read_columns_nonlist_sequence(engine: ExcelSpreadsheetEngine) -> None:
1452
    df = pl.DataFrame(
1453
        {"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
1454
    )
1455
    xls = BytesIO()
1456
    df.write_excel(xls, worksheet="data")
1457

1458
    xldf = pl.read_excel(xls, engine=engine, columns=("colx", "coly"))
1459
    expected = df.select("colx", "coly")
1460
    assert_frame_equal(xldf, expected)
1461

1462
    xldf = pl.read_excel(xls, engine=engine, columns="colx")
1463
    expected = df.select("colx")
1464
    assert_frame_equal(xldf, expected)
1465

1466
Product

Resources

Company