Path: blob/main/py-polars/tests/unit/io/test_spreadsheet.py
6939 views
from __future__ import annotations12import warnings3from collections import OrderedDict4from datetime import date, datetime, time5from io import BytesIO6from pathlib import Path7from typing import TYPE_CHECKING, Any, Callable8from zoneinfo import ZoneInfo910import pytest1112import polars as pl13import polars.selectors as cs14from polars.exceptions import (15NoDataError,16ParameterCollisionError,17)18from polars.testing import assert_frame_equal, assert_series_equal19from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES2021if TYPE_CHECKING:22from collections.abc import Sequence2324from polars._typing import (25ExcelSpreadsheetEngine,26PolarsDataType,27SchemaDict,28SelectorType,29)303132# pytestmark = pytest.mark.slow()333435@pytest.fixture36def path_xls(io_files_path: Path) -> Path:37# old excel 97-2004 format38return io_files_path / "example.xls"394041@pytest.fixture42def path_xlsx(io_files_path: Path) -> Path:43# modern excel format44return io_files_path / "example.xlsx"454647@pytest.fixture48def path_xlsb(io_files_path: Path) -> Path:49# excel binary format50return io_files_path / "example.xlsb"515253@pytest.fixture54def path_ods(io_files_path: Path) -> Path:55# open document spreadsheet56return io_files_path / "example.ods"575859@pytest.fixture60def path_xls_empty(io_files_path: Path) -> Path:61return io_files_path / "empty.xls"626364@pytest.fixture65def path_xlsx_empty(io_files_path: Path) -> Path:66return io_files_path / "empty.xlsx"676869@pytest.fixture70def path_xlsx_mixed(io_files_path: Path) -> Path:71return io_files_path / "mixed.xlsx"727374@pytest.fixture75def path_xlsb_empty(io_files_path: Path) -> Path:76return io_files_path / "empty.xlsb"777879@pytest.fixture80def path_xlsb_mixed(io_files_path: Path) -> Path:81return io_files_path / "mixed.xlsb"828384@pytest.fixture85def path_ods_empty(io_files_path: Path) -> Path:86return io_files_path / "empty.ods"878889@pytest.fixture90def path_ods_mixed(io_files_path: Path) -> Path:91return io_files_path / "mixed.ods"929394@pytest.fixture95def path_empty_rows_excel(io_files_path: Path) -> Path:96return io_files_path / "test_empty_rows.xlsx"979899@pytest.mark.parametrize(100("read_spreadsheet", "source", "engine_params"),101[102# xls file103(pl.read_excel, "path_xls", {"engine": "calamine"}),104# xlsx file105(pl.read_excel, "path_xlsx", {"engine": "calamine"}),106(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),107(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),108# xlsb file (binary)109(pl.read_excel, "path_xlsb", {"engine": "calamine"}),110# open document111(pl.read_ods, "path_ods", {}),112],113)114def test_read_spreadsheet(115read_spreadsheet: Callable[..., pl.DataFrame],116source: str,117engine_params: dict[str, str],118request: pytest.FixtureRequest,119) -> None:120sheet_params: dict[str, Any]121122for sheet_params in (123{"sheet_name": None, "sheet_id": None},124{"sheet_name": "test1"},125{"sheet_id": 1},126):127df = read_spreadsheet(128source=request.getfixturevalue(source),129**engine_params,130**sheet_params,131)132expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]})133assert_frame_equal(df, expected)134135136@pytest.mark.parametrize(137("read_spreadsheet", "source", "params"),138[139# xls file140(pl.read_excel, "path_xls", {"engine": "calamine"}),141# xlsx file142(pl.read_excel, "path_xlsx", {"engine": "calamine"}),143(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),144(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),145# xlsb file (binary)146(pl.read_excel, "path_xlsb", {"engine": "calamine"}),147# open document148(pl.read_ods, "path_ods", {}),149],150)151def test_read_excel_multiple_worksheets(152read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],153source: str,154params: dict[str, str],155request: pytest.FixtureRequest,156) -> None:157spreadsheet_path = request.getfixturevalue(source)158frames_by_id = read_spreadsheet(159spreadsheet_path,160sheet_id=[2, 1],161sheet_name=None,162**params,163)164frames_by_name = read_spreadsheet(165spreadsheet_path,166sheet_id=None,167sheet_name=["test2", "test1"],168**params,169)170for frames in (frames_by_id, frames_by_name):171assert list(frames_by_name) == ["test2", "test1"]172173expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})174expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})175176assert_frame_equal(frames["test1"], expected1)177assert_frame_equal(frames["test2"], expected2)178179180@pytest.mark.parametrize(181("read_spreadsheet", "source", "params"),182[183# xls file184(pl.read_excel, "path_xls", {"engine": "calamine"}),185# xlsx file186(pl.read_excel, "path_xlsx", {"engine": "calamine"}),187(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),188(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),189# xlsb file (binary)190(pl.read_excel, "path_xlsb", {"engine": "calamine"}),191# open document192(pl.read_ods, "path_ods", {}),193],194)195def test_read_excel_multiple_workbooks(196read_spreadsheet: Callable[..., Any],197source: str,198params: dict[str, str],199request: pytest.FixtureRequest,200) -> None:201spreadsheet_path = request.getfixturevalue(source)202203# multiple workbooks, single worksheet204df = read_spreadsheet(205[206spreadsheet_path,207spreadsheet_path,208spreadsheet_path,209],210sheet_id=None,211sheet_name="test1",212include_file_paths="path",213**params,214)215expected = pl.DataFrame(216{217"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"],218"path": [str(spreadsheet_path)] * 6,219},220)221assert_frame_equal(df, expected)222223# multiple workbooks, multiple worksheets224res = read_spreadsheet(225[226spreadsheet_path,227spreadsheet_path,228spreadsheet_path,229],230sheet_id=None,231sheet_name=["test1", "test2"],232**params,233)234expected_frames = {235"test1": pl.DataFrame(236{"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"]}237),238"test2": pl.DataFrame(239{"world": ["Row 3", "Row 4", "Row 3", "Row 4", "Row 3", "Row 4"]}240),241}242assert sorted(res) == sorted(expected_frames)243assert_frame_equal(res["test1"], expected_frames["test1"])244assert_frame_equal(res["test2"], expected_frames["test2"])245246247@pytest.mark.parametrize(248("read_spreadsheet", "source", "params"),249[250# xls file251(pl.read_excel, "path_xls", {"engine": "calamine"}),252# xlsx file253(pl.read_excel, "path_xlsx", {"engine": "calamine"}),254(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),255(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),256# xlsb file (binary)257(pl.read_excel, "path_xlsb", {"engine": "calamine"}),258# open document259(pl.read_ods, "path_ods", {}),260],261)262def test_read_excel_all_sheets(263read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],264source: str,265params: dict[str, str],266request: pytest.FixtureRequest,267) -> None:268spreadsheet_path = request.getfixturevalue(source)269frames = read_spreadsheet(270spreadsheet_path,271sheet_id=0,272**params,273)274assert len(frames) == (4 if str(spreadsheet_path).endswith("ods") else 6)275276expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})277expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})278expected3 = pl.DataFrame(279{280"cardinality": [1, 3, 15, 30, 150, 300],281"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],282"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],283}284)285assert_frame_equal(frames["test1"], expected1)286assert_frame_equal(frames["test2"], expected2)287if params.get("engine") == "openpyxl":288# TODO: flag that trims trailing all-null rows?289assert_frame_equal(frames["test3"], expected3)290assert_frame_equal(frames["test4"].drop_nulls(), expected3)291292293@pytest.mark.parametrize(294"engine",295["calamine", "openpyxl", "xlsx2csv"],296)297def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:298df = pl.DataFrame(299{300"A": [1, 2, 3, 4, 5],301"fruits": ["banana", "banana", "apple", "apple", "banana"],302"floats": [1.1, 1.2, 1.3, 1.4, 1.5],303"datetime": [datetime(2023, 1, x) for x in range(1, 6)],304"nulls": [1, None, None, None, 0],305},306)307xls = BytesIO()308df.write_excel(xls, position="C5")309310schema_overrides = {"datetime": pl.Datetime("us"), "nulls": pl.Boolean()}311df_compare = df.with_columns(312pl.col(nm).cast(tp) for nm, tp in schema_overrides.items()313)314for sheet_id, sheet_name in ((None, None), (1, None), (None, "Sheet1")):315df_from_excel = pl.read_excel(316xls,317sheet_id=sheet_id,318sheet_name=sheet_name,319engine=engine,320schema_overrides=schema_overrides,321)322assert_frame_equal(df_compare, df_from_excel)323324# check some additional overrides325# (note: xlsx2csv can't currently convert datetime with trailing '00:00:00' to date)326dt_override = {"datetime": pl.Date} if engine != "xlsx2csv" else {}327df = pl.read_excel(328xls,329sheet_id=sheet_id,330sheet_name=sheet_name,331engine=engine,332schema_overrides={"A": pl.Float32, **dt_override},333)334assert_series_equal(335df["A"],336pl.Series(name="A", values=[1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float32),337)338if dt_override:339assert_series_equal(340df["datetime"],341pl.Series(342name="datetime",343values=[date(2023, 1, x) for x in range(1, 6)],344dtype=pl.Date,345),346)347348349@pytest.mark.parametrize(350("read_spreadsheet", "source", "params"),351[352# TODO: uncomment once fastexcel offers a suitable param353# (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),354(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),355(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),356],357)358def test_read_dropped_cols(359read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],360source: str,361params: dict[str, str],362request: pytest.FixtureRequest,363) -> None:364spreadsheet_path = request.getfixturevalue(source)365366df1 = read_spreadsheet(367spreadsheet_path,368sheet_name="test4",369**params,370)371df2 = read_spreadsheet(372spreadsheet_path,373sheet_name="test4",374drop_empty_cols=False,375**params,376)377assert df1.to_dict(as_series=False) == { # type: ignore[attr-defined]378"cardinality": [1, 3, 15, 30, 150, 300],379"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],380"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],381}382assert df2.to_dict(as_series=False) == { # type: ignore[attr-defined]383"": [None, None, None, None, None, None],384"cardinality": [1, 3, 15, 30, 150, 300],385"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],386"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],387"0": [None, None, None, None, None, None],388"1": [None, None, None, None, None, None],389}390391392@pytest.mark.parametrize(393("source", "params"),394[395("path_xls", {"engine": "calamine", "sheet_name": "temporal"}),396("path_xlsx", {"engine": "calamine", "table_name": "TemporalData"}),397("path_xlsx", {"engine": "openpyxl", "sheet_name": "temporal"}),398("path_xlsb", {"engine": "calamine", "sheet_name": "temporal"}),399],400)401def test_read_excel_temporal_data(402source: str,403params: dict[str, str],404request: pytest.FixtureRequest,405) -> None:406source_path = request.getfixturevalue(source)407408temporal_schema = {409"id": pl.UInt16(),410"dtm": pl.Datetime("ms"),411"dt": pl.Date(),412"dtm_str": pl.Datetime(time_zone="Asia/Tokyo"),413"dt_str": pl.Date(),414"tm_str": pl.Time(),415}416parsed_df = pl.read_excel( # type: ignore[call-overload]417source_path,418**params,419schema_overrides=temporal_schema,420)421TK = ZoneInfo("Asia/Tokyo")422423expected = pl.DataFrame(424{425"id": [100, 200, 300, 400],426"dtm": [427datetime(1999, 12, 31, 1, 2, 3),428None,429datetime(1969, 7, 5, 10, 30, 45),430datetime(2077, 10, 10, 5, 59, 44),431],432"dt": [433date(2000, 1, 18),434date(1965, 8, 8),435date(2027, 4, 22),436None,437],438"dtm_str": [439None,440datetime(1900, 1, 30, 14, 50, 20, tzinfo=TK),441datetime(2026, 5, 7, 23, 59, 59, tzinfo=TK),442datetime(2007, 6, 1, 0, 0, tzinfo=TK),443],444"dt_str": [445date(2000, 6, 14),446date(1978, 2, 28),447None,448date(2040, 12, 4),449],450"tm_str": [451time(23, 50, 22),452time(0, 0, 1),453time(10, 10, 33),454time(18, 30, 15),455],456},457schema=temporal_schema,458)459assert_frame_equal(expected, parsed_df)460461462@pytest.mark.parametrize(463("read_spreadsheet", "source", "params"),464[465# xls file466(pl.read_excel, "path_xls", {"engine": "calamine"}),467# xlsx file468(pl.read_excel, "path_xlsx", {"engine": "calamine"}),469(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),470(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),471# xlsb file (binary)472(pl.read_excel, "path_xlsb", {"engine": "calamine"}),473# open document474(pl.read_ods, "path_ods", {}),475],476)477def test_read_invalid_worksheet(478read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],479source: str,480params: dict[str, str],481request: pytest.FixtureRequest,482) -> None:483spreadsheet_path = request.getfixturevalue(source)484for param, sheet_id, sheet_name in (485("id", 999, None),486("name", None, "not_a_sheet_name"),487):488value = sheet_id if param == "id" else sheet_name489with pytest.raises(490ValueError,491match=f"no matching sheet found when `sheet_{param}` is {value!r}",492):493read_spreadsheet(494spreadsheet_path, sheet_id=sheet_id, sheet_name=sheet_name, **params495)496497498@pytest.mark.parametrize(499("read_spreadsheet", "source", "additional_params"),500[501(pl.read_excel, "path_xlsx_mixed", {"engine": "openpyxl"}),502(pl.read_ods, "path_ods_mixed", {}),503],504)505@pytest.mark.may_fail_auto_streaming506def test_read_mixed_dtype_columns(507read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],508source: str,509additional_params: dict[str, str],510request: pytest.FixtureRequest,511) -> None:512spreadsheet_path = request.getfixturevalue(source)513schema_overrides = {514"Employee ID": pl.Utf8(),515"Employee Name": pl.Utf8(),516"Date": pl.Date(),517"Details": pl.Categorical("lexical"),518"Asset ID": pl.Utf8(),519}520df = read_spreadsheet(521spreadsheet_path,522sheet_id=0,523schema_overrides=schema_overrides,524**additional_params,525)["Sheet1"]526527assert_frame_equal(528df,529pl.DataFrame(530{531"Employee ID": ["123456", "44333", "US00011", "135967", "IN86868"],532"Employee Name": ["Test1", "Test2", "Test4", "Test5", "Test6"],533"Date": [534date(2023, 7, 21),535date(2023, 7, 21),536date(2023, 7, 21),537date(2023, 7, 21),538date(2023, 7, 21),539],540"Details": [541"Healthcare",542"Healthcare",543"Healthcare",544"Healthcare",545"Something",546],547"Asset ID": ["84444", "84444", "84444", "84444", "ABC123"],548},549schema_overrides=schema_overrides,550),551)552553554def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> None:555df1 = pl.read_excel(556path_xlsx,557sheet_name="test4",558schema_overrides={"cardinality": pl.UInt16},559).drop_nulls()560561assert df1.schema["cardinality"] == pl.UInt16562assert df1.schema["rows_by_key"] == pl.Float64563assert df1.schema["iter_groups"] == pl.Float64564565df2 = pl.read_excel(566path_xlsx,567sheet_name="test4",568engine="xlsx2csv",569read_options={"schema_overrides": {"cardinality": pl.UInt16}},570).drop_nulls()571572assert df2.schema["cardinality"] == pl.UInt16573assert df2.schema["rows_by_key"] == pl.Float64574assert df2.schema["iter_groups"] == pl.Float64575576df3 = pl.read_excel(577path_xlsx,578sheet_name="test4",579engine="xlsx2csv",580schema_overrides={"cardinality": pl.UInt16},581read_options={582"schema_overrides": {583"rows_by_key": pl.Float32,584"iter_groups": pl.Float32,585},586},587).drop_nulls()588589assert df3.schema["cardinality"] == pl.UInt16590assert df3.schema["rows_by_key"] == pl.Float32591assert df3.schema["iter_groups"] == pl.Float32592593for workbook_path in (path_xlsx, path_xlsb, path_ods):594read_spreadsheet = (595pl.read_ods if workbook_path.suffix == ".ods" else pl.read_excel596)597df4 = read_spreadsheet( # type: ignore[operator]598workbook_path,599sheet_name="test5",600schema_overrides={"dtm": pl.Datetime("ns"), "dt": pl.Date},601)602assert_frame_equal(603df4,604pl.DataFrame(605{606"dtm": [607datetime(1999, 12, 31, 10, 30, 45),608datetime(2010, 10, 11, 12, 13, 14),609],610"dt": [date(2024, 1, 1), date(2018, 8, 7)],611"val": [1.5, -0.5],612},613schema={"dtm": pl.Datetime("ns"), "dt": pl.Date, "val": pl.Float64},614),615)616617with pytest.raises(ParameterCollisionError):618# cannot specify 'cardinality' in both schema_overrides and read_options619pl.read_excel(620path_xlsx,621sheet_name="test4",622engine="xlsx2csv",623schema_overrides={"cardinality": pl.UInt16},624read_options={"schema_overrides": {"cardinality": pl.Int32}},625)626627# read multiple sheets in conjunction with 'schema_overrides'628# (note: reading the same sheet twice simulates the issue in #11850)629overrides = OrderedDict(630[631("cardinality", pl.UInt32),632("rows_by_key", pl.Float32),633("iter_groups", pl.Float64),634]635)636df = pl.read_excel(637path_xlsx,638sheet_name=["test4", "test4"],639schema_overrides=overrides,640)641for col, dtype in overrides.items():642assert df["test4"].schema[col] == dtype643644645@pytest.mark.parametrize(646("engine", "read_opts_param"),647[648("xlsx2csv", "infer_schema_length"),649("calamine", "schema_sample_rows"),650],651)652def test_invalid_parameter_combinations_infer_schema_len(653path_xlsx: Path, engine: str, read_opts_param: str654) -> None:655with pytest.raises(656ParameterCollisionError,657match=f"cannot specify both `infer_schema_length`.*{read_opts_param}",658):659pl.read_excel( # type: ignore[call-overload]660path_xlsx,661sheet_id=1,662engine=engine,663read_options={read_opts_param: 512},664infer_schema_length=1024,665)666667668@pytest.mark.parametrize(669("engine", "read_opts_param"),670[671("xlsx2csv", "columns"),672("calamine", "use_columns"),673],674)675def test_invalid_parameter_combinations_columns(676path_xlsx: Path, engine: str, read_opts_param: str677) -> None:678with pytest.raises(679ParameterCollisionError,680match=f"cannot specify both `columns`.*{read_opts_param}",681):682pl.read_excel( # type: ignore[call-overload]683path_xlsx,684sheet_id=1,685engine=engine,686read_options={read_opts_param: ["B", "C", "D"]},687columns=["A", "B", "C"],688)689690691def test_unsupported_engine() -> None:692with pytest.raises(NotImplementedError):693pl.read_excel(None, engine="foo") # type: ignore[call-overload]694695696def test_unsupported_binary_workbook(path_xlsb: Path) -> None:697with pytest.raises(Exception, match="does not support binary format"):698pl.read_excel(path_xlsb, engine="openpyxl")699700701@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])702def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:703with pytest.raises(704ValueError,705match=r"cannot specify both `sheet_name` \('Sheet1'\) and `sheet_id` \(1\)",706):707pl.read_excel( # type: ignore[call-overload]708path_xlsx,709sheet_id=1,710sheet_name="Sheet1",711engine=engine,712)713714715# the parameters don't change the data, only the formatting, so we expect716# the same result each time. however, it's important to validate that the717# parameter permutations don't raise exceptions, or interfere with the718# values written to the worksheet, so test multiple variations.719@pytest.mark.parametrize(720"write_params",721[722# default parameters723{},724# basic formatting725{726"autofit": True,727"table_style": "Table Style Dark 2",728"column_totals": True,729"float_precision": 0,730},731# slightly customized formatting, with some formulas732{733"position": (0, 0),734"table_style": {735"style": "Table Style Medium 23",736"first_column": True,737},738"conditional_formats": {"val": "data_bar"},739"column_formats": {740"val": "#,##0.000;[White]-#,##0.000",741("day", "month", "year"): {"align": "left", "num_format": "0"},742},743"header_format": {"italic": True, "bg_color": "#d9d9d9"},744"column_widths": {"val": 100},745"row_heights": {0: 35},746"formulas": {747# string: formula added to the end of the table (but before row_totals)748"day": "=DAY([@dtm])",749"month": "=MONTH([@dtm])",750"year": {751# dict: full control over formula positioning/dtype752"formula": "=YEAR([@dtm])",753"insert_after": "month",754"return_type": pl.Int16,755},756},757"column_totals": True,758"row_totals": True,759},760# heavily customized formatting/definition761{762"position": "A1",763"table_name": "PolarsFrameData",764"table_style": "Table Style Light 9",765"conditional_formats": {766# single dict format767"str": {768"type": "duplicate",769"format": {"bg_color": "#ff0000", "font_color": "#ffffff"},770},771# multiple dict formats772"val": [773{774"type": "3_color_scale",775"min_color": "#4bacc6",776"mid_color": "#ffffff",777"max_color": "#daeef3",778},779{780"type": "cell",781"criteria": "<",782"value": -90,783"format": {"font_color": "white"},784},785],786"dtm": [787{788"type": "top",789"value": 1,790"format": {"bold": True, "font_color": "green"},791},792{793"type": "bottom",794"value": 1,795"format": {"bold": True, "font_color": "red"},796},797],798},799"dtype_formats": {800frozenset(801FLOAT_DTYPES802): '_(£* #,##0.00_);_(£* (#,##0.00);_(£* "-"??_);_(@_)',803pl.Date: "dd-mm-yyyy",804},805"column_formats": {"dtm": {"font_color": "#31869c", "bg_color": "#b7dee8"}},806"column_totals": {"val": "average", "dtm": "min"},807"column_widths": {("str", "val"): 60, "dtm": 80},808"row_totals": {"tot": True},809"hidden_columns": ["str"],810"hide_gridlines": True,811"include_header": False,812},813],814)815def test_excel_round_trip(write_params: dict[str, Any]) -> None:816df = pl.DataFrame(817{818"dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)],819"str": ["xxx", "yyy", "xxx"],820"val": [100.5, 55.0, -99.5],821}822)823824engine: ExcelSpreadsheetEngine825for engine in ("calamine", "xlsx2csv"):826read_options, has_header = (827({}, True)828if write_params.get("include_header", True)829else (830{"new_columns": ["dtm", "str", "val"]}831if engine == "xlsx2csv"832else {"column_names": ["dtm", "str", "val"]},833False,834)835)836837fmt_strptime = "%Y-%m-%d"838if write_params.get("dtype_formats", {}).get(pl.Date) == "dd-mm-yyyy":839fmt_strptime = "%d-%m-%Y"840841# write to xlsx using various parameters...842xls = BytesIO()843_wb = df.write_excel(workbook=xls, worksheet="data", **write_params)844845# ...and read it back again:846xldf = pl.read_excel(847xls,848sheet_name="data",849engine=engine,850read_options=read_options,851has_header=has_header,852)[:3].select(df.columns[:3])853854if engine == "xlsx2csv":855xldf = xldf.with_columns(pl.col("dtm").str.strptime(pl.Date, fmt_strptime))856857assert_frame_equal(df, xldf)858859860@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"])861def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None:862df = pl.DataFrame(863{864"id": ["aaa", "bbb", "ccc", "ddd", "eee"],865# float cols866"q1": [100.0, 55.5, -20.0, 0.5, 35.0],867"q2": [30.5, -10.25, 15.0, 60.0, 20.5],868# int cols869"q3": [-50, 0, 40, 80, 80],870"q4": [75, 55, 25, -10, -55],871}872)873for fn_sum in (True, "sum", "SUM"):874xls = BytesIO()875df.write_excel(876xls,877worksheet="misc",878sparklines={"trend": ["q1", "q2", "q3", "q4"]},879row_totals={880# add semiannual row total columns881"h1": ("q1", "q2"),882"h2": ("q3", "q4"),883},884column_totals=fn_sum,885)886887# note that the totals are written as formulae, so we888# won't have the calculated values in the dataframe889xldf = pl.read_excel(xls, sheet_name="misc", engine=engine)890891assert xldf.columns == ["id", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]892assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)893894895@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set896@pytest.mark.parametrize(897("engine", "list_dtype"),898[899("calamine", pl.List(pl.Int8)),900("openpyxl", pl.List(pl.UInt16)),901("xlsx2csv", pl.Array(pl.Int32, 3)),902],903)904def test_excel_write_compound_types(905engine: ExcelSpreadsheetEngine,906list_dtype: PolarsDataType,907) -> None:908df = pl.DataFrame(909data={"x": [None, [1, 2, 3], [4, 5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]},910schema_overrides={"x": pl.Array(pl.Int32, 3)},911).select("x", pl.struct(["y", "z"]))912913xls = BytesIO()914df.write_excel(xls, worksheet="data")915916# also test reading from the various flavours of supported binary data917# across all backend engines (check bytesio, bytes, and memoryview)918for binary_data in (919xls,920xls.getvalue(),921xls.getbuffer(),922):923xldf = pl.read_excel(924binary_data,925sheet_name="data",926engine=engine,927include_file_paths="wbook",928)929930# expect string conversion (only scalar values are supported)931assert xldf.rows() == [932(None, "{'y': 'a', 'z': 9}", "in-mem"),933("[1, 2, 3]", "{'y': 'b', 'z': 8}", "in-mem"),934("[4, 5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),935]936937938def test_excel_read_named_table_with_total_row(tmp_path: Path) -> None:939df = pl.DataFrame(940{941"x": ["aa", "bb", "cc"],942"y": [100, 325, -250],943"z": [975, -444, 123],944}945)946# when we read back a named table object with a total row we expect the read947# to automatically omit that row as it is *not* part of the actual table data948wb_path = Path(tmp_path).joinpath("test_named_table_read.xlsx")949df.write_excel(950wb_path,951worksheet="data",952table_name="PolarsFrameTable",953column_totals=True,954)955for engine in ("calamine", "openpyxl"):956xldf = pl.read_excel(wb_path, table_name="PolarsFrameTable", engine=engine)957assert_frame_equal(df, xldf)958959# xlsx2csv doesn't support reading named tables, so we see the960# column total if we don't filter it out after reading the data961with pytest.raises(962ValueError,963match="the `table_name` parameter is not supported by the 'xlsx2csv' engine",964):965pl.read_excel(wb_path, table_name="PolarsFrameTable", engine="xlsx2csv")966967xldf = pl.read_excel(wb_path, sheet_name="data", engine="xlsx2csv")968assert_frame_equal(df, xldf.head(3))969assert xldf.height == 4970assert xldf.row(3) == (None, 0, 0)971972973@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])974def test_excel_write_to_bytesio(engine: ExcelSpreadsheetEngine) -> None:975df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]})976977excel_bytes = BytesIO()978df.write_excel(excel_bytes)979980df_read = pl.read_excel(excel_bytes, engine=engine)981assert_frame_equal(df, df_read)982983# also confirm consistent behaviour when 'infer_schema_length=0'984df_read = pl.read_excel(excel_bytes, engine=engine, infer_schema_length=0)985expected = pl.DataFrame({"colx": ["1.5", "-2", "0"], "coly": ["a", None, "c"]})986assert_frame_equal(expected, df_read)987988989@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])990def test_excel_write_to_file_object(991engine: ExcelSpreadsheetEngine, tmp_path: Path992) -> None:993tmp_path.mkdir(exist_ok=True)994995df = pl.DataFrame({"x": ["aaa", "bbb", "ccc"], "y": [123, 456, 789]})996997# write to bytesio998xls = BytesIO()999df.write_excel(xls, worksheet="data")1000assert_frame_equal(df, pl.read_excel(xls, engine=engine))10011002# write to file path1003path = Path(tmp_path).joinpath("test_write_path.xlsx")1004df.write_excel(path, worksheet="data")1005assert_frame_equal(df, pl.read_excel(xls, engine=engine))10061007# write to file path (as string)1008path = Path(tmp_path).joinpath("test_write_path_str.xlsx")1009df.write_excel(str(path), worksheet="data")1010assert_frame_equal(df, pl.read_excel(xls, engine=engine))10111012# write to file object1013path = Path(tmp_path).joinpath("test_write_file_object.xlsx")1014with path.open("wb") as tgt:1015df.write_excel(tgt, worksheet="data")1016with path.open("rb") as src:1017assert_frame_equal(df, pl.read_excel(src, engine=engine))101810191020@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])1021def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:1022df = pl.DataFrame(1023{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}1024)1025xls = BytesIO()1026df.write_excel(xls, worksheet="data", include_header=False)10271028xldf = pl.read_excel(xls, engine=engine, has_header=False)1029expected = xldf.rename({"column_1": "colx", "column_2": "coly", "column_3": "colz"})1030assert_frame_equal(df, expected)103110321033@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])1034def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:1035from xlsxwriter import Workbook10361037# note that we don't (quite) expect sparkline export to round-trip as we1038# inject additional empty columns to hold them (which will read as nulls)1039df = pl.DataFrame(1040{1041"id": ["aaa", "bbb", "ccc", "ddd", "eee"],1042"q1": [100, 55, -20, 0, 35],1043"q2": [30, -10, 15, 60, 20],1044"q3": [-50, 0, 40, 80, 80],1045"q4": [75, 55, 25, -10, -55],1046}1047).cast(dtypes={pl.Int64: pl.Float64})10481049# also: confirm that we can use a Workbook directly with "write_excel"1050xls = BytesIO()1051with Workbook(xls) as wb:1052df.write_excel(1053workbook=wb,1054worksheet="frame_data",1055table_style="Table Style Light 2",1056dtype_formats={frozenset(NUMERIC_DTYPES): "#,##0_);(#,##0)"},1057column_formats={cs.starts_with("h"): "#,##0_);(#,##0)"},1058sparklines={1059"trend": ["q1", "q2", "q3", "q4"],1060"+/-": {1061"columns": ["q1", "q2", "q3", "q4"],1062"insert_after": "id",1063"type": "win_loss",1064},1065},1066conditional_formats={1067cs.starts_with("q", "h"): {1068"type": "2_color_scale",1069"min_color": "#95b3d7",1070"max_color": "#ffffff",1071}1072},1073column_widths={cs.starts_with("q", "h"): 40},1074row_totals={1075"h1": ("q1", "q2"),1076"h2": ("q3", "q4"),1077},1078hide_gridlines=True,1079row_heights=35,1080sheet_zoom=125,1081)10821083tables = {tbl["name"] for tbl in wb.get_worksheet_by_name("frame_data").tables}1084assert "Frame0" in tables10851086with warnings.catch_warnings():1087# ignore an openpyxl user warning about sparklines1088warnings.simplefilter("ignore", UserWarning)1089xldf = pl.read_excel(xls, sheet_name="frame_data", engine=engine)10901091# ┌─────┬──────┬─────┬─────┬─────┬─────┬───────┬─────┬─────┐1092# │ id ┆ +/- ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ trend ┆ h1 ┆ h2 │1093# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │1094# │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │1095# ╞═════╪══════╪═════╪═════╪═════╪═════╪═══════╪═════╪═════╡1096# │ aaa ┆ null ┆ 100 ┆ 30 ┆ -50 ┆ 75 ┆ null ┆ 0 ┆ 0 │1097# │ bbb ┆ null ┆ 55 ┆ -10 ┆ 0 ┆ 55 ┆ null ┆ 0 ┆ 0 │1098# │ ccc ┆ null ┆ -20 ┆ 15 ┆ 40 ┆ 25 ┆ null ┆ 0 ┆ 0 │1099# │ ddd ┆ null ┆ 0 ┆ 60 ┆ 80 ┆ -10 ┆ null ┆ 0 ┆ 0 │1100# │ eee ┆ null ┆ 35 ┆ 20 ┆ 80 ┆ -55 ┆ null ┆ 0 ┆ 0 │1101# └─────┴──────┴─────┴─────┴─────┴─────┴───────┴─────┴─────┘11021103for sparkline_col in ("+/-", "trend"):1104assert set(xldf[sparkline_col]) in ({None}, {""})11051106assert xldf.columns == ["id", "+/-", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]1107assert_frame_equal(1108df, xldf.drop("+/-", "trend", "h1", "h2").cast(dtypes={pl.Int64: pl.Float64})1109)111011111112def test_excel_write_multiple_tables() -> None:1113from xlsxwriter import Workbook11141115# note: also checks that empty tables don't error on write1116df = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})11171118# write multiple frames to multiple worksheets1119xls = BytesIO()1120with Workbook(xls) as wb:1121df.rename({"colx": "colx0", "coly": "coly0", "colz": "colz0"}).write_excel(1122workbook=wb, worksheet="sheet1", position="A1"1123)1124df.rename({"colx": "colx1", "coly": "coly1", "colz": "colz1"}).write_excel(1125workbook=wb, worksheet="sheet1", position="X10"1126)1127df.rename({"colx": "colx2", "coly": "coly2", "colz": "colz2"}).write_excel(1128workbook=wb, worksheet="sheet2", position="C25"1129)11301131# also validate integration of externally-added formats1132fmt = wb.add_format({"bg_color": "#ffff00"})1133df.rename({"colx": "colx3", "coly": "coly3", "colz": "colz3"}).write_excel(1134workbook=wb,1135worksheet="sheet3",1136position="D4",1137conditional_formats={1138"colz3": {1139"type": "formula",1140"criteria": "=C2=B2",1141"format": fmt,1142}1143},1144)11451146table_names = {1147tbl["name"]1148for sheet in wb.sheetnames1149for tbl in wb.get_worksheet_by_name(sheet).tables1150}1151assert table_names == {f"Frame{n}" for n in range(4)}1152assert pl.read_excel(xls, sheet_name="sheet3").rows() == []11531154# test loading one of the written tables by name1155for engine in ("calamine", "openpyxl"):1156df1 = pl.read_excel(1157xls,1158sheet_name="sheet2",1159table_name="Frame2",1160engine=engine,1161)1162df2 = pl.read_excel(1163xls,1164table_name="Frame2",1165engine=engine,1166)1167assert df1.columns == ["colx2", "coly2", "colz2"]1168assert_frame_equal(df1, df2)11691170# if we supply a sheet name (which is optional when using `table_name`),1171# then the table name must be present in *that* sheet, or we raise an error1172with pytest.raises(1173RuntimeError,1174match="table named 'Frame3' not found in sheet 'sheet1'",1175):1176pl.read_excel(xls, sheet_name="sheet1", table_name="Frame3")117711781179def test_excel_write_worksheet_object() -> None:1180# write to worksheet object1181from xlsxwriter import Workbook11821183df = pl.DataFrame({"colx": ["aaa", "bbb", "ccc"], "coly": [-1234, 0, 5678]})11841185with Workbook(xls := BytesIO()) as wb:1186ws = wb.add_worksheet("frame_data")1187df.write_excel(wb, worksheet=ws)1188ws.hide_zero()11891190assert_frame_equal(df, pl.read_excel(xls, sheet_name="frame_data"))11911192with pytest.raises( # noqa: SIM1171193ValueError,1194match="the given workbook object .* is not the parent of worksheet 'frame_data'",1195):1196with Workbook(BytesIO()) as wb:1197df.write_excel(wb, worksheet=ws)11981199with pytest.raises( # noqa: SIM1171200TypeError,1201match="worksheet object requires the parent workbook object; found workbook=None",1202):1203with Workbook(BytesIO()) as wb:1204df.write_excel(None, worksheet=ws)120512061207def test_excel_write_beyond_max_rows_cols(tmp_path: Path) -> None:1208tmp_path.mkdir(exist_ok=True)1209path = tmp_path / "test_max_dimensions.xlsx"1210sheet = "mysheet"12111212df = pl.DataFrame({"col1": range(10), "col2": range(10, 20)})12131214with pytest.raises(pl.exceptions.InvalidOperationError):1215df.write_excel(workbook=path, worksheet=sheet, position="A1048570")121612171218def test_excel_freeze_panes() -> None:1219from xlsxwriter import Workbook12201221# note: checks that empty tables don't error on write1222df1 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})1223df2 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})1224df3 = pl.DataFrame(schema={"colx": pl.Date, "coly": pl.String, "colz": pl.Float64})12251226xls = BytesIO()12271228# use all three freeze_pane notations1229with Workbook(xls) as wb:1230df1.write_excel(workbook=wb, worksheet="sheet1", freeze_panes=(1, 0))1231df2.write_excel(workbook=wb, worksheet="sheet2", freeze_panes=(1, 0, 3, 4))1232df3.write_excel(workbook=wb, worksheet="sheet3", freeze_panes=("B2"))12331234table_names: set[str] = set()1235for sheet in ("sheet1", "sheet2", "sheet3"):1236table_names.update(1237tbl["name"] for tbl in wb.get_worksheet_by_name(sheet).tables1238)1239assert table_names == {f"Frame{n}" for n in range(3)}1240assert pl.read_excel(xls, sheet_name="sheet3").rows() == []124112421243@pytest.mark.parametrize(1244("read_spreadsheet", "source", "schema_overrides"),1245[1246(pl.read_excel, "path_xlsx_empty", None),1247(pl.read_excel, "path_xlsb_empty", None),1248(pl.read_excel, "path_xls_empty", None),1249(pl.read_ods, "path_ods_empty", None),1250# Test with schema overrides, to ensure they don't interfere with1251# raising NoDataErrors.1252(pl.read_excel, "path_xlsx_empty", {"a": pl.Int64}),1253(pl.read_excel, "path_xlsb_empty", {"a": pl.Int64}),1254(pl.read_excel, "path_xls_empty", {"a": pl.Int64}),1255(pl.read_ods, "path_ods_empty", {"a": pl.Int64}),1256],1257)1258def test_excel_empty_sheet(1259read_spreadsheet: Callable[..., pl.DataFrame],1260source: str,1261request: pytest.FixtureRequest,1262schema_overrides: SchemaDict | None,1263) -> None:1264ods = (empty_spreadsheet_path := request.getfixturevalue(source)).suffix == ".ods"1265read_spreadsheet = pl.read_ods if ods else pl.read_excel # type: ignore[assignment]12661267with pytest.raises(NoDataError, match="empty Excel sheet"):1268read_spreadsheet(empty_spreadsheet_path, schema_overrides=schema_overrides)12691270engine_params = [{}] if ods else [{"engine": "calamine"}]1271for params in engine_params:1272df = read_spreadsheet(1273empty_spreadsheet_path,1274sheet_name="no_data",1275raise_if_empty=False,1276**params,1277)1278expected = pl.DataFrame()1279assert_frame_equal(df, expected)12801281df = read_spreadsheet(1282empty_spreadsheet_path,1283sheet_name="no_rows",1284raise_if_empty=False,1285**params,1286)1287expected = pl.DataFrame(schema={f"col{c}": pl.String for c in ("x", "y", "z")})1288assert_frame_equal(df, expected)128912901291@pytest.mark.parametrize(1292("engine", "hidden_columns"),1293[1294("xlsx2csv", ["a"]),1295("openpyxl", ["a", "b"]),1296("calamine", ["a", "b"]),1297("xlsx2csv", cs.numeric()),1298("openpyxl", cs.last()),1299],1300)1301def test_excel_hidden_columns(1302hidden_columns: list[str] | SelectorType,1303engine: ExcelSpreadsheetEngine,1304) -> None:1305df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})13061307xls = BytesIO()1308df.write_excel(xls, hidden_columns=hidden_columns)13091310read_df = pl.read_excel(xls)1311assert_frame_equal(df, read_df)131213131314def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None:1315df = pl.read_excel(io_files_path / "nan_test.xlsx", engine="calamine")1316nan = float("nan")1317assert_frame_equal(1318pl.DataFrame({"float_col": [nan, nan, nan, 100.0, 200.0, 300.0]}),1319df,1320)132113221323@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])1324@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df1325def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:1326df = pl.DataFrame(1327{1328"a": [1, 2, None],1329"b": [1.0, None, 3.5],1330"c": ["x", None, "z"],1331"d": [True, False, None],1332"e": [1333date(2023, 1, 1),1334None,1335date(2023, 1, 4),1336],1337"f": [1338datetime(2023, 1, 1),1339datetime(2000, 10, 10, 10, 10),1340None,1341],1342"g": [1343None,1344"1920-08-08 00:00:00",1345"2077-10-20 00:00:00.000000",1346],1347}1348)1349xls = BytesIO()1350df.write_excel(xls)13511352reversed_cols = list(reversed(df.columns))1353read_cols: Sequence[str] | Sequence[int]1354expected = df.select(reversed_cols).with_columns(1355pl.col("g").str.slice(0, 10).str.to_date()1356)1357for read_cols in (1358reversed_cols,1359[6, 5, 4, 3, 2, 1, 0],1360):1361read_df = pl.read_excel(1362xls,1363engine=engine,1364columns=read_cols,1365schema_overrides={1366"e": pl.Date,1367"f": pl.Datetime("us"),1368"g": pl.Date,1369},1370)1371assert_frame_equal(expected, read_df)137213731374@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])1375def test_drop_empty_rows(1376path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine1377) -> None:1378df1 = pl.read_excel(1379source=path_empty_rows_excel,1380engine=engine,1381) # check default1382assert df1.shape == (8, 4)13831384df2 = pl.read_excel(1385source=path_empty_rows_excel,1386engine=engine,1387drop_empty_rows=True,1388)1389assert df2.shape == (8, 4)13901391df3 = pl.read_excel(1392source=path_empty_rows_excel,1393engine=engine,1394drop_empty_rows=False,1395)1396assert df3.shape == (10, 4)139713981399def test_excel_write_select_col_dtype() -> None:1400from openpyxl import load_workbook1401from xlsxwriter import Workbook14021403def get_col_widths(wb_bytes: BytesIO) -> dict[str, int]:1404return {1405k: round(v.width)1406for k, v in load_workbook(wb_bytes).active.column_dimensions.items()1407}14081409df = pl.DataFrame(1410{1411"name": [["Alice", "Ben"], ["Charlie", "Delta"]],1412"col2": ["Hi", "Bye"],1413}1414)14151416# column_widths test:1417# pl.List(pl.String)) datatype should not match column with no list1418check = BytesIO()1419with Workbook(check) as wb:1420df.write_excel(wb, column_widths={cs.by_dtype(pl.List(pl.String)): 300})14211422assert get_col_widths(check) == {"A": 43}14231424# column_widths test:1425# pl.String datatype should not match column with list1426check = BytesIO()1427with Workbook(check) as wb:1428df.write_excel(wb, column_widths={cs.by_dtype(pl.String): 300})14291430assert get_col_widths(check) == {"B": 43}14311432# hidden_columns test:1433# pl.List(pl.String)) datatype should not match column with no list1434check = BytesIO()1435with Workbook(check) as wb:1436df.write_excel(wb, hidden_columns=cs.by_dtype(pl.List(pl.String)))14371438assert get_col_widths(check) == {"A": 0}14391440# hidden_columns test:1441# pl.String datatype should not match column with list1442check = BytesIO()1443with Workbook(check) as wb:1444df.write_excel(wb, hidden_columns=cs.by_dtype(pl.String))14451446assert get_col_widths(check) == {"B": 0}144714481449@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])1450def test_excel_read_columns_nonlist_sequence(engine: ExcelSpreadsheetEngine) -> None:1451df = pl.DataFrame(1452{"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}1453)1454xls = BytesIO()1455df.write_excel(xls, worksheet="data")14561457xldf = pl.read_excel(xls, engine=engine, columns=("colx", "coly"))1458expected = df.select("colx", "coly")1459assert_frame_equal(xldf, expected)14601461xldf = pl.read_excel(xls, engine=engine, columns="colx")1462expected = df.select("colx")1463assert_frame_equal(xldf, expected)146414651466