Path: blob/main/py-polars/tests/unit/io/test_scan_lines.py
8424 views
from __future__ import annotations12from typing import TYPE_CHECKING, Any34import pytest56import polars as pl7from polars.exceptions import ComputeError8from polars.testing.asserts.frame import assert_frame_equal910if TYPE_CHECKING:11from tests.conftest import PlMonkeyPatch121314def lazified_read_lines(*a: Any, **kw: Any) -> pl.LazyFrame:15return pl.read_lines(*a, **kw).lazy()161718@pytest.mark.parametrize("patch_scan_lines", [True, False])19@pytest.mark.parametrize("force_unit_chunk_size", [True, False])20@pytest.mark.parametrize("carriage_return", [True, False])21def test_scan_lines(22patch_scan_lines: bool,23force_unit_chunk_size: bool,24carriage_return: bool,25capfd: pytest.CaptureFixture[str],26plmonkeypatch: PlMonkeyPatch,27) -> None:28if patch_scan_lines:29plmonkeypatch.setattr(pl, "scan_lines", lazified_read_lines)30assert pl.scan_lines is lazified_read_lines3132if carriage_return:33inner = pl.scan_lines34last_bytes = b""3536def wrapped(data: Any, *a: Any, **kw: Any) -> Any:37nonlocal last_bytes38last_bytes = bytes.replace(data, b"\n", b"\r\n")39return inner(last_bytes, *a, **kw)4041plmonkeypatch.setattr(pl, "scan_lines", wrapped)4243pl.scan_lines(b"\n\n")44assert last_bytes == b"\r\n\r\n"4546if force_unit_chunk_size:47plmonkeypatch.setenv("POLARS_FORCE_NDJSON_READ_SIZE", "1")4849with plmonkeypatch.context() as cx:50capfd.readouterr()51cx.setenv("POLARS_VERBOSE", "1")52pl.scan_lines(b"").collect()53capture = capfd.readouterr().err54assert "fixed_read_size: Some(1)" in capture5556assert_frame_equal(57pl.scan_lines(b"").collect(),58pl.DataFrame(schema={"lines": pl.String}),59)6061assert_frame_equal(62pl.scan_lines(b"", name="A").collect(),63pl.DataFrame(schema={"A": pl.String}),64)6566assert_frame_equal(67pl.scan_lines(b"").collect(),68pl.DataFrame(schema={"lines": pl.String}),69)7071lf = pl.scan_lines(b"""\72AAA73BBB74CCC75DDD76EEE77""")7879assert_frame_equal(80lf.slice(2, 1).collect(),81pl.DataFrame({"lines": ["CCC"]}),82)8384assert_frame_equal(85lf.with_row_index().slice(2, 1).collect(),86pl.DataFrame(87{"index": [2], "lines": ["CCC"]},88schema_overrides={"index": pl.get_index_type()},89),90)9192assert_frame_equal(93lf.slice(-2, 1).collect(),94pl.DataFrame({"lines": ["DDD"]}),95)9697assert_frame_equal(98lf.with_row_index().slice(-2, 1).collect(),99pl.DataFrame(100{"index": [3], "lines": ["DDD"]},101schema_overrides={"index": pl.get_index_type()},102),103)104105def f(n_spaces: int, use_file_eol: bool) -> None:106v = n_spaces * " "107file_eol = "\n" if use_file_eol else ""108109lf = pl.scan_lines(f"{v}\n{v}\n{v}\n{v}\n{v}{file_eol}".encode())110111q = lf112113assert_frame_equal(114q.collect(),115pl.DataFrame({"lines": 5 * [v]}),116)117118assert q.select(pl.len()).collect().item() == 5119120q = lf.slice(4)121122assert_frame_equal(123q.collect(),124pl.DataFrame({"lines": [v]}),125)126127assert q.select(pl.len()).collect().item() == 1128129q = lf.with_row_index().slice(4)130131assert_frame_equal(132q.collect(),133pl.DataFrame(134{"index": [4], "lines": [v]},135schema_overrides={"index": pl.get_index_type()},136),137)138139assert q.select(pl.len()).collect().item() == 1140141q = lf.slice(5)142143assert_frame_equal(144q.collect(),145pl.DataFrame(schema={"lines": pl.String}),146)147148assert q.select(pl.len()).collect().item() == 0149150q = lf.slice(-1)151152assert_frame_equal(153q.collect(),154pl.DataFrame({"lines": [v]}),155)156157assert q.select(pl.len()).collect().item() == 1158159q = lf.with_row_index().slice(-1)160161assert_frame_equal(162q.collect(),163pl.DataFrame(164{"index": [4], "lines": [v]},165schema_overrides={"index": pl.get_index_type()},166),167)168169assert q.select(pl.len()).collect().item() == 1170171q = lf.slice(-4)172173assert_frame_equal(174q.collect(),175pl.DataFrame({"lines": 4 * [v]}),176)177178assert q.select(pl.len()).collect().item() == 4179180q = lf.slice(-99)181182assert_frame_equal(183q.collect(),184pl.DataFrame({"lines": 5 * [v]}),185)186187assert q.select(pl.len()).collect().item() == 5188189f(n_spaces=0, use_file_eol=True)190191for n_spaces in [1, 100]:192for use_file_eol in [True, False]:193f(n_spaces, use_file_eol)194195196def test_scan_lines_negative_slice_reversed_read(197plmonkeypatch: PlMonkeyPatch,198) -> None:199plmonkeypatch.setenv("POLARS_FORCE_NDJSON_READ_SIZE", "1")200q = pl.scan_lines(b"\xff" + 5000 * b"abc\n")201202with pytest.raises(ComputeError, match="invalid utf8"):203q.collect()204205assert q.tail(1).collect().item() == "abc"206assert q.tail(1).select(pl.len()).collect().item() == 1207208# This succeeds because the line counter simply counts '\n' bytes without209# parsing to string.210assert q.select(pl.len()).collect().item() == 5000211212213