Path: blob/main/py-polars/tests/unit/io/test_lazy_json.py
6939 views
from __future__ import annotations12from typing import TYPE_CHECKING34import pytest56import polars as pl7from polars.testing import assert_frame_equal89if TYPE_CHECKING:10from pathlib import Path111213@pytest.fixture14def foods_ndjson_path(io_files_path: Path) -> Path:15return io_files_path / "foods1.ndjson"161718def test_scan_ndjson(foods_ndjson_path: Path) -> None:19df = pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index").collect()20assert df["row_index"].to_list() == list(range(27))2122df = (23pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index")24.filter(pl.col("category") == pl.lit("vegetables"))25.collect()26)2728assert df["row_index"].to_list() == [0, 6, 11, 13, 14, 20, 25]2930df = (31pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index")32.with_row_index("foo", 10)33.filter(pl.col("category") == pl.lit("vegetables"))34.collect()35)3637assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]383940def test_scan_ndjson_with_schema(foods_ndjson_path: Path) -> None:41schema = {42"category": pl.Categorical,43"calories": pl.Int64,44"fats_g": pl.Float64,45"sugars_g": pl.Int64,46}47df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()48assert df["category"].dtype == pl.Categorical49assert df["calories"].dtype == pl.Int6450assert df["fats_g"].dtype == pl.Float6451assert df["sugars_g"].dtype == pl.Int645253schema["sugars_g"] = pl.Float6454df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()55assert df["sugars_g"].dtype == pl.Float64565758def test_scan_ndjson_infer_0(foods_ndjson_path: Path) -> None:59with pytest.raises(ValueError):60pl.scan_ndjson(foods_ndjson_path, infer_schema_length=0)616263def test_scan_ndjson_batch_size_zero() -> None:64with pytest.raises(ValueError, match="invalid zero value"):65pl.scan_ndjson("test.ndjson", batch_size=0)666768@pytest.mark.write_disk69def test_scan_with_projection(tmp_path: Path) -> None:70tmp_path.mkdir(exist_ok=True)7172json = r"""73{"text": "\"hello", "id": 1}74{"text": "\n{\n\t\t\"inner\": \"json\n}\n", "id": 10}75{"id": 0, "text":"\"","date":"2013-08-03 15:17:23"}76{"id": 1, "text":"\"123\"","date":"2009-05-19 21:07:53"}77{"id": 2, "text":"/....","date":"2009-05-19 21:07:53"}78{"id": 3, "text":"\n\n..","date":"2"}79{"id": 4, "text":"\"'/\n...","date":"2009-05-19 21:07:53"}80{"id": 5, "text":".h\"h1hh\\21hi1e2emm...","date":"2009-05-19 21:07:53"}81{"id": 6, "text":"xxxx....","date":"2009-05-19 21:07:53"}82{"id": 7, "text":".\"quoted text\".","date":"2009-05-19 21:07:53"}83"""84json_bytes = bytes(json, "utf-8")8586file_path = tmp_path / "escape_chars.json"87file_path.write_bytes(json_bytes)8889actual = pl.scan_ndjson(file_path).select(["id", "text"]).collect()9091expected = pl.DataFrame(92{93"id": [1, 10, 0, 1, 2, 3, 4, 5, 6, 7],94"text": [95'"hello',96'\n{\n\t\t"inner": "json\n}\n',97'"',98'"123"',99"/....",100"\n\n..",101"\"'/\n...",102'.h"h1hh\\21hi1e2emm...',103"xxxx....",104'."quoted text".',105],106}107)108assert_frame_equal(actual, expected)109110111def test_projection_pushdown_ndjson(io_files_path: Path) -> None:112file_path = io_files_path / "foods1.ndjson"113df = pl.scan_ndjson(file_path).select(pl.col.calories)114115explain = df.explain()116117assert "simple π" not in explain118assert "PROJECT 1/4 COLUMNS" in explain119120assert_frame_equal(df.collect(optimizations=pl.QueryOptFlags.none()), df.collect())121122123def test_predicate_pushdown_ndjson(io_files_path: Path) -> None:124file_path = io_files_path / "foods1.ndjson"125df = pl.scan_ndjson(file_path).filter(pl.col.calories > 80)126127explain = df.explain()128129assert "FILTER" not in explain130assert """SELECTION: [(col("calories")) > (80)]""" in explain131132assert_frame_equal(df.collect(optimizations=pl.QueryOptFlags.none()), df.collect())133134135def test_glob_n_rows(io_files_path: Path) -> None:136file_path = io_files_path / "foods*.ndjson"137df = pl.scan_ndjson(file_path, n_rows=40).collect()138139# 27 rows from foods1.ndjson and 13 from foods2.ndjson140assert df.shape == (40, 4)141142# take first and last rows143assert df[[0, 39]].to_dict(as_series=False) == {144"category": ["vegetables", "seafood"],145"calories": [45, 146],146"fats_g": [0.5, 6.0],147"sugars_g": [2, 2],148}149150151# See #10661.152def test_json_no_unicode_truncate() -> None:153assert pl.read_ndjson(rb'{"field": "\ufffd1234"}')[0, 0] == "\ufffd1234"154155156def test_ndjson_list_arg(io_files_path: Path) -> None:157first = io_files_path / "foods1.ndjson"158second = io_files_path / "foods2.ndjson"159160df = pl.scan_ndjson(source=[first, second]).collect()161assert df.shape == (54, 4)162assert df.row(-1) == ("seafood", 194, 12.0, 1)163assert df.row(0) == ("vegetables", 45, 0.5, 2)164165166def test_glob_single_scan(io_files_path: Path) -> None:167file_path = io_files_path / "foods*.ndjson"168df = pl.scan_ndjson(file_path, n_rows=40)169170explain = df.explain()171172assert explain.count("SCAN") == 1173assert "UNION" not in explain174175176def test_scan_ndjson_empty_lines_in_middle() -> None:177assert_frame_equal(178pl.scan_ndjson(179f"""\180{{"a": 1}}181{" "}182{{"a": 2}}{" "}183{" "}184{{"a": 3}}185""".encode()186).collect(),187pl.DataFrame({"a": [1, 2, 3]}),188)189190191@pytest.mark.parametrize("row_index_offset", [None, 0, 20])192def test_scan_ndjson_slicing(193foods_ndjson_path: Path, row_index_offset: int | None194) -> None:195lf = pl.scan_ndjson(foods_ndjson_path)196197if row_index_offset is not None:198lf = lf.with_row_index(offset=row_index_offset)199200for q in [201lf.head(5),202lf.tail(5),203lf.head(0),204lf.tail(0),205lf.slice(-999, 3),206lf.slice(999, 3),207lf.slice(-999, 0),208lf.slice(999, 0),209lf.slice(-999),210lf.slice(-3, 999),211]:212assert_frame_equal(213q.collect(), q.collect(optimizations=pl.QueryOptFlags.none())214)215216217@pytest.mark.parametrize(218"dtype",219[220pl.Boolean,221pl.Int32,222pl.Int64,223pl.UInt64,224pl.UInt32,225pl.Float32,226pl.Float64,227pl.Datetime,228pl.Date,229pl.Null,230],231)232def test_scan_ndjson_raises_on_parse_error(dtype: pl.DataType) -> None:233buf = b"""\234{"a": "AAAA"}235"""236237cx = (238pytest.raises(239pl.exceptions.ComputeError,240match="got non-null value for NULL-typed column: AAAA",241)242if str(dtype) == "Null"243else pytest.raises(pl.exceptions.ComputeError, match="cannot parse 'AAAA' as ")244)245246with cx:247pl.scan_ndjson(248buf,249schema={"a": dtype},250).collect()251252assert_frame_equal(253pl.scan_ndjson(buf, schema={"a": dtype}, ignore_errors=True).collect(),254pl.DataFrame({"a": [None]}, schema={"a": dtype}),255)256257258def test_scan_ndjson_parse_string() -> None:259assert_frame_equal(260pl.scan_ndjson(261b"""\262{"a": "123"}263""",264schema={"a": pl.String},265).collect(),266pl.DataFrame({"a": "123"}),267)268269270def test_scan_ndjson_raises_on_parse_error_nested() -> None:271buf = b"""\272{"a": {"b": "AAAA"}}273"""274q = pl.scan_ndjson(275buf,276schema={"a": pl.Struct({"b": pl.Int64})},277)278279with pytest.raises(pl.exceptions.ComputeError):280q.collect()281282q = pl.scan_ndjson(283buf, schema={"a": pl.Struct({"b": pl.Int64})}, ignore_errors=True284)285286assert_frame_equal(287q.collect(),288pl.DataFrame({"a": [{"b": None}]}, schema={"a": pl.Struct({"b": pl.Int64})}),289)290291292def test_scan_ndjson_nested_as_string() -> None:293buf = b"""\294{"a": {"x": 1}, "b": [1,2,3], "c": {"y": null}, "d": [{"k": "abc"}, {"j": "123"}, {"l": 7}]}295"""296297df = pl.scan_ndjson(298buf,299schema={"a": pl.String, "b": pl.String, "c": pl.String, "d": pl.String},300).collect()301302assert_frame_equal(303df,304pl.DataFrame(305{306"a": '{"x": 1}',307"b": "[1, 2, 3]",308"c": '{"y": null}',309"d": '[{"k": "abc"}, {"j": "123"}, {"l": 7}]',310}311),312)313314315def test_scan_ndjson_schema_overwrite_22514() -> None:316buf = b"""\317{"a": 1}318"""319320q = pl.scan_ndjson(buf)321322# Baseline: Infers as Int64323assert q.collect_schema() == {"a": pl.Int64}324assert_frame_equal(q.collect(), pl.DataFrame({"a": 1}))325326q = pl.scan_ndjson(buf, schema_overrides={"a": pl.String})327assert q.collect_schema() == {"a": pl.String}328assert_frame_equal(q.collect(), pl.DataFrame({"a": "1"}))329330331