CoCalc -- test_lazy

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_lazy_json.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from typing import TYPE_CHECKING
4

5
import pytest
6

7
import polars as pl
8
from polars.testing import assert_frame_equal
9

10
if TYPE_CHECKING:
11
    from pathlib import Path
12

13

14
@pytest.fixture
15
def foods_ndjson_path(io_files_path: Path) -> Path:
16
    return io_files_path / "foods1.ndjson"
17

18

19
def test_scan_ndjson(foods_ndjson_path: Path) -> None:
20
    df = pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index").collect()
21
    assert df["row_index"].to_list() == list(range(27))
22

23
    df = (
24
        pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index")
25
        .filter(pl.col("category") == pl.lit("vegetables"))
26
        .collect()
27
    )
28

29
    assert df["row_index"].to_list() == [0, 6, 11, 13, 14, 20, 25]
30

31
    df = (
32
        pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index")
33
        .with_row_index("foo", 10)
34
        .filter(pl.col("category") == pl.lit("vegetables"))
35
        .collect()
36
    )
37

38
    assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
39

40

41
def test_scan_ndjson_with_schema(foods_ndjson_path: Path) -> None:
42
    schema = {
43
        "category": pl.Categorical,
44
        "calories": pl.Int64,
45
        "fats_g": pl.Float64,
46
        "sugars_g": pl.Int64,
47
    }
48
    df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
49
    assert df["category"].dtype == pl.Categorical
50
    assert df["calories"].dtype == pl.Int64
51
    assert df["fats_g"].dtype == pl.Float64
52
    assert df["sugars_g"].dtype == pl.Int64
53

54
    schema["sugars_g"] = pl.Float64
55
    df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
56
    assert df["sugars_g"].dtype == pl.Float64
57

58

59
def test_scan_ndjson_infer_0(foods_ndjson_path: Path) -> None:
60
    with pytest.raises(ValueError):
61
        pl.scan_ndjson(foods_ndjson_path, infer_schema_length=0)
62

63

64
def test_scan_ndjson_batch_size_zero() -> None:
65
    with pytest.raises(ValueError, match="invalid zero value"):
66
        pl.scan_ndjson("test.ndjson", batch_size=0)
67

68

69
@pytest.mark.write_disk
70
def test_scan_with_projection(tmp_path: Path) -> None:
71
    tmp_path.mkdir(exist_ok=True)
72

73
    json = r"""
74
{"text": "\"hello", "id": 1}
75
{"text": "\n{\n\t\t\"inner\": \"json\n}\n", "id": 10}
76
{"id": 0, "text":"\"","date":"2013-08-03 15:17:23"}
77
{"id": 1, "text":"\"123\"","date":"2009-05-19 21:07:53"}
78
{"id": 2, "text":"/....","date":"2009-05-19 21:07:53"}
79
{"id": 3, "text":"\n\n..","date":"2"}
80
{"id": 4, "text":"\"'/\n...","date":"2009-05-19 21:07:53"}
81
{"id": 5, "text":".h\"h1hh\\21hi1e2emm...","date":"2009-05-19 21:07:53"}
82
{"id": 6, "text":"xxxx....","date":"2009-05-19 21:07:53"}
83
{"id": 7, "text":".\"quoted text\".","date":"2009-05-19 21:07:53"}
84
"""
85
    json_bytes = bytes(json, "utf-8")
86

87
    file_path = tmp_path / "escape_chars.json"
88
    file_path.write_bytes(json_bytes)
89

90
    actual = pl.scan_ndjson(file_path).select(["id", "text"]).collect()
91

92
    expected = pl.DataFrame(
93
        {
94
            "id": [1, 10, 0, 1, 2, 3, 4, 5, 6, 7],
95
            "text": [
96
                '"hello',
97
                '\n{\n\t\t"inner": "json\n}\n',
98
                '"',
99
                '"123"',
100
                "/....",
101
                "\n\n..",
102
                "\"'/\n...",
103
                '.h"h1hh\\21hi1e2emm...',
104
                "xxxx....",
105
                '."quoted text".',
106
            ],
107
        }
108
    )
109
    assert_frame_equal(actual, expected)
110

111

112
def test_projection_pushdown_ndjson(io_files_path: Path) -> None:
113
    file_path = io_files_path / "foods1.ndjson"
114
    df = pl.scan_ndjson(file_path).select(pl.col.calories)
115

116
    explain = df.explain()
117

118
    assert "simple π" not in explain
119
    assert "PROJECT 1/4 COLUMNS" in explain
120

121
    assert_frame_equal(df.collect(optimizations=pl.QueryOptFlags.none()), df.collect())
122

123

124
def test_predicate_pushdown_ndjson(io_files_path: Path) -> None:
125
    file_path = io_files_path / "foods1.ndjson"
126
    df = pl.scan_ndjson(file_path).filter(pl.col.calories > 80)
127

128
    explain = df.explain()
129

130
    assert "FILTER" not in explain
131
    assert """SELECTION: [(col("calories")) > (80)]""" in explain
132

133
    assert_frame_equal(df.collect(optimizations=pl.QueryOptFlags.none()), df.collect())
134

135

136
def test_glob_n_rows(io_files_path: Path) -> None:
137
    file_path = io_files_path / "foods*.ndjson"
138
    df = pl.scan_ndjson(file_path, n_rows=40).collect()
139

140
    # 27 rows from foods1.ndjson and 13 from foods2.ndjson
141
    assert df.shape == (40, 4)
142

143
    # take first and last rows
144
    assert df[[0, 39]].to_dict(as_series=False) == {
145
        "category": ["vegetables", "seafood"],
146
        "calories": [45, 146],
147
        "fats_g": [0.5, 6.0],
148
        "sugars_g": [2, 2],
149
    }
150

151

152
# See #10661.
153
def test_json_no_unicode_truncate() -> None:
154
    assert pl.read_ndjson(rb'{"field": "\ufffd1234"}')[0, 0] == "\ufffd1234"
155

156

157
def test_ndjson_list_arg(io_files_path: Path) -> None:
158
    first = io_files_path / "foods1.ndjson"
159
    second = io_files_path / "foods2.ndjson"
160

161
    df = pl.scan_ndjson(source=[first, second]).collect()
162
    assert df.shape == (54, 4)
163
    assert df.row(-1) == ("seafood", 194, 12.0, 1)
164
    assert df.row(0) == ("vegetables", 45, 0.5, 2)
165

166

167
def test_glob_single_scan(io_files_path: Path) -> None:
168
    file_path = io_files_path / "foods*.ndjson"
169
    df = pl.scan_ndjson(file_path, n_rows=40)
170

171
    explain = df.explain()
172

173
    assert explain.count("SCAN") == 1
174
    assert "UNION" not in explain
175

176

177
def test_scan_ndjson_empty_lines_in_middle() -> None:
178
    assert_frame_equal(
179
        pl.scan_ndjson(
180
            f"""\
181
{{"a": 1}}
182
{"              "}
183
{{"a": 2}}{"              "}
184
{"              "}
185
{{"a": 3}}
186
""".encode()
187
        ).collect(),
188
        pl.DataFrame({"a": [1, 2, 3]}),
189
    )
190

191

192
@pytest.mark.parametrize("row_index_offset", [None, 0, 20])
193
def test_scan_ndjson_slicing(
194
    foods_ndjson_path: Path, row_index_offset: int | None
195
) -> None:
196
    lf = pl.scan_ndjson(foods_ndjson_path)
197

198
    if row_index_offset is not None:
199
        lf = lf.with_row_index(offset=row_index_offset)
200

201
    for q in [
202
        lf.head(5),
203
        lf.tail(5),
204
        lf.head(0),
205
        lf.tail(0),
206
        lf.slice(-999, 3),
207
        lf.slice(999, 3),
208
        lf.slice(-999, 0),
209
        lf.slice(999, 0),
210
        lf.slice(-999),
211
        lf.slice(-3, 999),
212
    ]:
213
        assert_frame_equal(
214
            q.collect(), q.collect(optimizations=pl.QueryOptFlags.none())
215
        )
216

217

218
@pytest.mark.parametrize(
219
    "dtype",
220
    [
221
        pl.Boolean,
222
        pl.Int32,
223
        pl.Int64,
224
        pl.UInt64,
225
        pl.UInt32,
226
        pl.Float32,
227
        pl.Float64,
228
        pl.Datetime,
229
        pl.Date,
230
        pl.Null,
231
    ],
232
)
233
def test_scan_ndjson_raises_on_parse_error(dtype: pl.DataType) -> None:
234
    buf = b"""\
235
{"a": "AAAA"}
236
"""
237

238
    cx = (
239
        pytest.raises(
240
            pl.exceptions.ComputeError,
241
            match="got non-null value for NULL-typed column: AAAA",
242
        )
243
        if str(dtype) == "Null"
244
        else pytest.raises(pl.exceptions.ComputeError, match="cannot parse 'AAAA' as ")
245
    )
246

247
    with cx:
248
        pl.scan_ndjson(
249
            buf,
250
            schema={"a": dtype},
251
        ).collect()
252

253
    assert_frame_equal(
254
        pl.scan_ndjson(buf, schema={"a": dtype}, ignore_errors=True).collect(),
255
        pl.DataFrame({"a": [None]}, schema={"a": dtype}),
256
    )
257

258

259
def test_scan_ndjson_parse_string() -> None:
260
    assert_frame_equal(
261
        pl.scan_ndjson(
262
            b"""\
263
{"a": "123"}
264
""",
265
            schema={"a": pl.String},
266
        ).collect(),
267
        pl.DataFrame({"a": "123"}),
268
    )
269

270

271
def test_scan_ndjson_raises_on_parse_error_nested() -> None:
272
    buf = b"""\
273
{"a": {"b": "AAAA"}}
274
"""
275
    q = pl.scan_ndjson(
276
        buf,
277
        schema={"a": pl.Struct({"b": pl.Int64})},
278
    )
279

280
    with pytest.raises(pl.exceptions.ComputeError):
281
        q.collect()
282

283
    q = pl.scan_ndjson(
284
        buf, schema={"a": pl.Struct({"b": pl.Int64})}, ignore_errors=True
285
    )
286

287
    assert_frame_equal(
288
        q.collect(),
289
        pl.DataFrame({"a": [{"b": None}]}, schema={"a": pl.Struct({"b": pl.Int64})}),
290
    )
291

292

293
def test_scan_ndjson_nested_as_string() -> None:
294
    buf = b"""\
295
{"a": {"x": 1}, "b": [1,2,3], "c": {"y": null}, "d": [{"k": "abc"}, {"j": "123"}, {"l": 7}]}
296
"""
297

298
    df = pl.scan_ndjson(
299
        buf,
300
        schema={"a": pl.String, "b": pl.String, "c": pl.String, "d": pl.String},
301
    ).collect()
302

303
    assert_frame_equal(
304
        df,
305
        pl.DataFrame(
306
            {
307
                "a": '{"x": 1}',
308
                "b": "[1, 2, 3]",
309
                "c": '{"y": null}',
310
                "d": '[{"k": "abc"}, {"j": "123"}, {"l": 7}]',
311
            }
312
        ),
313
    )
314

315

316
def test_scan_ndjson_schema_overwrite_22514() -> None:
317
    buf = b"""\
318
{"a": 1}
319
"""
320

321
    q = pl.scan_ndjson(buf)
322

323
    # Baseline: Infers as Int64
324
    assert q.collect_schema() == {"a": pl.Int64}
325
    assert_frame_equal(q.collect(), pl.DataFrame({"a": 1}))
326

327
    q = pl.scan_ndjson(buf, schema_overrides={"a": pl.String})
328
    assert q.collect_schema() == {"a": pl.String}
329
    assert_frame_equal(q.collect(), pl.DataFrame({"a": "1"}))
330

331
Product

Resources

Company