Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_lazy_json.py
6939 views
1
from __future__ import annotations
2
3
from typing import TYPE_CHECKING
4
5
import pytest
6
7
import polars as pl
8
from polars.testing import assert_frame_equal
9
10
if TYPE_CHECKING:
11
from pathlib import Path
12
13
14
@pytest.fixture
15
def foods_ndjson_path(io_files_path: Path) -> Path:
16
return io_files_path / "foods1.ndjson"
17
18
19
def test_scan_ndjson(foods_ndjson_path: Path) -> None:
20
df = pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index").collect()
21
assert df["row_index"].to_list() == list(range(27))
22
23
df = (
24
pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index")
25
.filter(pl.col("category") == pl.lit("vegetables"))
26
.collect()
27
)
28
29
assert df["row_index"].to_list() == [0, 6, 11, 13, 14, 20, 25]
30
31
df = (
32
pl.scan_ndjson(foods_ndjson_path, row_index_name="row_index")
33
.with_row_index("foo", 10)
34
.filter(pl.col("category") == pl.lit("vegetables"))
35
.collect()
36
)
37
38
assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
39
40
41
def test_scan_ndjson_with_schema(foods_ndjson_path: Path) -> None:
42
schema = {
43
"category": pl.Categorical,
44
"calories": pl.Int64,
45
"fats_g": pl.Float64,
46
"sugars_g": pl.Int64,
47
}
48
df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
49
assert df["category"].dtype == pl.Categorical
50
assert df["calories"].dtype == pl.Int64
51
assert df["fats_g"].dtype == pl.Float64
52
assert df["sugars_g"].dtype == pl.Int64
53
54
schema["sugars_g"] = pl.Float64
55
df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
56
assert df["sugars_g"].dtype == pl.Float64
57
58
59
def test_scan_ndjson_infer_0(foods_ndjson_path: Path) -> None:
60
with pytest.raises(ValueError):
61
pl.scan_ndjson(foods_ndjson_path, infer_schema_length=0)
62
63
64
def test_scan_ndjson_batch_size_zero() -> None:
65
with pytest.raises(ValueError, match="invalid zero value"):
66
pl.scan_ndjson("test.ndjson", batch_size=0)
67
68
69
@pytest.mark.write_disk
70
def test_scan_with_projection(tmp_path: Path) -> None:
71
tmp_path.mkdir(exist_ok=True)
72
73
json = r"""
74
{"text": "\"hello", "id": 1}
75
{"text": "\n{\n\t\t\"inner\": \"json\n}\n", "id": 10}
76
{"id": 0, "text":"\"","date":"2013-08-03 15:17:23"}
77
{"id": 1, "text":"\"123\"","date":"2009-05-19 21:07:53"}
78
{"id": 2, "text":"/....","date":"2009-05-19 21:07:53"}
79
{"id": 3, "text":"\n\n..","date":"2"}
80
{"id": 4, "text":"\"'/\n...","date":"2009-05-19 21:07:53"}
81
{"id": 5, "text":".h\"h1hh\\21hi1e2emm...","date":"2009-05-19 21:07:53"}
82
{"id": 6, "text":"xxxx....","date":"2009-05-19 21:07:53"}
83
{"id": 7, "text":".\"quoted text\".","date":"2009-05-19 21:07:53"}
84
"""
85
json_bytes = bytes(json, "utf-8")
86
87
file_path = tmp_path / "escape_chars.json"
88
file_path.write_bytes(json_bytes)
89
90
actual = pl.scan_ndjson(file_path).select(["id", "text"]).collect()
91
92
expected = pl.DataFrame(
93
{
94
"id": [1, 10, 0, 1, 2, 3, 4, 5, 6, 7],
95
"text": [
96
'"hello',
97
'\n{\n\t\t"inner": "json\n}\n',
98
'"',
99
'"123"',
100
"/....",
101
"\n\n..",
102
"\"'/\n...",
103
'.h"h1hh\\21hi1e2emm...',
104
"xxxx....",
105
'."quoted text".',
106
],
107
}
108
)
109
assert_frame_equal(actual, expected)
110
111
112
def test_projection_pushdown_ndjson(io_files_path: Path) -> None:
113
file_path = io_files_path / "foods1.ndjson"
114
df = pl.scan_ndjson(file_path).select(pl.col.calories)
115
116
explain = df.explain()
117
118
assert "simple π" not in explain
119
assert "PROJECT 1/4 COLUMNS" in explain
120
121
assert_frame_equal(df.collect(optimizations=pl.QueryOptFlags.none()), df.collect())
122
123
124
def test_predicate_pushdown_ndjson(io_files_path: Path) -> None:
125
file_path = io_files_path / "foods1.ndjson"
126
df = pl.scan_ndjson(file_path).filter(pl.col.calories > 80)
127
128
explain = df.explain()
129
130
assert "FILTER" not in explain
131
assert """SELECTION: [(col("calories")) > (80)]""" in explain
132
133
assert_frame_equal(df.collect(optimizations=pl.QueryOptFlags.none()), df.collect())
134
135
136
def test_glob_n_rows(io_files_path: Path) -> None:
137
file_path = io_files_path / "foods*.ndjson"
138
df = pl.scan_ndjson(file_path, n_rows=40).collect()
139
140
# 27 rows from foods1.ndjson and 13 from foods2.ndjson
141
assert df.shape == (40, 4)
142
143
# take first and last rows
144
assert df[[0, 39]].to_dict(as_series=False) == {
145
"category": ["vegetables", "seafood"],
146
"calories": [45, 146],
147
"fats_g": [0.5, 6.0],
148
"sugars_g": [2, 2],
149
}
150
151
152
# See #10661.
153
def test_json_no_unicode_truncate() -> None:
154
assert pl.read_ndjson(rb'{"field": "\ufffd1234"}')[0, 0] == "\ufffd1234"
155
156
157
def test_ndjson_list_arg(io_files_path: Path) -> None:
158
first = io_files_path / "foods1.ndjson"
159
second = io_files_path / "foods2.ndjson"
160
161
df = pl.scan_ndjson(source=[first, second]).collect()
162
assert df.shape == (54, 4)
163
assert df.row(-1) == ("seafood", 194, 12.0, 1)
164
assert df.row(0) == ("vegetables", 45, 0.5, 2)
165
166
167
def test_glob_single_scan(io_files_path: Path) -> None:
168
file_path = io_files_path / "foods*.ndjson"
169
df = pl.scan_ndjson(file_path, n_rows=40)
170
171
explain = df.explain()
172
173
assert explain.count("SCAN") == 1
174
assert "UNION" not in explain
175
176
177
def test_scan_ndjson_empty_lines_in_middle() -> None:
178
assert_frame_equal(
179
pl.scan_ndjson(
180
f"""\
181
{{"a": 1}}
182
{" "}
183
{{"a": 2}}{" "}
184
{" "}
185
{{"a": 3}}
186
""".encode()
187
).collect(),
188
pl.DataFrame({"a": [1, 2, 3]}),
189
)
190
191
192
@pytest.mark.parametrize("row_index_offset", [None, 0, 20])
193
def test_scan_ndjson_slicing(
194
foods_ndjson_path: Path, row_index_offset: int | None
195
) -> None:
196
lf = pl.scan_ndjson(foods_ndjson_path)
197
198
if row_index_offset is not None:
199
lf = lf.with_row_index(offset=row_index_offset)
200
201
for q in [
202
lf.head(5),
203
lf.tail(5),
204
lf.head(0),
205
lf.tail(0),
206
lf.slice(-999, 3),
207
lf.slice(999, 3),
208
lf.slice(-999, 0),
209
lf.slice(999, 0),
210
lf.slice(-999),
211
lf.slice(-3, 999),
212
]:
213
assert_frame_equal(
214
q.collect(), q.collect(optimizations=pl.QueryOptFlags.none())
215
)
216
217
218
@pytest.mark.parametrize(
219
"dtype",
220
[
221
pl.Boolean,
222
pl.Int32,
223
pl.Int64,
224
pl.UInt64,
225
pl.UInt32,
226
pl.Float32,
227
pl.Float64,
228
pl.Datetime,
229
pl.Date,
230
pl.Null,
231
],
232
)
233
def test_scan_ndjson_raises_on_parse_error(dtype: pl.DataType) -> None:
234
buf = b"""\
235
{"a": "AAAA"}
236
"""
237
238
cx = (
239
pytest.raises(
240
pl.exceptions.ComputeError,
241
match="got non-null value for NULL-typed column: AAAA",
242
)
243
if str(dtype) == "Null"
244
else pytest.raises(pl.exceptions.ComputeError, match="cannot parse 'AAAA' as ")
245
)
246
247
with cx:
248
pl.scan_ndjson(
249
buf,
250
schema={"a": dtype},
251
).collect()
252
253
assert_frame_equal(
254
pl.scan_ndjson(buf, schema={"a": dtype}, ignore_errors=True).collect(),
255
pl.DataFrame({"a": [None]}, schema={"a": dtype}),
256
)
257
258
259
def test_scan_ndjson_parse_string() -> None:
260
assert_frame_equal(
261
pl.scan_ndjson(
262
b"""\
263
{"a": "123"}
264
""",
265
schema={"a": pl.String},
266
).collect(),
267
pl.DataFrame({"a": "123"}),
268
)
269
270
271
def test_scan_ndjson_raises_on_parse_error_nested() -> None:
272
buf = b"""\
273
{"a": {"b": "AAAA"}}
274
"""
275
q = pl.scan_ndjson(
276
buf,
277
schema={"a": pl.Struct({"b": pl.Int64})},
278
)
279
280
with pytest.raises(pl.exceptions.ComputeError):
281
q.collect()
282
283
q = pl.scan_ndjson(
284
buf, schema={"a": pl.Struct({"b": pl.Int64})}, ignore_errors=True
285
)
286
287
assert_frame_equal(
288
q.collect(),
289
pl.DataFrame({"a": [{"b": None}]}, schema={"a": pl.Struct({"b": pl.Int64})}),
290
)
291
292
293
def test_scan_ndjson_nested_as_string() -> None:
294
buf = b"""\
295
{"a": {"x": 1}, "b": [1,2,3], "c": {"y": null}, "d": [{"k": "abc"}, {"j": "123"}, {"l": 7}]}
296
"""
297
298
df = pl.scan_ndjson(
299
buf,
300
schema={"a": pl.String, "b": pl.String, "c": pl.String, "d": pl.String},
301
).collect()
302
303
assert_frame_equal(
304
df,
305
pl.DataFrame(
306
{
307
"a": '{"x": 1}',
308
"b": "[1, 2, 3]",
309
"c": '{"y": null}',
310
"d": '[{"k": "abc"}, {"j": "123"}, {"l": 7}]',
311
}
312
),
313
)
314
315
316
def test_scan_ndjson_schema_overwrite_22514() -> None:
317
buf = b"""\
318
{"a": 1}
319
"""
320
321
q = pl.scan_ndjson(buf)
322
323
# Baseline: Infers as Int64
324
assert q.collect_schema() == {"a": pl.Int64}
325
assert_frame_equal(q.collect(), pl.DataFrame({"a": 1}))
326
327
q = pl.scan_ndjson(buf, schema_overrides={"a": pl.String})
328
assert q.collect_schema() == {"a": pl.String}
329
assert_frame_equal(q.collect(), pl.DataFrame({"a": "1"}))
330
331