Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_json.py
6939 views
1
from __future__ import annotations
2
3
import gzip
4
import io
5
import json
6
import zlib
7
from collections import OrderedDict
8
from datetime import datetime
9
from decimal import Decimal as D
10
from io import BytesIO
11
from typing import TYPE_CHECKING
12
13
import zstandard
14
15
if TYPE_CHECKING:
16
from pathlib import Path
17
18
import orjson
19
import pytest
20
21
import polars as pl
22
from polars.exceptions import ComputeError
23
from polars.testing import assert_frame_equal, assert_series_equal
24
25
26
@pytest.mark.may_fail_cloud # reason: object
27
def test_write_json() -> None:
28
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
29
out = df.write_json()
30
assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]'
31
32
# Test round trip
33
f = io.BytesIO()
34
f.write(out.encode())
35
f.seek(0)
36
result = pl.read_json(f)
37
assert_frame_equal(result, df)
38
39
40
def test_write_json_categoricals() -> None:
41
data = {"column": ["test1", "test2", "test3", "test4"]}
42
df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical))
43
expected = (
44
'[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]'
45
)
46
assert df.write_json() == expected
47
48
49
def test_write_json_duration() -> None:
50
df = pl.DataFrame(
51
{
52
"a": pl.Series(
53
[91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms")
54
)
55
}
56
)
57
58
# we don't guarantee a format, just round-circling
59
value = df.write_json()
60
expected = '[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]'
61
assert value == expected
62
63
64
def test_write_json_time() -> None:
65
ns = 1_000_000_000
66
df = pl.DataFrame(
67
{
68
"a": pl.Series(
69
[7291 * ns + 54321, 54321 * ns + 12345, 86399 * ns],
70
dtype=pl.Time,
71
),
72
}
73
)
74
75
value = df.write_json()
76
expected = (
77
'[{"a":"02:01:31.000054321"},{"a":"15:05:21.000012345"},{"a":"23:59:59"}]'
78
)
79
assert value == expected
80
81
82
def test_write_json_list_of_arrays() -> None:
83
df = pl.DataFrame(
84
{
85
"a": pl.Series(
86
[[(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)], [(7.0, 8.0, 9.0)]],
87
dtype=pl.List(pl.Array(pl.Float32, 3)),
88
),
89
}
90
)
91
92
value = df.write_json()
93
expected = '[{"a":[[1.0,2.0,3.0],[4.0,5.0,6.0]]},{"a":[[7.0,8.0,9.0]]}]'
94
assert value == expected
95
96
97
def test_write_json_decimal() -> None:
98
df = pl.DataFrame({"a": pl.Series([D("1.00"), D("2.00"), None])})
99
100
# we don't guarantee a format, just round-circling
101
value = df.write_json()
102
assert value == """[{"a":"1.00"},{"a":"2.00"},{"a":null}]"""
103
104
105
def test_json_infer_schema_length_11148() -> None:
106
response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
107
with pytest.raises(
108
pl.exceptions.ComputeError, match="extra field in struct data: col2"
109
):
110
pl.read_json(json.dumps(response).encode(), infer_schema_length=2)
111
112
response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
113
result = pl.read_json(json.dumps(response).encode(), infer_schema_length=3)
114
assert set(result.columns) == {"col1", "col2"}
115
116
117
def test_to_from_buffer_arraywise_schema() -> None:
118
buf = io.StringIO(
119
"""
120
[
121
{"a": 5, "b": "foo", "c": null},
122
{"a": 11.4, "b": null, "c": true, "d": 8},
123
{"a": -25.8, "b": "bar", "c": false}
124
]"""
125
)
126
127
read_df = pl.read_json(buf, schema={"b": pl.String, "e": pl.Int16})
128
129
assert_frame_equal(
130
read_df,
131
pl.DataFrame(
132
{
133
"b": pl.Series(["foo", None, "bar"], dtype=pl.String),
134
"e": pl.Series([None, None, None], dtype=pl.Int16),
135
}
136
),
137
)
138
139
140
def test_to_from_buffer_arraywise_schema_override() -> None:
141
buf = io.StringIO(
142
"""
143
[
144
{"a": 5, "b": "foo", "c": null},
145
{"a": 11.4, "b": null, "c": true, "d": 8},
146
{"a": -25.8, "b": "bar", "c": false}
147
]"""
148
)
149
150
read_df = pl.read_json(buf, schema_overrides={"c": pl.Int64, "d": pl.Float64})
151
152
assert_frame_equal(
153
read_df,
154
pl.DataFrame(
155
{
156
"a": pl.Series([5, 11.4, -25.8], dtype=pl.Float64),
157
"b": pl.Series(["foo", None, "bar"], dtype=pl.String),
158
"c": pl.Series([None, 1, 0], dtype=pl.Int64),
159
"d": pl.Series([None, 8, None], dtype=pl.Float64),
160
}
161
),
162
check_column_order=False,
163
)
164
165
166
def test_write_ndjson() -> None:
167
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
168
out = df.write_ndjson()
169
assert out == '{"a":1,"b":"a"}\n{"a":2,"b":"b"}\n{"a":3,"b":null}\n'
170
171
# Test round trip
172
f = io.BytesIO()
173
f.write(out.encode())
174
f.seek(0)
175
result = pl.read_ndjson(f)
176
assert_frame_equal(result, df)
177
178
179
def test_write_ndjson_with_trailing_newline() -> None:
180
input = """{"Column1":"Value1"}\n"""
181
df = pl.read_ndjson(io.StringIO(input))
182
183
expected = pl.DataFrame({"Column1": ["Value1"]})
184
assert_frame_equal(df, expected)
185
186
187
def test_read_ndjson_empty_array() -> None:
188
assert pl.read_ndjson(io.StringIO("""{"foo": {"bar": []}}""")).to_dict(
189
as_series=False
190
) == {"foo": [{"bar": []}]}
191
192
193
def test_ndjson_nested_null() -> None:
194
json_payload = """{"foo":{"bar":[{}]}}"""
195
df = pl.read_ndjson(io.StringIO(json_payload))
196
197
# 'bar' represents an empty list of structs; check the schema is correct (eg: picks
198
# up that it IS a list of structs), but confirm that list is empty (ref: #11301)
199
# We don't support empty structs yet. So Null is closest.
200
assert df.schema == {"foo": pl.Struct([pl.Field("bar", pl.List(pl.Struct({})))])}
201
assert df.to_dict(as_series=False) == {"foo": [{"bar": [{}]}]}
202
203
204
def test_ndjson_nested_string_int() -> None:
205
ndjson = """{"Accumulables":[{"Value":32395888},{"Value":"539454"}]}"""
206
assert pl.read_ndjson(io.StringIO(ndjson)).to_dict(as_series=False) == {
207
"Accumulables": [[{"Value": "32395888"}, {"Value": "539454"}]]
208
}
209
210
211
def test_json_supertype_infer() -> None:
212
json_string = """[
213
{"c":[{"b": [], "a": "1"}]},
214
{"c":[{"b":[]}]},
215
{"c":[{"b":["1"], "a": "1"}]}]
216
"""
217
python_infer = pl.from_records(json.loads(json_string))
218
polars_infer = pl.read_json(io.StringIO(json_string))
219
assert_frame_equal(python_infer, polars_infer)
220
221
222
@pytest.mark.may_fail_cloud # reason: object
223
def test_ndjson_sliced_list_serialization() -> None:
224
data = {"col1": [0, 2], "col2": [[3, 4, 5], [6, 7, 8]]}
225
df = pl.DataFrame(data)
226
f = io.BytesIO()
227
sliced_df = df[1, :]
228
sliced_df.write_ndjson(f)
229
assert f.getvalue() == b'{"col1":2,"col2":[6,7,8]}\n'
230
231
232
def test_json_deserialize_9687() -> None:
233
response = {
234
"volume": [0.0, 0.0, 0.0],
235
"open": [1263.0, 1263.0, 1263.0],
236
"close": [1263.0, 1263.0, 1263.0],
237
"high": [1263.0, 1263.0, 1263.0],
238
"low": [1263.0, 1263.0, 1263.0],
239
}
240
241
result = pl.read_json(json.dumps(response).encode())
242
243
assert result.to_dict(as_series=False) == {k: [v] for k, v in response.items()}
244
245
246
def test_ndjson_ignore_errors() -> None:
247
# this schema is inconsistent as "value" is string and object
248
jsonl = r"""{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]}
249
{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]}"""
250
251
buf = io.BytesIO(jsonl.encode())
252
253
# check if we can replace with nulls
254
assert pl.read_ndjson(buf, ignore_errors=True).to_dict(as_series=False) == {
255
"Type": ["insert", "insert"],
256
"Key": [[1], [1]],
257
"SeqNo": [1, 1],
258
"Timestamp": [1, 1],
259
"Fields": [
260
[{"Name": "added_id", "Value": "2"}, {"Name": "body", "Value": '{"a": 1}'}],
261
[{"Name": "added_id", "Value": "2"}, {"Name": "body", "Value": '{"a": 1}'}],
262
],
263
}
264
265
schema = {
266
"Fields": pl.List(
267
pl.Struct([pl.Field("Name", pl.String), pl.Field("Value", pl.Int64)])
268
)
269
}
270
# schema argument only parses Fields
271
assert pl.read_ndjson(buf, schema=schema, ignore_errors=True).to_dict(
272
as_series=False
273
) == {
274
"Fields": [
275
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
276
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
277
]
278
}
279
280
# schema_overrides argument does schema inference, but overrides Fields
281
result = pl.read_ndjson(buf, schema_overrides=schema, ignore_errors=True)
282
expected = {
283
"Type": ["insert", "insert"],
284
"Key": [[1], [1]],
285
"SeqNo": [1, 1],
286
"Timestamp": [1, 1],
287
"Fields": [
288
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
289
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
290
],
291
}
292
assert result.to_dict(as_series=False) == expected
293
294
295
def test_json_null_infer() -> None:
296
json = BytesIO(
297
bytes(
298
"""
299
[
300
{
301
"a": 1,
302
"b": null
303
}
304
]
305
""",
306
"UTF-8",
307
)
308
)
309
310
assert pl.read_json(json).schema == OrderedDict({"a": pl.Int64, "b": pl.Null})
311
312
313
def test_ndjson_null_buffer() -> None:
314
data = io.BytesIO(
315
b"""\
316
{"id": 1, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
317
{"id": 2, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
318
{"id": 3, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
319
{"id": 4, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
320
"""
321
)
322
323
assert pl.read_ndjson(data).schema == OrderedDict(
324
[
325
("id", pl.Int64),
326
("zero_column", pl.Int64),
327
("empty_array_column", pl.List(pl.Null)),
328
("empty_object_column", pl.Struct([])),
329
("null_column", pl.Null),
330
]
331
)
332
333
334
def test_ndjson_null_inference_13183() -> None:
335
assert pl.read_ndjson(
336
b"""
337
{"map": "a", "start_time": 0.795, "end_time": 1.495}
338
{"map": "a", "start_time": 1.6239999999999999, "end_time": 2.0540000000000003}
339
{"map": "c", "start_time": 2.184, "end_time": 2.645}
340
{"map": "a", "start_time": null, "end_time": null}
341
""".strip()
342
).to_dict(as_series=False) == {
343
"map": ["a", "a", "c", "a"],
344
"start_time": [0.795, 1.6239999999999999, 2.184, None],
345
"end_time": [1.495, 2.0540000000000003, 2.645, None],
346
}
347
348
349
def test_ndjson_expected_null_got_object_inference_22807() -> None:
350
buf = io.StringIO()
351
for _ in range(100):
352
buf.write('{"a":[]}\n')
353
buf.write('{"a":[{"b":[]}]}\n')
354
355
buf.seek(0)
356
357
assert pl.read_ndjson(buf, infer_schema_length=None).schema == (
358
{"a": pl.List(pl.Struct([pl.Field("b", pl.List(pl.Null))]))}
359
)
360
361
362
@pytest.mark.may_fail_cloud # reason: object
363
@pytest.mark.write_disk
364
def test_json_wrong_input_handle_textio(tmp_path: Path) -> None:
365
# This shouldn't be passed, but still we test if we can handle it gracefully
366
df = pl.DataFrame(
367
{
368
"x": [1, 2, 3],
369
"y": ["a", "b", "c"],
370
}
371
)
372
file_path = tmp_path / "test.ndjson"
373
df.write_ndjson(file_path)
374
375
with file_path.open() as f:
376
result = pl.read_ndjson(f)
377
assert_frame_equal(result, df)
378
379
380
def test_json_normalize() -> None:
381
data = [
382
{"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
383
{"name": {"given": "Mark", "family": "Regner"}},
384
{"id": 2, "name": "Faye Raker"},
385
]
386
387
assert pl.json_normalize([], schema=pl.Schema({"test": pl.Int32})).to_dict(
388
as_series=False
389
) == {
390
"test": [],
391
}
392
393
assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
394
"id": [1, None, 2],
395
"name": [
396
'{"first": "Coleen", "last": "Volk"}',
397
'{"given": "Mark", "family": "Regner"}',
398
"Faye Raker",
399
],
400
}
401
402
assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
403
"id": [1, None, 2],
404
"name.first": ["Coleen", None, None],
405
"name.last": ["Volk", None, None],
406
"name.given": [None, "Mark", None],
407
"name.family": [None, "Regner", None],
408
"name": [None, None, "Faye Raker"],
409
}
410
411
data = [
412
{
413
"id": 1,
414
"name": "Cole Volk",
415
"fitness": {"height": 130, "weight": 60},
416
},
417
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
418
{
419
"id": 2,
420
"name": "Faye Raker",
421
"fitness": {"height": 130, "weight": 60},
422
},
423
]
424
assert pl.json_normalize(data, max_level=1, separator=":").to_dict(
425
as_series=False,
426
) == {
427
"id": [1, None, 2],
428
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
429
"fitness:height": [130, 130, 130],
430
"fitness:weight": [60, 60, 60],
431
}
432
assert pl.json_normalize(data, max_level=0).to_dict(
433
as_series=False,
434
) == {
435
"id": [1, None, 2],
436
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
437
"fitness": [
438
'{"height": 130, "weight": 60}',
439
'{"height": 130, "weight": 60}',
440
'{"height": 130, "weight": 60}',
441
],
442
}
443
assert pl.json_normalize(data, max_level=0, encoder=orjson.dumps).to_dict(
444
as_series=False,
445
) == {
446
"id": [1, None, 2],
447
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
448
"fitness": [
449
b'{"height":130,"weight":60}',
450
b'{"height":130,"weight":60}',
451
b'{"height":130,"weight":60}',
452
],
453
}
454
455
456
def test_empty_json() -> None:
457
df = pl.read_json(io.StringIO("{}"))
458
assert df.shape == (0, 0)
459
assert isinstance(df, pl.DataFrame)
460
461
df = pl.read_json(b'{"j":{}}')
462
assert df.dtypes == [pl.Struct([])]
463
assert df.shape == (1, 1)
464
465
466
def test_compressed_json() -> None:
467
# shared setup
468
json_obj = [
469
{"id": 1, "name": "Alice", "trusted": True},
470
{"id": 2, "name": "Bob", "trusted": True},
471
{"id": 3, "name": "Carol", "trusted": False},
472
]
473
expected = pl.DataFrame(json_obj, orient="row")
474
json_bytes = json.dumps(json_obj).encode()
475
476
# gzip
477
compressed_bytes = gzip.compress(json_bytes)
478
out = pl.read_json(compressed_bytes)
479
assert_frame_equal(out, expected)
480
481
# zlib
482
compressed_bytes = zlib.compress(json_bytes)
483
out = pl.read_json(compressed_bytes)
484
assert_frame_equal(out, expected)
485
486
# zstd
487
compressed_bytes = zstandard.compress(json_bytes)
488
out = pl.read_json(compressed_bytes)
489
assert_frame_equal(out, expected)
490
491
# no compression
492
uncompressed = io.BytesIO(json_bytes)
493
out = pl.read_json(uncompressed)
494
assert_frame_equal(out, expected)
495
496
497
def test_empty_list_json() -> None:
498
df = pl.read_json(io.StringIO("[]")) #
499
assert df.shape == (0, 0)
500
assert isinstance(df, pl.DataFrame)
501
502
df = pl.read_json(b"[]")
503
assert df.shape == (0, 0)
504
assert isinstance(df, pl.DataFrame)
505
506
507
def test_json_infer_3_dtypes() -> None:
508
# would SO before
509
df = pl.DataFrame({"a": ["{}", "1", "[1, 2]"]})
510
511
with pytest.raises(pl.exceptions.ComputeError):
512
df.select(pl.col("a").str.json_decode(pl.Int64))
513
514
df = pl.DataFrame({"a": [None, "1", "[1, 2]"]})
515
out = df.select(pl.col("a").str.json_decode(dtype=pl.List(pl.String)))
516
assert out["a"].to_list() == [None, ["1"], ["1", "2"]]
517
assert out.dtypes[0] == pl.List(pl.String)
518
519
520
# NOTE: This doesn't work for 0, but that is normal
521
@pytest.mark.parametrize("size", [1, 2, 13])
522
def test_zfs_json_roundtrip(size: int) -> None:
523
a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame()
524
525
f = io.StringIO()
526
a.write_json(f)
527
528
f.seek(0)
529
assert_frame_equal(a, pl.read_json(f))
530
531
532
def test_read_json_raise_on_data_type_mismatch() -> None:
533
with pytest.raises(ComputeError):
534
pl.read_json(
535
b"""\
536
[
537
{"a": null},
538
{"a": 1}
539
]
540
""",
541
infer_schema_length=1,
542
)
543
544
545
def test_read_json_struct_schema() -> None:
546
with pytest.raises(ComputeError, match="extra field in struct data: b"):
547
pl.read_json(
548
b"""\
549
[
550
{"a": 1},
551
{"a": 2, "b": 2}
552
]
553
""",
554
infer_schema_length=1,
555
)
556
557
assert_frame_equal(
558
pl.read_json(
559
b"""\
560
[
561
{"a": 1},
562
{"a": 2, "b": 2}
563
]
564
""",
565
infer_schema_length=2,
566
),
567
pl.DataFrame({"a": [1, 2], "b": [None, 2]}),
568
)
569
570
# If the schema was explicitly given, then we ignore extra fields.
571
# TODO: There should be a `columns=` parameter to this.
572
assert_frame_equal(
573
pl.read_json(
574
b"""\
575
[
576
{"a": 1},
577
{"a": 2, "b": 2}
578
]
579
""",
580
schema={"a": pl.Int64},
581
),
582
pl.DataFrame({"a": [1, 2]}),
583
)
584
585
586
def test_read_ndjson_inner_list_types_18244() -> None:
587
assert pl.read_ndjson(
588
io.StringIO("""{"a":null,"b":null,"c":null}"""),
589
schema={
590
"a": pl.List(pl.String),
591
"b": pl.List(pl.Int32),
592
"c": pl.List(pl.Float64),
593
},
594
).schema == (
595
{"a": pl.List(pl.String), "b": pl.List(pl.Int32), "c": pl.List(pl.Float64)}
596
)
597
598
599
def test_read_json_utf_8_sig_encoding() -> None:
600
data = [{"a": [1, 2], "b": [1, 2]}]
601
result = pl.read_json(json.dumps(data).encode("utf-8-sig"))
602
expected = pl.DataFrame(data)
603
assert_frame_equal(result, expected)
604
605
606
@pytest.mark.may_fail_cloud # reason: object
607
def test_write_masked_out_list_22202() -> None:
608
df = pl.DataFrame({"x": [1, 2], "y": [None, 3]})
609
610
output_file = io.BytesIO()
611
612
query = (
613
df.group_by("x", maintain_order=True)
614
.all()
615
.select(pl.when(pl.col("y").list.sum() > 0).then("y"))
616
)
617
618
eager = query.write_ndjson().encode()
619
620
query.lazy().sink_ndjson(output_file)
621
lazy = output_file.getvalue()
622
623
assert eager == lazy
624
625
626
def test_nested_datetime_ndjson() -> None:
627
f = io.StringIO(
628
"""{"start_date":"2025-03-14T09:30:27Z","steps":[{"id":1,"start_date":"2025-03-14T09:30:27Z"},{"id":2,"start_date":"2025-03-14T09:31:27Z"}]}"""
629
)
630
631
schema = {
632
"start_date": pl.Datetime,
633
"steps": pl.List(pl.Struct({"id": pl.Int64, "start_date": pl.Datetime})),
634
}
635
636
assert pl.read_ndjson(f, schema=schema).to_dict(as_series=False) == { # type: ignore[arg-type]
637
"start_date": [datetime(2025, 3, 14, 9, 30, 27)],
638
"steps": [
639
[
640
{"id": 1, "start_date": datetime(2025, 3, 14, 9, 30, 27)},
641
{"id": 2, "start_date": datetime(2025, 3, 14, 9, 31, 27)},
642
]
643
],
644
}
645
646
647
def test_ndjson_22229() -> None:
648
li = [
649
'{ "campaign": { "id": "123456" }, "metrics": { "conversions": 7}}',
650
'{ "campaign": { "id": "654321" }, "metrics": { "conversions": 3.5}}',
651
]
652
653
assert pl.read_ndjson(io.StringIO("\n".join(li))).to_dict(as_series=False)
654
655
656
def test_json_encode_enum_23826() -> None:
657
s = pl.Series("a", ["b"], dtype=pl.Enum(["b"]))
658
assert_series_equal(
659
s.to_frame().select(c=pl.struct("a").struct.json_encode()).to_series(),
660
pl.Series("c", ['{"a":"0"}'], pl.String),
661
)
662
663