Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_json.py
8421 views
1
from __future__ import annotations
2
3
import gzip
4
import io
5
import json
6
import math
7
import re
8
import zlib
9
from collections import OrderedDict
10
from datetime import datetime
11
from decimal import Decimal as D
12
from io import BytesIO
13
from typing import TYPE_CHECKING
14
15
import zstandard
16
from hypothesis import given
17
18
from polars.datatypes.group import FLOAT_DTYPES
19
20
if TYPE_CHECKING:
21
from pathlib import Path
22
23
import orjson
24
import pytest
25
26
import polars as pl
27
from polars.exceptions import ComputeError
28
from polars.testing import assert_frame_equal, assert_series_equal
29
from polars.testing.parametric import dataframes
30
31
32
@pytest.mark.may_fail_cloud # reason: object
33
def test_write_json_example() -> None:
34
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
35
out = df.write_json()
36
assert out == '[{"a":1,"b":"a"},{"a":2,"b":"b"},{"a":3,"b":null}]'
37
38
# Test round trip
39
f = io.BytesIO()
40
f.write(out.encode())
41
f.seek(0)
42
result = pl.read_json(f)
43
assert_frame_equal(result, df)
44
45
46
@pytest.mark.may_fail_cloud # reason: object
47
@given(
48
df=dataframes(
49
min_size=1,
50
allow_infinity=False,
51
allow_nan=False,
52
excluded_dtypes=[
53
pl.Binary,
54
pl.Categorical,
55
pl.Date,
56
pl.Datetime,
57
pl.Decimal,
58
pl.Duration, # See #20198
59
pl.Enum,
60
pl.Extension,
61
pl.Int128,
62
pl.Struct,
63
pl.Time,
64
pl.UInt128,
65
],
66
)
67
)
68
def test_write_json_roundtrip(df: pl.DataFrame) -> None:
69
f = io.BytesIO()
70
f.write(df.write_json().encode())
71
f.seek(0)
72
result = pl.read_json(f, schema=df.schema)
73
assert_frame_equal(result, df)
74
75
76
@given(
77
df=dataframes(
78
min_size=1,
79
allowed_dtypes=FLOAT_DTYPES,
80
)
81
)
82
def test_write_json_floats(df: pl.DataFrame) -> None:
83
f = io.BytesIO()
84
f.write(df.write_json().encode())
85
f.seek(0)
86
result = pl.read_json(f, schema=df.schema)
87
88
df = df.select(
89
pl.all()
90
.replace(math.inf, None)
91
.replace(-math.inf, None)
92
.replace(math.nan, None)
93
)
94
assert_frame_equal(result, df)
95
96
97
def test_write_json_categoricals() -> None:
98
data = {"column": ["test1", "test2", "test3", "test4"]}
99
df = pl.DataFrame(data).with_columns(pl.col("column").cast(pl.Categorical))
100
expected = (
101
'[{"column":"test1"},{"column":"test2"},{"column":"test3"},{"column":"test4"}]'
102
)
103
assert df.write_json() == expected
104
105
106
def test_write_json_duration() -> None:
107
df = pl.DataFrame(
108
{
109
"a": pl.Series(
110
[91762939, 91762890, 6020836], dtype=pl.Duration(time_unit="ms")
111
)
112
}
113
)
114
115
# we don't guarantee a format, just round-circling
116
value = df.write_json()
117
expected = '[{"a":"PT91762.939S"},{"a":"PT91762.89S"},{"a":"PT6020.836S"}]'
118
assert value == expected
119
120
121
def test_write_json_time() -> None:
122
ns = 1_000_000_000
123
df = pl.DataFrame(
124
{
125
"a": pl.Series(
126
[7291 * ns + 54321, 54321 * ns + 12345, 86399 * ns],
127
dtype=pl.Time,
128
),
129
}
130
)
131
132
value = df.write_json()
133
expected = (
134
'[{"a":"02:01:31.000054321"},{"a":"15:05:21.000012345"},{"a":"23:59:59"}]'
135
)
136
assert value == expected
137
138
139
def test_write_json_list_of_arrays() -> None:
140
df = pl.DataFrame(
141
{
142
"a": pl.Series(
143
[[(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)], [(7.0, 8.0, 9.0)]],
144
dtype=pl.List(pl.Array(pl.Float32, 3)),
145
),
146
}
147
)
148
149
value = df.write_json()
150
expected = '[{"a":[[1.0,2.0,3.0],[4.0,5.0,6.0]]},{"a":[[7.0,8.0,9.0]]}]'
151
assert value == expected
152
153
154
def test_write_json_decimal() -> None:
155
df = pl.DataFrame({"a": pl.Series([D("1.00"), D("2.00"), None])})
156
157
# we don't guarantee a format, just round-circling
158
value = df.write_json()
159
assert value == """[{"a":"1.00"},{"a":"2.00"},{"a":null}]"""
160
161
162
def test_json_infer_schema_length_11148() -> None:
163
response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
164
with pytest.raises(
165
pl.exceptions.ComputeError, match="extra field in struct data: col2"
166
):
167
pl.read_json(json.dumps(response).encode(), infer_schema_length=2)
168
169
response = [{"col1": 1}] * 2 + [{"col1": 1, "col2": 2}] * 1
170
result = pl.read_json(json.dumps(response).encode(), infer_schema_length=3)
171
assert set(result.columns) == {"col1", "col2"}
172
173
174
def test_to_from_buffer_arraywise_schema() -> None:
175
buf = io.StringIO(
176
"""
177
[
178
{"a": 5, "b": "foo", "c": null},
179
{"a": 11.4, "b": null, "c": true, "d": 8},
180
{"a": -25.8, "b": "bar", "c": false}
181
]"""
182
)
183
184
read_df = pl.read_json(buf, schema={"b": pl.String, "e": pl.Int16})
185
186
assert_frame_equal(
187
read_df,
188
pl.DataFrame(
189
{
190
"b": pl.Series(["foo", None, "bar"], dtype=pl.String),
191
"e": pl.Series([None, None, None], dtype=pl.Int16),
192
}
193
),
194
)
195
196
197
def test_to_from_buffer_arraywise_schema_override() -> None:
198
buf = io.StringIO(
199
"""
200
[
201
{"a": 5, "b": "foo", "c": null},
202
{"a": 11.4, "b": null, "c": true, "d": 8},
203
{"a": -25.8, "b": "bar", "c": false}
204
]"""
205
)
206
207
read_df = pl.read_json(buf, schema_overrides={"c": pl.Int64, "d": pl.Float64})
208
209
assert_frame_equal(
210
read_df,
211
pl.DataFrame(
212
{
213
"a": pl.Series([5, 11.4, -25.8], dtype=pl.Float64),
214
"b": pl.Series(["foo", None, "bar"], dtype=pl.String),
215
"c": pl.Series([None, 1, 0], dtype=pl.Int64),
216
"d": pl.Series([None, 8, None], dtype=pl.Float64),
217
}
218
),
219
check_column_order=False,
220
)
221
222
223
def test_write_ndjson() -> None:
224
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", None]})
225
out = df.write_ndjson()
226
assert out == '{"a":1,"b":"a"}\n{"a":2,"b":"b"}\n{"a":3,"b":null}\n'
227
228
# Test round trip
229
f = io.BytesIO()
230
f.write(out.encode())
231
f.seek(0)
232
result = pl.read_ndjson(f)
233
assert_frame_equal(result, df)
234
235
236
def test_write_ndjson_with_trailing_newline() -> None:
237
input = """{"Column1":"Value1"}\n"""
238
df = pl.read_ndjson(io.StringIO(input))
239
240
expected = pl.DataFrame({"Column1": ["Value1"]})
241
assert_frame_equal(df, expected)
242
243
244
def test_read_ndjson_empty_array() -> None:
245
assert pl.read_ndjson(io.StringIO("""{"foo": {"bar": []}}""")).to_dict(
246
as_series=False
247
) == {"foo": [{"bar": []}]}
248
249
250
def test_ndjson_nested_null() -> None:
251
json_payload = """{"foo":{"bar":[{}]}}"""
252
df = pl.read_ndjson(io.StringIO(json_payload))
253
254
# 'bar' represents an empty list of structs; check the schema is correct (eg: picks
255
# up that it IS a list of structs), but confirm that list is empty (ref: #11301)
256
# We don't support empty structs yet. So Null is closest.
257
assert df.schema == {"foo": pl.Struct([pl.Field("bar", pl.List(pl.Struct({})))])}
258
assert df.to_dict(as_series=False) == {"foo": [{"bar": [{}]}]}
259
260
261
def test_ndjson_nested_string_int() -> None:
262
ndjson = """{"Accumulables":[{"Value":32395888},{"Value":"539454"}]}"""
263
assert pl.read_ndjson(io.StringIO(ndjson)).to_dict(as_series=False) == {
264
"Accumulables": [[{"Value": "32395888"}, {"Value": "539454"}]]
265
}
266
267
268
def test_json_supertype_infer() -> None:
269
json_string = """[
270
{"c":[{"b": [], "a": "1"}]},
271
{"c":[{"b":[]}]},
272
{"c":[{"b":["1"], "a": "1"}]}]
273
"""
274
python_infer = pl.from_records(json.loads(json_string))
275
polars_infer = pl.read_json(io.StringIO(json_string))
276
assert_frame_equal(python_infer, polars_infer)
277
278
279
@pytest.mark.may_fail_cloud # reason: object
280
def test_ndjson_sliced_list_serialization() -> None:
281
data = {"col1": [0, 2], "col2": [[3, 4, 5], [6, 7, 8]]}
282
df = pl.DataFrame(data)
283
f = io.BytesIO()
284
sliced_df = df[1, :]
285
sliced_df.write_ndjson(f)
286
assert f.getvalue() == b'{"col1":2,"col2":[6,7,8]}\n'
287
288
289
def test_json_deserialize_9687() -> None:
290
response = {
291
"volume": [0.0, 0.0, 0.0],
292
"open": [1263.0, 1263.0, 1263.0],
293
"close": [1263.0, 1263.0, 1263.0],
294
"high": [1263.0, 1263.0, 1263.0],
295
"low": [1263.0, 1263.0, 1263.0],
296
}
297
298
result = pl.read_json(json.dumps(response).encode())
299
300
assert result.to_dict(as_series=False) == {k: [v] for k, v in response.items()}
301
302
303
def test_ndjson_ignore_errors() -> None:
304
# this schema is inconsistent as "value" is string and object
305
jsonl = r"""{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]}
306
{"Type":"insert","Key":[1],"SeqNo":1,"Timestamp":1,"Fields":[{"Name":"added_id","Value":2},{"Name":"body","Value":{"a": 1}}]}"""
307
308
buf = io.BytesIO(jsonl.encode())
309
310
# check if we can replace with nulls
311
assert pl.read_ndjson(buf, ignore_errors=True).to_dict(as_series=False) == {
312
"Type": ["insert", "insert"],
313
"Key": [[1], [1]],
314
"SeqNo": [1, 1],
315
"Timestamp": [1, 1],
316
"Fields": [
317
[{"Name": "added_id", "Value": "2"}, {"Name": "body", "Value": '{"a": 1}'}],
318
[{"Name": "added_id", "Value": "2"}, {"Name": "body", "Value": '{"a": 1}'}],
319
],
320
}
321
322
schema = {
323
"Fields": pl.List(
324
pl.Struct([pl.Field("Name", pl.String), pl.Field("Value", pl.Int64)])
325
)
326
}
327
# schema argument only parses Fields
328
assert pl.read_ndjson(buf, schema=schema, ignore_errors=True).to_dict(
329
as_series=False
330
) == {
331
"Fields": [
332
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
333
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
334
]
335
}
336
337
# schema_overrides argument does schema inference, but overrides Fields
338
result = pl.read_ndjson(buf, schema_overrides=schema, ignore_errors=True)
339
expected = {
340
"Type": ["insert", "insert"],
341
"Key": [[1], [1]],
342
"SeqNo": [1, 1],
343
"Timestamp": [1, 1],
344
"Fields": [
345
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
346
[{"Name": "added_id", "Value": 2}, {"Name": "body", "Value": None}],
347
],
348
}
349
assert result.to_dict(as_series=False) == expected
350
351
352
def test_json_null_infer() -> None:
353
json = BytesIO(
354
bytes(
355
"""
356
[
357
{
358
"a": 1,
359
"b": null
360
}
361
]
362
""",
363
"UTF-8",
364
)
365
)
366
367
assert pl.read_json(json).schema == OrderedDict({"a": pl.Int64, "b": pl.Null})
368
369
370
def test_ndjson_null_buffer() -> None:
371
data = io.BytesIO(
372
b"""\
373
{"id": 1, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
374
{"id": 2, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
375
{"id": 3, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
376
{"id": 4, "zero_column": 0, "empty_array_column": [], "empty_object_column": {}, "null_column": null}
377
"""
378
)
379
380
assert pl.read_ndjson(data).schema == OrderedDict(
381
[
382
("id", pl.Int64),
383
("zero_column", pl.Int64),
384
("empty_array_column", pl.List(pl.Null)),
385
("empty_object_column", pl.Struct([])),
386
("null_column", pl.Null),
387
]
388
)
389
390
391
def test_ndjson_null_inference_13183() -> None:
392
assert pl.read_ndjson(
393
b"""
394
{"map": "a", "start_time": 0.795, "end_time": 1.495}
395
{"map": "a", "start_time": 1.6239999999999999, "end_time": 2.0540000000000003}
396
{"map": "c", "start_time": 2.184, "end_time": 2.645}
397
{"map": "a", "start_time": null, "end_time": null}
398
""".strip()
399
).to_dict(as_series=False) == {
400
"map": ["a", "a", "c", "a"],
401
"start_time": [0.795, 1.6239999999999999, 2.184, None],
402
"end_time": [1.495, 2.0540000000000003, 2.645, None],
403
}
404
405
406
def test_ndjson_expected_null_got_object_inference_22807() -> None:
407
buf = io.StringIO()
408
for _ in range(100):
409
buf.write('{"a":[]}\n')
410
buf.write('{"a":[{"b":[]}]}\n')
411
412
buf.seek(0)
413
414
assert pl.read_ndjson(buf, infer_schema_length=None).schema == (
415
{"a": pl.List(pl.Struct([pl.Field("b", pl.List(pl.Null))]))}
416
)
417
418
419
@pytest.mark.may_fail_cloud # reason: object
420
@pytest.mark.write_disk
421
def test_json_wrong_input_handle_textio(tmp_path: Path) -> None:
422
# This shouldn't be passed, but still we test if we can handle it gracefully
423
df = pl.DataFrame(
424
{
425
"x": [1, 2, 3],
426
"y": ["a", "b", "c"],
427
}
428
)
429
file_path = tmp_path / "test.ndjson"
430
df.write_ndjson(file_path)
431
432
with file_path.open() as f:
433
result = pl.read_ndjson(f)
434
assert_frame_equal(result, df)
435
436
437
def test_json_normalize() -> None:
438
data = [
439
{"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
440
{"name": {"given": "Mark", "family": "Regner"}},
441
{"id": 2, "name": "Faye Raker"},
442
]
443
444
assert pl.json_normalize([], schema=pl.Schema({"test": pl.Int32})).to_dict(
445
as_series=False
446
) == {
447
"test": [],
448
}
449
450
assert pl.json_normalize(data, max_level=0).to_dict(as_series=False) == {
451
"id": [1, None, 2],
452
"name": [
453
'{"first": "Coleen", "last": "Volk"}',
454
'{"given": "Mark", "family": "Regner"}',
455
"Faye Raker",
456
],
457
}
458
459
assert pl.json_normalize(data, max_level=1).to_dict(as_series=False) == {
460
"id": [1, None, 2],
461
"name.first": ["Coleen", None, None],
462
"name.last": ["Volk", None, None],
463
"name.given": [None, "Mark", None],
464
"name.family": [None, "Regner", None],
465
"name": [None, None, "Faye Raker"],
466
}
467
468
data = [
469
{
470
"id": 1,
471
"name": "Cole Volk",
472
"fitness": {"height": 130, "weight": 60},
473
},
474
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
475
{
476
"id": 2,
477
"name": "Faye Raker",
478
"fitness": {"height": 130, "weight": 60},
479
},
480
]
481
assert pl.json_normalize(data, max_level=1, separator=":").to_dict(
482
as_series=False,
483
) == {
484
"id": [1, None, 2],
485
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
486
"fitness:height": [130, 130, 130],
487
"fitness:weight": [60, 60, 60],
488
}
489
assert pl.json_normalize(data, max_level=0).to_dict(
490
as_series=False,
491
) == {
492
"id": [1, None, 2],
493
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
494
"fitness": [
495
'{"height": 130, "weight": 60}',
496
'{"height": 130, "weight": 60}',
497
'{"height": 130, "weight": 60}',
498
],
499
}
500
assert pl.json_normalize(data, max_level=0, encoder=orjson.dumps).to_dict(
501
as_series=False,
502
) == {
503
"id": [1, None, 2],
504
"name": ["Cole Volk", "Mark Reg", "Faye Raker"],
505
"fitness": [
506
b'{"height":130,"weight":60}',
507
b'{"height":130,"weight":60}',
508
b'{"height":130,"weight":60}',
509
],
510
}
511
512
513
def test_empty_json() -> None:
514
df = pl.read_json(io.StringIO("{}"))
515
assert df.shape == (0, 0)
516
assert isinstance(df, pl.DataFrame)
517
518
df = pl.read_json(b'{"j":{}}')
519
assert df.dtypes == [pl.Struct([])]
520
assert df.shape == (1, 1)
521
522
523
def test_compressed_json() -> None:
524
# shared setup
525
json_obj = [
526
{"id": 1, "name": "Alice", "trusted": True},
527
{"id": 2, "name": "Bob", "trusted": True},
528
{"id": 3, "name": "Carol", "trusted": False},
529
]
530
expected = pl.DataFrame(json_obj, orient="row")
531
json_bytes = json.dumps(json_obj).encode()
532
533
# gzip
534
compressed_bytes = gzip.compress(json_bytes)
535
out = pl.read_json(compressed_bytes)
536
assert_frame_equal(out, expected)
537
538
# zlib
539
compressed_bytes = zlib.compress(json_bytes)
540
out = pl.read_json(compressed_bytes)
541
assert_frame_equal(out, expected)
542
543
# zstd
544
compressed_bytes = zstandard.compress(json_bytes)
545
out = pl.read_json(compressed_bytes)
546
assert_frame_equal(out, expected)
547
548
# no compression
549
uncompressed = io.BytesIO(json_bytes)
550
out = pl.read_json(uncompressed)
551
assert_frame_equal(out, expected)
552
553
554
def test_empty_list_json() -> None:
555
df = pl.read_json(io.StringIO("[]")) #
556
assert df.shape == (0, 0)
557
assert isinstance(df, pl.DataFrame)
558
559
df = pl.read_json(b"[]")
560
assert df.shape == (0, 0)
561
assert isinstance(df, pl.DataFrame)
562
563
564
def test_json_infer_3_dtypes() -> None:
565
# would SO before
566
df = pl.DataFrame({"a": ["{}", "1", "[1, 2]"]})
567
568
with pytest.raises(pl.exceptions.ComputeError):
569
df.select(pl.col("a").str.json_decode(pl.Int64))
570
571
df = pl.DataFrame({"a": [None, "1", "[1, 2]"]})
572
out = df.select(pl.col("a").str.json_decode(dtype=pl.List(pl.String)))
573
assert out["a"].to_list() == [None, ["1"], ["1", "2"]]
574
assert out.dtypes[0] == pl.List(pl.String)
575
576
577
# NOTE: This doesn't work for 0, but that is normal
578
@pytest.mark.parametrize("size", [1, 2, 13])
579
def test_zfs_json_roundtrip(size: int) -> None:
580
a = pl.Series("a", [{}] * size, pl.Struct([])).to_frame()
581
582
f = io.StringIO()
583
a.write_json(f)
584
585
f.seek(0)
586
assert_frame_equal(a, pl.read_json(f))
587
588
589
def test_read_json_raise_on_data_type_mismatch() -> None:
590
with pytest.raises(ComputeError):
591
pl.read_json(
592
b"""\
593
[
594
{"a": null},
595
{"a": 1}
596
]
597
""",
598
infer_schema_length=1,
599
)
600
601
602
def test_read_json_struct_schema() -> None:
603
with pytest.raises(ComputeError, match="extra field in struct data: b"):
604
pl.read_json(
605
b"""\
606
[
607
{"a": 1},
608
{"a": 2, "b": 2}
609
]
610
""",
611
infer_schema_length=1,
612
)
613
614
assert_frame_equal(
615
pl.read_json(
616
b"""\
617
[
618
{"a": 1},
619
{"a": 2, "b": 2}
620
]
621
""",
622
infer_schema_length=2,
623
),
624
pl.DataFrame({"a": [1, 2], "b": [None, 2]}),
625
)
626
627
# If the schema was explicitly given, then we ignore extra fields.
628
# TODO: There should be a `columns=` parameter to this.
629
assert_frame_equal(
630
pl.read_json(
631
b"""\
632
[
633
{"a": 1},
634
{"a": 2, "b": 2}
635
]
636
""",
637
schema={"a": pl.Int64},
638
),
639
pl.DataFrame({"a": [1, 2]}),
640
)
641
642
643
def test_read_ndjson_inner_list_types_18244() -> None:
644
assert pl.read_ndjson(
645
io.StringIO("""{"a":null,"b":null,"c":null}"""),
646
schema={
647
"a": pl.List(pl.String),
648
"b": pl.List(pl.Int32),
649
"c": pl.List(pl.Float64),
650
},
651
).schema == (
652
{"a": pl.List(pl.String), "b": pl.List(pl.Int32), "c": pl.List(pl.Float64)}
653
)
654
655
656
def test_read_json_utf_8_sig_encoding() -> None:
657
data = [{"a": [1, 2], "b": [1, 2]}]
658
result = pl.read_json(json.dumps(data).encode("utf-8-sig"))
659
expected = pl.DataFrame(data)
660
assert_frame_equal(result, expected)
661
662
663
@pytest.mark.may_fail_cloud # reason: object
664
def test_write_masked_out_list_22202() -> None:
665
df = pl.DataFrame({"x": [1, 2], "y": [None, 3]})
666
667
output_file = io.BytesIO()
668
669
query = (
670
df.group_by("x", maintain_order=True)
671
.all()
672
.select(pl.when(pl.col("y").list.sum() > 0).then("y"))
673
)
674
675
eager = query.write_ndjson().encode()
676
677
query.lazy().sink_ndjson(output_file)
678
lazy = output_file.getvalue()
679
680
assert eager == lazy
681
682
683
def test_nested_datetime_ndjson() -> None:
684
f = io.StringIO(
685
"""{"start_date":"2025-03-14T09:30:27Z","steps":[{"id":1,"start_date":"2025-03-14T09:30:27Z"},{"id":2,"start_date":"2025-03-14T09:31:27Z"}]}"""
686
)
687
688
schema = {
689
"start_date": pl.Datetime,
690
"steps": pl.List(pl.Struct({"id": pl.Int64, "start_date": pl.Datetime})),
691
}
692
693
assert pl.read_ndjson(f, schema=schema).to_dict(as_series=False) == { # type: ignore[arg-type]
694
"start_date": [datetime(2025, 3, 14, 9, 30, 27)],
695
"steps": [
696
[
697
{"id": 1, "start_date": datetime(2025, 3, 14, 9, 30, 27)},
698
{"id": 2, "start_date": datetime(2025, 3, 14, 9, 31, 27)},
699
]
700
],
701
}
702
703
704
def test_ndjson_22229() -> None:
705
li = [
706
'{ "campaign": { "id": "123456" }, "metrics": { "conversions": 7}}',
707
'{ "campaign": { "id": "654321" }, "metrics": { "conversions": 3.5}}',
708
]
709
710
assert pl.read_ndjson(io.StringIO("\n".join(li))).to_dict(as_series=False)
711
712
713
def test_json_encode_enum_23826() -> None:
714
s = pl.Series("a", ["foo", "bar"], dtype=pl.Enum(["bar", "foo"]))
715
assert_series_equal(
716
s.to_frame().select(c=pl.struct("a").struct.json_encode()).to_series(),
717
pl.Series("c", ['{"a":"foo"}', '{"a":"bar"}'], pl.String),
718
)
719
720
721
def test_json_encode_categorical() -> None:
722
s = pl.Series("a", ["foo", "bar"], dtype=pl.Categorical)
723
assert_series_equal(
724
s.to_frame().select(c=pl.struct("a").struct.json_encode()).to_series(),
725
pl.Series("c", ['{"a":"foo"}', '{"a":"bar"}'], pl.String),
726
)
727
728
729
@pytest.mark.parametrize(
730
"ndjson_str", ["10", "null", "true", "false", "1.5", "[]", "[1, 2]"]
731
)
732
def test_ndjson_row_not_an_object_24267(ndjson_str: str) -> None:
733
with pytest.raises(
734
ComputeError, match="NDJSON line expected to contain JSON object: "
735
):
736
pl.read_ndjson(
737
io.StringIO(ndjson_str), ignore_errors=False, infer_schema_length=100
738
)
739
740
with pytest.raises(
741
ComputeError, match="NDJSON line expected to contain JSON object: "
742
):
743
pl.read_ndjson(
744
io.StringIO('{"a": 10}\n' + ndjson_str),
745
ignore_errors=False,
746
infer_schema_length=1,
747
)
748
749
assert_frame_equal(
750
pl.read_ndjson(
751
io.StringIO('{"a": 10}\n' + ndjson_str),
752
infer_schema_length=1,
753
ignore_errors=True,
754
),
755
pl.DataFrame({"a": [10, None]}),
756
)
757
758
759
def test_ndjson_no_cast_int_to_float_19138() -> None:
760
with pytest.raises(
761
ComputeError, match=re.escape("cannot parse '2.7' (f64) as Int64")
762
):
763
pl.read_ndjson(
764
io.StringIO('{"a": 1}\n{"a": 2.7}\n'),
765
infer_schema_length=1,
766
ignore_errors=False,
767
)
768
769
assert_frame_equal(
770
pl.read_ndjson(
771
io.StringIO('{"a": 1}\n{"a": 2.7}\n'),
772
infer_schema_length=1,
773
ignore_errors=True,
774
),
775
pl.DataFrame({"a": [1, None]}),
776
)
777
778
assert_frame_equal(
779
pl.read_ndjson(
780
io.StringIO('{"a": 2.7}\n{"a": 1}\n'),
781
infer_schema_length=1,
782
ignore_errors=False,
783
),
784
pl.DataFrame({"a": [2.7, 1]}),
785
)
786
787
788
def test_ndjson_large_u64_infer_25894() -> None:
789
data = b"""{"id":14933243513335727983}"""
790
df = pl.read_ndjson(data)
791
assert_frame_equal(
792
df,
793
pl.DataFrame({"id": pl.Series("id", [14933243513335727983], dtype=pl.Int128)}),
794
)
795
796