Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-json/src/json/deserialize.rs
8420 views
1
use std::borrow::Borrow;
2
use std::fmt::Write;
3
4
use arrow::array::*;
5
use arrow::bitmap::BitmapBuilder;
6
use arrow::datatypes::{ArrowDataType, IntervalUnit};
7
use arrow::offset::{Offset, Offsets};
8
use arrow::temporal_conversions;
9
use arrow::types::NativeType;
10
use num_traits::NumCast;
11
#[cfg(feature = "dtype-decimal")]
12
use polars_compute::decimal::{f64_to_dec128, i128_to_dec128, str_to_dec128};
13
use polars_utils::float16::pf16;
14
use simd_json::{BorrowedValue, StaticNode};
15
16
use super::*;
17
18
const JSON_NULL_VALUE: BorrowedValue = BorrowedValue::Static(StaticNode::Null);
19
20
fn deserialize_boolean_into<'a, A: Borrow<BorrowedValue<'a>>>(
21
target: &mut MutableBooleanArray,
22
rows: &[A],
23
) -> PolarsResult<()> {
24
let mut err_idx = rows.len();
25
let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
26
BorrowedValue::Static(StaticNode::Bool(v)) => Some(v),
27
BorrowedValue::Static(StaticNode::Null) => None,
28
_ => {
29
err_idx = if err_idx == rows.len() { i } else { err_idx };
30
None
31
},
32
});
33
target.extend_trusted_len(iter);
34
check_err_idx(rows, err_idx, "boolean")
35
}
36
37
fn deserialize_primitive_into<'a, T: NativeType + NumCast, A: Borrow<BorrowedValue<'a>>>(
38
target: &mut MutablePrimitiveArray<T>,
39
rows: &[A],
40
) -> PolarsResult<()> {
41
let mut err_idx = rows.len();
42
let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
43
BorrowedValue::Static(StaticNode::I64(v)) => T::from(*v),
44
BorrowedValue::Static(StaticNode::U64(v)) => T::from(*v),
45
BorrowedValue::Static(StaticNode::I128(v)) => T::from(*v),
46
BorrowedValue::Static(StaticNode::U128(v)) => T::from(*v),
47
BorrowedValue::Static(StaticNode::F64(v)) => T::from(*v),
48
BorrowedValue::Static(StaticNode::Bool(v)) => T::from(*v as u8),
49
BorrowedValue::Static(StaticNode::Null) => None,
50
_ => {
51
err_idx = if err_idx == rows.len() { i } else { err_idx };
52
None
53
},
54
});
55
target.extend_trusted_len(iter);
56
check_err_idx(rows, err_idx, "numeric")
57
}
58
59
#[cfg(feature = "dtype-decimal")]
60
fn deserialize_decimal<'a, A: Borrow<BorrowedValue<'a>>>(
61
rows: &[A],
62
dtype: ArrowDataType,
63
) -> PolarsResult<Int128Array> {
64
let ArrowDataType::Decimal(prec, scale) = dtype else {
65
unreachable!()
66
};
67
let mut err_idx = rows.len();
68
let iter = rows.iter().enumerate().map(|(i, row)| {
69
let decode = match row.borrow() {
70
BorrowedValue::Static(StaticNode::I64(v)) => i128_to_dec128(*v as i128, prec, scale),
71
BorrowedValue::Static(StaticNode::U64(v)) => i128_to_dec128(*v as i128, prec, scale),
72
BorrowedValue::Static(StaticNode::I128(v)) => i128_to_dec128(*v, prec, scale),
73
BorrowedValue::Static(StaticNode::U128(v)) => i128::try_from(*v)
74
.ok()
75
.and_then(|v| i128_to_dec128(v, prec, scale)),
76
BorrowedValue::Static(StaticNode::F64(v)) => f64_to_dec128(*v, prec, scale),
77
BorrowedValue::String(s) => str_to_dec128(s.as_bytes(), prec, scale, false),
78
BorrowedValue::Static(StaticNode::Null) => return None,
79
_ => None,
80
};
81
if decode.is_none() && err_idx == rows.len() {
82
err_idx = i;
83
}
84
decode
85
});
86
87
let arr = Int128Array::from_trusted_len_iter(iter);
88
if err_idx != rows.len() {
89
polars_bail!(
90
ComputeError:
91
r#"error deserializing value "{:?}" as Decimal({prec}, {scale}).
92
93
Try increasing `infer_schema_length` or specifying a schema."#,
94
rows[err_idx].borrow()
95
)
96
}
97
Ok(arr.to(dtype))
98
}
99
100
fn deserialize_binary<'a, A: Borrow<BorrowedValue<'a>>>(
101
rows: &[A],
102
) -> PolarsResult<BinaryArray<i64>> {
103
let mut err_idx = rows.len();
104
let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
105
BorrowedValue::String(v) => Some(v.as_bytes()),
106
BorrowedValue::Static(StaticNode::Null) => None,
107
_ => {
108
err_idx = if err_idx == rows.len() { i } else { err_idx };
109
None
110
},
111
});
112
let out = BinaryArray::from_trusted_len_iter(iter);
113
check_err_idx(rows, err_idx, "binary")?;
114
Ok(out)
115
}
116
117
fn deserialize_utf8_into<'a, O: Offset, A: Borrow<BorrowedValue<'a>>>(
118
target: &mut MutableUtf8Array<O>,
119
rows: &[A],
120
) -> PolarsResult<()> {
121
let mut err_idx = rows.len();
122
let mut scratch = String::new();
123
for (i, row) in rows.iter().enumerate() {
124
match row.borrow() {
125
BorrowedValue::String(v) => target.push(Some(v.as_ref())),
126
BorrowedValue::Static(StaticNode::Bool(v)) => {
127
target.push(Some(if *v { "true" } else { "false" }))
128
},
129
BorrowedValue::Static(StaticNode::Null) => target.push_null(),
130
BorrowedValue::Static(node) => {
131
write!(scratch, "{node}").unwrap();
132
target.push(Some(scratch.as_str()));
133
scratch.clear();
134
},
135
_ => {
136
err_idx = if err_idx == rows.len() { i } else { err_idx };
137
},
138
}
139
}
140
check_err_idx(rows, err_idx, "string")
141
}
142
143
fn deserialize_utf8view_into<'a, A: Borrow<BorrowedValue<'a>>>(
144
target: &mut MutableBinaryViewArray<str>,
145
rows: &[A],
146
) -> PolarsResult<()> {
147
let mut err_idx = rows.len();
148
let mut scratch = String::new();
149
for (i, row) in rows.iter().enumerate() {
150
match row.borrow() {
151
BorrowedValue::String(v) => target.push_value(v.as_ref()),
152
BorrowedValue::Static(StaticNode::Bool(v)) => {
153
target.push_value(if *v { "true" } else { "false" })
154
},
155
BorrowedValue::Static(StaticNode::Null) => target.push_null(),
156
BorrowedValue::Static(node) => {
157
write!(scratch, "{node}").unwrap();
158
target.push_value(scratch.as_str());
159
scratch.clear();
160
},
161
_ => {
162
err_idx = if err_idx == rows.len() { i } else { err_idx };
163
},
164
}
165
}
166
check_err_idx(rows, err_idx, "string")
167
}
168
169
fn deserialize_list<'a, A: Borrow<BorrowedValue<'a>>>(
170
rows: &[A],
171
dtype: ArrowDataType,
172
allow_extra_fields_in_struct: bool,
173
) -> PolarsResult<ListArray<i64>> {
174
let mut err_idx = rows.len();
175
let child = ListArray::<i64>::get_child_type(&dtype);
176
177
let mut validity = BitmapBuilder::with_capacity(rows.len());
178
let mut offsets = Offsets::<i64>::with_capacity(rows.len());
179
let mut inner = vec![];
180
rows.iter()
181
.enumerate()
182
.for_each(|(i, row)| match row.borrow() {
183
BorrowedValue::Array(value) => {
184
inner.extend(value.iter());
185
validity.push(true);
186
offsets
187
.try_push(value.len())
188
.expect("List offset is too large :/");
189
},
190
BorrowedValue::Static(StaticNode::Null) => {
191
validity.push(false);
192
offsets.extend_constant(1)
193
},
194
value @ (BorrowedValue::Static(_) | BorrowedValue::String(_)) => {
195
inner.push(value);
196
validity.push(true);
197
offsets.try_push(1).expect("List offset is too large :/");
198
},
199
_ => {
200
err_idx = if err_idx == rows.len() { i } else { err_idx };
201
},
202
});
203
204
check_err_idx(rows, err_idx, "list")?;
205
206
let values = _deserialize(&inner, child.clone(), allow_extra_fields_in_struct)?;
207
208
Ok(ListArray::<i64>::new(
209
dtype,
210
offsets.into(),
211
values,
212
validity.into_opt_validity(),
213
))
214
}
215
216
fn deserialize_struct<'a, A: Borrow<BorrowedValue<'a>>>(
217
rows: &[A],
218
dtype: ArrowDataType,
219
allow_extra_fields_in_struct: bool,
220
) -> PolarsResult<StructArray> {
221
let mut err_idx = rows.len();
222
let fields = StructArray::get_fields(&dtype);
223
224
let mut out_values = fields
225
.iter()
226
.map(|f| (f.name.as_str(), (f.dtype(), vec![])))
227
.collect::<PlHashMap<_, _>>();
228
229
let mut validity = BitmapBuilder::with_capacity(rows.len());
230
// Custom error tracker
231
let mut extra_field = None;
232
233
rows.iter().enumerate().for_each(|(i, row)| {
234
match row.borrow() {
235
BorrowedValue::Object(values) => {
236
let mut n_matched = 0usize;
237
for (&key, &mut (_, ref mut inner)) in out_values.iter_mut() {
238
if let Some(v) = values.get(key) {
239
n_matched += 1;
240
inner.push(v)
241
} else {
242
inner.push(&JSON_NULL_VALUE)
243
}
244
}
245
246
validity.push(true);
247
248
if n_matched < values.len() && extra_field.is_none() {
249
for k in values.keys() {
250
if !out_values.contains_key(k.as_ref()) {
251
extra_field = Some(k.as_ref())
252
}
253
}
254
}
255
},
256
BorrowedValue::Static(StaticNode::Null) => {
257
out_values
258
.iter_mut()
259
.for_each(|(_, (_, inner))| inner.push(&JSON_NULL_VALUE));
260
validity.push(false);
261
},
262
_ => {
263
err_idx = if err_idx == rows.len() { i } else { err_idx };
264
},
265
};
266
});
267
268
if let Some(v) = extra_field {
269
if !allow_extra_fields_in_struct {
270
polars_bail!(
271
ComputeError:
272
"extra field in struct data: {}, consider increasing infer_schema_length, or \
273
manually specifying the full schema to ignore extra fields",
274
v
275
)
276
}
277
}
278
279
check_err_idx(rows, err_idx, "struct")?;
280
281
// ensure we collect in the proper order
282
let values = fields
283
.iter()
284
.map(|fld| {
285
let (dtype, vals) = out_values.get(fld.name.as_str()).unwrap();
286
_deserialize(vals, (*dtype).clone(), allow_extra_fields_in_struct)
287
})
288
.collect::<PolarsResult<Vec<_>>>()?;
289
290
Ok(StructArray::new(
291
dtype.clone(),
292
rows.len(),
293
values,
294
validity.into_opt_validity(),
295
))
296
}
297
298
fn fill_array_from<B, T, A>(
299
f: fn(&mut MutablePrimitiveArray<T>, &[B]) -> PolarsResult<()>,
300
dtype: ArrowDataType,
301
rows: &[B],
302
) -> PolarsResult<Box<dyn Array>>
303
where
304
T: NativeType,
305
A: From<MutablePrimitiveArray<T>> + Array,
306
{
307
let mut array = MutablePrimitiveArray::<T>::with_capacity(rows.len()).to(dtype);
308
f(&mut array, rows)?;
309
Ok(Box::new(A::from(array)))
310
}
311
312
/// A trait describing an array with a backing store that can be preallocated to
313
/// a given size.
314
pub(crate) trait Container {
315
/// Create this array with a given capacity.
316
fn with_capacity(capacity: usize) -> Self
317
where
318
Self: Sized;
319
}
320
321
impl<O: Offset> Container for MutableBinaryArray<O> {
322
fn with_capacity(capacity: usize) -> Self {
323
MutableBinaryArray::with_capacity(capacity)
324
}
325
}
326
327
impl Container for MutableBooleanArray {
328
fn with_capacity(capacity: usize) -> Self {
329
MutableBooleanArray::with_capacity(capacity)
330
}
331
}
332
333
impl Container for MutableFixedSizeBinaryArray {
334
fn with_capacity(capacity: usize) -> Self {
335
MutableFixedSizeBinaryArray::with_capacity(capacity, 0)
336
}
337
}
338
339
impl Container for MutableBinaryViewArray<str> {
340
fn with_capacity(capacity: usize) -> Self
341
where
342
Self: Sized,
343
{
344
MutableBinaryViewArray::with_capacity(capacity)
345
}
346
}
347
348
impl<O: Offset, M: MutableArray + Default + 'static> Container for MutableListArray<O, M> {
349
fn with_capacity(capacity: usize) -> Self {
350
MutableListArray::with_capacity(capacity)
351
}
352
}
353
354
impl<T: NativeType> Container for MutablePrimitiveArray<T> {
355
fn with_capacity(capacity: usize) -> Self {
356
MutablePrimitiveArray::with_capacity(capacity)
357
}
358
}
359
360
impl<O: Offset> Container for MutableUtf8Array<O> {
361
fn with_capacity(capacity: usize) -> Self {
362
MutableUtf8Array::with_capacity(capacity)
363
}
364
}
365
366
fn fill_generic_array_from<B, M, A>(
367
f: fn(&mut M, &[B]) -> PolarsResult<()>,
368
rows: &[B],
369
) -> PolarsResult<Box<dyn Array>>
370
where
371
M: Container,
372
A: From<M> + Array,
373
{
374
let mut array = M::with_capacity(rows.len());
375
f(&mut array, rows)?;
376
Ok(Box::new(A::from(array)))
377
}
378
379
pub(crate) fn _deserialize<'a, A: Borrow<BorrowedValue<'a>>>(
380
rows: &[A],
381
dtype: ArrowDataType,
382
allow_extra_fields_in_struct: bool,
383
) -> PolarsResult<Box<dyn Array>> {
384
match &dtype {
385
ArrowDataType::Null => {
386
if let Some(err_idx) = (0..rows.len())
387
.find(|i| !matches!(rows[*i].borrow(), BorrowedValue::Static(StaticNode::Null)))
388
{
389
check_err_idx(rows, err_idx, "null")?;
390
}
391
392
Ok(Box::new(NullArray::new(dtype, rows.len())))
393
},
394
ArrowDataType::Boolean => {
395
fill_generic_array_from::<_, _, BooleanArray>(deserialize_boolean_into, rows)
396
},
397
ArrowDataType::Int8 => {
398
fill_array_from::<_, _, PrimitiveArray<i8>>(deserialize_primitive_into, dtype, rows)
399
},
400
ArrowDataType::Int16 => {
401
fill_array_from::<_, _, PrimitiveArray<i16>>(deserialize_primitive_into, dtype, rows)
402
},
403
ArrowDataType::Int32
404
| ArrowDataType::Date32
405
| ArrowDataType::Time32(_)
406
| ArrowDataType::Interval(IntervalUnit::YearMonth) => {
407
fill_array_from::<_, _, PrimitiveArray<i32>>(deserialize_primitive_into, dtype, rows)
408
},
409
ArrowDataType::Interval(IntervalUnit::DayTime) => {
410
unimplemented!("There is no natural representation of DayTime in JSON.")
411
},
412
ArrowDataType::Int64
413
| ArrowDataType::Date64
414
| ArrowDataType::Time64(_)
415
| ArrowDataType::Duration(_) => {
416
fill_array_from::<_, _, PrimitiveArray<i64>>(deserialize_primitive_into, dtype, rows)
417
},
418
ArrowDataType::Int128 => {
419
fill_array_from::<_, _, PrimitiveArray<i128>>(deserialize_primitive_into, dtype, rows)
420
},
421
ArrowDataType::Timestamp(tu, tz) => {
422
let mut err_idx = rows.len();
423
let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
424
BorrowedValue::Static(StaticNode::I64(v)) => Some(*v),
425
BorrowedValue::String(v) => match (tu, tz) {
426
(_, None) => {
427
polars_compute::cast::temporal::utf8_to_naive_timestamp_scalar(v, "%+", tu)
428
},
429
(_, Some(tz)) => {
430
let tz = temporal_conversions::parse_offset(tz.as_str()).unwrap();
431
temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, tu)
432
},
433
},
434
BorrowedValue::Static(StaticNode::Null) => None,
435
_ => {
436
err_idx = if err_idx == rows.len() { i } else { err_idx };
437
None
438
},
439
});
440
let out = Box::new(Int64Array::from_iter(iter).to(dtype));
441
check_err_idx(rows, err_idx, "timestamp")?;
442
Ok(out)
443
},
444
ArrowDataType::UInt8 => {
445
fill_array_from::<_, _, PrimitiveArray<u8>>(deserialize_primitive_into, dtype, rows)
446
},
447
ArrowDataType::UInt16 => {
448
fill_array_from::<_, _, PrimitiveArray<u16>>(deserialize_primitive_into, dtype, rows)
449
},
450
ArrowDataType::UInt32 => {
451
fill_array_from::<_, _, PrimitiveArray<u32>>(deserialize_primitive_into, dtype, rows)
452
},
453
ArrowDataType::UInt64 => {
454
fill_array_from::<_, _, PrimitiveArray<u64>>(deserialize_primitive_into, dtype, rows)
455
},
456
ArrowDataType::UInt128 => {
457
fill_array_from::<_, _, PrimitiveArray<u128>>(deserialize_primitive_into, dtype, rows)
458
},
459
ArrowDataType::Float16 => {
460
fill_array_from::<_, _, PrimitiveArray<pf16>>(deserialize_primitive_into, dtype, rows)
461
},
462
ArrowDataType::Float32 => {
463
fill_array_from::<_, _, PrimitiveArray<f32>>(deserialize_primitive_into, dtype, rows)
464
},
465
ArrowDataType::Float64 => {
466
fill_array_from::<_, _, PrimitiveArray<f64>>(deserialize_primitive_into, dtype, rows)
467
},
468
#[cfg(feature = "dtype-decimal")]
469
ArrowDataType::Decimal(_, _) => Ok(Box::new(deserialize_decimal(rows, dtype)?)),
470
ArrowDataType::LargeUtf8 => {
471
fill_generic_array_from::<_, _, Utf8Array<i64>>(deserialize_utf8_into, rows)
472
},
473
ArrowDataType::Utf8View => {
474
fill_generic_array_from::<_, _, Utf8ViewArray>(deserialize_utf8view_into, rows)
475
},
476
ArrowDataType::LargeList(_) => Ok(Box::new(deserialize_list(
477
rows,
478
dtype,
479
allow_extra_fields_in_struct,
480
)?)),
481
ArrowDataType::LargeBinary => Ok(Box::new(deserialize_binary(rows)?)),
482
ArrowDataType::Struct(_) => Ok(Box::new(deserialize_struct(
483
rows,
484
dtype,
485
allow_extra_fields_in_struct,
486
)?)),
487
adt => unimplemented!("Deserialization from JSON not implemented for {adt:?}"),
488
}
489
}
490
491
pub fn deserialize(
492
json: &BorrowedValue,
493
dtype: ArrowDataType,
494
allow_extra_fields_in_struct: bool,
495
) -> PolarsResult<Box<dyn Array>> {
496
match json {
497
BorrowedValue::Array(rows) => match dtype {
498
ArrowDataType::LargeList(inner) => {
499
_deserialize(rows, inner.dtype, allow_extra_fields_in_struct)
500
},
501
_ => todo!("read an Array from a non-Array data type"),
502
},
503
_ => _deserialize(&[json], dtype, allow_extra_fields_in_struct),
504
}
505
}
506
507
fn check_err_idx<'a>(
508
rows: &[impl Borrow<BorrowedValue<'a>>],
509
err_idx: usize,
510
type_name: &'static str,
511
) -> PolarsResult<()> {
512
if err_idx != rows.len() {
513
polars_bail!(
514
ComputeError:
515
r#"error deserializing value "{:?}" as {}.
516
517
Try increasing `infer_schema_length` or specifying a schema."#,
518
rows[err_idx].borrow(), type_name,
519
)
520
}
521
522
Ok(())
523
}
524
525