Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/json_path.rs
8391 views
1
use std::borrow::Cow;
2
3
use arrow::array::ValueSize;
4
use jsonpath_lib::PathCompiled;
5
use polars_core::prelude::arity::{broadcast_try_binary_elementwise, unary_elementwise};
6
use serde_json::Value;
7
8
use super::*;
9
10
pub fn extract_json(expr: &PathCompiled, json_str: &str) -> Option<String> {
11
serde_json::from_str(json_str).ok().and_then(|value| {
12
// TODO: a lot of heap allocations here. Improve json path by adding a take?
13
let result = expr.select(&value).ok()?;
14
let first = *result.first()?;
15
16
match first {
17
Value::String(s) => Some(s.clone()),
18
Value::Null => None,
19
v => Some(v.to_string()),
20
}
21
})
22
}
23
24
/// Returns a string of the most specific value given the compiled JSON path expression.
25
/// This avoids creating a list to represent individual elements so that they can be
26
/// selected directly.
27
pub fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, str>> {
28
serde_json::from_str(json_str).ok().and_then(|value| {
29
// TODO: a lot of heap allocations here. Improve json path by adding a take?
30
let result = expr.select(&value).ok()?;
31
32
let result_str = match result.len() {
33
0 => None,
34
1 => serde_json::to_string(&result[0]).ok(),
35
_ => serde_json::to_string(&result).ok(),
36
};
37
38
result_str.map(Cow::Owned)
39
})
40
}
41
42
pub trait Utf8JsonPathImpl: AsString {
43
/// Extract json path, first match
44
/// Refer to <https://goessner.net/articles/JsonPath/>
45
fn json_path_match(&self, json_path: &StringChunked) -> PolarsResult<StringChunked> {
46
let ca = self.as_string();
47
match (ca.len(), json_path.len()) {
48
(_, 1) => {
49
// SAFETY: `json_path` was verified to have exactly 1 element.
50
let opt_path = unsafe { json_path.get_unchecked(0) };
51
let out = if let Some(path) = opt_path {
52
let pat = PathCompiled::compile(path).map_err(
53
|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e),
54
)?;
55
unary_elementwise(ca, |opt_s| opt_s.and_then(|s| extract_json(&pat, s)))
56
} else {
57
StringChunked::full_null(ca.name().clone(), ca.len())
58
};
59
Ok(out)
60
},
61
(len_ca, len_path) if len_ca == 1 || len_ca == len_path => {
62
broadcast_try_binary_elementwise(ca, json_path, |opt_str, opt_path| {
63
match (opt_str, opt_path) {
64
(Some(str_val), Some(path)) => {
65
PathCompiled::compile(path)
66
.map_err(|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e))
67
.map(|path| extract_json(&path, str_val))
68
},
69
_ => Ok(None),
70
}
71
})
72
},
73
(len_ca, len_path) => {
74
polars_bail!(ComputeError: "The length of `ca` and `json_path` should either 1 or the same, but `{}`, `{}` founded", len_ca, len_path)
75
},
76
}
77
}
78
79
/// Returns the inferred DataType for JSON values for each row
80
/// in the StringChunked, with an optional number of rows to inspect.
81
/// When None is passed for the number of rows, all rows are inspected.
82
fn json_infer(&self, number_of_rows: Option<usize>) -> PolarsResult<DataType> {
83
let ca = self.as_string();
84
let values_iter = ca
85
.iter()
86
.map(|x| x.unwrap_or("null"))
87
.take(number_of_rows.unwrap_or(ca.len()));
88
89
polars_json::ndjson::infer_iter(values_iter)
90
.map(|d| DataType::from_arrow_dtype(&d))
91
.map_err(|e| polars_err!(ComputeError: "error inferring JSON: {}", e))
92
}
93
94
/// Extracts a typed-JSON value for each row in the StringChunked
95
fn json_decode(
96
&self,
97
dtype: Option<DataType>,
98
infer_schema_len: Option<usize>,
99
) -> PolarsResult<Series> {
100
let ca = self.as_string();
101
// Ignore extra fields instead of erroring if the dtype was explicitly given.
102
let allow_extra_fields_in_struct = dtype.is_some();
103
let mut needs_cast = false;
104
let decode_dtype = match &dtype {
105
Some(dt) => dt.clone().map_leaves(&mut |leaf_dt| {
106
match leaf_dt {
107
#[cfg(feature = "dtype-categorical")]
108
DataType::Enum(..) | DataType::Categorical(..) => {
109
// Decode enums and categoricals as string, will cast later.
110
needs_cast = true;
111
DataType::String
112
},
113
leaf_dt => leaf_dt,
114
}
115
}),
116
None => ca.json_infer(infer_schema_len)?,
117
};
118
let buf_size = ca.get_values_size() + ca.null_count() * "null".len();
119
let iter = ca.iter().map(|x| x.unwrap_or("null"));
120
121
let array = polars_json::ndjson::deserialize::deserialize_iter(
122
iter,
123
decode_dtype.to_arrow(CompatLevel::newest()),
124
buf_size,
125
ca.len(),
126
allow_extra_fields_in_struct,
127
)
128
.map_err(|e| polars_err!(ComputeError: "error deserializing JSON: {}", e))?;
129
let s = Series::try_from((PlSmallStr::EMPTY, array))?;
130
if needs_cast {
131
s.strict_cast(&dtype.unwrap())
132
} else {
133
Ok(s)
134
}
135
}
136
137
fn json_path_select(&self, json_path: &str) -> PolarsResult<StringChunked> {
138
let pat = PathCompiled::compile(json_path)
139
.map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression: {}", e))?;
140
Ok(self
141
.as_string()
142
.apply(|opt_s| opt_s.and_then(|s| select_json(&pat, s))))
143
}
144
145
fn json_path_extract(
146
&self,
147
json_path: &str,
148
dtype: Option<DataType>,
149
infer_schema_len: Option<usize>,
150
) -> PolarsResult<Series> {
151
let selected_json = self.as_string().json_path_select(json_path)?;
152
selected_json.json_decode(dtype, infer_schema_len)
153
}
154
}
155
156
impl Utf8JsonPathImpl for StringChunked {}
157
158
#[cfg(test)]
159
mod tests {
160
use arrow::bitmap::Bitmap;
161
162
use super::*;
163
164
#[test]
165
fn test_json_select() {
166
let json_str = r#"{"a":1,"b":{"c":"hello"},"d":[{"e":0},{"e":2},{"e":null}]}"#;
167
168
let compile = |s| PathCompiled::compile(s).unwrap();
169
let some_cow = |s: &str| Some(Cow::Owned(s.to_string()));
170
171
assert_eq!(select_json(&compile("$"), json_str), some_cow(json_str));
172
assert_eq!(select_json(&compile("$.a"), json_str), some_cow("1"));
173
assert_eq!(
174
select_json(&compile("$.b.c"), json_str),
175
some_cow(r#""hello""#)
176
);
177
assert_eq!(select_json(&compile("$.d[0].e"), json_str), some_cow("0"));
178
assert_eq!(
179
select_json(&compile("$.d[2].e"), json_str),
180
some_cow("null")
181
);
182
assert_eq!(
183
select_json(&compile("$.d[:].e"), json_str),
184
some_cow("[0,2,null]")
185
);
186
}
187
188
#[test]
189
fn test_json_infer() {
190
let s = Series::new(
191
"json".into(),
192
[
193
None,
194
Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#),
195
Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#),
196
None,
197
],
198
);
199
let ca = s.str().unwrap();
200
201
let inner_dtype = DataType::Struct(vec![Field::new("c".into(), DataType::Int64)]);
202
let expected_dtype = DataType::Struct(vec![
203
Field::new("a".into(), DataType::Int64),
204
Field::new("b".into(), DataType::List(Box::new(inner_dtype))),
205
]);
206
207
assert_eq!(ca.json_infer(None).unwrap(), expected_dtype);
208
// Infereing with the first row will only see None
209
assert_eq!(ca.json_infer(Some(1)).unwrap(), DataType::Null);
210
assert_eq!(ca.json_infer(Some(2)).unwrap(), expected_dtype);
211
}
212
213
#[test]
214
fn test_json_decode() {
215
let s = Series::new(
216
"json".into(),
217
[
218
None,
219
Some(r#"{"a": 1, "b": "hello"}"#),
220
Some(r#"{"a": 2, "b": "goodbye"}"#),
221
None,
222
],
223
);
224
let ca = s.str().unwrap();
225
226
let expected_series = StructChunked::from_series(
227
"".into(),
228
4,
229
[
230
Series::new("a".into(), &[None, Some(1), Some(2), None]),
231
Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]),
232
]
233
.iter(),
234
)
235
.unwrap()
236
.with_outer_validity(Some(Bitmap::from_iter([false, true, true, false])))
237
.into_series();
238
let expected_dtype = expected_series.dtype().clone();
239
240
assert!(
241
ca.json_decode(None, None)
242
.unwrap()
243
.equals_missing(&expected_series)
244
);
245
assert!(
246
ca.json_decode(Some(expected_dtype), None)
247
.unwrap()
248
.equals_missing(&expected_series)
249
);
250
}
251
252
#[test]
253
fn test_json_path_select() {
254
let s = Series::new(
255
"json".into(),
256
[
257
None,
258
Some(r#"{"a":1,"b":[{"c":0},{"c":1}]}"#),
259
Some(r#"{"a":2,"b":[{"c":2},{"c":5}]}"#),
260
None,
261
],
262
);
263
let ca = s.str().unwrap();
264
265
assert!(
266
ca.json_path_select("$")
267
.unwrap()
268
.into_series()
269
.equals_missing(&s)
270
);
271
272
let b_series = Series::new(
273
"json".into(),
274
[
275
None,
276
Some(r#"[{"c":0},{"c":1}]"#),
277
Some(r#"[{"c":2},{"c":5}]"#),
278
None,
279
],
280
);
281
assert!(
282
ca.json_path_select("$.b")
283
.unwrap()
284
.into_series()
285
.equals_missing(&b_series)
286
);
287
288
let c_series = Series::new(
289
"json".into(),
290
[None, Some(r#"[0,1]"#), Some(r#"[2,5]"#), None],
291
);
292
assert!(
293
ca.json_path_select("$.b[:].c")
294
.unwrap()
295
.into_series()
296
.equals_missing(&c_series)
297
);
298
}
299
300
#[test]
301
fn test_json_path_extract() {
302
let s = Series::new(
303
"json".into(),
304
[
305
None,
306
Some(r#"{"a":1,"b":[{"c":0},{"c":1}]}"#),
307
Some(r#"{"a":2,"b":[{"c":2},{"c":5}]}"#),
308
None,
309
],
310
);
311
let ca = s.str().unwrap();
312
313
let c_series = Series::new(
314
"".into(),
315
[
316
None,
317
Some(Series::new("".into(), &[0, 1])),
318
Some(Series::new("".into(), &[2, 5])),
319
None,
320
],
321
);
322
323
assert!(
324
ca.json_path_extract("$.b[:].c", None, None)
325
.unwrap()
326
.into_series()
327
.equals_missing(&c_series)
328
);
329
}
330
}
331
332