Path: blob/main/crates/polars-ops/src/chunked_array/strings/json_path.rs
8391 views
use std::borrow::Cow;12use arrow::array::ValueSize;3use jsonpath_lib::PathCompiled;4use polars_core::prelude::arity::{broadcast_try_binary_elementwise, unary_elementwise};5use serde_json::Value;67use super::*;89pub fn extract_json(expr: &PathCompiled, json_str: &str) -> Option<String> {10serde_json::from_str(json_str).ok().and_then(|value| {11// TODO: a lot of heap allocations here. Improve json path by adding a take?12let result = expr.select(&value).ok()?;13let first = *result.first()?;1415match first {16Value::String(s) => Some(s.clone()),17Value::Null => None,18v => Some(v.to_string()),19}20})21}2223/// Returns a string of the most specific value given the compiled JSON path expression.24/// This avoids creating a list to represent individual elements so that they can be25/// selected directly.26pub fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, str>> {27serde_json::from_str(json_str).ok().and_then(|value| {28// TODO: a lot of heap allocations here. Improve json path by adding a take?29let result = expr.select(&value).ok()?;3031let result_str = match result.len() {320 => None,331 => serde_json::to_string(&result[0]).ok(),34_ => serde_json::to_string(&result).ok(),35};3637result_str.map(Cow::Owned)38})39}4041pub trait Utf8JsonPathImpl: AsString {42/// Extract json path, first match43/// Refer to <https://goessner.net/articles/JsonPath/>44fn json_path_match(&self, json_path: &StringChunked) -> PolarsResult<StringChunked> {45let ca = self.as_string();46match (ca.len(), json_path.len()) {47(_, 1) => {48// SAFETY: `json_path` was verified to have exactly 1 element.49let opt_path = unsafe { json_path.get_unchecked(0) };50let out = if let Some(path) = opt_path {51let pat = PathCompiled::compile(path).map_err(52|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e),53)?;54unary_elementwise(ca, |opt_s| opt_s.and_then(|s| extract_json(&pat, s)))55} else {56StringChunked::full_null(ca.name().clone(), ca.len())57};58Ok(out)59},60(len_ca, len_path) if len_ca == 1 || len_ca == len_path => {61broadcast_try_binary_elementwise(ca, json_path, |opt_str, opt_path| {62match (opt_str, opt_path) {63(Some(str_val), Some(path)) => {64PathCompiled::compile(path)65.map_err(|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e))66.map(|path| extract_json(&path, str_val))67},68_ => Ok(None),69}70})71},72(len_ca, len_path) => {73polars_bail!(ComputeError: "The length of `ca` and `json_path` should either 1 or the same, but `{}`, `{}` founded", len_ca, len_path)74},75}76}7778/// Returns the inferred DataType for JSON values for each row79/// in the StringChunked, with an optional number of rows to inspect.80/// When None is passed for the number of rows, all rows are inspected.81fn json_infer(&self, number_of_rows: Option<usize>) -> PolarsResult<DataType> {82let ca = self.as_string();83let values_iter = ca84.iter()85.map(|x| x.unwrap_or("null"))86.take(number_of_rows.unwrap_or(ca.len()));8788polars_json::ndjson::infer_iter(values_iter)89.map(|d| DataType::from_arrow_dtype(&d))90.map_err(|e| polars_err!(ComputeError: "error inferring JSON: {}", e))91}9293/// Extracts a typed-JSON value for each row in the StringChunked94fn json_decode(95&self,96dtype: Option<DataType>,97infer_schema_len: Option<usize>,98) -> PolarsResult<Series> {99let ca = self.as_string();100// Ignore extra fields instead of erroring if the dtype was explicitly given.101let allow_extra_fields_in_struct = dtype.is_some();102let mut needs_cast = false;103let decode_dtype = match &dtype {104Some(dt) => dt.clone().map_leaves(&mut |leaf_dt| {105match leaf_dt {106#[cfg(feature = "dtype-categorical")]107DataType::Enum(..) | DataType::Categorical(..) => {108// Decode enums and categoricals as string, will cast later.109needs_cast = true;110DataType::String111},112leaf_dt => leaf_dt,113}114}),115None => ca.json_infer(infer_schema_len)?,116};117let buf_size = ca.get_values_size() + ca.null_count() * "null".len();118let iter = ca.iter().map(|x| x.unwrap_or("null"));119120let array = polars_json::ndjson::deserialize::deserialize_iter(121iter,122decode_dtype.to_arrow(CompatLevel::newest()),123buf_size,124ca.len(),125allow_extra_fields_in_struct,126)127.map_err(|e| polars_err!(ComputeError: "error deserializing JSON: {}", e))?;128let s = Series::try_from((PlSmallStr::EMPTY, array))?;129if needs_cast {130s.strict_cast(&dtype.unwrap())131} else {132Ok(s)133}134}135136fn json_path_select(&self, json_path: &str) -> PolarsResult<StringChunked> {137let pat = PathCompiled::compile(json_path)138.map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression: {}", e))?;139Ok(self140.as_string()141.apply(|opt_s| opt_s.and_then(|s| select_json(&pat, s))))142}143144fn json_path_extract(145&self,146json_path: &str,147dtype: Option<DataType>,148infer_schema_len: Option<usize>,149) -> PolarsResult<Series> {150let selected_json = self.as_string().json_path_select(json_path)?;151selected_json.json_decode(dtype, infer_schema_len)152}153}154155impl Utf8JsonPathImpl for StringChunked {}156157#[cfg(test)]158mod tests {159use arrow::bitmap::Bitmap;160161use super::*;162163#[test]164fn test_json_select() {165let json_str = r#"{"a":1,"b":{"c":"hello"},"d":[{"e":0},{"e":2},{"e":null}]}"#;166167let compile = |s| PathCompiled::compile(s).unwrap();168let some_cow = |s: &str| Some(Cow::Owned(s.to_string()));169170assert_eq!(select_json(&compile("$"), json_str), some_cow(json_str));171assert_eq!(select_json(&compile("$.a"), json_str), some_cow("1"));172assert_eq!(173select_json(&compile("$.b.c"), json_str),174some_cow(r#""hello""#)175);176assert_eq!(select_json(&compile("$.d[0].e"), json_str), some_cow("0"));177assert_eq!(178select_json(&compile("$.d[2].e"), json_str),179some_cow("null")180);181assert_eq!(182select_json(&compile("$.d[:].e"), json_str),183some_cow("[0,2,null]")184);185}186187#[test]188fn test_json_infer() {189let s = Series::new(190"json".into(),191[192None,193Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#),194Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#),195None,196],197);198let ca = s.str().unwrap();199200let inner_dtype = DataType::Struct(vec![Field::new("c".into(), DataType::Int64)]);201let expected_dtype = DataType::Struct(vec![202Field::new("a".into(), DataType::Int64),203Field::new("b".into(), DataType::List(Box::new(inner_dtype))),204]);205206assert_eq!(ca.json_infer(None).unwrap(), expected_dtype);207// Infereing with the first row will only see None208assert_eq!(ca.json_infer(Some(1)).unwrap(), DataType::Null);209assert_eq!(ca.json_infer(Some(2)).unwrap(), expected_dtype);210}211212#[test]213fn test_json_decode() {214let s = Series::new(215"json".into(),216[217None,218Some(r#"{"a": 1, "b": "hello"}"#),219Some(r#"{"a": 2, "b": "goodbye"}"#),220None,221],222);223let ca = s.str().unwrap();224225let expected_series = StructChunked::from_series(226"".into(),2274,228[229Series::new("a".into(), &[None, Some(1), Some(2), None]),230Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]),231]232.iter(),233)234.unwrap()235.with_outer_validity(Some(Bitmap::from_iter([false, true, true, false])))236.into_series();237let expected_dtype = expected_series.dtype().clone();238239assert!(240ca.json_decode(None, None)241.unwrap()242.equals_missing(&expected_series)243);244assert!(245ca.json_decode(Some(expected_dtype), None)246.unwrap()247.equals_missing(&expected_series)248);249}250251#[test]252fn test_json_path_select() {253let s = Series::new(254"json".into(),255[256None,257Some(r#"{"a":1,"b":[{"c":0},{"c":1}]}"#),258Some(r#"{"a":2,"b":[{"c":2},{"c":5}]}"#),259None,260],261);262let ca = s.str().unwrap();263264assert!(265ca.json_path_select("$")266.unwrap()267.into_series()268.equals_missing(&s)269);270271let b_series = Series::new(272"json".into(),273[274None,275Some(r#"[{"c":0},{"c":1}]"#),276Some(r#"[{"c":2},{"c":5}]"#),277None,278],279);280assert!(281ca.json_path_select("$.b")282.unwrap()283.into_series()284.equals_missing(&b_series)285);286287let c_series = Series::new(288"json".into(),289[None, Some(r#"[0,1]"#), Some(r#"[2,5]"#), None],290);291assert!(292ca.json_path_select("$.b[:].c")293.unwrap()294.into_series()295.equals_missing(&c_series)296);297}298299#[test]300fn test_json_path_extract() {301let s = Series::new(302"json".into(),303[304None,305Some(r#"{"a":1,"b":[{"c":0},{"c":1}]}"#),306Some(r#"{"a":2,"b":[{"c":2},{"c":5}]}"#),307None,308],309);310let ca = s.str().unwrap();311312let c_series = Series::new(313"".into(),314[315None,316Some(Series::new("".into(), &[0, 1])),317Some(Series::new("".into(), &[2, 5])),318None,319],320);321322assert!(323ca.json_path_extract("$.b[:].c", None, None)324.unwrap()325.into_series()326.equals_missing(&c_series)327);328}329}330331332