Path: blob/main/crates/polars-json/src/json/deserialize.rs
6939 views
use std::borrow::Borrow;1use std::fmt::Write;23use arrow::array::*;4use arrow::bitmap::BitmapBuilder;5use arrow::datatypes::{ArrowDataType, IntervalUnit};6use arrow::offset::{Offset, Offsets};7use arrow::temporal_conversions;8use arrow::types::NativeType;9use num_traits::NumCast;10use simd_json::{BorrowedValue, StaticNode};1112use super::*;1314const JSON_NULL_VALUE: BorrowedValue = BorrowedValue::Static(StaticNode::Null);1516fn deserialize_boolean_into<'a, A: Borrow<BorrowedValue<'a>>>(17target: &mut MutableBooleanArray,18rows: &[A],19) -> PolarsResult<()> {20let mut err_idx = rows.len();21let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {22BorrowedValue::Static(StaticNode::Bool(v)) => Some(v),23BorrowedValue::Static(StaticNode::Null) => None,24_ => {25err_idx = if err_idx == rows.len() { i } else { err_idx };26None27},28});29target.extend_trusted_len(iter);30check_err_idx(rows, err_idx, "boolean")31}3233fn deserialize_primitive_into<'a, T: NativeType + NumCast, A: Borrow<BorrowedValue<'a>>>(34target: &mut MutablePrimitiveArray<T>,35rows: &[A],36) -> PolarsResult<()> {37let mut err_idx = rows.len();38let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {39BorrowedValue::Static(StaticNode::I64(v)) => T::from(*v),40BorrowedValue::Static(StaticNode::U64(v)) => T::from(*v),41BorrowedValue::Static(StaticNode::F64(v)) => T::from(*v),42BorrowedValue::Static(StaticNode::Bool(v)) => T::from(*v as u8),43BorrowedValue::Static(StaticNode::Null) => None,44_ => {45err_idx = if err_idx == rows.len() { i } else { err_idx };46None47},48});49target.extend_trusted_len(iter);50check_err_idx(rows, err_idx, "numeric")51}5253fn deserialize_binary<'a, A: Borrow<BorrowedValue<'a>>>(54rows: &[A],55) -> PolarsResult<BinaryArray<i64>> {56let mut err_idx = rows.len();57let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {58BorrowedValue::String(v) => Some(v.as_bytes()),59BorrowedValue::Static(StaticNode::Null) => None,60_ => {61err_idx = if err_idx == rows.len() { i } else { err_idx };62None63},64});65let out = BinaryArray::from_trusted_len_iter(iter);66check_err_idx(rows, err_idx, "binary")?;67Ok(out)68}6970fn deserialize_utf8_into<'a, O: Offset, A: Borrow<BorrowedValue<'a>>>(71target: &mut MutableUtf8Array<O>,72rows: &[A],73) -> PolarsResult<()> {74let mut err_idx = rows.len();75let mut scratch = String::new();76for (i, row) in rows.iter().enumerate() {77match row.borrow() {78BorrowedValue::String(v) => target.push(Some(v.as_ref())),79BorrowedValue::Static(StaticNode::Bool(v)) => {80target.push(Some(if *v { "true" } else { "false" }))81},82BorrowedValue::Static(StaticNode::Null) => target.push_null(),83BorrowedValue::Static(node) => {84write!(scratch, "{node}").unwrap();85target.push(Some(scratch.as_str()));86scratch.clear();87},88_ => {89err_idx = if err_idx == rows.len() { i } else { err_idx };90},91}92}93check_err_idx(rows, err_idx, "string")94}9596fn deserialize_utf8view_into<'a, A: Borrow<BorrowedValue<'a>>>(97target: &mut MutableBinaryViewArray<str>,98rows: &[A],99) -> PolarsResult<()> {100let mut err_idx = rows.len();101let mut scratch = String::new();102for (i, row) in rows.iter().enumerate() {103match row.borrow() {104BorrowedValue::String(v) => target.push_value(v.as_ref()),105BorrowedValue::Static(StaticNode::Bool(v)) => {106target.push_value(if *v { "true" } else { "false" })107},108BorrowedValue::Static(StaticNode::Null) => target.push_null(),109BorrowedValue::Static(node) => {110write!(scratch, "{node}").unwrap();111target.push_value(scratch.as_str());112scratch.clear();113},114_ => {115err_idx = if err_idx == rows.len() { i } else { err_idx };116},117}118}119check_err_idx(rows, err_idx, "string")120}121122fn deserialize_list<'a, A: Borrow<BorrowedValue<'a>>>(123rows: &[A],124dtype: ArrowDataType,125allow_extra_fields_in_struct: bool,126) -> PolarsResult<ListArray<i64>> {127let mut err_idx = rows.len();128let child = ListArray::<i64>::get_child_type(&dtype);129130let mut validity = BitmapBuilder::with_capacity(rows.len());131let mut offsets = Offsets::<i64>::with_capacity(rows.len());132let mut inner = vec![];133rows.iter()134.enumerate()135.for_each(|(i, row)| match row.borrow() {136BorrowedValue::Array(value) => {137inner.extend(value.iter());138validity.push(true);139offsets140.try_push(value.len())141.expect("List offset is too large :/");142},143BorrowedValue::Static(StaticNode::Null) => {144validity.push(false);145offsets.extend_constant(1)146},147value @ (BorrowedValue::Static(_) | BorrowedValue::String(_)) => {148inner.push(value);149validity.push(true);150offsets.try_push(1).expect("List offset is too large :/");151},152_ => {153err_idx = if err_idx == rows.len() { i } else { err_idx };154},155});156157check_err_idx(rows, err_idx, "list")?;158159let values = _deserialize(&inner, child.clone(), allow_extra_fields_in_struct)?;160161Ok(ListArray::<i64>::new(162dtype,163offsets.into(),164values,165validity.into_opt_validity(),166))167}168169fn deserialize_struct<'a, A: Borrow<BorrowedValue<'a>>>(170rows: &[A],171dtype: ArrowDataType,172allow_extra_fields_in_struct: bool,173) -> PolarsResult<StructArray> {174let mut err_idx = rows.len();175let fields = StructArray::get_fields(&dtype);176177let mut out_values = fields178.iter()179.map(|f| (f.name.as_str(), (f.dtype(), vec![])))180.collect::<PlHashMap<_, _>>();181182let mut validity = BitmapBuilder::with_capacity(rows.len());183// Custom error tracker184let mut extra_field = None;185186rows.iter().enumerate().for_each(|(i, row)| {187match row.borrow() {188BorrowedValue::Object(values) => {189let mut n_matched = 0usize;190for (&key, &mut (_, ref mut inner)) in out_values.iter_mut() {191if let Some(v) = values.get(key) {192n_matched += 1;193inner.push(v)194} else {195inner.push(&JSON_NULL_VALUE)196}197}198199validity.push(true);200201if n_matched < values.len() && extra_field.is_none() {202for k in values.keys() {203if !out_values.contains_key(k.as_ref()) {204extra_field = Some(k.as_ref())205}206}207}208},209BorrowedValue::Static(StaticNode::Null) => {210out_values211.iter_mut()212.for_each(|(_, (_, inner))| inner.push(&JSON_NULL_VALUE));213validity.push(false);214},215_ => {216err_idx = if err_idx == rows.len() { i } else { err_idx };217},218};219});220221if let Some(v) = extra_field {222if !allow_extra_fields_in_struct {223polars_bail!(224ComputeError:225"extra field in struct data: {}, consider increasing infer_schema_length, or \226manually specifying the full schema to ignore extra fields",227v228)229}230}231232check_err_idx(rows, err_idx, "struct")?;233234// ensure we collect in the proper order235let values = fields236.iter()237.map(|fld| {238let (dtype, vals) = out_values.get(fld.name.as_str()).unwrap();239_deserialize(vals, (*dtype).clone(), allow_extra_fields_in_struct)240})241.collect::<PolarsResult<Vec<_>>>()?;242243Ok(StructArray::new(244dtype.clone(),245rows.len(),246values,247validity.into_opt_validity(),248))249}250251fn fill_array_from<B, T, A>(252f: fn(&mut MutablePrimitiveArray<T>, &[B]) -> PolarsResult<()>,253dtype: ArrowDataType,254rows: &[B],255) -> PolarsResult<Box<dyn Array>>256where257T: NativeType,258A: From<MutablePrimitiveArray<T>> + Array,259{260let mut array = MutablePrimitiveArray::<T>::with_capacity(rows.len()).to(dtype);261f(&mut array, rows)?;262Ok(Box::new(A::from(array)))263}264265/// A trait describing an array with a backing store that can be preallocated to266/// a given size.267pub(crate) trait Container {268/// Create this array with a given capacity.269fn with_capacity(capacity: usize) -> Self270where271Self: Sized;272}273274impl<O: Offset> Container for MutableBinaryArray<O> {275fn with_capacity(capacity: usize) -> Self {276MutableBinaryArray::with_capacity(capacity)277}278}279280impl Container for MutableBooleanArray {281fn with_capacity(capacity: usize) -> Self {282MutableBooleanArray::with_capacity(capacity)283}284}285286impl Container for MutableFixedSizeBinaryArray {287fn with_capacity(capacity: usize) -> Self {288MutableFixedSizeBinaryArray::with_capacity(capacity, 0)289}290}291292impl Container for MutableBinaryViewArray<str> {293fn with_capacity(capacity: usize) -> Self294where295Self: Sized,296{297MutableBinaryViewArray::with_capacity(capacity)298}299}300301impl<O: Offset, M: MutableArray + Default + 'static> Container for MutableListArray<O, M> {302fn with_capacity(capacity: usize) -> Self {303MutableListArray::with_capacity(capacity)304}305}306307impl<T: NativeType> Container for MutablePrimitiveArray<T> {308fn with_capacity(capacity: usize) -> Self {309MutablePrimitiveArray::with_capacity(capacity)310}311}312313impl<O: Offset> Container for MutableUtf8Array<O> {314fn with_capacity(capacity: usize) -> Self {315MutableUtf8Array::with_capacity(capacity)316}317}318319fn fill_generic_array_from<B, M, A>(320f: fn(&mut M, &[B]) -> PolarsResult<()>,321rows: &[B],322) -> PolarsResult<Box<dyn Array>>323where324M: Container,325A: From<M> + Array,326{327let mut array = M::with_capacity(rows.len());328f(&mut array, rows)?;329Ok(Box::new(A::from(array)))330}331332pub(crate) fn _deserialize<'a, A: Borrow<BorrowedValue<'a>>>(333rows: &[A],334dtype: ArrowDataType,335allow_extra_fields_in_struct: bool,336) -> PolarsResult<Box<dyn Array>> {337match &dtype {338ArrowDataType::Null => {339if let Some(err_idx) = (0..rows.len())340.find(|i| !matches!(rows[*i].borrow(), BorrowedValue::Static(StaticNode::Null)))341{342check_err_idx(rows, err_idx, "null")?;343}344345Ok(Box::new(NullArray::new(dtype, rows.len())))346},347ArrowDataType::Boolean => {348fill_generic_array_from::<_, _, BooleanArray>(deserialize_boolean_into, rows)349},350ArrowDataType::Int8 => {351fill_array_from::<_, _, PrimitiveArray<i8>>(deserialize_primitive_into, dtype, rows)352},353ArrowDataType::Int16 => {354fill_array_from::<_, _, PrimitiveArray<i16>>(deserialize_primitive_into, dtype, rows)355},356ArrowDataType::Int32357| ArrowDataType::Date32358| ArrowDataType::Time32(_)359| ArrowDataType::Interval(IntervalUnit::YearMonth) => {360fill_array_from::<_, _, PrimitiveArray<i32>>(deserialize_primitive_into, dtype, rows)361},362ArrowDataType::Interval(IntervalUnit::DayTime) => {363unimplemented!("There is no natural representation of DayTime in JSON.")364},365ArrowDataType::Int64366| ArrowDataType::Date64367| ArrowDataType::Time64(_)368| ArrowDataType::Duration(_) => {369fill_array_from::<_, _, PrimitiveArray<i64>>(deserialize_primitive_into, dtype, rows)370},371ArrowDataType::Timestamp(tu, tz) => {372let mut err_idx = rows.len();373let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {374BorrowedValue::Static(StaticNode::I64(v)) => Some(*v),375BorrowedValue::String(v) => match (tu, tz) {376(_, None) => {377polars_compute::cast::temporal::utf8_to_naive_timestamp_scalar(v, "%+", tu)378},379(_, Some(tz)) => {380let tz = temporal_conversions::parse_offset(tz.as_str()).unwrap();381temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, tu)382},383},384BorrowedValue::Static(StaticNode::Null) => None,385_ => {386err_idx = if err_idx == rows.len() { i } else { err_idx };387None388},389});390let out = Box::new(Int64Array::from_iter(iter).to(dtype));391check_err_idx(rows, err_idx, "timestamp")?;392Ok(out)393},394ArrowDataType::UInt8 => {395fill_array_from::<_, _, PrimitiveArray<u8>>(deserialize_primitive_into, dtype, rows)396},397ArrowDataType::UInt16 => {398fill_array_from::<_, _, PrimitiveArray<u16>>(deserialize_primitive_into, dtype, rows)399},400ArrowDataType::UInt32 => {401fill_array_from::<_, _, PrimitiveArray<u32>>(deserialize_primitive_into, dtype, rows)402},403ArrowDataType::UInt64 => {404fill_array_from::<_, _, PrimitiveArray<u64>>(deserialize_primitive_into, dtype, rows)405},406ArrowDataType::Float16 => unreachable!(),407ArrowDataType::Float32 => {408fill_array_from::<_, _, PrimitiveArray<f32>>(deserialize_primitive_into, dtype, rows)409},410ArrowDataType::Float64 => {411fill_array_from::<_, _, PrimitiveArray<f64>>(deserialize_primitive_into, dtype, rows)412},413ArrowDataType::LargeUtf8 => {414fill_generic_array_from::<_, _, Utf8Array<i64>>(deserialize_utf8_into, rows)415},416ArrowDataType::Utf8View => {417fill_generic_array_from::<_, _, Utf8ViewArray>(deserialize_utf8view_into, rows)418},419ArrowDataType::LargeList(_) => Ok(Box::new(deserialize_list(420rows,421dtype,422allow_extra_fields_in_struct,423)?)),424ArrowDataType::LargeBinary => Ok(Box::new(deserialize_binary(rows)?)),425ArrowDataType::Struct(_) => Ok(Box::new(deserialize_struct(426rows,427dtype,428allow_extra_fields_in_struct,429)?)),430_ => todo!(),431}432}433434pub fn deserialize(435json: &BorrowedValue,436dtype: ArrowDataType,437allow_extra_fields_in_struct: bool,438) -> PolarsResult<Box<dyn Array>> {439match json {440BorrowedValue::Array(rows) => match dtype {441ArrowDataType::LargeList(inner) => {442_deserialize(rows, inner.dtype, allow_extra_fields_in_struct)443},444_ => todo!("read an Array from a non-Array data type"),445},446_ => _deserialize(&[json], dtype, allow_extra_fields_in_struct),447}448}449450fn check_err_idx<'a>(451rows: &[impl Borrow<BorrowedValue<'a>>],452err_idx: usize,453type_name: &'static str,454) -> PolarsResult<()> {455if err_idx != rows.len() {456polars_bail!(457ComputeError:458r#"error deserializing value "{:?}" as {}. \459Try increasing `infer_schema_length` or specifying a schema.460"#,461rows[err_idx].borrow(), type_name,462)463}464465Ok(())466}467468469