Path: blob/main/crates/polars-json/src/json/deserialize.rs
8420 views
use std::borrow::Borrow;1use std::fmt::Write;23use arrow::array::*;4use arrow::bitmap::BitmapBuilder;5use arrow::datatypes::{ArrowDataType, IntervalUnit};6use arrow::offset::{Offset, Offsets};7use arrow::temporal_conversions;8use arrow::types::NativeType;9use num_traits::NumCast;10#[cfg(feature = "dtype-decimal")]11use polars_compute::decimal::{f64_to_dec128, i128_to_dec128, str_to_dec128};12use polars_utils::float16::pf16;13use simd_json::{BorrowedValue, StaticNode};1415use super::*;1617const JSON_NULL_VALUE: BorrowedValue = BorrowedValue::Static(StaticNode::Null);1819fn deserialize_boolean_into<'a, A: Borrow<BorrowedValue<'a>>>(20target: &mut MutableBooleanArray,21rows: &[A],22) -> PolarsResult<()> {23let mut err_idx = rows.len();24let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {25BorrowedValue::Static(StaticNode::Bool(v)) => Some(v),26BorrowedValue::Static(StaticNode::Null) => None,27_ => {28err_idx = if err_idx == rows.len() { i } else { err_idx };29None30},31});32target.extend_trusted_len(iter);33check_err_idx(rows, err_idx, "boolean")34}3536fn deserialize_primitive_into<'a, T: NativeType + NumCast, A: Borrow<BorrowedValue<'a>>>(37target: &mut MutablePrimitiveArray<T>,38rows: &[A],39) -> PolarsResult<()> {40let mut err_idx = rows.len();41let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {42BorrowedValue::Static(StaticNode::I64(v)) => T::from(*v),43BorrowedValue::Static(StaticNode::U64(v)) => T::from(*v),44BorrowedValue::Static(StaticNode::I128(v)) => T::from(*v),45BorrowedValue::Static(StaticNode::U128(v)) => T::from(*v),46BorrowedValue::Static(StaticNode::F64(v)) => T::from(*v),47BorrowedValue::Static(StaticNode::Bool(v)) => T::from(*v as u8),48BorrowedValue::Static(StaticNode::Null) => None,49_ => {50err_idx = if err_idx == rows.len() { i } else { err_idx };51None52},53});54target.extend_trusted_len(iter);55check_err_idx(rows, err_idx, "numeric")56}5758#[cfg(feature = "dtype-decimal")]59fn deserialize_decimal<'a, A: Borrow<BorrowedValue<'a>>>(60rows: &[A],61dtype: ArrowDataType,62) -> PolarsResult<Int128Array> {63let ArrowDataType::Decimal(prec, scale) = dtype else {64unreachable!()65};66let mut err_idx = rows.len();67let iter = rows.iter().enumerate().map(|(i, row)| {68let decode = match row.borrow() {69BorrowedValue::Static(StaticNode::I64(v)) => i128_to_dec128(*v as i128, prec, scale),70BorrowedValue::Static(StaticNode::U64(v)) => i128_to_dec128(*v as i128, prec, scale),71BorrowedValue::Static(StaticNode::I128(v)) => i128_to_dec128(*v, prec, scale),72BorrowedValue::Static(StaticNode::U128(v)) => i128::try_from(*v)73.ok()74.and_then(|v| i128_to_dec128(v, prec, scale)),75BorrowedValue::Static(StaticNode::F64(v)) => f64_to_dec128(*v, prec, scale),76BorrowedValue::String(s) => str_to_dec128(s.as_bytes(), prec, scale, false),77BorrowedValue::Static(StaticNode::Null) => return None,78_ => None,79};80if decode.is_none() && err_idx == rows.len() {81err_idx = i;82}83decode84});8586let arr = Int128Array::from_trusted_len_iter(iter);87if err_idx != rows.len() {88polars_bail!(89ComputeError:90r#"error deserializing value "{:?}" as Decimal({prec}, {scale}).9192Try increasing `infer_schema_length` or specifying a schema."#,93rows[err_idx].borrow()94)95}96Ok(arr.to(dtype))97}9899fn deserialize_binary<'a, A: Borrow<BorrowedValue<'a>>>(100rows: &[A],101) -> PolarsResult<BinaryArray<i64>> {102let mut err_idx = rows.len();103let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {104BorrowedValue::String(v) => Some(v.as_bytes()),105BorrowedValue::Static(StaticNode::Null) => None,106_ => {107err_idx = if err_idx == rows.len() { i } else { err_idx };108None109},110});111let out = BinaryArray::from_trusted_len_iter(iter);112check_err_idx(rows, err_idx, "binary")?;113Ok(out)114}115116fn deserialize_utf8_into<'a, O: Offset, A: Borrow<BorrowedValue<'a>>>(117target: &mut MutableUtf8Array<O>,118rows: &[A],119) -> PolarsResult<()> {120let mut err_idx = rows.len();121let mut scratch = String::new();122for (i, row) in rows.iter().enumerate() {123match row.borrow() {124BorrowedValue::String(v) => target.push(Some(v.as_ref())),125BorrowedValue::Static(StaticNode::Bool(v)) => {126target.push(Some(if *v { "true" } else { "false" }))127},128BorrowedValue::Static(StaticNode::Null) => target.push_null(),129BorrowedValue::Static(node) => {130write!(scratch, "{node}").unwrap();131target.push(Some(scratch.as_str()));132scratch.clear();133},134_ => {135err_idx = if err_idx == rows.len() { i } else { err_idx };136},137}138}139check_err_idx(rows, err_idx, "string")140}141142fn deserialize_utf8view_into<'a, A: Borrow<BorrowedValue<'a>>>(143target: &mut MutableBinaryViewArray<str>,144rows: &[A],145) -> PolarsResult<()> {146let mut err_idx = rows.len();147let mut scratch = String::new();148for (i, row) in rows.iter().enumerate() {149match row.borrow() {150BorrowedValue::String(v) => target.push_value(v.as_ref()),151BorrowedValue::Static(StaticNode::Bool(v)) => {152target.push_value(if *v { "true" } else { "false" })153},154BorrowedValue::Static(StaticNode::Null) => target.push_null(),155BorrowedValue::Static(node) => {156write!(scratch, "{node}").unwrap();157target.push_value(scratch.as_str());158scratch.clear();159},160_ => {161err_idx = if err_idx == rows.len() { i } else { err_idx };162},163}164}165check_err_idx(rows, err_idx, "string")166}167168fn deserialize_list<'a, A: Borrow<BorrowedValue<'a>>>(169rows: &[A],170dtype: ArrowDataType,171allow_extra_fields_in_struct: bool,172) -> PolarsResult<ListArray<i64>> {173let mut err_idx = rows.len();174let child = ListArray::<i64>::get_child_type(&dtype);175176let mut validity = BitmapBuilder::with_capacity(rows.len());177let mut offsets = Offsets::<i64>::with_capacity(rows.len());178let mut inner = vec![];179rows.iter()180.enumerate()181.for_each(|(i, row)| match row.borrow() {182BorrowedValue::Array(value) => {183inner.extend(value.iter());184validity.push(true);185offsets186.try_push(value.len())187.expect("List offset is too large :/");188},189BorrowedValue::Static(StaticNode::Null) => {190validity.push(false);191offsets.extend_constant(1)192},193value @ (BorrowedValue::Static(_) | BorrowedValue::String(_)) => {194inner.push(value);195validity.push(true);196offsets.try_push(1).expect("List offset is too large :/");197},198_ => {199err_idx = if err_idx == rows.len() { i } else { err_idx };200},201});202203check_err_idx(rows, err_idx, "list")?;204205let values = _deserialize(&inner, child.clone(), allow_extra_fields_in_struct)?;206207Ok(ListArray::<i64>::new(208dtype,209offsets.into(),210values,211validity.into_opt_validity(),212))213}214215fn deserialize_struct<'a, A: Borrow<BorrowedValue<'a>>>(216rows: &[A],217dtype: ArrowDataType,218allow_extra_fields_in_struct: bool,219) -> PolarsResult<StructArray> {220let mut err_idx = rows.len();221let fields = StructArray::get_fields(&dtype);222223let mut out_values = fields224.iter()225.map(|f| (f.name.as_str(), (f.dtype(), vec![])))226.collect::<PlHashMap<_, _>>();227228let mut validity = BitmapBuilder::with_capacity(rows.len());229// Custom error tracker230let mut extra_field = None;231232rows.iter().enumerate().for_each(|(i, row)| {233match row.borrow() {234BorrowedValue::Object(values) => {235let mut n_matched = 0usize;236for (&key, &mut (_, ref mut inner)) in out_values.iter_mut() {237if let Some(v) = values.get(key) {238n_matched += 1;239inner.push(v)240} else {241inner.push(&JSON_NULL_VALUE)242}243}244245validity.push(true);246247if n_matched < values.len() && extra_field.is_none() {248for k in values.keys() {249if !out_values.contains_key(k.as_ref()) {250extra_field = Some(k.as_ref())251}252}253}254},255BorrowedValue::Static(StaticNode::Null) => {256out_values257.iter_mut()258.for_each(|(_, (_, inner))| inner.push(&JSON_NULL_VALUE));259validity.push(false);260},261_ => {262err_idx = if err_idx == rows.len() { i } else { err_idx };263},264};265});266267if let Some(v) = extra_field {268if !allow_extra_fields_in_struct {269polars_bail!(270ComputeError:271"extra field in struct data: {}, consider increasing infer_schema_length, or \272manually specifying the full schema to ignore extra fields",273v274)275}276}277278check_err_idx(rows, err_idx, "struct")?;279280// ensure we collect in the proper order281let values = fields282.iter()283.map(|fld| {284let (dtype, vals) = out_values.get(fld.name.as_str()).unwrap();285_deserialize(vals, (*dtype).clone(), allow_extra_fields_in_struct)286})287.collect::<PolarsResult<Vec<_>>>()?;288289Ok(StructArray::new(290dtype.clone(),291rows.len(),292values,293validity.into_opt_validity(),294))295}296297fn fill_array_from<B, T, A>(298f: fn(&mut MutablePrimitiveArray<T>, &[B]) -> PolarsResult<()>,299dtype: ArrowDataType,300rows: &[B],301) -> PolarsResult<Box<dyn Array>>302where303T: NativeType,304A: From<MutablePrimitiveArray<T>> + Array,305{306let mut array = MutablePrimitiveArray::<T>::with_capacity(rows.len()).to(dtype);307f(&mut array, rows)?;308Ok(Box::new(A::from(array)))309}310311/// A trait describing an array with a backing store that can be preallocated to312/// a given size.313pub(crate) trait Container {314/// Create this array with a given capacity.315fn with_capacity(capacity: usize) -> Self316where317Self: Sized;318}319320impl<O: Offset> Container for MutableBinaryArray<O> {321fn with_capacity(capacity: usize) -> Self {322MutableBinaryArray::with_capacity(capacity)323}324}325326impl Container for MutableBooleanArray {327fn with_capacity(capacity: usize) -> Self {328MutableBooleanArray::with_capacity(capacity)329}330}331332impl Container for MutableFixedSizeBinaryArray {333fn with_capacity(capacity: usize) -> Self {334MutableFixedSizeBinaryArray::with_capacity(capacity, 0)335}336}337338impl Container for MutableBinaryViewArray<str> {339fn with_capacity(capacity: usize) -> Self340where341Self: Sized,342{343MutableBinaryViewArray::with_capacity(capacity)344}345}346347impl<O: Offset, M: MutableArray + Default + 'static> Container for MutableListArray<O, M> {348fn with_capacity(capacity: usize) -> Self {349MutableListArray::with_capacity(capacity)350}351}352353impl<T: NativeType> Container for MutablePrimitiveArray<T> {354fn with_capacity(capacity: usize) -> Self {355MutablePrimitiveArray::with_capacity(capacity)356}357}358359impl<O: Offset> Container for MutableUtf8Array<O> {360fn with_capacity(capacity: usize) -> Self {361MutableUtf8Array::with_capacity(capacity)362}363}364365fn fill_generic_array_from<B, M, A>(366f: fn(&mut M, &[B]) -> PolarsResult<()>,367rows: &[B],368) -> PolarsResult<Box<dyn Array>>369where370M: Container,371A: From<M> + Array,372{373let mut array = M::with_capacity(rows.len());374f(&mut array, rows)?;375Ok(Box::new(A::from(array)))376}377378pub(crate) fn _deserialize<'a, A: Borrow<BorrowedValue<'a>>>(379rows: &[A],380dtype: ArrowDataType,381allow_extra_fields_in_struct: bool,382) -> PolarsResult<Box<dyn Array>> {383match &dtype {384ArrowDataType::Null => {385if let Some(err_idx) = (0..rows.len())386.find(|i| !matches!(rows[*i].borrow(), BorrowedValue::Static(StaticNode::Null)))387{388check_err_idx(rows, err_idx, "null")?;389}390391Ok(Box::new(NullArray::new(dtype, rows.len())))392},393ArrowDataType::Boolean => {394fill_generic_array_from::<_, _, BooleanArray>(deserialize_boolean_into, rows)395},396ArrowDataType::Int8 => {397fill_array_from::<_, _, PrimitiveArray<i8>>(deserialize_primitive_into, dtype, rows)398},399ArrowDataType::Int16 => {400fill_array_from::<_, _, PrimitiveArray<i16>>(deserialize_primitive_into, dtype, rows)401},402ArrowDataType::Int32403| ArrowDataType::Date32404| ArrowDataType::Time32(_)405| ArrowDataType::Interval(IntervalUnit::YearMonth) => {406fill_array_from::<_, _, PrimitiveArray<i32>>(deserialize_primitive_into, dtype, rows)407},408ArrowDataType::Interval(IntervalUnit::DayTime) => {409unimplemented!("There is no natural representation of DayTime in JSON.")410},411ArrowDataType::Int64412| ArrowDataType::Date64413| ArrowDataType::Time64(_)414| ArrowDataType::Duration(_) => {415fill_array_from::<_, _, PrimitiveArray<i64>>(deserialize_primitive_into, dtype, rows)416},417ArrowDataType::Int128 => {418fill_array_from::<_, _, PrimitiveArray<i128>>(deserialize_primitive_into, dtype, rows)419},420ArrowDataType::Timestamp(tu, tz) => {421let mut err_idx = rows.len();422let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {423BorrowedValue::Static(StaticNode::I64(v)) => Some(*v),424BorrowedValue::String(v) => match (tu, tz) {425(_, None) => {426polars_compute::cast::temporal::utf8_to_naive_timestamp_scalar(v, "%+", tu)427},428(_, Some(tz)) => {429let tz = temporal_conversions::parse_offset(tz.as_str()).unwrap();430temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, tu)431},432},433BorrowedValue::Static(StaticNode::Null) => None,434_ => {435err_idx = if err_idx == rows.len() { i } else { err_idx };436None437},438});439let out = Box::new(Int64Array::from_iter(iter).to(dtype));440check_err_idx(rows, err_idx, "timestamp")?;441Ok(out)442},443ArrowDataType::UInt8 => {444fill_array_from::<_, _, PrimitiveArray<u8>>(deserialize_primitive_into, dtype, rows)445},446ArrowDataType::UInt16 => {447fill_array_from::<_, _, PrimitiveArray<u16>>(deserialize_primitive_into, dtype, rows)448},449ArrowDataType::UInt32 => {450fill_array_from::<_, _, PrimitiveArray<u32>>(deserialize_primitive_into, dtype, rows)451},452ArrowDataType::UInt64 => {453fill_array_from::<_, _, PrimitiveArray<u64>>(deserialize_primitive_into, dtype, rows)454},455ArrowDataType::UInt128 => {456fill_array_from::<_, _, PrimitiveArray<u128>>(deserialize_primitive_into, dtype, rows)457},458ArrowDataType::Float16 => {459fill_array_from::<_, _, PrimitiveArray<pf16>>(deserialize_primitive_into, dtype, rows)460},461ArrowDataType::Float32 => {462fill_array_from::<_, _, PrimitiveArray<f32>>(deserialize_primitive_into, dtype, rows)463},464ArrowDataType::Float64 => {465fill_array_from::<_, _, PrimitiveArray<f64>>(deserialize_primitive_into, dtype, rows)466},467#[cfg(feature = "dtype-decimal")]468ArrowDataType::Decimal(_, _) => Ok(Box::new(deserialize_decimal(rows, dtype)?)),469ArrowDataType::LargeUtf8 => {470fill_generic_array_from::<_, _, Utf8Array<i64>>(deserialize_utf8_into, rows)471},472ArrowDataType::Utf8View => {473fill_generic_array_from::<_, _, Utf8ViewArray>(deserialize_utf8view_into, rows)474},475ArrowDataType::LargeList(_) => Ok(Box::new(deserialize_list(476rows,477dtype,478allow_extra_fields_in_struct,479)?)),480ArrowDataType::LargeBinary => Ok(Box::new(deserialize_binary(rows)?)),481ArrowDataType::Struct(_) => Ok(Box::new(deserialize_struct(482rows,483dtype,484allow_extra_fields_in_struct,485)?)),486adt => unimplemented!("Deserialization from JSON not implemented for {adt:?}"),487}488}489490pub fn deserialize(491json: &BorrowedValue,492dtype: ArrowDataType,493allow_extra_fields_in_struct: bool,494) -> PolarsResult<Box<dyn Array>> {495match json {496BorrowedValue::Array(rows) => match dtype {497ArrowDataType::LargeList(inner) => {498_deserialize(rows, inner.dtype, allow_extra_fields_in_struct)499},500_ => todo!("read an Array from a non-Array data type"),501},502_ => _deserialize(&[json], dtype, allow_extra_fields_in_struct),503}504}505506fn check_err_idx<'a>(507rows: &[impl Borrow<BorrowedValue<'a>>],508err_idx: usize,509type_name: &'static str,510) -> PolarsResult<()> {511if err_idx != rows.len() {512polars_bail!(513ComputeError:514r#"error deserializing value "{:?}" as {}.515516Try increasing `infer_schema_length` or specifying a schema."#,517rows[err_idx].borrow(), type_name,518)519}520521Ok(())522}523524525