Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-json/src/json/infer_schema.rs
8427 views
1
use std::borrow::Borrow;
2
3
use arrow::datatypes::{ArrowDataType, Field};
4
use indexmap::map::Entry;
5
use polars_utils::pl_str::PlSmallStr;
6
use simd_json::borrowed::Object;
7
use simd_json::{BorrowedValue, StaticNode};
8
9
use super::*;
10
11
const ITEM_NAME: &str = "item";
12
13
/// Infers [`ArrowDataType`] from [`Value`][Value].
14
///
15
/// [Value]: simd_json::value::Value
16
pub fn infer(json: &BorrowedValue) -> PolarsResult<ArrowDataType> {
17
Ok(match json {
18
BorrowedValue::Static(StaticNode::Bool(_)) => ArrowDataType::Boolean,
19
BorrowedValue::Static(StaticNode::I64(_)) => ArrowDataType::Int64,
20
BorrowedValue::Static(StaticNode::U64(x)) if *x <= i64::MAX as u64 => ArrowDataType::Int64,
21
BorrowedValue::Static(StaticNode::U64(_) | StaticNode::U128(_) | StaticNode::I128(_)) => {
22
ArrowDataType::Int128
23
},
24
BorrowedValue::Static(StaticNode::F64(_)) => ArrowDataType::Float64,
25
BorrowedValue::Static(StaticNode::Null) => ArrowDataType::Null,
26
BorrowedValue::Array(array) => infer_array(array)?,
27
BorrowedValue::String(_) => ArrowDataType::LargeUtf8,
28
BorrowedValue::Object(inner) => infer_object(inner)?,
29
})
30
}
31
32
fn infer_object(inner: &Object) -> PolarsResult<ArrowDataType> {
33
let fields = inner
34
.iter()
35
.map(|(key, value)| infer(value).map(|dt| (key, dt)))
36
.map(|maybe_dt| {
37
let (key, dt) = maybe_dt?;
38
Ok(Field::new(key.as_ref().into(), dt, true))
39
})
40
.collect::<PolarsResult<Vec<_>>>()?;
41
Ok(ArrowDataType::Struct(fields))
42
}
43
44
fn infer_array(values: &[BorrowedValue]) -> PolarsResult<ArrowDataType> {
45
let types = values
46
.iter()
47
.map(infer)
48
// deduplicate entries
49
.collect::<PolarsResult<PlIndexSet<_>>>()?;
50
51
let dt = if !types.is_empty() {
52
let types = types.into_iter().collect::<Vec<_>>();
53
coerce_dtype(&types)
54
} else {
55
ArrowDataType::Null
56
};
57
58
Ok(ArrowDataType::LargeList(Box::new(Field::new(
59
PlSmallStr::from_static(ITEM_NAME),
60
dt,
61
true,
62
))))
63
}
64
65
/// Coerce an heterogeneous set of [`ArrowDataType`] into a single one. Rules:
66
/// * The empty set is coerced to `Null`
67
/// * `Int64` and `Float64` are `Float64`
68
/// * Lists and scalars are coerced to a list of a compatible scalar
69
/// * Structs contain the union of all fields
70
/// * All other types are coerced to `Utf8`
71
pub(crate) fn coerce_dtype<A: Borrow<ArrowDataType>>(datatypes: &[A]) -> ArrowDataType {
72
use ArrowDataType::*;
73
74
if datatypes.is_empty() {
75
return Null;
76
}
77
78
let are_all_equal = datatypes.windows(2).all(|w| w[0].borrow() == w[1].borrow());
79
80
if are_all_equal {
81
return datatypes[0].borrow().clone();
82
}
83
let mut are_all_structs = true;
84
let mut are_all_lists = true;
85
for dt in datatypes {
86
are_all_structs &= matches!(dt.borrow(), Struct(_));
87
are_all_lists &= matches!(dt.borrow(), LargeList(_));
88
}
89
90
if are_all_structs {
91
// all are structs => union of all fields (that may have equal names)
92
let fields = datatypes.iter().fold(vec![], |mut acc, dt| {
93
if let Struct(new_fields) = dt.borrow() {
94
acc.extend(new_fields);
95
};
96
acc
97
});
98
// group fields by unique
99
let fields = fields.iter().fold(
100
PlIndexMap::<&str, PlHashSet<&ArrowDataType>>::default(),
101
|mut acc, field| {
102
match acc.entry(field.name.as_str()) {
103
Entry::Occupied(mut v) => {
104
v.get_mut().insert(&field.dtype);
105
},
106
Entry::Vacant(v) => {
107
let mut a = PlHashSet::default();
108
a.insert(&field.dtype);
109
v.insert(a);
110
},
111
}
112
acc
113
},
114
);
115
// and finally, coerce each of the fields within the same name
116
let fields = fields
117
.into_iter()
118
.map(|(name, dts)| {
119
let dts = dts.into_iter().collect::<Vec<_>>();
120
Field::new(name.into(), coerce_dtype(&dts), true)
121
})
122
.collect();
123
return Struct(fields);
124
} else if are_all_lists {
125
let inner_types: Vec<&ArrowDataType> = datatypes
126
.iter()
127
.map(|dt| {
128
if let LargeList(inner) = dt.borrow() {
129
inner.dtype()
130
} else {
131
unreachable!();
132
}
133
})
134
.collect();
135
return LargeList(Box::new(Field::new(
136
PlSmallStr::from_static(ITEM_NAME),
137
coerce_dtype(inner_types.as_slice()),
138
true,
139
)));
140
} else if datatypes.len() > 2 {
141
return datatypes
142
.iter()
143
.map(|t| t.borrow().clone())
144
.reduce(|a, b| coerce_dtype(&[a, b]))
145
.expect("not empty");
146
}
147
let (lhs, rhs) = (datatypes[0].borrow(), datatypes[1].borrow());
148
149
match (lhs, rhs) {
150
(lhs, rhs) if lhs == rhs => lhs.clone(),
151
(LargeList(lhs), LargeList(rhs)) => {
152
let inner = coerce_dtype(&[lhs.dtype(), rhs.dtype()]);
153
LargeList(Box::new(Field::new(
154
PlSmallStr::from_static(ITEM_NAME),
155
inner,
156
true,
157
)))
158
},
159
(scalar, LargeList(list)) => {
160
let inner = coerce_dtype(&[scalar, list.dtype()]);
161
LargeList(Box::new(Field::new(
162
PlSmallStr::from_static(ITEM_NAME),
163
inner,
164
true,
165
)))
166
},
167
(LargeList(list), scalar) => {
168
let inner = coerce_dtype(&[scalar, list.dtype()]);
169
LargeList(Box::new(Field::new(
170
PlSmallStr::from_static(ITEM_NAME),
171
inner,
172
true,
173
)))
174
},
175
(Float64, Int64) => Float64,
176
(Int64, Float64) => Float64,
177
(Int64, Boolean) => Int64,
178
(Boolean, Int64) => Int64,
179
(Null, rhs) => rhs.clone(),
180
(lhs, Null) => lhs.clone(),
181
(_, _) => LargeUtf8,
182
}
183
}
184
185