Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/schema/mod.rs
6940 views
1
use std::fmt::Debug;
2
3
use arrow::bitmap::Bitmap;
4
use polars_utils::pl_str::PlSmallStr;
5
6
use crate::prelude::*;
7
use crate::utils::try_get_supertype;
8
9
pub mod iceberg;
10
11
pub type SchemaRef = Arc<Schema>;
12
pub type Schema = polars_schema::Schema<DataType>;
13
14
pub trait SchemaExt {
15
fn from_arrow_schema(value: &ArrowSchema) -> Self;
16
17
fn get_field(&self, name: &str) -> Option<Field>;
18
19
fn try_get_field(&self, name: &str) -> PolarsResult<Field>;
20
21
fn to_arrow(&self, compat_level: CompatLevel) -> ArrowSchema;
22
23
fn iter_fields(&self) -> impl ExactSizeIterator<Item = Field> + '_;
24
25
fn to_supertype(&mut self, other: &Schema) -> PolarsResult<bool>;
26
27
/// Select fields using a bitmap.
28
fn project_select(&self, select: &Bitmap) -> Self;
29
}
30
31
impl SchemaExt for Schema {
32
fn from_arrow_schema(value: &ArrowSchema) -> Self {
33
value
34
.iter_values()
35
.map(|x| (x.name.clone(), DataType::from_arrow_field(x)))
36
.collect()
37
}
38
39
/// Look up the name in the schema and return an owned [`Field`] by cloning the data.
40
///
41
/// Returns `None` if the field does not exist.
42
///
43
/// This method constructs the `Field` by cloning the name and dtype. For a version that returns references, see
44
/// [`get`][Self::get] or [`get_full`][Self::get_full].
45
fn get_field(&self, name: &str) -> Option<Field> {
46
self.get_full(name)
47
.map(|(_, name, dtype)| Field::new(name.clone(), dtype.clone()))
48
}
49
50
/// Look up the name in the schema and return an owned [`Field`] by cloning the data.
51
///
52
/// Returns `Err(PolarsErr)` if the field does not exist.
53
///
54
/// This method constructs the `Field` by cloning the name and dtype. For a version that returns references, see
55
/// [`get`][Self::get] or [`get_full`][Self::get_full].
56
fn try_get_field(&self, name: &str) -> PolarsResult<Field> {
57
self.get_full(name)
58
.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))
59
.map(|(_, name, dtype)| Field::new(name.clone(), dtype.clone()))
60
}
61
62
/// Convert self to `ArrowSchema` by cloning the fields.
63
fn to_arrow(&self, compat_level: CompatLevel) -> ArrowSchema {
64
self.iter()
65
.map(|(name, dtype)| {
66
(
67
name.clone(),
68
dtype.to_arrow_field(name.clone(), compat_level),
69
)
70
})
71
.collect()
72
}
73
74
/// Iterates the [`Field`]s in this schema, constructing them anew by cloning each `(&name, &dtype)` pair.
75
///
76
/// Note that this clones each name and dtype in order to form an owned [`Field`]. For a clone-free version, use
77
/// [`iter`][Self::iter], which returns `(&name, &dtype)`.
78
fn iter_fields(&self) -> impl ExactSizeIterator<Item = Field> + '_ {
79
self.iter()
80
.map(|(name, dtype)| Field::new(name.clone(), dtype.clone()))
81
}
82
83
/// Take another [`Schema`] and try to find the supertypes between them.
84
fn to_supertype(&mut self, other: &Schema) -> PolarsResult<bool> {
85
polars_ensure!(self.len() == other.len(), ComputeError: "schema lengths differ");
86
87
let mut changed = false;
88
for ((k, dt), (other_k, other_dt)) in self.iter_mut().zip(other.iter()) {
89
polars_ensure!(k == other_k, ComputeError: "schema names differ: got {}, expected {}", k, other_k);
90
91
let st = try_get_supertype(dt, other_dt)?;
92
changed |= (&st != dt) || (&st != other_dt);
93
*dt = st
94
}
95
Ok(changed)
96
}
97
98
fn project_select(&self, select: &Bitmap) -> Self {
99
assert_eq!(self.len(), select.len());
100
self.iter()
101
.zip(select.iter())
102
.filter(|(_, select)| *select)
103
.map(|((n, dt), _)| (n.clone(), dt.clone()))
104
.collect()
105
}
106
}
107
108
pub trait SchemaNamesAndDtypes {
109
const IS_ARROW: bool;
110
type DataType: Debug + Clone + Default + PartialEq;
111
112
fn iter_names_and_dtypes(
113
&self,
114
) -> impl ExactSizeIterator<Item = (&PlSmallStr, &Self::DataType)>;
115
}
116
117
impl SchemaNamesAndDtypes for ArrowSchema {
118
const IS_ARROW: bool = true;
119
type DataType = ArrowDataType;
120
121
fn iter_names_and_dtypes(
122
&self,
123
) -> impl ExactSizeIterator<Item = (&PlSmallStr, &Self::DataType)> {
124
self.iter_values().map(|x| (&x.name, &x.dtype))
125
}
126
}
127
128
impl SchemaNamesAndDtypes for Schema {
129
const IS_ARROW: bool = false;
130
type DataType = DataType;
131
132
fn iter_names_and_dtypes(
133
&self,
134
) -> impl ExactSizeIterator<Item = (&PlSmallStr, &Self::DataType)> {
135
self.iter()
136
}
137
}
138
139
pub fn ensure_matching_schema<D>(
140
lhs: &polars_schema::Schema<D>,
141
rhs: &polars_schema::Schema<D>,
142
) -> PolarsResult<()>
143
where
144
polars_schema::Schema<D>: SchemaNamesAndDtypes,
145
{
146
let lhs = lhs.iter_names_and_dtypes();
147
let rhs = rhs.iter_names_and_dtypes();
148
149
if lhs.len() != rhs.len() {
150
polars_bail!(
151
SchemaMismatch:
152
"schemas contained differing number of columns: {} != {}",
153
lhs.len(), rhs.len(),
154
);
155
}
156
157
for (i, ((l_name, l_dtype), (r_name, r_dtype))) in lhs.zip(rhs).enumerate() {
158
if l_name != r_name {
159
polars_bail!(
160
SchemaMismatch:
161
"schema names differ at index {}: {} != {}",
162
i, l_name, r_name
163
)
164
}
165
if l_dtype != r_dtype
166
&& (!polars_schema::Schema::<D>::IS_ARROW
167
|| unsafe {
168
// For timezone normalization. Easier than writing out the entire PartialEq.
169
DataType::from_arrow_dtype(std::mem::transmute::<
170
&<polars_schema::Schema<D> as SchemaNamesAndDtypes>::DataType,
171
&ArrowDataType,
172
>(l_dtype))
173
!= DataType::from_arrow_dtype(std::mem::transmute::<
174
&<polars_schema::Schema<D> as SchemaNamesAndDtypes>::DataType,
175
&ArrowDataType,
176
>(r_dtype))
177
})
178
{
179
polars_bail!(
180
SchemaMismatch:
181
"schema dtypes differ at index {} for column {}: {:?} != {:?}",
182
i, l_name, l_dtype, r_dtype
183
)
184
}
185
}
186
187
Ok(())
188
}
189
190