Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-schema/src/schema.rs
8422 views
1
use core::fmt::Debug;
2
use core::hash::{Hash, Hasher};
3
4
use indexmap::map::MutableKeys;
5
use polars_error::{PolarsError, PolarsResult, polars_bail, polars_ensure, polars_err};
6
use polars_utils::aliases::{InitHashMaps, PlIndexMap};
7
use polars_utils::pl_str::PlSmallStr;
8
9
#[derive(Debug, Clone)]
10
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
11
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
12
pub struct Schema<Field, Metadata> {
13
fields: PlIndexMap<PlSmallStr, Field>,
14
metadata: Metadata,
15
}
16
17
impl<Field, Metadata: Default> Default for Schema<Field, Metadata> {
18
fn default() -> Self {
19
Self {
20
fields: PlIndexMap::default(),
21
metadata: Metadata::default(),
22
}
23
}
24
}
25
26
impl<Field: Eq, Metadata: Eq> Eq for Schema<Field, Metadata> {}
27
28
impl<Field, Metadata: Default> Schema<Field, Metadata> {
29
pub fn with_capacity(capacity: usize) -> Self {
30
let fields = PlIndexMap::with_capacity(capacity);
31
Self {
32
fields,
33
metadata: Metadata::default(),
34
}
35
}
36
37
pub fn from_iter_check_duplicates<I, F>(iter: I) -> PolarsResult<Self>
38
where
39
I: IntoIterator<Item = F>,
40
F: Into<(PlSmallStr, Field)>,
41
{
42
Self::try_from_iter_check_duplicates(
43
iter.into_iter().map(PolarsResult::Ok),
44
|name: &str| polars_err!(Duplicate: "duplicate name when building schema '{}'", &name),
45
)
46
}
47
48
pub fn try_from_iter_check_duplicates<I, F, E>(iter: I, err_func: E) -> PolarsResult<Self>
49
where
50
I: IntoIterator<Item = PolarsResult<F>>,
51
F: Into<(PlSmallStr, Field)>,
52
E: Fn(&str) -> PolarsError,
53
{
54
let iter = iter.into_iter();
55
let mut slf = Self::with_capacity(iter.size_hint().1.unwrap_or(0));
56
57
for v in iter {
58
let (name, d) = v?.into();
59
60
if slf.contains(&name) {
61
return Err(err_func(&name));
62
}
63
64
slf.fields.insert(name, d);
65
}
66
67
Ok(slf)
68
}
69
}
70
71
impl<Field, Metadata> Schema<Field, Metadata> {
72
/// Reserve `additional` memory spaces in the schema.
73
pub fn reserve(&mut self, additional: usize) {
74
self.fields.reserve(additional);
75
}
76
77
/// The number of fields in the schema.
78
#[inline]
79
pub fn len(&self) -> usize {
80
self.fields.len()
81
}
82
83
#[inline]
84
pub fn is_empty(&self) -> bool {
85
self.fields.is_empty()
86
}
87
88
pub fn metadata(&self) -> &Metadata {
89
&self.metadata
90
}
91
92
pub fn metadata_mut(&mut self) -> &mut Metadata {
93
&mut self.metadata
94
}
95
96
/// Rename field `old` to `new`, and return the (owned) old name.
97
///
98
/// If `old` is not present in the schema, the schema is not modified and `None` is returned. Otherwise the schema
99
/// is updated and `Some(old_name)` is returned.
100
pub fn rename(&mut self, old: &str, new: PlSmallStr) -> Option<PlSmallStr> {
101
// Remove `old`, get the corresponding index and dtype, and move the last item in the map to that position
102
let (old_index, old_name, dtype) = self.fields.swap_remove_full(old)?;
103
// Insert the same dtype under the new name at the end of the map and store that index
104
let (new_index, _) = self.fields.insert_full(new, dtype);
105
// Swap the two indices to move the originally last element back to the end and to move the new element back to
106
// its original position
107
self.fields.swap_indices(old_index, new_index);
108
109
Some(old_name)
110
}
111
112
pub fn insert(&mut self, key: PlSmallStr, value: Field) -> Option<Field> {
113
self.fields.insert(key, value)
114
}
115
116
/// Insert a field with `name` and `dtype` at the given `index` into this schema.
117
///
118
/// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is
119
/// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the
120
/// end of the schema).
121
///
122
/// For a non-mutating version that clones the schema, see [`new_inserting_at_index`][Self::new_inserting_at_index].
123
///
124
/// Runtime: **O(n)** where `n` is the number of fields in the schema.
125
///
126
/// Returns:
127
/// - If index is out of bounds, `Err(PolarsError)`
128
/// - Else if `name` was already in the schema, `Ok(Some(old_dtype))`
129
/// - Else `Ok(None)`
130
pub fn insert_at_index(
131
&mut self,
132
mut index: usize,
133
name: PlSmallStr,
134
dtype: Field,
135
) -> PolarsResult<Option<Field>> {
136
polars_ensure!(
137
index <= self.len(),
138
OutOfBounds:
139
"index {} is out of bounds for schema with length {} (the max index allowed is self.len())",
140
index,
141
self.len()
142
);
143
144
let (old_index, old_dtype) = self.fields.insert_full(name, dtype);
145
146
// If we're moving an existing field, one-past-the-end will actually be out of bounds. Also, self.len() won't
147
// have changed after inserting, so `index == self.len()` is the same as it was before inserting.
148
if old_dtype.is_some() && index == self.len() {
149
index -= 1;
150
}
151
self.fields.move_index(old_index, index);
152
Ok(old_dtype)
153
}
154
155
/// Get a reference to the dtype of the field named `name`, or `None` if the field doesn't exist.
156
pub fn get(&self, name: &str) -> Option<&Field> {
157
self.fields.get(name)
158
}
159
160
/// Get a mutable reference to the dtype of the field named `name`, or `None` if the field doesn't exist.
161
pub fn get_mut(&mut self, name: &str) -> Option<&mut Field> {
162
self.fields.get_mut(name)
163
}
164
165
/// Get a reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.
166
pub fn try_get(&self, name: &str) -> PolarsResult<&Field> {
167
self.get(name)
168
.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{name}"))
169
}
170
171
/// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.
172
pub fn try_get_mut(&mut self, name: &str) -> PolarsResult<&mut Field> {
173
self.fields
174
.get_mut(name)
175
.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{name}"))
176
}
177
178
/// Return all data about the field named `name`: its index in the schema, its name, and its dtype.
179
///
180
/// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't.
181
pub fn get_full(&self, name: &str) -> Option<(usize, &PlSmallStr, &Field)> {
182
self.fields.get_full(name)
183
}
184
185
/// Return all data about the field named `name`: its index in the schema, its name, and its dtype.
186
///
187
/// Returns `Ok((index, &name, &dtype))` if the field exists, `Err(PolarsErr)` if it doesn't.
188
pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &PlSmallStr, &Field)> {
189
self.fields
190
.get_full(name)
191
.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{name}"))
192
}
193
194
/// Get references to the name and dtype of the field at `index`.
195
///
196
/// If `index` is inbounds, returns `Some((&name, &dtype))`, else `None`. See
197
/// [`get_at_index_mut`][Self::get_at_index_mut] for a mutable version.
198
pub fn get_at_index(&self, index: usize) -> Option<(&PlSmallStr, &Field)> {
199
self.fields.get_index(index)
200
}
201
202
pub fn try_get_at_index(&self, index: usize) -> PolarsResult<(&PlSmallStr, &Field)> {
203
self.fields.get_index(index).ok_or_else(|| polars_err!(ComputeError: "index {index} out of bounds with 'schema' of len: {}", self.len()))
204
}
205
206
/// Get mutable references to the name and dtype of the field at `index`.
207
///
208
/// If `index` is inbounds, returns `Some((&mut name, &mut dtype))`, else `None`. See
209
/// [`get_at_index`][Self::get_at_index] for an immutable version.
210
pub fn get_at_index_mut(&mut self, index: usize) -> Option<(&mut PlSmallStr, &mut Field)> {
211
self.fields.get_index_mut2(index)
212
}
213
214
/// Swap-remove a field by name and, if the field existed, return its dtype.
215
///
216
/// If the field does not exist, the schema is not modified and `None` is returned.
217
///
218
/// This method does a `swap_remove`, which is O(1) but **changes the order of the schema**: the field named `name`
219
/// is replaced by the last field, which takes its position. For a slower, but order-preserving, method, use
220
/// [`shift_remove`][Self::shift_remove].
221
pub fn remove(&mut self, name: &str) -> Option<Field> {
222
self.fields.swap_remove(name)
223
}
224
225
/// Remove a field by name, preserving order, and, if the field existed, return its dtype.
226
///
227
/// If the field does not exist, the schema is not modified and `None` is returned.
228
///
229
/// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a
230
/// faster, but not order-preserving, method, use [`remove`][Self::remove].
231
pub fn shift_remove(&mut self, name: &str) -> Option<Field> {
232
self.fields.shift_remove(name)
233
}
234
235
/// Remove a field by name, preserving order, and, if the field existed, return its dtype.
236
///
237
/// If the field does not exist, the schema is not modified and `None` is returned.
238
///
239
/// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a
240
/// faster, but not order-preserving, method, use [`remove`][Self::remove].
241
pub fn shift_remove_index(&mut self, index: usize) -> Option<(PlSmallStr, Field)> {
242
self.fields.shift_remove_index(index)
243
}
244
245
/// Whether the schema contains a field named `name`.
246
pub fn contains(&self, name: &str) -> bool {
247
self.get(name).is_some()
248
}
249
250
/// Change the field named `name` to the given `dtype` and return the previous dtype.
251
///
252
/// If `name` doesn't already exist in the schema, the schema is not modified and `None` is returned. Otherwise
253
/// returns `Some(old_dtype)`.
254
///
255
/// This method only ever modifies an existing field and never adds a new field to the schema. To add a new field,
256
/// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].
257
pub fn set_dtype(&mut self, name: &str, dtype: Field) -> Option<Field> {
258
let old_dtype = self.fields.get_mut(name)?;
259
Some(std::mem::replace(old_dtype, dtype))
260
}
261
262
/// Change the field at the given index to the given `dtype` and return the previous dtype.
263
///
264
/// If the index is out of bounds, the schema is not modified and `None` is returned. Otherwise returns
265
/// `Some(old_dtype)`.
266
///
267
/// This method only ever modifies an existing index and never adds a new field to the schema. To add a new field,
268
/// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].
269
pub fn set_dtype_at_index(&mut self, index: usize, dtype: Field) -> Option<Field> {
270
let (_, old_dtype) = self.fields.get_index_mut(index)?;
271
Some(std::mem::replace(old_dtype, dtype))
272
}
273
274
/// Insert a column into the [`Schema`].
275
///
276
/// If the schema already has this column, this instead updates it with the new value and
277
/// returns the old one. Otherwise, the column is inserted at the end.
278
///
279
/// To enforce the index of the resulting field, use [`insert_at_index`][Self::insert_at_index].
280
pub fn with_column(&mut self, name: PlSmallStr, dtype: Field) -> Option<Field> {
281
self.fields.insert(name, dtype)
282
}
283
284
/// Raises DuplicateError if this column already exists in the schema.
285
pub fn try_insert(&mut self, name: PlSmallStr, value: Field) -> PolarsResult<()> {
286
if self.fields.contains_key(&name) {
287
polars_bail!(Duplicate: "column '{}' is duplicate", name)
288
}
289
290
self.fields.insert(name, value);
291
292
Ok(())
293
}
294
295
/// Performs [`Schema::try_insert`] for every column.
296
///
297
/// Raises DuplicateError if a column already exists in the schema.
298
pub fn hstack_mut(
299
&mut self,
300
columns: impl IntoIterator<Item = impl Into<(PlSmallStr, Field)>>,
301
) -> PolarsResult<()> {
302
for v in columns {
303
let (k, v) = v.into();
304
self.try_insert(k, v)?;
305
}
306
307
Ok(())
308
}
309
310
/// Performs [`Schema::try_insert`] for every column.
311
///
312
/// Raises DuplicateError if a column already exists in the schema.
313
pub fn hstack(
314
mut self,
315
columns: impl IntoIterator<Item = impl Into<(PlSmallStr, Field)>>,
316
) -> PolarsResult<Self> {
317
self.hstack_mut(columns)?;
318
Ok(self)
319
}
320
321
pub fn sort_by_key<T, F>(&mut self, sort_key: F)
322
where
323
T: Ord,
324
F: FnMut(&PlSmallStr, &Field) -> T,
325
{
326
self.fields.sort_by_key(sort_key);
327
}
328
329
/// Merge `other` into `self`.
330
///
331
/// Merging logic:
332
/// - Fields that occur in `self` but not `other` are unmodified
333
/// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`
334
/// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original
335
/// index
336
pub fn merge(&mut self, other: Self) {
337
self.fields.extend(other.fields)
338
}
339
340
/// Iterates over the `(&name, &dtype)` pairs in this schema.
341
///
342
/// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s
343
pub fn iter(&self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &Field)> + '_ {
344
self.fields.iter()
345
}
346
347
pub fn iter_mut(&mut self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &mut Field)> + '_ {
348
self.fields.iter_mut()
349
}
350
351
/// Iterates over references to the names in this schema.
352
pub fn iter_names(&self) -> impl '_ + ExactSizeIterator<Item = &PlSmallStr> {
353
self.fields.iter().map(|(name, _dtype)| name)
354
}
355
356
pub fn iter_names_cloned(&self) -> impl '_ + ExactSizeIterator<Item = PlSmallStr> {
357
self.iter_names().cloned()
358
}
359
360
/// Iterates over references to the dtypes in this schema.
361
pub fn iter_values(&self) -> impl '_ + ExactSizeIterator<Item = &Field> {
362
self.fields.iter().map(|(_name, dtype)| dtype)
363
}
364
365
pub fn into_iter_values(self) -> impl ExactSizeIterator<Item = Field> {
366
self.fields.into_values()
367
}
368
369
/// Iterates over mut references to the dtypes in this schema.
370
pub fn iter_values_mut(&mut self) -> impl '_ + ExactSizeIterator<Item = &mut Field> {
371
self.fields.iter_mut().map(|(_name, dtype)| dtype)
372
}
373
374
pub fn index_of(&self, name: &str) -> Option<usize> {
375
self.fields.get_index_of(name)
376
}
377
378
pub fn try_index_of(&self, name: &str) -> PolarsResult<usize> {
379
let Some(i) = self.fields.get_index_of(name) else {
380
polars_bail!(
381
ColumnNotFound:
382
"unable to find column {:?}; valid columns: {:?}",
383
name, self.iter_names().collect::<Vec<_>>(),
384
)
385
};
386
387
Ok(i)
388
}
389
390
/// Compare the fields between two schema returning the additional columns that each schema has.
391
pub fn field_compare<'a, 'b>(
392
&'a self,
393
other: &'b Self,
394
self_extra: &mut Vec<(usize, (&'a PlSmallStr, &'a Field))>,
395
other_extra: &mut Vec<(usize, (&'b PlSmallStr, &'b Field))>,
396
) {
397
self_extra.extend(
398
self.iter()
399
.enumerate()
400
.filter(|(_, (n, _))| !other.contains(n)),
401
);
402
other_extra.extend(
403
other
404
.iter()
405
.enumerate()
406
.filter(|(_, (n, _))| !self.contains(n)),
407
);
408
}
409
}
410
411
impl<Field, Metadata> Schema<Field, Metadata>
412
where
413
Field: Clone,
414
Metadata: Clone,
415
{
416
/// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`.
417
///
418
/// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is
419
/// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the
420
/// end of the schema).
421
///
422
/// For a mutating version that doesn't clone, see [`insert_at_index`][Self::insert_at_index].
423
///
424
/// Runtime: **O(m * n)** where `m` is the (average) length of the field names and `n` is the number of fields in
425
/// the schema. This method clones every field in the schema.
426
///
427
/// Returns: `Ok(new_schema)` if `index <= self.len()`, else `Err(PolarsError)`
428
pub fn new_inserting_at_index(
429
&self,
430
index: usize,
431
name: PlSmallStr,
432
field: Field,
433
) -> PolarsResult<Self> {
434
polars_ensure!(
435
index <= self.len(),
436
OutOfBounds:
437
"index {} is out of bounds for schema with length {} (the max index allowed is self.len())",
438
index,
439
self.len()
440
);
441
442
let mut new = Self {
443
fields: Default::default(),
444
metadata: self.metadata().clone(),
445
};
446
let mut iter = self.fields.iter().filter_map(|(fld_name, dtype)| {
447
(fld_name != &name).then_some((fld_name.clone(), dtype.clone()))
448
});
449
new.fields.extend(iter.by_ref().take(index));
450
new.fields.insert(name.clone(), field);
451
new.fields.extend(iter);
452
Ok(new)
453
}
454
455
/// Merge borrowed `other` into `self`.
456
///
457
/// Merging logic:
458
/// - Fields that occur in `self` but not `other` are unmodified
459
/// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`
460
/// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original
461
/// index
462
pub fn merge_from_ref(&mut self, other: &Self) {
463
self.fields.extend(
464
other
465
.iter()
466
.map(|(column, field)| (column.clone(), field.clone())),
467
)
468
}
469
470
/// Generates another schema with just the specified columns selected from this one.
471
pub fn try_project<I>(&self, columns: I) -> PolarsResult<Self>
472
where
473
I: IntoIterator,
474
I::Item: AsRef<str>,
475
{
476
let fields = columns
477
.into_iter()
478
.map(|c| {
479
let name = c.as_ref();
480
let (_, name, dtype) = self
481
.fields
482
.get_full(name)
483
.ok_or_else(|| polars_err!(col_not_found = name))?;
484
PolarsResult::Ok((name.clone(), dtype.clone()))
485
})
486
.collect::<PolarsResult<PlIndexMap<PlSmallStr, _>>>()?;
487
Ok(Self {
488
fields,
489
metadata: self.metadata().clone(),
490
})
491
}
492
493
pub fn try_project_indices(&self, indices: &[usize]) -> PolarsResult<Self> {
494
let fields = indices
495
.iter()
496
.map(|&i| {
497
let Some((k, v)) = self.fields.get_index(i) else {
498
polars_bail!(
499
SchemaFieldNotFound:
500
"projection index {} is out of bounds for schema of length {}",
501
i, self.fields.len()
502
);
503
};
504
505
Ok((k.clone(), v.clone()))
506
})
507
.collect::<PolarsResult<PlIndexMap<_, _>>>()?;
508
509
Ok(Self {
510
fields,
511
metadata: self.metadata().clone(),
512
})
513
}
514
515
/// Returns a new [`Schema`] with a subset of all fields whose `predicate`
516
/// evaluates to true.
517
pub fn filter<F: Fn(usize, &Field) -> bool>(self, predicate: F) -> Self {
518
let metadata = self.metadata().clone();
519
let fields = self
520
.fields
521
.into_iter()
522
.enumerate()
523
.filter_map(|(index, (name, d))| {
524
if (predicate)(index, &d) {
525
Some((name, d))
526
} else {
527
None
528
}
529
})
530
.collect();
531
532
Self { fields, metadata }
533
}
534
}
535
536
impl<Field: Hash, Metadata: Hash> Hash for Schema<Field, Metadata> {
537
fn hash<H: Hasher>(&self, state: &mut H) {
538
Hash::hash(&SchemaHashEqWrap::from(self), state)
539
}
540
}
541
542
// Schemas will only compare equal if they have the same fields in the same order. We can't use `self.inner ==
543
// other.inner` because [`IndexMap`] ignores order when checking equality, but we don't want to ignore it.
544
impl<Field: PartialEq, Metadata: PartialEq> PartialEq for Schema<Field, Metadata> {
545
fn eq(&self, other: &Self) -> bool {
546
PartialEq::eq(
547
&SchemaHashEqWrap::from(self),
548
&SchemaHashEqWrap::from(other),
549
)
550
}
551
}
552
553
/// Specialization
554
/// * `IndexMap` eq impl does not consider key ordering, but we want key ordering.
555
/// * `IndexMap` does not impl Hash.
556
#[derive(Hash, PartialEq)]
557
struct SchemaHashEqWrap<'a, Field, Metadata> {
558
fields: &'a indexmap::map::Slice<PlSmallStr, Field>,
559
metadata: &'a Metadata,
560
}
561
562
impl<'a, Field, Metadata> From<&'a Schema<Field, Metadata>>
563
for SchemaHashEqWrap<'a, Field, Metadata>
564
{
565
fn from(value: &'a Schema<Field, Metadata>) -> Self {
566
let Schema { fields, metadata } = value;
567
568
Self {
569
fields: fields.as_slice(),
570
metadata,
571
}
572
}
573
}
574
575
impl<Field, Metadata: Default> From<PlIndexMap<PlSmallStr, Field>> for Schema<Field, Metadata> {
576
fn from(fields: PlIndexMap<PlSmallStr, Field>) -> Self {
577
Self {
578
fields,
579
metadata: Metadata::default(),
580
}
581
}
582
}
583
584
impl<F, Field, Metadata: Default> FromIterator<F> for Schema<Field, Metadata>
585
where
586
F: Into<(PlSmallStr, Field)>,
587
{
588
fn from_iter<I: IntoIterator<Item = F>>(iter: I) -> Self {
589
let fields = PlIndexMap::from_iter(iter.into_iter().map(|x| x.into()));
590
Self {
591
fields,
592
metadata: Metadata::default(),
593
}
594
}
595
}
596
597
impl<F, Field, Metadata> Extend<F> for Schema<Field, Metadata>
598
where
599
F: Into<(PlSmallStr, Field)>,
600
{
601
fn extend<T: IntoIterator<Item = F>>(&mut self, iter: T) {
602
self.fields.extend(iter.into_iter().map(|x| x.into()))
603
}
604
}
605
606
impl<Field, Metadata> IntoIterator for Schema<Field, Metadata> {
607
type IntoIter = <PlIndexMap<PlSmallStr, Field> as IntoIterator>::IntoIter;
608
type Item = (PlSmallStr, Field);
609
610
fn into_iter(self) -> Self::IntoIter {
611
self.fields.into_iter()
612
}
613
}
614
615
#[cfg(test)]
616
mod tests {
617
use super::Schema;
618
619
#[test]
620
fn test_schema_eq_checks_key_order() {
621
let lhs: Schema<(), ()> = Schema::from_iter([("a".into(), ()), ("b".into(), ())]);
622
let rhs: Schema<(), ()> = Schema::from_iter([("b".into(), ()), ("a".into(), ())]);
623
624
assert_ne!(lhs, rhs);
625
}
626
}
627
628