use core::fmt::{Debug, Formatter};1use core::hash::{Hash, Hasher};23use indexmap::map::MutableKeys;4use polars_error::{PolarsError, PolarsResult, polars_bail, polars_ensure, polars_err};5use polars_utils::aliases::{InitHashMaps, PlIndexMap};6use polars_utils::pl_str::PlSmallStr;78#[derive(Clone, Default)]9#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]10#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]11pub struct Schema<D> {12fields: PlIndexMap<PlSmallStr, D>,13}1415impl<D: Eq> Eq for Schema<D> {}1617impl<D> Schema<D> {18pub fn with_capacity(capacity: usize) -> Self {19let fields = PlIndexMap::with_capacity(capacity);20Self { fields }21}2223/// Reserve `additional` memory spaces in the schema.24pub fn reserve(&mut self, additional: usize) {25self.fields.reserve(additional);26}2728/// The number of fields in the schema.29#[inline]30pub fn len(&self) -> usize {31self.fields.len()32}3334#[inline]35pub fn is_empty(&self) -> bool {36self.fields.is_empty()37}3839/// Rename field `old` to `new`, and return the (owned) old name.40///41/// If `old` is not present in the schema, the schema is not modified and `None` is returned. Otherwise the schema42/// is updated and `Some(old_name)` is returned.43pub fn rename(&mut self, old: &str, new: PlSmallStr) -> Option<PlSmallStr> {44// Remove `old`, get the corresponding index and dtype, and move the last item in the map to that position45let (old_index, old_name, dtype) = self.fields.swap_remove_full(old)?;46// Insert the same dtype under the new name at the end of the map and store that index47let (new_index, _) = self.fields.insert_full(new, dtype);48// Swap the two indices to move the originally last element back to the end and to move the new element back to49// its original position50self.fields.swap_indices(old_index, new_index);5152Some(old_name)53}5455pub fn insert(&mut self, key: PlSmallStr, value: D) -> Option<D> {56self.fields.insert(key, value)57}5859/// Insert a field with `name` and `dtype` at the given `index` into this schema.60///61/// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is62/// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the63/// end of the schema).64///65/// For a non-mutating version that clones the schema, see [`new_inserting_at_index`][Self::new_inserting_at_index].66///67/// Runtime: **O(n)** where `n` is the number of fields in the schema.68///69/// Returns:70/// - If index is out of bounds, `Err(PolarsError)`71/// - Else if `name` was already in the schema, `Ok(Some(old_dtype))`72/// - Else `Ok(None)`73pub fn insert_at_index(74&mut self,75mut index: usize,76name: PlSmallStr,77dtype: D,78) -> PolarsResult<Option<D>> {79polars_ensure!(80index <= self.len(),81OutOfBounds:82"index {} is out of bounds for schema with length {} (the max index allowed is self.len())",83index,84self.len()85);8687let (old_index, old_dtype) = self.fields.insert_full(name, dtype);8889// If we're moving an existing field, one-past-the-end will actually be out of bounds. Also, self.len() won't90// have changed after inserting, so `index == self.len()` is the same as it was before inserting.91if old_dtype.is_some() && index == self.len() {92index -= 1;93}94self.fields.move_index(old_index, index);95Ok(old_dtype)96}9798/// Get a reference to the dtype of the field named `name`, or `None` if the field doesn't exist.99pub fn get(&self, name: &str) -> Option<&D> {100self.fields.get(name)101}102103/// Get a mutable reference to the dtype of the field named `name`, or `None` if the field doesn't exist.104pub fn get_mut(&mut self, name: &str) -> Option<&mut D> {105self.fields.get_mut(name)106}107108/// Get a reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.109pub fn try_get(&self, name: &str) -> PolarsResult<&D> {110self.get(name)111.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))112}113114/// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist.115pub fn try_get_mut(&mut self, name: &str) -> PolarsResult<&mut D> {116self.fields117.get_mut(name)118.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))119}120121/// Return all data about the field named `name`: its index in the schema, its name, and its dtype.122///123/// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't.124pub fn get_full(&self, name: &str) -> Option<(usize, &PlSmallStr, &D)> {125self.fields.get_full(name)126}127128/// Return all data about the field named `name`: its index in the schema, its name, and its dtype.129///130/// Returns `Ok((index, &name, &dtype))` if the field exists, `Err(PolarsErr)` if it doesn't.131pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &PlSmallStr, &D)> {132self.fields133.get_full(name)134.ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name))135}136137/// Get references to the name and dtype of the field at `index`.138///139/// If `index` is inbounds, returns `Some((&name, &dtype))`, else `None`. See140/// [`get_at_index_mut`][Self::get_at_index_mut] for a mutable version.141pub fn get_at_index(&self, index: usize) -> Option<(&PlSmallStr, &D)> {142self.fields.get_index(index)143}144145pub fn try_get_at_index(&self, index: usize) -> PolarsResult<(&PlSmallStr, &D)> {146self.fields.get_index(index).ok_or_else(|| polars_err!(ComputeError: "index {index} out of bounds with 'schema' of len: {}", self.len()))147}148149/// Get mutable references to the name and dtype of the field at `index`.150///151/// If `index` is inbounds, returns `Some((&mut name, &mut dtype))`, else `None`. See152/// [`get_at_index`][Self::get_at_index] for an immutable version.153pub fn get_at_index_mut(&mut self, index: usize) -> Option<(&mut PlSmallStr, &mut D)> {154self.fields.get_index_mut2(index)155}156157/// Swap-remove a field by name and, if the field existed, return its dtype.158///159/// If the field does not exist, the schema is not modified and `None` is returned.160///161/// This method does a `swap_remove`, which is O(1) but **changes the order of the schema**: the field named `name`162/// is replaced by the last field, which takes its position. For a slower, but order-preserving, method, use163/// [`shift_remove`][Self::shift_remove].164pub fn remove(&mut self, name: &str) -> Option<D> {165self.fields.swap_remove(name)166}167168/// Remove a field by name, preserving order, and, if the field existed, return its dtype.169///170/// If the field does not exist, the schema is not modified and `None` is returned.171///172/// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a173/// faster, but not order-preserving, method, use [`remove`][Self::remove].174pub fn shift_remove(&mut self, name: &str) -> Option<D> {175self.fields.shift_remove(name)176}177178/// Remove a field by name, preserving order, and, if the field existed, return its dtype.179///180/// If the field does not exist, the schema is not modified and `None` is returned.181///182/// This method does a `shift_remove`, which preserves the order of the fields in the schema but **is O(n)**. For a183/// faster, but not order-preserving, method, use [`remove`][Self::remove].184pub fn shift_remove_index(&mut self, index: usize) -> Option<(PlSmallStr, D)> {185self.fields.shift_remove_index(index)186}187188/// Whether the schema contains a field named `name`.189pub fn contains(&self, name: &str) -> bool {190self.get(name).is_some()191}192193/// Change the field named `name` to the given `dtype` and return the previous dtype.194///195/// If `name` doesn't already exist in the schema, the schema is not modified and `None` is returned. Otherwise196/// returns `Some(old_dtype)`.197///198/// This method only ever modifies an existing field and never adds a new field to the schema. To add a new field,199/// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].200pub fn set_dtype(&mut self, name: &str, dtype: D) -> Option<D> {201let old_dtype = self.fields.get_mut(name)?;202Some(std::mem::replace(old_dtype, dtype))203}204205/// Change the field at the given index to the given `dtype` and return the previous dtype.206///207/// If the index is out of bounds, the schema is not modified and `None` is returned. Otherwise returns208/// `Some(old_dtype)`.209///210/// This method only ever modifies an existing index and never adds a new field to the schema. To add a new field,211/// use [`with_column`][Self::with_column] or [`insert_at_index`][Self::insert_at_index].212pub fn set_dtype_at_index(&mut self, index: usize, dtype: D) -> Option<D> {213let (_, old_dtype) = self.fields.get_index_mut(index)?;214Some(std::mem::replace(old_dtype, dtype))215}216217/// Insert a column into the [`Schema`].218///219/// If the schema already has this column, this instead updates it with the new value and220/// returns the old one. Otherwise, the column is inserted at the end.221///222/// To enforce the index of the resulting field, use [`insert_at_index`][Self::insert_at_index].223pub fn with_column(&mut self, name: PlSmallStr, dtype: D) -> Option<D> {224self.fields.insert(name, dtype)225}226227/// Raises DuplicateError if this column already exists in the schema.228pub fn try_insert(&mut self, name: PlSmallStr, value: D) -> PolarsResult<()> {229if self.fields.contains_key(&name) {230polars_bail!(Duplicate: "column '{}' is duplicate", name)231}232233self.fields.insert(name, value);234235Ok(())236}237238/// Performs [`Schema::try_insert`] for every column.239///240/// Raises DuplicateError if a column already exists in the schema.241pub fn hstack_mut(242&mut self,243columns: impl IntoIterator<Item = impl Into<(PlSmallStr, D)>>,244) -> PolarsResult<()> {245for v in columns {246let (k, v) = v.into();247self.try_insert(k, v)?;248}249250Ok(())251}252253/// Performs [`Schema::try_insert`] for every column.254///255/// Raises DuplicateError if a column already exists in the schema.256pub fn hstack(257mut self,258columns: impl IntoIterator<Item = impl Into<(PlSmallStr, D)>>,259) -> PolarsResult<Self> {260self.hstack_mut(columns)?;261Ok(self)262}263264/// Merge `other` into `self`.265///266/// Merging logic:267/// - Fields that occur in `self` but not `other` are unmodified268/// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`269/// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original270/// index271pub fn merge(&mut self, other: Self) {272self.fields.extend(other.fields)273}274275/// Iterates over the `(&name, &dtype)` pairs in this schema.276///277/// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s278pub fn iter(&self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &D)> + '_ {279self.fields.iter()280}281282pub fn iter_mut(&mut self) -> impl ExactSizeIterator<Item = (&PlSmallStr, &mut D)> + '_ {283self.fields.iter_mut()284}285286/// Iterates over references to the names in this schema.287pub fn iter_names(&self) -> impl '_ + ExactSizeIterator<Item = &PlSmallStr> {288self.fields.iter().map(|(name, _dtype)| name)289}290291pub fn iter_names_cloned(&self) -> impl '_ + ExactSizeIterator<Item = PlSmallStr> {292self.iter_names().cloned()293}294295/// Iterates over references to the dtypes in this schema.296pub fn iter_values(&self) -> impl '_ + ExactSizeIterator<Item = &D> {297self.fields.iter().map(|(_name, dtype)| dtype)298}299300pub fn into_iter_values(self) -> impl ExactSizeIterator<Item = D> {301self.fields.into_values()302}303304/// Iterates over mut references to the dtypes in this schema.305pub fn iter_values_mut(&mut self) -> impl '_ + ExactSizeIterator<Item = &mut D> {306self.fields.iter_mut().map(|(_name, dtype)| dtype)307}308309pub fn index_of(&self, name: &str) -> Option<usize> {310self.fields.get_index_of(name)311}312313pub fn try_index_of(&self, name: &str) -> PolarsResult<usize> {314let Some(i) = self.fields.get_index_of(name) else {315polars_bail!(316ColumnNotFound:317"unable to find column {:?}; valid columns: {:?}",318name, self.iter_names().collect::<Vec<_>>(),319)320};321322Ok(i)323}324325/// Compare the fields between two schema returning the additional columns that each schema has.326pub fn field_compare<'a, 'b>(327&'a self,328other: &'b Self,329self_extra: &mut Vec<(usize, (&'a PlSmallStr, &'a D))>,330other_extra: &mut Vec<(usize, (&'b PlSmallStr, &'b D))>,331) {332self_extra.extend(333self.iter()334.enumerate()335.filter(|(_, (n, _))| !other.contains(n)),336);337other_extra.extend(338other339.iter()340.enumerate()341.filter(|(_, (n, _))| !self.contains(n)),342);343}344}345346impl<D> Schema<D>347where348D: Clone + Default,349{350/// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`.351///352/// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is353/// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the354/// end of the schema).355///356/// For a mutating version that doesn't clone, see [`insert_at_index`][Self::insert_at_index].357///358/// Runtime: **O(m * n)** where `m` is the (average) length of the field names and `n` is the number of fields in359/// the schema. This method clones every field in the schema.360///361/// Returns: `Ok(new_schema)` if `index <= self.len()`, else `Err(PolarsError)`362pub fn new_inserting_at_index(363&self,364index: usize,365name: PlSmallStr,366field: D,367) -> PolarsResult<Self> {368polars_ensure!(369index <= self.len(),370OutOfBounds:371"index {} is out of bounds for schema with length {} (the max index allowed is self.len())",372index,373self.len()374);375376let mut new = Self::default();377let mut iter = self.fields.iter().filter_map(|(fld_name, dtype)| {378(fld_name != &name).then_some((fld_name.clone(), dtype.clone()))379});380new.fields.extend(iter.by_ref().take(index));381new.fields.insert(name.clone(), field);382new.fields.extend(iter);383Ok(new)384}385386/// Merge borrowed `other` into `self`.387///388/// Merging logic:389/// - Fields that occur in `self` but not `other` are unmodified390/// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self`391/// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original392/// index393pub fn merge_from_ref(&mut self, other: &Self) {394self.fields.extend(395other396.iter()397.map(|(column, field)| (column.clone(), field.clone())),398)399}400401/// Generates another schema with just the specified columns selected from this one.402pub fn try_project<I>(&self, columns: I) -> PolarsResult<Self>403where404I: IntoIterator,405I::Item: AsRef<str>,406{407let schema = columns408.into_iter()409.map(|c| {410let name = c.as_ref();411let (_, name, dtype) = self412.fields413.get_full(name)414.ok_or_else(|| polars_err!(col_not_found = name))?;415PolarsResult::Ok((name.clone(), dtype.clone()))416})417.collect::<PolarsResult<PlIndexMap<PlSmallStr, _>>>()?;418Ok(Self::from(schema))419}420421pub fn try_project_indices(&self, indices: &[usize]) -> PolarsResult<Self> {422let fields = indices423.iter()424.map(|&i| {425let Some((k, v)) = self.fields.get_index(i) else {426polars_bail!(427SchemaFieldNotFound:428"projection index {} is out of bounds for schema of length {}",429i, self.fields.len()430);431};432433Ok((k.clone(), v.clone()))434})435.collect::<PolarsResult<PlIndexMap<_, _>>>()?;436437Ok(Self { fields })438}439440/// Returns a new [`Schema`] with a subset of all fields whose `predicate`441/// evaluates to true.442pub fn filter<F: Fn(usize, &D) -> bool>(self, predicate: F) -> Self {443let fields = self444.fields445.into_iter()446.enumerate()447.filter_map(|(index, (name, d))| {448if (predicate)(index, &d) {449Some((name, d))450} else {451None452}453})454.collect();455456Self { fields }457}458459pub fn from_iter_check_duplicates<I, F>(iter: I) -> PolarsResult<Self>460where461I: IntoIterator<Item = F>,462F: Into<(PlSmallStr, D)>,463{464let iter = iter.into_iter();465let mut slf = Self::with_capacity(iter.size_hint().1.unwrap_or(0));466467for v in iter {468let (name, d) = v.into();469470if slf.contains(&name) {471return Err(err_msg(&name));472473fn err_msg(name: &str) -> PolarsError {474polars_err!(Duplicate: "duplicate name when building schema '{}'", &name)475}476}477478slf.fields.insert(name, d);479}480481Ok(slf)482}483}484485pub fn ensure_matching_schema_names<D>(lhs: &Schema<D>, rhs: &Schema<D>) -> PolarsResult<()> {486let lhs_names = lhs.iter_names();487let rhs_names = rhs.iter_names();488489if !(lhs_names.len() == rhs_names.len() && lhs_names.zip(rhs_names).all(|(l, r)| l == r)) {490polars_bail!(491SchemaMismatch:492"lhs: {:?} rhs: {:?}",493lhs.iter_names().collect::<Vec<_>>(), rhs.iter_names().collect::<Vec<_>>()494)495}496497Ok(())498}499500impl<D: Debug> Debug for Schema<D> {501fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {502writeln!(f, "Schema:")?;503for (name, field) in self.fields.iter() {504writeln!(f, "name: {name}, field: {field:?}")?;505}506Ok(())507}508}509510impl<D: Hash> Hash for Schema<D> {511fn hash<H: Hasher>(&self, state: &mut H) {512self.fields.iter().for_each(|v| v.hash(state))513}514}515516// Schemas will only compare equal if they have the same fields in the same order. We can't use `self.inner ==517// other.inner` because [`IndexMap`] ignores order when checking equality, but we don't want to ignore it.518impl<D: PartialEq> PartialEq for Schema<D> {519fn eq(&self, other: &Self) -> bool {520self.fields.len() == other.fields.len()521&& self522.fields523.iter()524.zip(other.fields.iter())525.all(|(a, b)| a == b)526}527}528529impl<D> From<PlIndexMap<PlSmallStr, D>> for Schema<D> {530fn from(fields: PlIndexMap<PlSmallStr, D>) -> Self {531Self { fields }532}533}534535impl<F, D> FromIterator<F> for Schema<D>536where537F: Into<(PlSmallStr, D)>,538{539fn from_iter<I: IntoIterator<Item = F>>(iter: I) -> Self {540let fields = PlIndexMap::from_iter(iter.into_iter().map(|x| x.into()));541Self { fields }542}543}544545impl<F, D> Extend<F> for Schema<D>546where547F: Into<(PlSmallStr, D)>,548{549fn extend<T: IntoIterator<Item = F>>(&mut self, iter: T) {550self.fields.extend(iter.into_iter().map(|x| x.into()))551}552}553554impl<D> IntoIterator for Schema<D> {555type IntoIter = <PlIndexMap<PlSmallStr, D> as IntoIterator>::IntoIter;556type Item = (PlSmallStr, D);557558fn into_iter(self) -> Self::IntoIter {559self.fields.into_iter()560}561}562563564