Path: blob/main/crates/polars-arrow/src/array/utf8/mutable.rs
6939 views
use std::sync::Arc;12use polars_error::{PolarsResult, polars_bail};34use super::{MutableUtf8ValuesArray, MutableUtf8ValuesIter, StrAsBytes, Utf8Array};5use crate::array::physical_binary::*;6use crate::array::{Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush};7use crate::bitmap::utils::{BitmapIter, ZipValidity};8use crate::bitmap::{Bitmap, MutableBitmap};9use crate::datatypes::ArrowDataType;10use crate::offset::{Offset, Offsets};11use crate::trusted_len::TrustedLen;1213/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs14/// from [`MutableUtf8ValuesArray`] in that it can build nullable [`Utf8Array`]s.15#[derive(Debug, Clone)]16pub struct MutableUtf8Array<O: Offset> {17values: MutableUtf8ValuesArray<O>,18validity: Option<MutableBitmap>,19}2021impl<O: Offset> From<MutableUtf8Array<O>> for Utf8Array<O> {22fn from(other: MutableUtf8Array<O>) -> Self {23let validity = other.validity.and_then(|x| {24let validity: Option<Bitmap> = x.into();25validity26});27let array: Utf8Array<O> = other.values.into();28array.with_validity(validity)29}30}3132impl<O: Offset> Default for MutableUtf8Array<O> {33fn default() -> Self {34Self::new()35}36}3738impl<O: Offset> MutableUtf8Array<O> {39/// Initializes a new empty [`MutableUtf8Array`].40pub fn new() -> Self {41Self {42values: Default::default(),43validity: None,44}45}4647/// Returns a [`MutableUtf8Array`] created from its internal representation.48///49/// # Errors50/// This function returns an error iff:51/// * The last offset is not equal to the values' length.52/// * the validity's length is not equal to `offsets.len()`.53/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.54/// * The `values` between two consecutive `offsets` are not valid utf855/// # Implementation56/// This function is `O(N)` - checking utf8 is `O(N)`57pub fn try_new(58dtype: ArrowDataType,59offsets: Offsets<O>,60values: Vec<u8>,61validity: Option<MutableBitmap>,62) -> PolarsResult<Self> {63let values = MutableUtf8ValuesArray::try_new(dtype, offsets, values)?;6465if validity66.as_ref()67.is_some_and(|validity| validity.len() != values.len())68{69polars_bail!(ComputeError: "validity's length must be equal to the number of values")70}7172Ok(Self { values, validity })73}7475/// Create a [`MutableUtf8Array`] out of low-end APIs.76///77/// # Safety78/// The caller must ensure that every value between offsets is a valid utf8.79/// # Panics80/// This function panics iff:81/// * The `offsets` and `values` are inconsistent82/// * The validity is not `None` and its length is different from `offsets`'s length minus one.83pub unsafe fn new_unchecked(84dtype: ArrowDataType,85offsets: Offsets<O>,86values: Vec<u8>,87validity: Option<MutableBitmap>,88) -> Self {89let values = MutableUtf8ValuesArray::new_unchecked(dtype, offsets, values);90if let Some(ref validity) = validity {91assert_eq!(values.len(), validity.len());92}93Self { values, validity }94}9596/// Creates a new [`MutableUtf8Array`] from a slice of optional `&[u8]`.97// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.98pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {99Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref()))100}101102fn default_dtype() -> ArrowDataType {103Utf8Array::<O>::default_dtype()104}105106/// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots.107pub fn with_capacity(capacity: usize) -> Self {108Self::with_capacities(capacity, 0)109}110111/// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots and values.112pub fn with_capacities(capacity: usize, values: usize) -> Self {113Self {114values: MutableUtf8ValuesArray::with_capacities(capacity, values),115validity: None,116}117}118119/// Reserves `additional` elements and `additional_values` on the values buffer.120pub fn reserve(&mut self, additional: usize, additional_values: usize) {121self.values.reserve(additional, additional_values);122if let Some(x) = self.validity.as_mut() {123x.reserve(additional)124}125}126127/// Reserves `additional` elements and `additional_values` on the values buffer.128pub fn capacity(&self) -> usize {129self.values.capacity()130}131132/// Returns the length of this array133#[inline]134pub fn len(&self) -> usize {135self.values.len()136}137138/// Pushes a new element to the array.139/// # Panic140/// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.141#[inline]142pub fn push<T: AsRef<str>>(&mut self, value: Option<T>) {143self.try_push(value).unwrap()144}145146/// Returns the value of the element at index `i`, ignoring the array's validity.147#[inline]148pub fn value(&self, i: usize) -> &str {149self.values.value(i)150}151152/// Returns the value of the element at index `i`, ignoring the array's validity.153///154/// # Safety155/// This function is safe iff `i < self.len`.156#[inline]157pub unsafe fn value_unchecked(&self, i: usize) -> &str {158self.values.value_unchecked(i)159}160161/// Pop the last entry from [`MutableUtf8Array`].162/// This function returns `None` iff this array is empty.163pub fn pop(&mut self) -> Option<String> {164let value = self.values.pop()?;165self.validity166.as_mut()167.map(|x| x.pop()?.then(|| ()))168.unwrap_or_else(|| Some(()))169.map(|_| value)170}171172fn init_validity(&mut self) {173let mut validity = MutableBitmap::with_capacity(self.values.capacity());174validity.extend_constant(self.len(), true);175validity.set(self.len() - 1, false);176self.validity = Some(validity);177}178179/// Returns an iterator of `Option<&str>`180pub fn iter(&self) -> ZipValidity<&str, MutableUtf8ValuesIter<'_, O>, BitmapIter<'_>> {181ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter()))182}183184/// Converts itself into an [`Array`].185pub fn into_arc(self) -> Arc<dyn Array> {186let a: Utf8Array<O> = self.into();187Arc::new(a)188}189190/// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length.191pub fn shrink_to_fit(&mut self) {192self.values.shrink_to_fit();193if let Some(validity) = &mut self.validity {194validity.shrink_to_fit()195}196}197198/// Extract the low-end APIs from the [`MutableUtf8Array`].199pub fn into_data(self) -> (ArrowDataType, Offsets<O>, Vec<u8>, Option<MutableBitmap>) {200let (dtype, offsets, values) = self.values.into_inner();201(dtype, offsets, values, self.validity)202}203204/// Returns an iterator of `&str`205pub fn values_iter(&self) -> MutableUtf8ValuesIter<'_, O> {206self.values.iter()207}208209/// Sets the validity.210/// # Panic211/// Panics iff the validity's len is not equal to the existing values' length.212pub fn set_validity(&mut self, validity: Option<MutableBitmap>) {213if let Some(validity) = &validity {214assert_eq!(self.values.len(), validity.len())215}216self.validity = validity;217}218219/// Applies a function `f` to the validity of this array.220///221/// This is an API to leverage clone-on-write222/// # Panics223/// This function panics if the function `f` modifies the length of the [`Bitmap`].224pub fn apply_validity<F: FnOnce(MutableBitmap) -> MutableBitmap>(&mut self, f: F) {225if let Some(validity) = std::mem::take(&mut self.validity) {226self.set_validity(Some(f(validity)))227}228}229}230231impl<O: Offset> MutableUtf8Array<O> {232/// returns its values.233pub fn values(&self) -> &Vec<u8> {234self.values.values()235}236237/// returns its offsets.238pub fn offsets(&self) -> &Offsets<O> {239self.values.offsets()240}241}242243impl<O: Offset> MutableArray for MutableUtf8Array<O> {244fn len(&self) -> usize {245self.len()246}247248fn validity(&self) -> Option<&MutableBitmap> {249self.validity.as_ref()250}251252fn as_box(&mut self) -> Box<dyn Array> {253let array: Utf8Array<O> = std::mem::take(self).into();254array.boxed()255}256257fn as_arc(&mut self) -> Arc<dyn Array> {258let array: Utf8Array<O> = std::mem::take(self).into();259array.arced()260}261262fn dtype(&self) -> &ArrowDataType {263if O::IS_LARGE {264&ArrowDataType::LargeUtf8265} else {266&ArrowDataType::Utf8267}268}269270fn as_any(&self) -> &dyn std::any::Any {271self272}273274fn as_mut_any(&mut self) -> &mut dyn std::any::Any {275self276}277278#[inline]279fn push_null(&mut self) {280self.push::<&str>(None)281}282283fn reserve(&mut self, additional: usize) {284self.reserve(additional, 0)285}286287fn shrink_to_fit(&mut self) {288self.shrink_to_fit()289}290}291292impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for MutableUtf8Array<O> {293fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {294Self::try_from_iter(iter).unwrap()295}296}297298impl<O: Offset> MutableUtf8Array<O> {299/// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.300/// This differs from `extended_trusted_len` which accepts iterator of optional values.301#[inline]302pub fn extend_trusted_len_values<I, P>(&mut self, iterator: I)303where304P: AsRef<str>,305I: TrustedLen<Item = P>,306{307unsafe { self.extend_trusted_len_values_unchecked(iterator) }308}309310/// Extends the [`MutableUtf8Array`] from an iterator of values.311/// This differs from `extended_trusted_len` which accepts iterator of optional values.312#[inline]313pub fn extend_values<I, P>(&mut self, iterator: I)314where315P: AsRef<str>,316I: Iterator<Item = P>,317{318let length = self.values.len();319self.values.extend(iterator);320let additional = self.values.len() - length;321322if let Some(validity) = self.validity.as_mut() {323validity.extend_constant(additional, true);324}325}326327/// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.328/// This differs from `extended_trusted_len_unchecked` which accepts iterator of optional329/// values.330///331/// # Safety332/// The iterator must be trusted len.333#[inline]334pub unsafe fn extend_trusted_len_values_unchecked<I, P>(&mut self, iterator: I)335where336P: AsRef<str>,337I: Iterator<Item = P>,338{339let length = self.values.len();340self.values.extend_trusted_len_unchecked(iterator);341let additional = self.values.len() - length;342343if let Some(validity) = self.validity.as_mut() {344validity.extend_constant(additional, true);345}346}347348/// Extends the [`MutableUtf8Array`] from an iterator of trusted len.349#[inline]350pub fn extend_trusted_len<I, P>(&mut self, iterator: I)351where352P: AsRef<str>,353I: TrustedLen<Item = Option<P>>,354{355unsafe { self.extend_trusted_len_unchecked(iterator) }356}357358/// Extends [`MutableUtf8Array`] from an iterator of trusted len.359///360/// # Safety361/// The iterator must be trusted len.362#[inline]363pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)364where365P: AsRef<str>,366I: Iterator<Item = Option<P>>,367{368if self.validity.is_none() {369let mut validity = MutableBitmap::new();370validity.extend_constant(self.len(), true);371self.validity = Some(validity);372}373374self.values375.extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator);376}377378/// Creates a [`MutableUtf8Array`] from an iterator of trusted length.379///380/// # Safety381/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).382/// I.e. that `size_hint().1` correctly reports its length.383#[inline]384pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self385where386P: AsRef<str>,387I: Iterator<Item = Option<P>>,388{389let iterator = iterator.map(|x| x.map(StrAsBytes));390let (validity, offsets, values) = trusted_len_unzip(iterator);391392// soundness: P is `str`393Self::new_unchecked(Self::default_dtype(), offsets, values, validity)394}395396/// Creates a [`MutableUtf8Array`] from an iterator of trusted length.397#[inline]398pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self399where400P: AsRef<str>,401I: TrustedLen<Item = Option<P>>,402{403// soundness: I is `TrustedLen`404unsafe { Self::from_trusted_len_iter_unchecked(iterator) }405}406407/// Creates a [`MutableUtf8Array`] from an iterator of trusted length of `&str`.408///409/// # Safety410/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).411/// I.e. that `size_hint().1` correctly reports its length.412#[inline]413pub unsafe fn from_trusted_len_values_iter_unchecked<T: AsRef<str>, I: Iterator<Item = T>>(414iterator: I,415) -> Self {416MutableUtf8ValuesArray::from_trusted_len_iter_unchecked(iterator).into()417}418419/// Creates a new [`MutableUtf8Array`] from a [`TrustedLen`] of `&str`.420#[inline]421pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(422iterator: I,423) -> Self {424// soundness: I is `TrustedLen`425unsafe { Self::from_trusted_len_values_iter_unchecked(iterator) }426}427428/// Creates a new [`MutableUtf8Array`] from an iterator.429/// # Error430/// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.431/// (`i32::MAX` or `i64::MAX` respectively).432fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = Option<P>>>(433iter: I,434) -> PolarsResult<Self> {435let iterator = iter.into_iter();436let (lower, _) = iterator.size_hint();437let mut array = Self::with_capacity(lower);438for item in iterator {439array.try_push(item)?;440}441Ok(array)442}443444/// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.445///446/// # Safety447/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).448/// I.e. that `size_hint().1` correctly reports its length.449#[inline]450pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(451iterator: I,452) -> std::result::Result<Self, E>453where454P: AsRef<str>,455I: IntoIterator<Item = std::result::Result<Option<P>, E>>,456{457let iterator = iterator.into_iter();458459let iterator = iterator.map(|x| x.map(|x| x.map(StrAsBytes)));460let (validity, offsets, values) = try_trusted_len_unzip(iterator)?;461462// soundness: P is `str`463Ok(Self::new_unchecked(464Self::default_dtype(),465offsets,466values,467validity,468))469}470471/// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.472#[inline]473pub fn try_from_trusted_len_iter<E, I, P>(iterator: I) -> std::result::Result<Self, E>474where475P: AsRef<str>,476I: TrustedLen<Item = std::result::Result<Option<P>, E>>,477{478// soundness: I: TrustedLen479unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }480}481482/// Creates a new [`MutableUtf8Array`] from a [`Iterator`] of `&str`.483pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {484MutableUtf8ValuesArray::from_iter(iterator).into()485}486487/// Extend with a fallible iterator488pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>489where490E: std::error::Error,491I: IntoIterator<Item = std::result::Result<Option<T>, E>>,492T: AsRef<str>,493{494let mut iter = iter.into_iter();495self.reserve(iter.size_hint().0, 0);496iter.try_for_each(|x| {497self.push(x?);498Ok(())499})500}501}502503impl<O: Offset, T: AsRef<str>> Extend<Option<T>> for MutableUtf8Array<O> {504fn extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) {505self.try_extend(iter).unwrap();506}507}508509impl<O: Offset, T: AsRef<str>> TryExtend<Option<T>> for MutableUtf8Array<O> {510fn try_extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) -> PolarsResult<()> {511let mut iter = iter.into_iter();512self.reserve(iter.size_hint().0, 0);513iter.try_for_each(|x| self.try_push(x))514}515}516517impl<O: Offset, T: AsRef<str>> TryPush<Option<T>> for MutableUtf8Array<O> {518#[inline]519fn try_push(&mut self, value: Option<T>) -> PolarsResult<()> {520match value {521Some(value) => {522self.values.try_push(value.as_ref())?;523524if let Some(validity) = &mut self.validity {525validity.push(true)526}527},528None => {529self.values.push("");530match &mut self.validity {531Some(validity) => validity.push(false),532None => self.init_validity(),533}534},535}536Ok(())537}538}539540impl<O: Offset> PartialEq for MutableUtf8Array<O> {541fn eq(&self, other: &Self) -> bool {542self.iter().eq(other.iter())543}544}545546impl<O: Offset> TryExtendFromSelf for MutableUtf8Array<O> {547fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {548extend_validity(self.len(), &mut self.validity, &other.validity);549550self.values.try_extend_from_self(&other.values)551}552}553554555