Path: blob/main/crates/polars-arrow/src/array/utf8/mutable_values.rs
6939 views
use std::sync::Arc;12use polars_error::{PolarsResult, polars_bail};34use super::{MutableUtf8Array, StrAsBytes, Utf8Array};5use crate::array::physical_binary::*;6use crate::array::specification::{try_check_offsets_bounds, try_check_utf8};7use crate::array::{Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush};8use crate::bitmap::MutableBitmap;9use crate::datatypes::ArrowDataType;10use crate::offset::{Offset, Offsets};11use crate::trusted_len::TrustedLen;1213/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs14/// from [`MutableUtf8Array`] in that it builds non-null [`Utf8Array`].15#[derive(Debug, Clone)]16pub struct MutableUtf8ValuesArray<O: Offset> {17dtype: ArrowDataType,18offsets: Offsets<O>,19values: Vec<u8>,20}2122impl<O: Offset> From<MutableUtf8ValuesArray<O>> for Utf8Array<O> {23fn from(other: MutableUtf8ValuesArray<O>) -> Self {24// SAFETY:25// `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus26// `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks.27unsafe {28Utf8Array::<O>::new_unchecked(29other.dtype,30other.offsets.into(),31other.values.into(),32None,33)34}35}36}3738impl<O: Offset> From<MutableUtf8ValuesArray<O>> for MutableUtf8Array<O> {39fn from(other: MutableUtf8ValuesArray<O>) -> Self {40// SAFETY:41// `MutableUtf8ValuesArray` has the same invariants as `MutableUtf8Array`42unsafe {43MutableUtf8Array::<O>::new_unchecked(other.dtype, other.offsets, other.values, None)44}45}46}4748impl<O: Offset> Default for MutableUtf8ValuesArray<O> {49fn default() -> Self {50Self::new()51}52}5354impl<O: Offset> MutableUtf8ValuesArray<O> {55/// Returns an empty [`MutableUtf8ValuesArray`].56pub fn new() -> Self {57Self {58dtype: Self::default_dtype(),59offsets: Offsets::new(),60values: Vec::<u8>::new(),61}62}6364/// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.65///66/// # Errors67/// This function returns an error iff:68/// * `offsets.last()` is greater than `values.len()`.69/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.70/// * The `values` between two consecutive `offsets` are not valid utf871/// # Implementation72/// This function is `O(N)` - checking utf8 is `O(N)`73pub fn try_new(74dtype: ArrowDataType,75offsets: Offsets<O>,76values: Vec<u8>,77) -> PolarsResult<Self> {78try_check_utf8(&offsets, &values)?;79if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {80polars_bail!(ComputeError: "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8")81}8283Ok(Self {84dtype,85offsets,86values,87})88}8990/// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.91///92/// # Panic93/// This function does not panic iff:94/// * `offsets.last()` is greater than `values.len()`95/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is equal to either `Utf8` or `LargeUtf8`.96///97/// # Safety98/// This function is safe iff:99/// * the offsets are monotonically increasing100/// * The `values` between two consecutive `offsets` are not valid utf8101/// # Implementation102/// This function is `O(1)`103pub unsafe fn new_unchecked(104dtype: ArrowDataType,105offsets: Offsets<O>,106values: Vec<u8>,107) -> Self {108try_check_offsets_bounds(&offsets, values.len())109.expect("The length of the values must be equal to the last offset value");110111if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {112panic!(113"MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8"114)115}116117Self {118dtype,119offsets,120values,121}122}123124/// Returns the default [`ArrowDataType`] of this container: [`ArrowDataType::Utf8`] or [`ArrowDataType::LargeUtf8`]125/// depending on the generic [`Offset`].126pub fn default_dtype() -> ArrowDataType {127Utf8Array::<O>::default_dtype()128}129130/// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items.131pub fn with_capacity(capacity: usize) -> Self {132Self::with_capacities(capacity, 0)133}134135/// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values.136pub fn with_capacities(capacity: usize, values: usize) -> Self {137Self {138dtype: Self::default_dtype(),139offsets: Offsets::<O>::with_capacity(capacity),140values: Vec::<u8>::with_capacity(values),141}142}143144/// returns its values.145#[inline]146pub fn values(&self) -> &Vec<u8> {147&self.values148}149150/// returns its offsets.151#[inline]152pub fn offsets(&self) -> &Offsets<O> {153&self.offsets154}155156/// Reserves `additional` elements and `additional_values` on the values.157#[inline]158pub fn reserve(&mut self, additional: usize, additional_values: usize) {159self.offsets.reserve(additional + 1);160self.values.reserve(additional_values);161}162163/// Returns the capacity in number of items164pub fn capacity(&self) -> usize {165self.offsets.capacity()166}167168/// Returns the length of this array169#[inline]170pub fn len(&self) -> usize {171self.offsets.len_proxy()172}173174/// Pushes a new item to the array.175/// # Panic176/// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.177#[inline]178pub fn push<T: AsRef<str>>(&mut self, value: T) {179self.try_push(value).unwrap()180}181182/// Pop the last entry from [`MutableUtf8ValuesArray`].183/// This function returns `None` iff this array is empty.184pub fn pop(&mut self) -> Option<String> {185if self.len() == 0 {186return None;187}188self.offsets.pop()?;189let start = self.offsets.last().to_usize();190let value = self.values.split_off(start);191// SAFETY: utf8 is validated on initialization192Some(unsafe { String::from_utf8_unchecked(value) })193}194195/// Returns the value of the element at index `i`.196/// # Panic197/// This function panics iff `i >= self.len`.198#[inline]199pub fn value(&self, i: usize) -> &str {200assert!(i < self.len());201unsafe { self.value_unchecked(i) }202}203204/// Returns the value of the element at index `i`.205///206/// # Safety207/// This function is safe iff `i < self.len`.208#[inline]209pub unsafe fn value_unchecked(&self, i: usize) -> &str {210// soundness: the invariant of the function211let (start, end) = self.offsets.start_end(i);212213// soundness: the invariant of the struct214let slice = self.values.get_unchecked(start..end);215216// soundness: the invariant of the struct217std::str::from_utf8_unchecked(slice)218}219220/// Returns an iterator of `&str`221pub fn iter(&self) -> ArrayValuesIter<'_, Self> {222ArrayValuesIter::new(self)223}224225/// Shrinks the capacity of the [`MutableUtf8ValuesArray`] to fit its current length.226pub fn shrink_to_fit(&mut self) {227self.values.shrink_to_fit();228self.offsets.shrink_to_fit();229}230231/// Extract the low-end APIs from the [`MutableUtf8ValuesArray`].232pub fn into_inner(self) -> (ArrowDataType, Offsets<O>, Vec<u8>) {233(self.dtype, self.offsets, self.values)234}235}236237impl<O: Offset> MutableArray for MutableUtf8ValuesArray<O> {238fn len(&self) -> usize {239self.len()240}241242fn validity(&self) -> Option<&MutableBitmap> {243None244}245246fn as_box(&mut self) -> Box<dyn Array> {247let array: Utf8Array<O> = std::mem::take(self).into();248array.boxed()249}250251fn as_arc(&mut self) -> Arc<dyn Array> {252let array: Utf8Array<O> = std::mem::take(self).into();253array.arced()254}255256fn dtype(&self) -> &ArrowDataType {257&self.dtype258}259260fn as_any(&self) -> &dyn std::any::Any {261self262}263264fn as_mut_any(&mut self) -> &mut dyn std::any::Any {265self266}267268#[inline]269fn push_null(&mut self) {270self.push::<&str>("")271}272273fn reserve(&mut self, additional: usize) {274self.reserve(additional, 0)275}276277fn shrink_to_fit(&mut self) {278self.shrink_to_fit()279}280}281282impl<O: Offset, P: AsRef<str>> FromIterator<P> for MutableUtf8ValuesArray<O> {283fn from_iter<I: IntoIterator<Item = P>>(iter: I) -> Self {284let (offsets, values) = values_iter(iter.into_iter().map(StrAsBytes));285// soundness: T: AsRef<str> and offsets are monotonically increasing286unsafe { Self::new_unchecked(Self::default_dtype(), offsets, values) }287}288}289290impl<O: Offset> MutableUtf8ValuesArray<O> {291pub(crate) unsafe fn extend_from_trusted_len_iter<I, P>(292&mut self,293validity: &mut MutableBitmap,294iterator: I,295) where296P: AsRef<str>,297I: Iterator<Item = Option<P>>,298{299let iterator = iterator.map(|x| x.map(StrAsBytes));300extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator);301}302303/// Extends the [`MutableUtf8ValuesArray`] from a [`TrustedLen`]304#[inline]305pub fn extend_trusted_len<I, P>(&mut self, iterator: I)306where307P: AsRef<str>,308I: TrustedLen<Item = P>,309{310unsafe { self.extend_trusted_len_unchecked(iterator) }311}312313/// Extends [`MutableUtf8ValuesArray`] from an iterator of trusted len.314///315/// # Safety316/// The iterator must be trusted len.317#[inline]318pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)319where320P: AsRef<str>,321I: Iterator<Item = P>,322{323let iterator = iterator.map(StrAsBytes);324extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);325}326327/// Creates a [`MutableUtf8ValuesArray`] from a [`TrustedLen`]328#[inline]329pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self330where331P: AsRef<str>,332I: TrustedLen<Item = P>,333{334// soundness: I is `TrustedLen`335unsafe { Self::from_trusted_len_iter_unchecked(iterator) }336}337338/// Returns a new [`MutableUtf8ValuesArray`] from an iterator of trusted length.339///340/// # Safety341/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).342/// I.e. that `size_hint().1` correctly reports its length.343#[inline]344pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self345where346P: AsRef<str>,347I: Iterator<Item = P>,348{349let iterator = iterator.map(StrAsBytes);350let (offsets, values) = trusted_len_values_iter(iterator);351352// soundness: P is `str` and offsets are monotonically increasing353Self::new_unchecked(Self::default_dtype(), offsets, values)354}355356/// Returns a new [`MutableUtf8ValuesArray`] from an iterator.357/// # Error358/// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.359/// (`i32::MAX` or `i64::MAX` respectively).360pub fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = P>>(iter: I) -> PolarsResult<Self> {361let iterator = iter.into_iter();362let (lower, _) = iterator.size_hint();363let mut array = Self::with_capacity(lower);364for item in iterator {365array.try_push(item)?;366}367Ok(array)368}369370/// Extend with a fallible iterator371pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>372where373E: std::error::Error,374I: IntoIterator<Item = std::result::Result<T, E>>,375T: AsRef<str>,376{377let mut iter = iter.into_iter();378self.reserve(iter.size_hint().0, 0);379iter.try_for_each(|x| {380self.push(x?);381Ok(())382})383}384}385386impl<O: Offset, T: AsRef<str>> Extend<T> for MutableUtf8ValuesArray<O> {387fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {388extend_from_values_iter(389&mut self.offsets,390&mut self.values,391iter.into_iter().map(StrAsBytes),392);393}394}395396impl<O: Offset, T: AsRef<str>> TryExtend<T> for MutableUtf8ValuesArray<O> {397fn try_extend<I: IntoIterator<Item = T>>(&mut self, iter: I) -> PolarsResult<()> {398let mut iter = iter.into_iter();399self.reserve(iter.size_hint().0, 0);400iter.try_for_each(|x| self.try_push(x))401}402}403404impl<O: Offset, T: AsRef<str>> TryPush<T> for MutableUtf8ValuesArray<O> {405#[inline]406fn try_push(&mut self, value: T) -> PolarsResult<()> {407let bytes = value.as_ref().as_bytes();408self.values.extend_from_slice(bytes);409self.offsets.try_push(bytes.len())410}411}412413impl<O: Offset> TryExtendFromSelf for MutableUtf8ValuesArray<O> {414fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {415self.values.extend_from_slice(&other.values);416self.offsets.try_extend_from_self(&other.offsets)417}418}419420421