Path: blob/main/crates/polars-arrow/src/array/utf8/mod.rs
6939 views
use either::Either;12use super::specification::try_check_utf8;3use super::{Array, GenericBinaryArray, Splitable};4use crate::array::BinaryArray;5use crate::array::iterator::NonNullValuesIter;6use crate::bitmap::Bitmap;7use crate::bitmap::utils::{BitmapIter, ZipValidity};8use crate::buffer::Buffer;9use crate::datatypes::ArrowDataType;10use crate::offset::{Offset, Offsets, OffsetsBuffer};11use crate::trusted_len::TrustedLen;1213mod ffi;14pub(super) mod fmt;15mod from;16mod iterator;17mod mutable;18mod mutable_values;19pub use iterator::*;20pub use mutable::*;21pub use mutable_values::MutableUtf8ValuesArray;22use polars_error::*;2324// Auxiliary struct to allow presenting &str as [u8] to a generic function25pub(super) struct StrAsBytes<P>(P);26impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {27#[inline(always)]28fn as_ref(&self) -> &[u8] {29self.0.as_ref().as_bytes()30}31}3233/// A [`Utf8Array`] is arrow's semantic equivalent of an immutable `Vec<Option<String>>`.34/// Cloning and slicing this struct is `O(1)`.35/// # Example36/// ```37/// use polars_arrow::bitmap::Bitmap;38/// use polars_arrow::buffer::Buffer;39/// use polars_arrow::array::Utf8Array;40/// # fn main() {41/// let array = Utf8Array::<i32>::from([Some("hi"), None, Some("there")]);42/// assert_eq!(array.value(0), "hi");43/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some("hi"), None, Some("there")]);44/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec!["hi", "", "there"]);45/// // the underlying representation46/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));47/// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec()));48/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5]));49/// # }50/// ```51///52/// # Generic parameter53/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with54/// memory usage:55/// * the sum of lengths of all elements cannot exceed `Offset::MAX`56/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`57///58/// # Safety59/// The following invariants hold:60/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.61/// * A slice of `values` taken from two consecutive `offsets` is valid `utf8`.62/// * `len` is equal to `validity.len()`, when defined.63#[derive(Clone)]64pub struct Utf8Array<O: Offset> {65dtype: ArrowDataType,66offsets: OffsetsBuffer<O>,67values: Buffer<u8>,68validity: Option<Bitmap>,69}7071// constructors72impl<O: Offset> Utf8Array<O> {73/// Returns a [`Utf8Array`] created from its internal representation.74///75/// # Errors76/// This function returns an error iff:77/// * The last offset is greater than the values' length.78/// * the validity's length is not equal to `offsets.len_proxy()`.79/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.80/// * The `values` between two consecutive `offsets` are not valid utf881/// # Implementation82/// This function is `O(N)` - checking utf8 is `O(N)`83pub fn try_new(84dtype: ArrowDataType,85offsets: OffsetsBuffer<O>,86values: Buffer<u8>,87validity: Option<Bitmap>,88) -> PolarsResult<Self> {89try_check_utf8(&offsets, &values)?;90if validity91.as_ref()92.is_some_and(|validity| validity.len() != offsets.len_proxy())93{94polars_bail!(ComputeError: "validity mask length must match the number of values");95}9697if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {98polars_bail!(ComputeError: "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")99}100101Ok(Self {102dtype,103offsets,104values,105validity,106})107}108109/// Returns a [`Utf8Array`] from a slice of `&str`.110///111/// A convenience method that uses [`Self::from_trusted_len_values_iter`].112pub fn from_slice<T: AsRef<str>, P: AsRef<[T]>>(slice: P) -> Self {113Self::from_trusted_len_values_iter(slice.as_ref().iter())114}115116/// Returns a new [`Utf8Array`] from a slice of `&str`.117///118/// A convenience method that uses [`Self::from_trusted_len_iter`].119// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.120pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {121MutableUtf8Array::<O>::from(slice).into()122}123124/// Returns an iterator of `Option<&str>`125pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter<'_, O>, BitmapIter<'_>> {126ZipValidity::new_with_validity(self.values_iter(), self.validity())127}128129/// Returns an iterator of `&str`130pub fn values_iter(&self) -> Utf8ValuesIter<'_, O> {131Utf8ValuesIter::new(self)132}133134/// Returns an iterator of the non-null values `&str.135#[inline]136pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, Utf8Array<O>> {137NonNullValuesIter::new(self, self.validity())138}139140/// Returns the length of this array141#[inline]142pub fn len(&self) -> usize {143self.offsets.len_proxy()144}145146/// Returns the value of the element at index `i`, ignoring the array's validity.147/// # Panic148/// This function panics iff `i >= self.len`.149#[inline]150pub fn value(&self, i: usize) -> &str {151assert!(i < self.len());152unsafe { self.value_unchecked(i) }153}154155/// Returns the value of the element at index `i`, ignoring the array's validity.156///157/// # Safety158/// This function is safe iff `i < self.len`.159#[inline]160pub unsafe fn value_unchecked(&self, i: usize) -> &str {161// soundness: the invariant of the function162let (start, end) = self.offsets.start_end_unchecked(i);163164// soundness: the invariant of the struct165let slice = self.values.get_unchecked(start..end);166167// soundness: the invariant of the struct168std::str::from_utf8_unchecked(slice)169}170171/// Returns the element at index `i` or `None` if it is null172/// # Panics173/// iff `i >= self.len()`174#[inline]175pub fn get(&self, i: usize) -> Option<&str> {176if !self.is_null(i) {177// soundness: Array::is_null panics if i >= self.len178unsafe { Some(self.value_unchecked(i)) }179} else {180None181}182}183184/// Returns the [`ArrowDataType`] of this array.185#[inline]186pub fn dtype(&self) -> &ArrowDataType {187&self.dtype188}189190/// Returns the values of this [`Utf8Array`].191#[inline]192pub fn values(&self) -> &Buffer<u8> {193&self.values194}195196/// Returns the offsets of this [`Utf8Array`].197#[inline]198pub fn offsets(&self) -> &OffsetsBuffer<O> {199&self.offsets200}201202/// The optional validity.203#[inline]204pub fn validity(&self) -> Option<&Bitmap> {205self.validity.as_ref()206}207208/// Slices this [`Utf8Array`].209/// # Implementation210/// This function is `O(1)`.211/// # Panics212/// iff `offset + length > self.len()`.213pub fn slice(&mut self, offset: usize, length: usize) {214assert!(215offset + length <= self.len(),216"the offset of the new array cannot exceed the arrays' length"217);218unsafe { self.slice_unchecked(offset, length) }219}220221/// Slices this [`Utf8Array`].222/// # Implementation223/// This function is `O(1)`224///225/// # Safety226/// The caller must ensure that `offset + length <= self.len()`.227pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {228self.validity = self229.validity230.take()231.map(|bitmap| bitmap.sliced_unchecked(offset, length))232.filter(|bitmap| bitmap.unset_bits() > 0);233self.offsets.slice_unchecked(offset, length + 1);234}235236impl_sliced!();237impl_mut_validity!();238impl_into_array!();239240/// Returns its internal representation241#[must_use]242pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {243let Self {244dtype,245offsets,246values,247validity,248} = self;249(dtype, offsets, values, validity)250}251252/// Try to convert this `Utf8Array` to a `MutableUtf8Array`253#[must_use]254pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {255use Either::*;256if let Some(bitmap) = self.validity {257match bitmap.into_mut() {258// SAFETY: invariants are preserved259Left(bitmap) => Left(unsafe {260Utf8Array::new_unchecked(self.dtype, self.offsets, self.values, Some(bitmap))261}),262Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {263(Left(values), Left(offsets)) => {264// SAFETY: invariants are preserved265Left(unsafe {266Utf8Array::new_unchecked(267self.dtype,268offsets,269values,270Some(mutable_bitmap.into()),271)272})273},274(Left(values), Right(offsets)) => {275// SAFETY: invariants are preserved276Left(unsafe {277Utf8Array::new_unchecked(278self.dtype,279offsets.into(),280values,281Some(mutable_bitmap.into()),282)283})284},285(Right(values), Left(offsets)) => {286// SAFETY: invariants are preserved287Left(unsafe {288Utf8Array::new_unchecked(289self.dtype,290offsets,291values.into(),292Some(mutable_bitmap.into()),293)294})295},296(Right(values), Right(offsets)) => Right(unsafe {297MutableUtf8Array::new_unchecked(298self.dtype,299offsets,300values,301Some(mutable_bitmap),302)303}),304},305}306} else {307match (self.values.into_mut(), self.offsets.into_mut()) {308(Left(values), Left(offsets)) => {309Left(unsafe { Utf8Array::new_unchecked(self.dtype, offsets, values, None) })310},311(Left(values), Right(offsets)) => Left(unsafe {312Utf8Array::new_unchecked(self.dtype, offsets.into(), values, None)313}),314(Right(values), Left(offsets)) => Left(unsafe {315Utf8Array::new_unchecked(self.dtype, offsets, values.into(), None)316}),317(Right(values), Right(offsets)) => Right(unsafe {318MutableUtf8Array::new_unchecked(self.dtype, offsets, values, None)319}),320}321}322}323324/// Returns a new empty [`Utf8Array`].325///326/// The array is guaranteed to have no elements nor validity.327#[inline]328pub fn new_empty(dtype: ArrowDataType) -> Self {329unsafe { Self::new_unchecked(dtype, OffsetsBuffer::new(), Buffer::new(), None) }330}331332/// Returns a new [`Utf8Array`] whose all slots are null / `None`.333#[inline]334pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {335Self::new(336dtype,337Offsets::new_zeroed(length).into(),338Buffer::new(),339Some(Bitmap::new_zeroed(length)),340)341}342343/// Returns a default [`ArrowDataType`] of this array, which depends on the generic parameter `O`: `DataType::Utf8` or `DataType::LargeUtf8`344pub fn default_dtype() -> ArrowDataType {345if O::IS_LARGE {346ArrowDataType::LargeUtf8347} else {348ArrowDataType::Utf8349}350}351352/// Creates a new [`Utf8Array`] without checking for offsets monotinicity nor utf8-validity353///354/// # Panic355/// This function panics (in debug mode only) iff:356/// * The last offset is greater than the values' length.357/// * the validity's length is not equal to `offsets.len_proxy()`.358/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.359///360/// # Safety361/// This function is unsound iff:362/// * The `values` between two consecutive `offsets` are not valid utf8363/// # Implementation364/// This function is `O(1)`365pub unsafe fn new_unchecked(366dtype: ArrowDataType,367offsets: OffsetsBuffer<O>,368values: Buffer<u8>,369validity: Option<Bitmap>,370) -> Self {371debug_assert!(372offsets.last().to_usize() <= values.len(),373"offsets must not exceed the values length"374);375debug_assert!(376validity377.as_ref()378.is_none_or(|validity| validity.len() == offsets.len_proxy()),379"validity mask length must match the number of values"380);381debug_assert!(382dtype.to_physical_type() == Self::default_dtype().to_physical_type(),383"Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8"384);385386Self {387dtype,388offsets,389values,390validity,391}392}393394/// Creates a new [`Utf8Array`].395/// # Panics396/// This function panics iff:397/// * `offsets.last()` is greater than `values.len()`.398/// * the validity's length is not equal to `offsets.len_proxy()`.399/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.400/// * The `values` between two consecutive `offsets` are not valid utf8401/// # Implementation402/// This function is `O(N)` - checking utf8 is `O(N)`403pub fn new(404dtype: ArrowDataType,405offsets: OffsetsBuffer<O>,406values: Buffer<u8>,407validity: Option<Bitmap>,408) -> Self {409Self::try_new(dtype, offsets, values, validity).unwrap()410}411412/// Returns a (non-null) [`Utf8Array`] created from a [`TrustedLen`] of `&str`.413/// # Implementation414/// This function is `O(N)`415#[inline]416pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(417iterator: I,418) -> Self {419MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()420}421422/// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`.423pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {424MutableUtf8Array::<O>::from_iter_values(iterator).into()425}426427/// Creates a [`Utf8Array`] from an iterator of trusted length.428///429/// # Safety430/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).431/// I.e. that `size_hint().1` correctly reports its length.432#[inline]433pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self434where435P: AsRef<str>,436I: Iterator<Item = Option<P>>,437{438MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()439}440441/// Creates a [`Utf8Array`] from an iterator of trusted length.442#[inline]443pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self444where445P: AsRef<str>,446I: TrustedLen<Item = Option<P>>,447{448MutableUtf8Array::<O>::from_trusted_len_iter(iterator).into()449}450451/// Creates a [`Utf8Array`] from an falible iterator of trusted length.452///453/// # Safety454/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).455/// I.e. that `size_hint().1` correctly reports its length.456#[inline]457pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(458iterator: I,459) -> std::result::Result<Self, E>460where461P: AsRef<str>,462I: IntoIterator<Item = std::result::Result<Option<P>, E>>,463{464MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())465}466467/// Creates a [`Utf8Array`] from an fallible iterator of trusted length.468#[inline]469pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> std::result::Result<Self, E>470where471P: AsRef<str>,472I: TrustedLen<Item = std::result::Result<Option<P>, E>>,473{474MutableUtf8Array::<O>::try_from_trusted_len_iter(iter).map(|x| x.into())475}476477/// Applies a function `f` to the validity of this array.478///479/// This is an API to leverage clone-on-write480/// # Panics481/// This function panics if the function `f` modifies the length of the [`Bitmap`].482pub fn apply_validity<F: FnOnce(Bitmap) -> Bitmap>(&mut self, f: F) {483if let Some(validity) = std::mem::take(&mut self.validity) {484self.set_validity(Some(f(validity)))485}486}487488// Convert this [`Utf8Array`] to a [`BinaryArray`].489pub fn to_binary(&self) -> BinaryArray<O> {490unsafe {491BinaryArray::new_unchecked(492BinaryArray::<O>::default_dtype(),493self.offsets.clone(),494self.values.clone(),495self.validity.clone(),496)497}498}499}500501impl<O: Offset> Splitable for Utf8Array<O> {502#[inline(always)]503fn check_bound(&self, offset: usize) -> bool {504offset <= self.len()505}506507unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {508let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };509let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };510511(512Self {513dtype: self.dtype.clone(),514offsets: lhs_offsets,515values: self.values.clone(),516validity: lhs_validity,517},518Self {519dtype: self.dtype.clone(),520offsets: rhs_offsets,521values: self.values.clone(),522validity: rhs_validity,523},524)525}526}527528impl<O: Offset> Array for Utf8Array<O> {529impl_common_array!();530531fn validity(&self) -> Option<&Bitmap> {532self.validity.as_ref()533}534535#[inline]536fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {537Box::new(self.clone().with_validity(validity))538}539}540541unsafe impl<O: Offset> GenericBinaryArray<O> for Utf8Array<O> {542#[inline]543fn values(&self) -> &[u8] {544self.values()545}546547#[inline]548fn offsets(&self) -> &[O] {549self.offsets().buffer()550}551}552553impl<O: Offset> Default for Utf8Array<O> {554fn default() -> Self {555let dtype = if O::IS_LARGE {556ArrowDataType::LargeUtf8557} else {558ArrowDataType::Utf8559};560Utf8Array::new(dtype, Default::default(), Default::default(), None)561}562}563564565