Path: blob/main/crates/polars-arrow/src/array/binary/mod.rs
6939 views
use either::Either;12use super::specification::try_check_offsets_bounds;3use super::{Array, GenericBinaryArray, Splitable};4use crate::array::iterator::NonNullValuesIter;5use crate::bitmap::Bitmap;6use crate::bitmap::utils::{BitmapIter, ZipValidity};7use crate::buffer::Buffer;8use crate::datatypes::ArrowDataType;9use crate::offset::{Offset, Offsets, OffsetsBuffer};10use crate::trusted_len::TrustedLen;1112mod builder;13pub use builder::*;14mod ffi;15pub(super) mod fmt;16mod iterator;17pub use iterator::*;18mod from;19mod mutable_values;20pub use mutable_values::*;21mod mutable;22pub use mutable::*;23use polars_error::{PolarsResult, polars_bail};24#[cfg(feature = "proptest")]25pub mod proptest;2627/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.28/// It implements [`Array`].29///30/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`].31/// # Example32/// ```33/// use polars_arrow::array::BinaryArray;34/// use polars_arrow::bitmap::Bitmap;35/// use polars_arrow::buffer::Buffer;36///37/// let array = BinaryArray::<i32>::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]);38/// assert_eq!(array.value(0), &[1, 2]);39/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]);40/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);41/// // the underlying representation:42/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));43/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));44/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));45/// ```46///47/// # Generic parameter48/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with49/// memory usage:50/// * the sum of lengths of all elements cannot exceed `Offset::MAX`51/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`52///53/// # Safety54/// The following invariants hold:55/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.56/// * `len` is equal to `validity.len()`, when defined.57#[derive(Clone)]58pub struct BinaryArray<O: Offset> {59dtype: ArrowDataType,60offsets: OffsetsBuffer<O>,61values: Buffer<u8>,62validity: Option<Bitmap>,63}6465impl<O: Offset> BinaryArray<O> {66/// Returns a [`BinaryArray`] created from its internal representation.67///68/// # Errors69/// This function returns an error iff:70/// * The last offset is not equal to the values' length.71/// * the validity's length is not equal to `offsets.len()`.72/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.73/// # Implementation74/// This function is `O(1)`75pub fn try_new(76dtype: ArrowDataType,77offsets: OffsetsBuffer<O>,78values: Buffer<u8>,79validity: Option<Bitmap>,80) -> PolarsResult<Self> {81try_check_offsets_bounds(&offsets, values.len())?;8283if validity84.as_ref()85.is_some_and(|validity| validity.len() != offsets.len_proxy())86{87polars_bail!(ComputeError: "validity mask length must match the number of values")88}8990if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {91polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")92}9394Ok(Self {95dtype,96offsets,97values,98validity,99})100}101102/// Creates a new [`BinaryArray`] without checking invariants.103///104/// # Safety105///106/// The invariants must be valid (see try_new).107pub unsafe fn new_unchecked(108dtype: ArrowDataType,109offsets: OffsetsBuffer<O>,110values: Buffer<u8>,111validity: Option<Bitmap>,112) -> Self {113Self {114dtype,115offsets,116values,117validity,118}119}120121/// Creates a new [`BinaryArray`] from slices of `&[u8]`.122pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {123Self::from_trusted_len_values_iter(slice.as_ref().iter())124}125126/// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`.127// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.128pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {129MutableBinaryArray::<O>::from(slice).into()130}131132/// Returns an iterator of `Option<&[u8]>` over every element of this array.133pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<'_, O>, BitmapIter<'_>> {134ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())135}136137/// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity138pub fn values_iter(&self) -> BinaryValueIter<'_, O> {139BinaryValueIter::new(self)140}141142/// Returns an iterator of the non-null values.143#[inline]144pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {145NonNullValuesIter::new(self, self.validity())146}147148/// Returns the length of this array149#[inline]150pub fn len(&self) -> usize {151self.offsets.len_proxy()152}153154/// Returns the element at index `i`155/// # Panics156/// iff `i >= self.len()`157#[inline]158pub fn value(&self, i: usize) -> &[u8] {159assert!(i < self.len());160unsafe { self.value_unchecked(i) }161}162163/// Returns the element at index `i`164///165/// # Safety166/// Assumes that the `i < self.len`.167#[inline]168pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {169// soundness: the invariant of the function170let (start, end) = self.offsets.start_end_unchecked(i);171172// soundness: the invariant of the struct173self.values.get_unchecked(start..end)174}175176/// Returns the element at index `i` or `None` if it is null177/// # Panics178/// iff `i >= self.len()`179#[inline]180pub fn get(&self, i: usize) -> Option<&[u8]> {181if !self.is_null(i) {182// soundness: Array::is_null panics if i >= self.len183unsafe { Some(self.value_unchecked(i)) }184} else {185None186}187}188189/// Returns the [`ArrowDataType`] of this array.190#[inline]191pub fn dtype(&self) -> &ArrowDataType {192&self.dtype193}194195/// Returns the values of this [`BinaryArray`].196#[inline]197pub fn values(&self) -> &Buffer<u8> {198&self.values199}200201/// Returns the offsets of this [`BinaryArray`].202#[inline]203pub fn offsets(&self) -> &OffsetsBuffer<O> {204&self.offsets205}206207/// The optional validity.208#[inline]209pub fn validity(&self) -> Option<&Bitmap> {210self.validity.as_ref()211}212213/// Slices this [`BinaryArray`].214/// # Implementation215/// This function is `O(1)`.216/// # Panics217/// iff `offset + length > self.len()`.218pub fn slice(&mut self, offset: usize, length: usize) {219assert!(220offset + length <= self.len(),221"the offset of the new Buffer cannot exceed the existing length"222);223unsafe { self.slice_unchecked(offset, length) }224}225226/// Slices this [`BinaryArray`].227/// # Implementation228/// This function is `O(1)`.229///230/// # Safety231/// The caller must ensure that `offset + length <= self.len()`.232pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {233self.validity = self234.validity235.take()236.map(|bitmap| bitmap.sliced_unchecked(offset, length))237.filter(|bitmap| bitmap.unset_bits() > 0);238self.offsets.slice_unchecked(offset, length + 1);239}240241impl_sliced!();242impl_mut_validity!();243impl_into_array!();244245/// Returns its internal representation246#[must_use]247pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {248let Self {249dtype,250offsets,251values,252validity,253} = self;254(dtype, offsets, values, validity)255}256257/// Try to convert this `BinaryArray` to a `MutableBinaryArray`258#[must_use]259pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {260use Either::*;261if let Some(bitmap) = self.validity {262match bitmap.into_mut() {263// SAFETY: invariants are preserved264Left(bitmap) => Left(BinaryArray::new(265self.dtype,266self.offsets,267self.values,268Some(bitmap),269)),270Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {271(Left(values), Left(offsets)) => Left(BinaryArray::new(272self.dtype,273offsets,274values,275Some(mutable_bitmap.into()),276)),277(Left(values), Right(offsets)) => Left(BinaryArray::new(278self.dtype,279offsets.into(),280values,281Some(mutable_bitmap.into()),282)),283(Right(values), Left(offsets)) => Left(BinaryArray::new(284self.dtype,285offsets,286values.into(),287Some(mutable_bitmap.into()),288)),289(Right(values), Right(offsets)) => Right(290MutableBinaryArray::try_new(291self.dtype,292offsets,293values,294Some(mutable_bitmap),295)296.unwrap(),297),298},299}300} else {301match (self.values.into_mut(), self.offsets.into_mut()) {302(Left(values), Left(offsets)) => {303Left(BinaryArray::new(self.dtype, offsets, values, None))304},305(Left(values), Right(offsets)) => {306Left(BinaryArray::new(self.dtype, offsets.into(), values, None))307},308(Right(values), Left(offsets)) => {309Left(BinaryArray::new(self.dtype, offsets, values.into(), None))310},311(Right(values), Right(offsets)) => {312Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())313},314}315}316}317318/// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.319pub fn new_empty(dtype: ArrowDataType) -> Self {320Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)321}322323/// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.324#[inline]325pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {326unsafe {327Self::new_unchecked(328dtype,329Offsets::new_zeroed(length).into(),330Buffer::new(),331Some(Bitmap::new_zeroed(length)),332)333}334}335336/// Returns the default [`ArrowDataType`], `DataType::Binary` or `DataType::LargeBinary`337pub fn default_dtype() -> ArrowDataType {338if O::IS_LARGE {339ArrowDataType::LargeBinary340} else {341ArrowDataType::Binary342}343}344345/// Alias for unwrapping [`Self::try_new`]346pub fn new(347dtype: ArrowDataType,348offsets: OffsetsBuffer<O>,349values: Buffer<u8>,350validity: Option<Bitmap>,351) -> Self {352Self::try_new(dtype, offsets, values, validity).unwrap()353}354355/// Returns a [`BinaryArray`] from an iterator of trusted length.356///357/// The [`BinaryArray`] is guaranteed to not have a validity358#[inline]359pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(360iterator: I,361) -> Self {362MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()363}364365/// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`.366///367/// The [`BinaryArray`] is guaranteed to not have a validity368pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {369MutableBinaryArray::<O>::from_iter_values(iterator).into()370}371372/// Creates a [`BinaryArray`] from an iterator of trusted length.373///374/// # Safety375/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).376/// I.e. that `size_hint().1` correctly reports its length.377#[inline]378pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self379where380P: AsRef<[u8]>,381I: Iterator<Item = Option<P>>,382{383MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()384}385386/// Creates a [`BinaryArray`] from a [`TrustedLen`]387#[inline]388pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self389where390P: AsRef<[u8]>,391I: TrustedLen<Item = Option<P>>,392{393// soundness: I is `TrustedLen`394unsafe { Self::from_trusted_len_iter_unchecked(iterator) }395}396397/// Creates a [`BinaryArray`] from an falible iterator of trusted length.398///399/// # Safety400/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).401/// I.e. that `size_hint().1` correctly reports its length.402#[inline]403pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>404where405P: AsRef<[u8]>,406I: IntoIterator<Item = Result<Option<P>, E>>,407{408MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())409}410411/// Creates a [`BinaryArray`] from an fallible iterator of trusted length.412#[inline]413pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>414where415P: AsRef<[u8]>,416I: TrustedLen<Item = Result<Option<P>, E>>,417{418// soundness: I: TrustedLen419unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }420}421}422423impl<O: Offset> Array for BinaryArray<O> {424impl_common_array!();425426fn validity(&self) -> Option<&Bitmap> {427self.validity.as_ref()428}429430#[inline]431fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {432Box::new(self.clone().with_validity(validity))433}434}435436unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {437#[inline]438fn values(&self) -> &[u8] {439self.values()440}441442#[inline]443fn offsets(&self) -> &[O] {444self.offsets().buffer()445}446}447448impl<O: Offset> Splitable for BinaryArray<O> {449#[inline(always)]450fn check_bound(&self, offset: usize) -> bool {451offset <= self.len()452}453454unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {455let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };456let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };457458(459Self {460dtype: self.dtype.clone(),461offsets: lhs_offsets,462values: self.values.clone(),463validity: lhs_validity,464},465Self {466dtype: self.dtype.clone(),467offsets: rhs_offsets,468values: self.values.clone(),469validity: rhs_validity,470},471)472}473}474475476