Path: blob/main/crates/polars-arrow/src/array/binview/mod.rs
6939 views
#![allow(unsafe_op_in_unsafe_fn)]1//! See thread: https://lists.apache.org/thread/w88tpz76ox8h3rxkjl4so6rg3f1rv7wt23mod builder;4pub use builder::*;5mod ffi;6pub(super) mod fmt;7mod iterator;8mod mutable;9#[cfg(feature = "proptest")]10pub mod proptest;11mod view;1213use std::any::Any;14use std::fmt::Debug;15use std::marker::PhantomData;16use std::sync::Arc;1718use polars_error::*;19use polars_utils::relaxed_cell::RelaxedCell;2021use crate::array::Array;22use crate::bitmap::Bitmap;23use crate::buffer::Buffer;24use crate::datatypes::ArrowDataType;2526mod private {27pub trait Sealed: Send + Sync {}2829impl Sealed for str {}30impl Sealed for [u8] {}31}32pub use iterator::BinaryViewValueIter;33pub use mutable::MutableBinaryViewArray;34use polars_utils::aliases::{InitHashMaps, PlHashMap};35use private::Sealed;3637use crate::array::binview::view::{validate_binary_views, validate_views_utf8_only};38use crate::array::iterator::NonNullValuesIter;39use crate::bitmap::utils::{BitmapIter, ZipValidity};40pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;41pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;42pub type BinaryViewArrayBuilder = BinaryViewArrayGenericBuilder<[u8]>;43pub type Utf8ViewArrayBuilder = BinaryViewArrayGenericBuilder<str>;44pub use view::{View, validate_utf8_views};4546use super::Splitable;4748pub type MutablePlString = MutableBinaryViewArray<str>;49pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;5051static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;52static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;5354// Growth parameters of view array buffers.55const DEFAULT_BLOCK_SIZE: usize = 8 * 1024;56const MAX_EXP_BLOCK_SIZE: usize = 16 * 1024 * 1024;5758pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {59const IS_UTF8: bool;60const DATA_TYPE: ArrowDataType;61type Owned: Debug + Clone + Sync + Send + AsRef<Self>;6263/// # Safety64/// The caller must ensure that `slice` is a valid view.65unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self;66fn from_bytes(slice: &[u8]) -> Option<&Self>;6768fn to_bytes(&self) -> &[u8];6970#[allow(clippy::wrong_self_convention)]71fn into_owned(&self) -> Self::Owned;7273fn dtype() -> &'static ArrowDataType;74}7576impl ViewType for str {77const IS_UTF8: bool = true;78const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View;79type Owned = String;8081#[inline(always)]82unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {83std::str::from_utf8_unchecked(slice)84}85#[inline(always)]86fn from_bytes(slice: &[u8]) -> Option<&Self> {87std::str::from_utf8(slice).ok()88}8990#[inline(always)]91fn to_bytes(&self) -> &[u8] {92self.as_bytes()93}9495fn into_owned(&self) -> Self::Owned {96self.to_string()97}98fn dtype() -> &'static ArrowDataType {99&UTF8_VIEW_TYPE100}101}102103impl ViewType for [u8] {104const IS_UTF8: bool = false;105const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView;106type Owned = Vec<u8>;107108#[inline(always)]109unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {110slice111}112#[inline(always)]113fn from_bytes(slice: &[u8]) -> Option<&Self> {114Some(slice)115}116117#[inline(always)]118fn to_bytes(&self) -> &[u8] {119self120}121122fn into_owned(&self) -> Self::Owned {123self.to_vec()124}125126fn dtype() -> &'static ArrowDataType {127&BIN_VIEW_TYPE128}129}130131pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {132dtype: ArrowDataType,133views: Buffer<View>,134buffers: Arc<[Buffer<u8>]>,135validity: Option<Bitmap>,136phantom: PhantomData<T>,137/// Total bytes length if we would concatenate them all.138total_bytes_len: RelaxedCell<u64>,139/// Total bytes in the buffer (excluding remaining capacity)140total_buffer_len: usize,141}142143impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {144fn eq(&self, other: &Self) -> bool {145self.len() == other.len() && self.into_iter().zip(other).all(|(l, r)| l == r)146}147}148149impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {150fn clone(&self) -> Self {151Self {152dtype: self.dtype.clone(),153views: self.views.clone(),154buffers: self.buffers.clone(),155validity: self.validity.clone(),156phantom: Default::default(),157total_bytes_len: self.total_bytes_len.clone(),158total_buffer_len: self.total_buffer_len,159}160}161}162163unsafe impl<T: ViewType + ?Sized> Send for BinaryViewArrayGeneric<T> {}164unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewArrayGeneric<T> {}165166const UNKNOWN_LEN: u64 = u64::MAX;167168impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {169/// # Safety170/// The caller must ensure171/// - the data is valid utf8 (if required)172/// - The offsets match the buffers.173pub unsafe fn new_unchecked(174dtype: ArrowDataType,175views: Buffer<View>,176buffers: Arc<[Buffer<u8>]>,177validity: Option<Bitmap>,178total_bytes_len: usize,179total_buffer_len: usize,180) -> Self {181// Verify the invariants182#[cfg(debug_assertions)]183{184if let Some(validity) = validity.as_ref() {185assert_eq!(validity.len(), views.len());186}187188// @TODO: Enable this. This is currently bugged with concatenate.189// let mut actual_total_buffer_len = 0;190// let mut actual_total_bytes_len = 0;191//192// for buffer in buffers.iter() {193// actual_total_buffer_len += buffer.len();194// }195196for (i, view) in views.iter().enumerate() {197let is_valid = validity.as_ref().is_none_or(|v| v.get_bit(i));198199if !is_valid {200continue;201}202203// actual_total_bytes_len += view.length as usize;204if view.length > View::MAX_INLINE_SIZE {205assert!((view.buffer_idx as usize) < (buffers.len()));206assert!(207view.offset as usize + view.length as usize208<= buffers[view.buffer_idx as usize].len()209);210}211}212213// assert_eq!(actual_total_buffer_len, total_buffer_len);214// if (total_bytes_len as u64) != UNKNOWN_LEN {215// assert_eq!(actual_total_bytes_len, total_bytes_len);216// }217}218219Self {220dtype,221views,222buffers,223validity,224phantom: Default::default(),225total_bytes_len: RelaxedCell::from(total_bytes_len as u64),226total_buffer_len,227}228}229230/// Create a new BinaryViewArray but initialize a statistics compute.231///232/// # Safety233/// The caller must ensure the invariants234pub unsafe fn new_unchecked_unknown_md(235dtype: ArrowDataType,236views: Buffer<View>,237buffers: Arc<[Buffer<u8>]>,238validity: Option<Bitmap>,239total_buffer_len: Option<usize>,240) -> Self {241let total_bytes_len = UNKNOWN_LEN as usize;242let total_buffer_len =243total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum());244Self::new_unchecked(245dtype,246views,247buffers,248validity,249total_bytes_len,250total_buffer_len,251)252}253254pub fn data_buffers(&self) -> &Arc<[Buffer<u8>]> {255&self.buffers256}257258pub fn variadic_buffer_lengths(&self) -> Vec<i64> {259self.buffers.iter().map(|buf| buf.len() as i64).collect()260}261262pub fn views(&self) -> &Buffer<View> {263&self.views264}265266pub fn into_views(self) -> Vec<View> {267self.views.make_mut()268}269270pub fn into_inner(271self,272) -> (273Buffer<View>,274Arc<[Buffer<u8>]>,275Option<Bitmap>,276usize,277usize,278) {279let views = self.views;280let buffers = self.buffers;281let validity = self.validity;282283(284views,285buffers,286validity,287self.total_bytes_len.load() as usize,288self.total_buffer_len,289)290}291292/// Apply a function over the views. This can be used to update views in operations like slicing.293///294/// # Safety295/// Update the views. All invariants of the views apply.296pub unsafe fn apply_views<F: FnMut(View, &T) -> View>(&self, mut update_view: F) -> Self {297let arr = self.clone();298let (views, buffers, validity, total_bytes_len, total_buffer_len) = arr.into_inner();299300let mut views = views.make_mut();301for v in views.iter_mut() {302let str_slice = T::from_bytes_unchecked(v.get_slice_unchecked(&buffers));303*v = update_view(*v, str_slice);304}305Self::new_unchecked(306self.dtype.clone(),307views.into(),308buffers,309validity,310total_bytes_len,311total_buffer_len,312)313}314315pub fn try_new(316dtype: ArrowDataType,317views: Buffer<View>,318buffers: Arc<[Buffer<u8>]>,319validity: Option<Bitmap>,320) -> PolarsResult<Self> {321if T::IS_UTF8 {322validate_utf8_views(views.as_ref(), buffers.as_ref())?;323} else {324validate_binary_views(views.as_ref(), buffers.as_ref())?;325}326327if let Some(validity) = &validity {328polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" )329}330331unsafe {332Ok(Self::new_unchecked_unknown_md(333dtype, views, buffers, validity, None,334))335}336}337338/// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero.339#[inline]340pub fn new_empty(dtype: ArrowDataType) -> Self {341unsafe { Self::new_unchecked(dtype, Buffer::new(), Arc::from([]), None, 0, 0) }342}343344/// Returns a new null [`BinaryViewArrayGeneric`] of `length`.345#[inline]346pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {347let validity = Some(Bitmap::new_zeroed(length));348unsafe { Self::new_unchecked(dtype, Buffer::zeroed(length), Arc::from([]), validity, 0, 0) }349}350351/// Returns the element at index `i`352/// # Panics353/// iff `i >= self.len()`354#[inline]355pub fn value(&self, i: usize) -> &T {356assert!(i < self.len());357unsafe { self.value_unchecked(i) }358}359360/// Returns the element at index `i`361///362/// # Safety363/// Assumes that the `i < self.len`.364#[inline]365pub unsafe fn value_unchecked(&self, i: usize) -> &T {366let v = self.views.get_unchecked(i);367T::from_bytes_unchecked(v.get_slice_unchecked(&self.buffers))368}369370/// Returns the element at index `i`, or None if it is null.371/// # Panics372/// iff `i >= self.len()`373#[inline]374pub fn get(&self, i: usize) -> Option<&T> {375assert!(i < self.len());376unsafe { self.get_unchecked(i) }377}378379/// Returns the element at index `i`, or None if it is null.380///381/// # Safety382/// Assumes that the `i < self.len`.383#[inline]384pub unsafe fn get_unchecked(&self, i: usize) -> Option<&T> {385if self386.validity387.as_ref()388.is_none_or(|v| v.get_bit_unchecked(i))389{390let v = self.views.get_unchecked(i);391Some(T::from_bytes_unchecked(392v.get_slice_unchecked(&self.buffers),393))394} else {395None396}397}398399/// Returns an iterator of `Option<&T>` over every element of this array.400pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter<'_, T>, BitmapIter<'_>> {401ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())402}403404/// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity405pub fn values_iter(&self) -> BinaryViewValueIter<'_, T> {406BinaryViewValueIter::new(self)407}408409pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {410self.views.iter().map(|v| v.length)411}412413/// Returns an iterator of the non-null values.414pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {415NonNullValuesIter::new(self, self.validity())416}417418/// Returns an iterator of the non-null values.419pub fn non_null_views_iter(&self) -> NonNullValuesIter<'_, Buffer<View>> {420NonNullValuesIter::new(self.views(), self.validity())421}422423impl_sliced!();424impl_mut_validity!();425impl_into_array!();426427pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {428let mutable = MutableBinaryViewArray::from_iterator(429slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),430);431mutable.into()432}433434pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {435let mutable =436MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));437mutable.into()438}439440/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.441pub fn total_bytes_len(&self) -> usize {442let total = self.total_bytes_len.load();443if total == UNKNOWN_LEN {444let total = self.len_iter().map(|v| v as usize).sum::<usize>();445self.total_bytes_len.store(total as u64);446total447} else {448total as usize449}450}451452/// Get the length of bytes that are stored in the variadic buffers.453pub fn total_buffer_len(&self) -> usize {454self.total_buffer_len455}456457fn total_unshared_buffer_len(&self) -> usize {458// XXX: it is O(n), not O(1).459// Given this function is only called in `maybe_gc()`,460// it may not be worthy to add an extra field for this.461self.buffers462.iter()463.map(|buf| {464if buf.storage_refcount() > 1 {4650466} else {467buf.len()468}469})470.sum()471}472473#[inline(always)]474pub fn len(&self) -> usize {475self.views.len()476}477478/// Garbage collect479pub fn gc(self) -> Self {480if self.buffers.is_empty() {481return self;482}483let mut mutable = MutableBinaryViewArray::with_capacity(self.len());484let buffers = self.buffers.as_ref();485486for view in self.views.as_ref() {487unsafe { mutable.push_view_unchecked(*view, buffers) }488}489mutable.freeze().with_validity(self.validity)490}491492pub fn deshare(&self) -> Self {493if Arc::strong_count(&self.buffers) == 1494&& self.buffers.iter().all(|b| b.storage_refcount() == 1)495{496return self.clone();497}498self.clone().gc()499}500501pub fn is_sliced(&self) -> bool {502!std::ptr::eq(self.views.as_ptr(), self.views.storage_ptr())503}504505pub fn maybe_gc(self) -> Self {506const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB.507508if self.total_buffer_len <= GC_MINIMUM_SAVINGS {509return self;510}511512if Arc::strong_count(&self.buffers) != 1 {513// There are multiple holders of this `buffers`.514// If we allow gc in this case,515// it may end up copying the same content multiple times.516return self;517}518519// Subtract the maximum amount of inlined strings to get a lower bound520// on the number of buffer bytes needed (assuming no dedup).521let total_bytes_len = self.total_bytes_len();522let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12);523524let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound;525// Use unshared buffer len. Shared buffer won't be freed; no savings.526let cur_mem_usage = self.len() * 16 + self.total_unshared_buffer_len();527let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc);528529if savings_upper_bound >= GC_MINIMUM_SAVINGS530&& cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc531{532self.gc()533} else {534self535}536}537538pub fn make_mut(self) -> MutableBinaryViewArray<T> {539let views = self.views.make_mut();540let completed_buffers = self.buffers.to_vec();541let validity = self.validity.map(|bitmap| bitmap.make_mut());542543// We need to know the total_bytes_len if we are going to mutate it.544let mut total_bytes_len = self.total_bytes_len.load();545if total_bytes_len == UNKNOWN_LEN {546total_bytes_len = views.iter().map(|view| view.length as u64).sum();547}548let total_bytes_len = total_bytes_len as usize;549550MutableBinaryViewArray {551views,552completed_buffers,553in_progress_buffer: vec![],554validity,555phantom: Default::default(),556total_bytes_len,557total_buffer_len: self.total_buffer_len,558stolen_buffers: PlHashMap::new(),559}560}561}562563impl BinaryViewArray {564/// Validate the underlying bytes on UTF-8.565pub fn validate_utf8(&self) -> PolarsResult<()> {566// SAFETY: views are correct567unsafe { validate_views_utf8_only(&self.views, &self.buffers, 0) }568}569570/// Convert [`BinaryViewArray`] to [`Utf8ViewArray`].571pub fn to_utf8view(&self) -> PolarsResult<Utf8ViewArray> {572self.validate_utf8()?;573unsafe { Ok(self.to_utf8view_unchecked()) }574}575576/// Convert [`BinaryViewArray`] to [`Utf8ViewArray`] without checking UTF-8.577///578/// # Safety579/// The caller must ensure the underlying data is valid UTF-8.580pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray {581Utf8ViewArray::new_unchecked(582ArrowDataType::Utf8View,583self.views.clone(),584self.buffers.clone(),585self.validity.clone(),586self.total_bytes_len.load() as usize,587self.total_buffer_len,588)589}590}591592impl Utf8ViewArray {593pub fn to_binview(&self) -> BinaryViewArray {594// SAFETY: same invariants.595unsafe {596BinaryViewArray::new_unchecked(597ArrowDataType::BinaryView,598self.views.clone(),599self.buffers.clone(),600self.validity.clone(),601self.total_bytes_len.load() as usize,602self.total_buffer_len,603)604}605}606}607608impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {609fn as_any(&self) -> &dyn Any {610self611}612613fn as_any_mut(&mut self) -> &mut dyn Any {614self615}616617#[inline(always)]618fn len(&self) -> usize {619BinaryViewArrayGeneric::len(self)620}621622fn dtype(&self) -> &ArrowDataType {623T::dtype()624}625626fn validity(&self) -> Option<&Bitmap> {627self.validity.as_ref()628}629630fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {631let (lhs, rhs) = Splitable::split_at(self, offset);632(Box::new(lhs), Box::new(rhs))633}634635unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {636let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };637(Box::new(lhs), Box::new(rhs))638}639640fn slice(&mut self, offset: usize, length: usize) {641assert!(642offset + length <= self.len(),643"the offset of the new Buffer cannot exceed the existing length"644);645unsafe { self.slice_unchecked(offset, length) }646}647648unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {649debug_assert!(offset + length <= self.len());650self.validity = self651.validity652.take()653.map(|bitmap| bitmap.sliced_unchecked(offset, length))654.filter(|bitmap| bitmap.unset_bits() > 0);655self.views.slice_unchecked(offset, length);656self.total_bytes_len.store(UNKNOWN_LEN)657}658659fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {660debug_assert!(661validity.as_ref().is_none_or(|v| v.len() == self.len()),662"{} != {}",663validity.as_ref().unwrap().len(),664self.len()665);666667let mut new = self.clone();668new.validity = validity;669Box::new(new)670}671672fn to_boxed(&self) -> Box<dyn Array> {673Box::new(self.clone())674}675}676677impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {678fn check_bound(&self, offset: usize) -> bool {679offset <= self.len()680}681682unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {683let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };684let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };685686unsafe {687(688Self::new_unchecked(689self.dtype.clone(),690lhs_views,691self.buffers.clone(),692lhs_validity,693if offset == 0 { 0 } else { UNKNOWN_LEN as _ },694self.total_buffer_len(),695),696Self::new_unchecked(697self.dtype.clone(),698rhs_views,699self.buffers.clone(),700rhs_validity,701if offset == self.len() {7020703} else {704UNKNOWN_LEN as _705},706self.total_buffer_len(),707),708)709}710}711}712713714