Path: blob/main/crates/polars-arrow/src/array/binview/mod.rs
8402 views
#![allow(unsafe_op_in_unsafe_fn)]1//! See thread: https://lists.apache.org/thread/w88tpz76ox8h3rxkjl4so6rg3f1rv7wt23mod builder;4pub use builder::*;5mod ffi;6pub(super) mod fmt;7mod iterator;8mod mutable;9#[cfg(feature = "proptest")]10pub mod proptest;11mod view;1213use std::any::Any;14use std::fmt::Debug;15use std::marker::PhantomData;1617use polars_buffer::Buffer;18use polars_error::*;19use polars_utils::relaxed_cell::RelaxedCell;2021use crate::array::Array;22use crate::bitmap::Bitmap;23use crate::datatypes::ArrowDataType;2425mod private {26pub trait Sealed: Send + Sync {}2728impl Sealed for str {}29impl Sealed for [u8] {}30}31pub use iterator::BinaryViewValueIter;32pub use mutable::MutableBinaryViewArray;33use polars_utils::aliases::{InitHashMaps, PlHashMap};34use private::Sealed;3536use crate::array::binview::view::{validate_binary_views, validate_views_utf8_only};37use crate::array::iterator::NonNullValuesIter;38use crate::bitmap::utils::{BitmapIter, ZipValidity};39pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;40pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;41pub type BinaryViewArrayBuilder = BinaryViewArrayGenericBuilder<[u8]>;42pub type Utf8ViewArrayBuilder = BinaryViewArrayGenericBuilder<str>;43pub use view::{View, validate_utf8_views};4445use super::Splitable;4647pub type MutablePlString = MutableBinaryViewArray<str>;48pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;4950static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;51static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;5253// Growth parameters of view array buffers.54const DEFAULT_BLOCK_SIZE: usize = 8 * 1024;55const MAX_EXP_BLOCK_SIZE: usize = 16 * 1024 * 1024;5657pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {58const IS_UTF8: bool;59const DATA_TYPE: ArrowDataType;60type Owned: Debug + Clone + Sync + Send + AsRef<Self>;6162/// # Safety63/// The caller must ensure that `slice` is a valid view.64unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self;65fn from_bytes(slice: &[u8]) -> Option<&Self>;6667fn to_bytes(&self) -> &[u8];6869#[allow(clippy::wrong_self_convention)]70fn into_owned(&self) -> Self::Owned;7172fn dtype() -> &'static ArrowDataType;73}7475impl ViewType for str {76const IS_UTF8: bool = true;77const DATA_TYPE: ArrowDataType = ArrowDataType::Utf8View;78type Owned = String;7980#[inline(always)]81unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {82std::str::from_utf8_unchecked(slice)83}84#[inline(always)]85fn from_bytes(slice: &[u8]) -> Option<&Self> {86std::str::from_utf8(slice).ok()87}8889#[inline(always)]90fn to_bytes(&self) -> &[u8] {91self.as_bytes()92}9394fn into_owned(&self) -> Self::Owned {95self.to_string()96}97fn dtype() -> &'static ArrowDataType {98&UTF8_VIEW_TYPE99}100}101102impl ViewType for [u8] {103const IS_UTF8: bool = false;104const DATA_TYPE: ArrowDataType = ArrowDataType::BinaryView;105type Owned = Vec<u8>;106107#[inline(always)]108unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self {109slice110}111#[inline(always)]112fn from_bytes(slice: &[u8]) -> Option<&Self> {113Some(slice)114}115116#[inline(always)]117fn to_bytes(&self) -> &[u8] {118self119}120121fn into_owned(&self) -> Self::Owned {122self.to_vec()123}124125fn dtype() -> &'static ArrowDataType {126&BIN_VIEW_TYPE127}128}129130pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {131dtype: ArrowDataType,132views: Buffer<View>,133buffers: Buffer<Buffer<u8>>,134validity: Option<Bitmap>,135phantom: PhantomData<T>,136/// Total bytes length if we would concatenate them all.137total_bytes_len: RelaxedCell<u64>,138/// Total bytes in the buffer (excluding remaining capacity)139total_buffer_len: usize,140}141142impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {143fn eq(&self, other: &Self) -> bool {144self.len() == other.len() && self.into_iter().zip(other).all(|(l, r)| l == r)145}146}147148impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {149fn clone(&self) -> Self {150Self {151dtype: self.dtype.clone(),152views: self.views.clone(),153buffers: self.buffers.clone(),154validity: self.validity.clone(),155phantom: Default::default(),156total_bytes_len: self.total_bytes_len.clone(),157total_buffer_len: self.total_buffer_len,158}159}160}161162unsafe impl<T: ViewType + ?Sized> Send for BinaryViewArrayGeneric<T> {}163unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewArrayGeneric<T> {}164165const UNKNOWN_LEN: u64 = u64::MAX;166167impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {168/// # Safety169/// The caller must ensure170/// - the data is valid utf8 (if required)171/// - The offsets match the buffers.172pub unsafe fn new_unchecked(173dtype: ArrowDataType,174views: Buffer<View>,175buffers: Buffer<Buffer<u8>>,176validity: Option<Bitmap>,177total_bytes_len: Option<usize>,178total_buffer_len: usize,179) -> Self {180// Verify the invariants181#[cfg(debug_assertions)]182{183if let Some(validity) = validity.as_ref() {184assert_eq!(validity.len(), views.len());185}186187// @TODO: Enable this. There are still some bugs but disabled temporarily to get some fixes in.188// let mut actual_total_buffer_len = 0;189// let mut actual_total_bytes_len = 0;190191// for buffer in buffers.iter() {192// actual_total_buffer_len += buffer.len();193// }194195for (i, view) in views.iter().enumerate() {196let is_valid = validity.as_ref().is_none_or(|v| v.get_bit(i));197198if !is_valid {199continue;200}201202// actual_total_bytes_len += view.length as usize;203if view.length > View::MAX_INLINE_SIZE {204assert!((view.buffer_idx as usize) < (buffers.len()));205assert!(206view.offset as usize + view.length as usize207<= buffers[view.buffer_idx as usize].len()208);209}210}211212// assert_eq!(actual_total_buffer_len, total_buffer_len);213// if let Some(len) = total_bytes_len {214// assert_eq!(actual_total_bytes_len, len);215// }216}217218Self {219dtype,220views,221buffers,222validity,223phantom: Default::default(),224total_bytes_len: RelaxedCell::from(225total_bytes_len.map(|l| l as u64).unwrap_or(UNKNOWN_LEN),226),227total_buffer_len,228}229}230231/// Create a new BinaryViewArray but initialize a statistics compute.232///233/// # Safety234/// The caller must ensure the invariants235pub unsafe fn new_unchecked_unknown_md(236dtype: ArrowDataType,237views: Buffer<View>,238buffers: Buffer<Buffer<u8>>,239validity: Option<Bitmap>,240total_buffer_len: Option<usize>,241) -> Self {242let total_bytes_len = None;243let total_buffer_len =244total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum());245Self::new_unchecked(246dtype,247views,248buffers,249validity,250total_bytes_len,251total_buffer_len,252)253}254255pub fn data_buffers(&self) -> &Buffer<Buffer<u8>> {256&self.buffers257}258259pub fn data_buffers_mut(&mut self) -> &mut Buffer<Buffer<u8>> {260&mut self.buffers261}262263pub fn variadic_buffer_lengths(&self) -> Vec<i64> {264self.buffers.iter().map(|buf| buf.len() as i64).collect()265}266267pub fn views(&self) -> &Buffer<View> {268&self.views269}270271pub fn into_views(self) -> Vec<View> {272self.views.to_vec()273}274275pub fn into_inner(276self,277) -> (278Buffer<View>,279Buffer<Buffer<u8>>,280Option<Bitmap>,281Option<usize>,282usize,283) {284let total_bytes_len = self.try_total_bytes_len();285let views = self.views;286let buffers = self.buffers;287let validity = self.validity;288289(290views,291buffers,292validity,293total_bytes_len,294self.total_buffer_len,295)296}297298/// Apply a function over the views. This can be used to update views in operations like slicing.299///300/// # Safety301/// All invariants of the views must be maintained.302pub unsafe fn apply_views<F: FnMut(View, &T) -> View>(&self, mut update_view: F) -> Self {303let arr = self.clone();304let (views, buffers, validity, _total_bytes_len, total_buffer_len) = arr.into_inner();305306let mut total_bytes_len = 0;307let mut views = views.to_vec();308for v in views.iter_mut() {309let str_slice = T::from_bytes_unchecked(v.get_slice_unchecked(&buffers));310*v = update_view(*v, str_slice);311total_bytes_len += v.length as usize;312}313314let len_valid = validity.is_none();315Self::new_unchecked(316self.dtype.clone(),317views.into(),318buffers,319validity,320len_valid.then_some(total_bytes_len),321total_buffer_len,322)323}324325/// Apply a function to the views as a mutable slice.326///327/// # Safety328/// All invariants of the views must be maintained.329pub unsafe fn with_views_mut<F: FnOnce(&mut [View])>(&mut self, f: F) {330self.total_bytes_len.store(UNKNOWN_LEN);331if let Some(views) = self.views.get_mut_slice() {332f(views)333} else {334let mut views = self.views.as_slice().to_vec();335f(&mut views);336self.views = Buffer::from(views);337}338}339340pub fn try_new(341dtype: ArrowDataType,342views: Buffer<View>,343buffers: Buffer<Buffer<u8>>,344validity: Option<Bitmap>,345) -> PolarsResult<Self> {346if T::IS_UTF8 {347validate_utf8_views(views.as_ref(), buffers.as_ref())?;348} else {349validate_binary_views(views.as_ref(), buffers.as_ref())?;350}351352if let Some(validity) = &validity {353polars_ensure!(validity.len()== views.len(), ComputeError: "validity mask length must match the number of values" )354}355356unsafe {357Ok(Self::new_unchecked_unknown_md(358dtype, views, buffers, validity, None,359))360}361}362363/// Creates an empty [`BinaryViewArrayGeneric`], i.e. whose `.len` is zero.364#[inline]365pub fn new_empty(dtype: ArrowDataType) -> Self {366unsafe { Self::new_unchecked(dtype, Buffer::new(), Buffer::new(), None, Some(0), 0) }367}368369/// Returns a new null [`BinaryViewArrayGeneric`] of `length`.370#[inline]371pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {372let validity = Some(Bitmap::new_zeroed(length));373unsafe {374Self::new_unchecked(375dtype,376Buffer::zeroed(length),377Buffer::new(),378validity,379Some(0),3800,381)382}383}384385/// Returns the element at index `i`386/// # Panics387/// iff `i >= self.len()`388#[inline]389pub fn value(&self, i: usize) -> &T {390assert!(i < self.len());391unsafe { self.value_unchecked(i) }392}393394/// Returns the element at index `i`395///396/// # Safety397/// Assumes that the `i < self.len`.398#[inline]399pub unsafe fn value_unchecked(&self, i: usize) -> &T {400let v = self.views.get_unchecked(i);401T::from_bytes_unchecked(v.get_slice_unchecked(&self.buffers))402}403404/// Returns the element at index `i`, or None if it is null.405/// # Panics406/// iff `i >= self.len()`407#[inline]408pub fn get(&self, i: usize) -> Option<&T> {409assert!(i < self.len());410unsafe { self.get_unchecked(i) }411}412413/// Returns the element at index `i`, or None if it is null.414///415/// # Safety416/// Assumes that the `i < self.len`.417#[inline]418pub unsafe fn get_unchecked(&self, i: usize) -> Option<&T> {419if self420.validity421.as_ref()422.is_none_or(|v| v.get_bit_unchecked(i))423{424let v = self.views.get_unchecked(i);425Some(T::from_bytes_unchecked(426v.get_slice_unchecked(&self.buffers),427))428} else {429None430}431}432433/// Returns an iterator of `Option<&T>` over every element of this array.434pub fn iter(&self) -> ZipValidity<&T, BinaryViewValueIter<'_, T>, BitmapIter<'_>> {435ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())436}437438/// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity439pub fn values_iter(&self) -> BinaryViewValueIter<'_, T> {440BinaryViewValueIter::new(self)441}442443pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {444self.views.iter().map(|v| v.length)445}446447/// Returns an iterator of the non-null values.448pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {449NonNullValuesIter::new(self, self.validity())450}451452/// Returns an iterator of the non-null values.453pub fn non_null_views_iter(&self) -> NonNullValuesIter<'_, Buffer<View>> {454NonNullValuesIter::new(self.views(), self.validity())455}456457impl_sliced!();458impl_into_array!();459460/// Returns this array with a new validity.461/// # Panic462/// Panics iff `validity.len() != self.len()`.463#[must_use]464#[inline]465pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {466self.set_validity(validity);467self468}469470/// Sets the validity of this array.471/// # Panics472/// This function panics iff `values.len() != self.len()`.473#[inline]474pub fn set_validity(&mut self, validity: Option<Bitmap>) {475if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) {476panic!("validity must be equal to the array's length")477}478self.total_bytes_len.store(UNKNOWN_LEN);479self.validity = validity;480}481482/// Takes the validity of this array, leaving it without a validity mask.483#[inline]484pub fn take_validity(&mut self) -> Option<Bitmap> {485self.total_bytes_len.store(UNKNOWN_LEN);486self.validity.take()487}488489pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {490let mutable = MutableBinaryViewArray::from_iterator(491slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),492);493mutable.into()494}495496pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {497let mutable =498MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));499mutable.into()500}501502/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.503pub fn total_bytes_len(&self) -> usize {504let total = self.total_bytes_len.load();505if total == UNKNOWN_LEN {506let total = ZipValidity::new_with_validity(self.len_iter(), self.validity.as_ref())507.map(|v| v.unwrap_or(0) as usize)508.sum::<usize>();509self.total_bytes_len.store(total as u64);510total511} else {512total as usize513}514}515516/// Like total_bytes_len() but if unavailable will not force a computation.517pub fn try_total_bytes_len(&self) -> Option<usize> {518let b = self.total_bytes_len.load();519(b != UNKNOWN_LEN).then_some(b as usize)520}521522/// Get the length of bytes that are stored in the variadic buffers.523pub fn total_buffer_len(&self) -> usize {524self.total_buffer_len525}526527fn total_unshared_buffer_len(&self) -> usize {528// XXX: it is O(n), not O(1).529// Given this function is only called in `maybe_gc()`,530// it may not be worthy to add an extra field for this.531self.buffers532.iter()533.map(|buf| {534if buf.storage_refcount() > 1 {5350536} else {537buf.len()538}539})540.sum()541}542543#[inline(always)]544pub fn len(&self) -> usize {545self.views.len()546}547548/// Garbage collect549pub fn gc(self) -> Self {550if self.buffers.is_empty() {551return self;552}553let mut mutable = MutableBinaryViewArray::with_capacity(self.len());554let buffers = self.buffers.as_ref();555556for view in self.views.as_ref() {557unsafe { mutable.push_view_unchecked(*view, buffers) }558}559mutable.freeze().with_validity(self.validity)560}561562pub fn deshare(&self) -> Self {563if self.buffers.storage_refcount() == 1564&& self.buffers.iter().all(|b| b.storage_refcount() == 1)565{566return self.clone();567}568self.clone().gc()569}570571pub fn is_sliced(&self) -> bool {572!std::ptr::eq(self.views.as_ptr(), self.views.storage_ptr())573}574575pub fn maybe_gc(self) -> Self {576const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB.577578if self.total_buffer_len <= GC_MINIMUM_SAVINGS {579return self;580}581582if self.buffers.storage_refcount() != 1 {583// There are multiple holders of this `buffers`.584// If we allow gc in this case,585// it may end up copying the same content multiple times.586return self;587}588589// Subtract the maximum amount of inlined strings to get a lower bound590// on the number of buffer bytes needed (assuming no dedup).591let total_bytes_len = self.total_bytes_len();592let buffer_req_lower_bound = total_bytes_len.saturating_sub(self.len() * 12);593594let lower_bound_mem_usage_post_gc = self.len() * 16 + buffer_req_lower_bound;595// Use unshared buffer len. Shared buffer won't be freed; no savings.596let cur_mem_usage = self.len() * 16 + self.total_unshared_buffer_len();597let savings_upper_bound = cur_mem_usage.saturating_sub(lower_bound_mem_usage_post_gc);598599if savings_upper_bound >= GC_MINIMUM_SAVINGS600&& cur_mem_usage >= 4 * lower_bound_mem_usage_post_gc601{602self.gc()603} else {604self605}606}607608pub fn make_mut(self) -> MutableBinaryViewArray<T> {609let views = self.views.to_vec();610let completed_buffers = self.buffers.to_vec();611let validity = self.validity.map(|bitmap| bitmap.make_mut());612613// We need to know the total_bytes_len if we are going to mutate it.614let mut total_bytes_len = self.total_bytes_len.load();615if total_bytes_len == UNKNOWN_LEN {616total_bytes_len = views.iter().map(|view| view.length as u64).sum();617}618let total_bytes_len = total_bytes_len as usize;619620MutableBinaryViewArray {621views,622completed_buffers,623in_progress_buffer: vec![],624validity,625phantom: Default::default(),626total_bytes_len,627total_buffer_len: self.total_buffer_len,628stolen_buffers: PlHashMap::new(),629}630}631}632633impl BinaryViewArray {634/// Validate the underlying bytes on UTF-8.635pub fn validate_utf8(&self) -> PolarsResult<()> {636// SAFETY: views are correct637unsafe { validate_views_utf8_only(&self.views, &self.buffers, 0) }638}639640/// Convert [`BinaryViewArray`] to [`Utf8ViewArray`].641pub fn to_utf8view(&self) -> PolarsResult<Utf8ViewArray> {642self.validate_utf8()?;643unsafe { Ok(self.to_utf8view_unchecked()) }644}645646/// Convert [`BinaryViewArray`] to [`Utf8ViewArray`] without checking UTF-8.647///648/// # Safety649/// The caller must ensure the underlying data is valid UTF-8.650pub unsafe fn to_utf8view_unchecked(&self) -> Utf8ViewArray {651Utf8ViewArray::new_unchecked(652ArrowDataType::Utf8View,653self.views.clone(),654self.buffers.clone(),655self.validity.clone(),656self.try_total_bytes_len(),657self.total_buffer_len,658)659}660}661662impl Utf8ViewArray {663pub fn to_binview(&self) -> BinaryViewArray {664// SAFETY: same invariants.665unsafe {666BinaryViewArray::new_unchecked(667ArrowDataType::BinaryView,668self.views.clone(),669self.buffers.clone(),670self.validity.clone(),671self.try_total_bytes_len(),672self.total_buffer_len,673)674}675}676}677678impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {679fn as_any(&self) -> &dyn Any {680self681}682683fn as_any_mut(&mut self) -> &mut dyn Any {684self685}686687#[inline(always)]688fn len(&self) -> usize {689BinaryViewArrayGeneric::len(self)690}691692#[inline(always)]693fn dtype(&self) -> &ArrowDataType {694&self.dtype695}696697#[inline(always)]698fn dtype_mut(&mut self) -> &mut ArrowDataType {699&mut self.dtype700}701702fn validity(&self) -> Option<&Bitmap> {703self.validity.as_ref()704}705706fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {707let (lhs, rhs) = Splitable::split_at(self, offset);708(Box::new(lhs), Box::new(rhs))709}710711unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {712let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };713(Box::new(lhs), Box::new(rhs))714}715716fn slice(&mut self, offset: usize, length: usize) {717assert!(718offset + length <= self.len(),719"the offset of the new Buffer cannot exceed the existing length"720);721unsafe { self.slice_unchecked(offset, length) }722}723724unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {725debug_assert!(offset + length <= self.len());726self.validity = self727.validity728.take()729.map(|bitmap| bitmap.sliced_unchecked(offset, length))730.filter(|bitmap| bitmap.unset_bits() > 0);731self.views.slice_in_place_unchecked(offset..offset + length);732self.total_bytes_len.store(UNKNOWN_LEN)733}734735fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {736debug_assert!(737validity.as_ref().is_none_or(|v| v.len() == self.len()),738"{} != {}",739validity.as_ref().unwrap().len(),740self.len()741);742743let mut new = self.clone();744new.validity = validity;745Box::new(new)746}747748fn to_boxed(&self) -> Box<dyn Array> {749Box::new(self.clone())750}751}752753impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {754fn check_bound(&self, offset: usize) -> bool {755offset <= self.len()756}757758unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {759let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };760let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };761762unsafe {763(764Self::new_unchecked(765self.dtype.clone(),766lhs_views,767self.buffers.clone(),768lhs_validity,769(offset == 0).then_some(0),770self.total_buffer_len(),771),772Self::new_unchecked(773self.dtype.clone(),774rhs_views,775self.buffers.clone(),776rhs_validity,777(offset == self.len()).then_some(0),778self.total_buffer_len(),779),780)781}782}783}784785786