Path: blob/main/crates/polars-arrow/src/array/binview/view.rs
8406 views
use std::cmp::Ordering;1use std::fmt::{self, Display, Formatter};23use bytemuck::{Pod, Zeroable};4use polars_error::*;5use polars_utils::min_max::MinMax;6use polars_utils::nulls::IsNull;7use polars_utils::total_ord::{TotalEq, TotalOrd};89use crate::datatypes::PrimitiveType;10use crate::types::{Bytes16Alignment4, NativeType};1112// We use this instead of u128 because we want alignment of <= 8 bytes.13/// A reference to a set of bytes.14///15/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.16/// If `length > 12`, these fields specify a slice of a buffer.17#[derive(Copy, Clone, Default)]18#[repr(C)]19pub struct View {20/// The length of the string/bytes.21pub length: u32,22/// First 4 bytes of string/bytes data.23pub prefix: u32,24/// The buffer index.25pub buffer_idx: u32,26/// The offset into the buffer.27pub offset: u32,28}2930impl fmt::Debug for View {31fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {32if self.length <= Self::MAX_INLINE_SIZE {33fmt.debug_struct("View")34.field("length", &self.length)35.field("content", &unsafe {36std::slice::from_raw_parts(37(self as *const _ as *const u8).add(4),38self.length as usize,39)40})41.finish()42} else {43fmt.debug_struct("View")44.field("length", &self.length)45.field("prefix", &self.prefix.to_be_bytes())46.field("buffer_idx", &self.buffer_idx)47.field("offset", &self.offset)48.finish()49}50}51}5253impl View {54pub const MAX_INLINE_SIZE: u32 = 12;5556#[inline(always)]57pub fn is_inline(&self) -> bool {58self.length <= Self::MAX_INLINE_SIZE59}6061#[inline(always)]62pub fn as_u128(self) -> u128 {63unsafe { std::mem::transmute(self) }64}6566/// Create a new inline view without verifying the length67///68/// # Safety69///70/// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.71#[inline]72pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {73debug_assert!(bytes.len() <= u32::MAX as usize);74debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);7576let mut view = Self {77length: bytes.len() as u32,78..Default::default()79};8081let view_ptr = &mut view as *mut _ as *mut u8;8283// SAFETY:84// - bytes length <= 12,85// - size_of::<View> == 1686// - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))87// - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and88// offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)89unsafe {90let inline_data_ptr = view_ptr.add(4);91core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());92}93view94}9596/// Create a new inline view97///98/// # Panics99///100/// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.101#[inline]102pub fn new_inline(bytes: &[u8]) -> Self {103assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);104unsafe { Self::new_inline_unchecked(bytes) }105}106107/// Create a new inline view108///109/// # Safety110///111/// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.112#[inline]113pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {114debug_assert!(bytes.len() <= u32::MAX as usize);115debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);116117// SAFETY: The invariant of this function guarantees that this is safe.118let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };119Self {120length: bytes.len() as u32,121prefix,122buffer_idx,123offset,124}125}126127#[inline]128pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {129assert!(bytes.len() <= u32::MAX as usize);130131// SAFETY: We verify the invariant with the outer if statement132unsafe {133if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {134Self::new_inline_unchecked(bytes)135} else {136Self::new_noninline_unchecked(bytes, buffer_idx, offset)137}138}139}140141#[inline]142pub fn new_with_buffers(143bytes: &[u8],144buffer_idx_offset: u32,145buffers: &mut Vec<Vec<u8>>,146) -> Self {147unsafe {148if bytes.len() <= Self::MAX_INLINE_SIZE as usize {149Self::new_inline_unchecked(bytes)150} else {151assert!(bytes.len() <= u32::MAX as usize);152let num_buffers = buffers.len();153if let Some(buf) = buffers.last_mut() {154if buf.len() + bytes.len() <= buf.capacity() {155let buf_idx = buffer_idx_offset + num_buffers as u32 - 1;156let offset = buf.len() as u32;157buf.extend_from_slice(bytes);158return Self::new_noninline_unchecked(bytes, buf_idx, offset);159}160}161162Self::new_with_buffers_slow(bytes, buffer_idx_offset, buffers)163}164}165}166167/// # Safety168///169/// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.170#[cold]171unsafe fn new_with_buffers_slow(172bytes: &[u8],173buffer_idx_offset: u32,174buffers: &mut Vec<Vec<u8>>,175) -> Self {176// Resize last buffer instead of making new buffer.177const RESIZE_LIMIT: usize = 8192;178179if buffers.is_empty() {180buffers.push(bytes.to_vec());181return View::new_noninline_unchecked(bytes, buffer_idx_offset, 0);182}183184let num_buffers = buffers.len();185let old_cap = buffers.last().unwrap().capacity();186let new_cap = (old_cap + old_cap).clamp(bytes.len(), u32::MAX as usize);187if new_cap <= RESIZE_LIMIT {188let buffer_idx = buffer_idx_offset + num_buffers as u32 - 1;189let buf = buffers.last_mut().unwrap();190let offset = buf.len() as u32;191buf.reserve(new_cap - buf.len());192buf.extend_from_slice(bytes);193View::new_noninline_unchecked(bytes, buffer_idx, offset)194} else {195let buffer_idx = buffer_idx_offset + num_buffers as u32;196let mut buf = Vec::with_capacity(new_cap);197buf.extend_from_slice(bytes);198buffers.push(buf);199View::new_noninline_unchecked(bytes, buffer_idx, 0)200}201}202203/// Constructs a byteslice from this view.204///205/// # Safety206/// Assumes that this view is valid for the given buffers.207#[inline]208pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {209unsafe {210if self.length <= Self::MAX_INLINE_SIZE {211self.get_inlined_slice_unchecked()212} else {213self.get_external_slice_unchecked(buffers)214}215}216}217218/// Construct a byte slice from an inline view, if it is inline.219#[inline]220pub fn get_inlined_slice(&self) -> Option<&[u8]> {221if self.length <= Self::MAX_INLINE_SIZE {222unsafe { Some(self.get_inlined_slice_unchecked()) }223} else {224None225}226}227228/// Construct a byte slice from an inline view.229///230/// # Safety231/// Assumes that this view is inlinable.232#[inline]233pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {234debug_assert!(self.length <= View::MAX_INLINE_SIZE);235let ptr = self as *const View as *const u8;236unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }237}238239/// Construct a byte slice from an external view.240///241/// # Safety242/// Assumes that this view is in the external buffers.243#[inline]244pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(245&self,246buffers: &'a [B],247) -> &'a [u8] {248debug_assert!(self.length > View::MAX_INLINE_SIZE);249let data = buffers.get_unchecked(self.buffer_idx as usize);250let offset = self.offset as usize;251data.as_ref()252.get_unchecked(offset..offset + self.length as usize)253}254255/// Extend a `Vec<View>` with inline views slices of `src` with `width`.256///257/// This tries to use SIMD to optimize the copying and can be massively faster than doing a258/// `views.extend(src.chunks_exact(width).map(View::new_inline))`.259///260/// # Panics261///262/// This function panics if `src.len()` is not divisible by `width`, `width >263/// View::MAX_INLINE_SIZE` or `width == 0`.264pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {265macro_rules! dispatch {266($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {267match $match {268$(269$v => {270const $n: usize = $v;271272$block273}274)+275_ => $otherwise,276}277}278}279280let width = width as usize;281282assert!(width > 0);283assert!(width <= View::MAX_INLINE_SIZE as usize);284285assert_eq!(src.len() % width, 0);286287let num_values = src.len() / width;288289views.reserve(num_values);290291#[allow(unused_mut)]292let mut src = src;293294dispatch! {295N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {296#[cfg(feature = "simd")]297{298macro_rules! repeat_with {299($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {300$({301const $i: usize = $v;302303$block304})+305}306}307308use std::simd::*;309310// SAFETY: This is always allowed, since views.len() is always in the Vec311// buffer.312let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };313314let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);315316const BLOCKS_PER_LOAD: usize = 16 / N;317const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;318319let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);320321for _ in 0..num_loops {322// SAFETY: The num_loops calculates how many times we can do this.323let loaded = u8x16::from_array(unsafe {324src.get_unchecked(..16).try_into().unwrap()325});326src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };327328// This way we can reuse the same load for multiple views.329repeat_with!(330I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {331if I < BLOCKS_PER_LOAD {332let zero = u8x16::default();333const SWIZZLE: [usize; 16] = const {334let mut swizzle = [16usize; 16];335336let mut i = 0;337while i < N {338let idx = i + I * N;339if idx < 16 {340swizzle[4+i] = idx;341}342i += 1;343}344345swizzle346};347348let scattered = simd_swizzle!(loaded, zero, SWIZZLE);349let view_bytes = (scattered | length_mask).to_array();350351// SAFETY: dst has the capacity reserved and view_bytes is 16352// bytes long.353unsafe {354core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);355dst = dst.add(16);356}357}358}359);360}361362unsafe {363views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);364}365}366367views.extend(src.chunks_exact(N).map(|slice| unsafe {368View::new_inline_unchecked(slice)369}));370},371otherwise = unreachable!()372}373}374}375376impl IsNull for View {377const HAS_NULLS: bool = false;378type Inner = Self;379380fn is_null(&self) -> bool {381false382}383384fn unwrap_inner(self) -> Self::Inner {385self386}387}388389impl Display for View {390fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {391write!(f, "{self:?}")392}393}394395unsafe impl Zeroable for View {}396397unsafe impl Pod for View {}398399impl PartialEq for View {400fn eq(&self, other: &Self) -> bool {401self.as_u128() == other.as_u128()402}403}404405// These are 'implemented' because we want to implement NativeType406// for View, that should probably not be done.407impl TotalOrd for View {408fn tot_cmp(&self, _other: &Self) -> Ordering {409unimplemented!()410}411}412413impl TotalEq for View {414fn tot_eq(&self, other: &Self) -> bool {415self.eq(other)416}417}418419impl MinMax for View {420fn nan_min_lt(&self, _other: &Self) -> bool {421unimplemented!()422}423424fn nan_max_lt(&self, _other: &Self) -> bool {425unimplemented!()426}427}428429impl NativeType for View {430const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;431432type Bytes = [u8; 16];433type AlignedBytes = Bytes16Alignment4;434435#[inline]436fn to_le_bytes(&self) -> Self::Bytes {437self.as_u128().to_le_bytes()438}439440#[inline]441fn to_be_bytes(&self) -> Self::Bytes {442self.as_u128().to_be_bytes()443}444445#[inline]446fn from_le_bytes(bytes: Self::Bytes) -> Self {447Self::from(u128::from_le_bytes(bytes))448}449450#[inline]451fn from_be_bytes(bytes: Self::Bytes) -> Self {452Self::from(u128::from_be_bytes(bytes))453}454}455456impl From<u128> for View {457#[inline]458fn from(value: u128) -> Self {459unsafe { std::mem::transmute(value) }460}461}462463impl From<View> for u128 {464#[inline]465fn from(value: View) -> Self {466value.as_u128()467}468}469470pub fn validate_views<B: AsRef<[u8]>, F>(471views: &[View],472buffers: &[B],473validate_bytes: F,474) -> PolarsResult<()>475where476F: Fn(&[u8]) -> PolarsResult<()>,477{478for view in views {479if let Some(inline_slice) = view.get_inlined_slice() {480if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0481{482polars_bail!(ComputeError: "view contained non-zero padding in prefix");483}484485validate_bytes(inline_slice)?;486} else {487let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {488polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)489})?;490491let start = view.offset as usize;492let end = start + view.length as usize;493let b = data494.as_ref()495.get(start..end)496.ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;497498polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");499validate_bytes(b)?;500}501}502503Ok(())504}505506pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {507validate_views(views, buffers, |_| Ok(()))508}509510fn validate_utf8(b: &[u8]) -> PolarsResult<()> {511match simdutf8::basic::from_utf8(b) {512Ok(_) => Ok(()),513Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),514}515}516517pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {518validate_views(views, buffers, validate_utf8)519}520521/// Checks the views for valid UTF-8. Assumes the first num_trusted_buffers are522/// valid UTF-8 without checking.523/// # Safety524/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.525pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(526views: &[View],527buffers: &[B],528mut num_trusted_buffers: usize,529) -> PolarsResult<()> {530unsafe {531while num_trusted_buffers < buffers.len()532&& buffers[num_trusted_buffers].as_ref().is_ascii()533{534num_trusted_buffers += 1;535}536537// Fast path if all buffers are ASCII (or there are no buffers).538if num_trusted_buffers >= buffers.len() {539for view in views {540if let Some(inlined_slice) = view.get_inlined_slice() {541validate_utf8(inlined_slice)?;542}543}544} else {545for view in views {546if view.length <= View::MAX_INLINE_SIZE547|| view.buffer_idx as usize >= num_trusted_buffers548{549validate_utf8(view.get_slice_unchecked(buffers))?;550}551}552}553554Ok(())555}556}557558559