Path: blob/main/crates/polars-arrow/src/array/binview/view.rs
6939 views
use std::cmp::Ordering;1use std::fmt::{self, Display, Formatter};23use bytemuck::{Pod, Zeroable};4use polars_error::*;5use polars_utils::min_max::MinMax;6use polars_utils::nulls::IsNull;7use polars_utils::total_ord::{TotalEq, TotalOrd};89use crate::datatypes::PrimitiveType;10use crate::types::{Bytes16Alignment4, NativeType};1112// We use this instead of u128 because we want alignment of <= 8 bytes.13/// A reference to a set of bytes.14///15/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.16/// If `length > 12`, these fields specify a slice of a buffer.17#[derive(Copy, Clone, Default)]18#[repr(C)]19pub struct View {20/// The length of the string/bytes.21pub length: u32,22/// First 4 bytes of string/bytes data.23pub prefix: u32,24/// The buffer index.25pub buffer_idx: u32,26/// The offset into the buffer.27pub offset: u32,28}2930impl fmt::Debug for View {31fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {32if self.length <= Self::MAX_INLINE_SIZE {33fmt.debug_struct("View")34.field("length", &self.length)35.field("content", &unsafe {36std::slice::from_raw_parts(37(self as *const _ as *const u8).add(4),38self.length as usize,39)40})41.finish()42} else {43fmt.debug_struct("View")44.field("length", &self.length)45.field("prefix", &self.prefix.to_be_bytes())46.field("buffer_idx", &self.buffer_idx)47.field("offset", &self.offset)48.finish()49}50}51}5253impl View {54pub const MAX_INLINE_SIZE: u32 = 12;5556#[inline(always)]57pub fn is_inline(&self) -> bool {58self.length <= Self::MAX_INLINE_SIZE59}6061#[inline(always)]62pub fn as_u128(self) -> u128 {63unsafe { std::mem::transmute(self) }64}6566/// Create a new inline view without verifying the length67///68/// # Safety69///70/// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.71#[inline]72pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {73debug_assert!(bytes.len() <= u32::MAX as usize);74debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);7576let mut view = Self {77length: bytes.len() as u32,78..Default::default()79};8081let view_ptr = &mut view as *mut _ as *mut u8;8283// SAFETY:84// - bytes length <= 12,85// - size_of::<View> == 1686// - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))87// - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and88// offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)89unsafe {90let inline_data_ptr = view_ptr.add(4);91core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());92}93view94}9596/// Create a new inline view97///98/// # Panics99///100/// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.101#[inline]102pub fn new_inline(bytes: &[u8]) -> Self {103assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);104unsafe { Self::new_inline_unchecked(bytes) }105}106107/// Create a new inline view108///109/// # Safety110///111/// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.112#[inline]113pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {114debug_assert!(bytes.len() <= u32::MAX as usize);115debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);116117// SAFETY: The invariant of this function guarantees that this is safe.118let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };119Self {120length: bytes.len() as u32,121prefix,122buffer_idx,123offset,124}125}126127#[inline]128pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {129debug_assert!(bytes.len() <= u32::MAX as usize);130131// SAFETY: We verify the invariant with the outer if statement132unsafe {133if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {134Self::new_inline_unchecked(bytes)135} else {136Self::new_noninline_unchecked(bytes, buffer_idx, offset)137}138}139}140141/// Constructs a byteslice from this view.142///143/// # Safety144/// Assumes that this view is valid for the given buffers.145#[inline]146pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {147unsafe {148if self.length <= Self::MAX_INLINE_SIZE {149self.get_inlined_slice_unchecked()150} else {151self.get_external_slice_unchecked(buffers)152}153}154}155156/// Construct a byte slice from an inline view, if it is inline.157#[inline]158pub fn get_inlined_slice(&self) -> Option<&[u8]> {159if self.length <= Self::MAX_INLINE_SIZE {160unsafe { Some(self.get_inlined_slice_unchecked()) }161} else {162None163}164}165166/// Construct a byte slice from an inline view.167///168/// # Safety169/// Assumes that this view is inlinable.170#[inline]171pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {172debug_assert!(self.length <= View::MAX_INLINE_SIZE);173let ptr = self as *const View as *const u8;174unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }175}176177/// Construct a byte slice from an external view.178///179/// # Safety180/// Assumes that this view is in the external buffers.181#[inline]182pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(183&self,184buffers: &'a [B],185) -> &'a [u8] {186debug_assert!(self.length > View::MAX_INLINE_SIZE);187let data = buffers.get_unchecked(self.buffer_idx as usize);188let offset = self.offset as usize;189data.as_ref()190.get_unchecked(offset..offset + self.length as usize)191}192193/// Extend a `Vec<View>` with inline views slices of `src` with `width`.194///195/// This tries to use SIMD to optimize the copying and can be massively faster than doing a196/// `views.extend(src.chunks_exact(width).map(View::new_inline))`.197///198/// # Panics199///200/// This function panics if `src.len()` is not divisible by `width`, `width >201/// View::MAX_INLINE_SIZE` or `width == 0`.202pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {203macro_rules! dispatch {204($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {205match $match {206$(207$v => {208const $n: usize = $v;209210$block211}212)+213_ => $otherwise,214}215}216}217218let width = width as usize;219220assert!(width > 0);221assert!(width <= View::MAX_INLINE_SIZE as usize);222223assert_eq!(src.len() % width, 0);224225let num_values = src.len() / width;226227views.reserve(num_values);228229#[allow(unused_mut)]230let mut src = src;231232dispatch! {233N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {234#[cfg(feature = "simd")]235{236macro_rules! repeat_with {237($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {238$({239const $i: usize = $v;240241$block242})+243}244}245246use std::simd::*;247248// SAFETY: This is always allowed, since views.len() is always in the Vec249// buffer.250let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };251252let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);253254const BLOCKS_PER_LOAD: usize = 16 / N;255const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;256257let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);258259for _ in 0..num_loops {260// SAFETY: The num_loops calculates how many times we can do this.261let loaded = u8x16::from_array(unsafe {262src.get_unchecked(..16).try_into().unwrap()263});264src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };265266// This way we can reuse the same load for multiple views.267repeat_with!(268I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {269if I < BLOCKS_PER_LOAD {270let zero = u8x16::default();271const SWIZZLE: [usize; 16] = const {272let mut swizzle = [16usize; 16];273274let mut i = 0;275while i < N {276let idx = i + I * N;277if idx < 16 {278swizzle[4+i] = idx;279}280i += 1;281}282283swizzle284};285286let scattered = simd_swizzle!(loaded, zero, SWIZZLE);287let view_bytes = (scattered | length_mask).to_array();288289// SAFETY: dst has the capacity reserved and view_bytes is 16290// bytes long.291unsafe {292core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);293dst = dst.add(16);294}295}296}297);298}299300unsafe {301views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);302}303}304305views.extend(src.chunks_exact(N).map(|slice| unsafe {306View::new_inline_unchecked(slice)307}));308},309otherwise = unreachable!()310}311}312}313314impl IsNull for View {315const HAS_NULLS: bool = false;316type Inner = Self;317318fn is_null(&self) -> bool {319false320}321322fn unwrap_inner(self) -> Self::Inner {323self324}325}326327impl Display for View {328fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {329write!(f, "{self:?}")330}331}332333unsafe impl Zeroable for View {}334335unsafe impl Pod for View {}336337impl PartialEq for View {338fn eq(&self, other: &Self) -> bool {339self.as_u128() == other.as_u128()340}341}342343// These are 'implemented' because we want to implement NativeType344// for View, that should probably not be done.345impl TotalOrd for View {346fn tot_cmp(&self, _other: &Self) -> Ordering {347unimplemented!()348}349}350351impl TotalEq for View {352fn tot_eq(&self, other: &Self) -> bool {353self.eq(other)354}355}356357impl MinMax for View {358fn nan_min_lt(&self, _other: &Self) -> bool {359unimplemented!()360}361362fn nan_max_lt(&self, _other: &Self) -> bool {363unimplemented!()364}365}366367impl NativeType for View {368const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;369370type Bytes = [u8; 16];371type AlignedBytes = Bytes16Alignment4;372373#[inline]374fn to_le_bytes(&self) -> Self::Bytes {375self.as_u128().to_le_bytes()376}377378#[inline]379fn to_be_bytes(&self) -> Self::Bytes {380self.as_u128().to_be_bytes()381}382383#[inline]384fn from_le_bytes(bytes: Self::Bytes) -> Self {385Self::from(u128::from_le_bytes(bytes))386}387388#[inline]389fn from_be_bytes(bytes: Self::Bytes) -> Self {390Self::from(u128::from_be_bytes(bytes))391}392}393394impl From<u128> for View {395#[inline]396fn from(value: u128) -> Self {397unsafe { std::mem::transmute(value) }398}399}400401impl From<View> for u128 {402#[inline]403fn from(value: View) -> Self {404value.as_u128()405}406}407408pub fn validate_views<B: AsRef<[u8]>, F>(409views: &[View],410buffers: &[B],411validate_bytes: F,412) -> PolarsResult<()>413where414F: Fn(&[u8]) -> PolarsResult<()>,415{416for view in views {417if let Some(inline_slice) = view.get_inlined_slice() {418if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0419{420polars_bail!(ComputeError: "view contained non-zero padding in prefix");421}422423validate_bytes(inline_slice)?;424} else {425let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {426polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)427})?;428429let start = view.offset as usize;430let end = start + view.length as usize;431let b = data432.as_ref()433.get(start..end)434.ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;435436polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");437validate_bytes(b)?;438}439}440441Ok(())442}443444pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {445validate_views(views, buffers, |_| Ok(()))446}447448fn validate_utf8(b: &[u8]) -> PolarsResult<()> {449match simdutf8::basic::from_utf8(b) {450Ok(_) => Ok(()),451Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),452}453}454455pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {456validate_views(views, buffers, validate_utf8)457}458459/// Checks the views for valid UTF-8. Assumes the first num_trusted_buffers are460/// valid UTF-8 without checking.461/// # Safety462/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.463pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(464views: &[View],465buffers: &[B],466mut num_trusted_buffers: usize,467) -> PolarsResult<()> {468unsafe {469while num_trusted_buffers < buffers.len()470&& buffers[num_trusted_buffers].as_ref().is_ascii()471{472num_trusted_buffers += 1;473}474475// Fast path if all buffers are ASCII (or there are no buffers).476if num_trusted_buffers >= buffers.len() {477for view in views {478if let Some(inlined_slice) = view.get_inlined_slice() {479validate_utf8(inlined_slice)?;480}481}482} else {483for view in views {484if view.length <= View::MAX_INLINE_SIZE485|| view.buffer_idx as usize >= num_trusted_buffers486{487validate_utf8(view.get_slice_unchecked(buffers))?;488}489}490}491492Ok(())493}494}495496497