Path: blob/main/crates/polars-arrow/src/array/specification.rs
6939 views
use polars_error::{PolarsResult, polars_bail, polars_err};12use crate::array::DictionaryKey;3use crate::offset::{Offset, Offsets, OffsetsBuffer};45/// Helper trait to support `Offset` and `OffsetBuffer`6pub trait OffsetsContainer<O> {7fn last(&self) -> usize;8fn as_slice(&self) -> &[O];9}1011impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {12#[inline]13fn last(&self) -> usize {14self.last().to_usize()15}1617#[inline]18fn as_slice(&self) -> &[O] {19self.buffer()20}21}2223impl<O: Offset> OffsetsContainer<O> for Offsets<O> {24#[inline]25fn last(&self) -> usize {26self.last().to_usize()27}2829#[inline]30fn as_slice(&self) -> &[O] {31self.as_slice()32}33}3435pub(crate) fn try_check_offsets_bounds<O: Offset>(36offsets: &[O],37values_len: usize,38) -> PolarsResult<()> {39if offsets.last().unwrap().to_usize() > values_len {40polars_bail!(ComputeError: "offsets must not exceed the values length")41} else {42Ok(())43}44}4546/// # Error47/// * any offset is larger or equal to `values_len`.48/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or49pub fn try_check_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> PolarsResult<()> {50if offsets.len() == 1 {51return Ok(());52}53assert!(offsets.len() > 1);54let end = offsets.last().unwrap().to_usize();55let start = offsets.first().unwrap().to_usize();5657try_check_offsets_bounds(offsets, values.len())?;58let values_range = &values[start..end];5960if values_range.is_ascii() {61Ok(())62} else {63simdutf8::basic::from_utf8(values_range)?;6465// offsets can be == values.len()66// find first offset from the end that is smaller67// Example:68// values.len() = 1069// offsets = [0, 5, 10, 10]70let last = offsets71.iter()72.enumerate()73.skip(1)74.rev()75.find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i));7677let last = if let Some(last) = last {78// following the example: last = 1 (offset = 5)79last80} else {81// given `l = values.len()`, this branch is hit iff either:82// * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above83// * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above84return Ok(());85};8687// truncate to relevant offsets. Note: `=last` because last was computed skipping the first item88// following the example: starts = [0, 5]89let starts = unsafe { offsets.get_unchecked(..=last) };9091let mut any_invalid = false;92for start in starts {93let start = start.to_usize();9495// SAFETY: `try_check_offsets_bounds` just checked for bounds96let b = *unsafe { values.get_unchecked(start) };9798// A valid code-point iff it does not start with 0b10xxxxxx99// Bit-magic taken from `std::str::is_char_boundary`100any_invalid |= (b as i8) < -0x40;101}102if any_invalid {103polars_bail!(ComputeError: "non-valid char boundary detected")104}105Ok(())106}107}108109/// Check dictionary indexes without checking usize conversion.110/// # Safety111/// The caller must ensure that `K::as_usize` always succeeds.112pub(crate) unsafe fn check_indexes_unchecked<K: DictionaryKey>(113keys: &[K],114len: usize,115) -> PolarsResult<()> {116let mut invalid = false;117118// this loop is auto-vectorized119keys.iter().for_each(|k| invalid |= k.as_usize() > len);120121if invalid {122let key = keys.iter().map(|k| k.as_usize()).max().unwrap();123polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")124} else {125Ok(())126}127}128129pub fn check_indexes<K>(keys: &[K], len: usize) -> PolarsResult<()>130where131K: std::fmt::Debug + Copy + TryInto<usize>,132{133keys.iter().try_for_each(|key| {134let key: usize = (*key)135.try_into()136.map_err(|_| polars_err!(ComputeError: "The dictionary key must fit in a `usize`, but {key:?} does not")137)?;138if key >= len {139polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")140} else {141Ok(())142}143})144}145146#[cfg(test)]147mod tests {148use proptest::prelude::*;149150use super::*;151152pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {153prop::collection::vec(any::<u8>(), 1..100)154}155156proptest! {157// a bit expensive, feel free to run it when changing the code above158// #![proptest_config(ProptestConfig::with_cases(100000))]159#[test]160#[cfg_attr(miri, ignore)] // miri and proptest do not work well161fn check_utf8_validation(values in binary_strategy()) {162163for offset in 0..values.len() - 1 {164let offsets: OffsetsBuffer<i32> = vec![0, offset as i32, values.len() as i32].try_into().unwrap();165166let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();167is_valid &= std::str::from_utf8(&values[offset..]).is_ok();168169assert_eq!(try_check_utf8::<i32>(&offsets, &values).is_ok(), is_valid)170}171}172}173}174175176