Path: blob/main/crates/polars-compute/src/cast/binary_to.rs
6939 views
use std::sync::Arc;12use arrow::array::*;3use arrow::buffer::Buffer;4use arrow::datatypes::ArrowDataType;5use arrow::offset::{Offset, Offsets};6use arrow::types::NativeType;7use polars_error::PolarsResult;89use super::CastOptionsImpl;1011pub(super) trait Parse {12fn parse(val: &[u8]) -> Option<Self>13where14Self: Sized;15}1617macro_rules! impl_parse {18($primitive_type:ident) => {19impl Parse for $primitive_type {20fn parse(val: &[u8]) -> Option<Self> {21atoi_simd::parse_skipped(val).ok()22}23}24};25}26impl_parse!(i8);27impl_parse!(i16);28impl_parse!(i32);29impl_parse!(i64);3031impl_parse!(u8);32impl_parse!(u16);33impl_parse!(u32);34impl_parse!(u64);3536#[cfg(feature = "dtype-i128")]37impl_parse!(i128);3839impl Parse for f32 {40fn parse(val: &[u8]) -> Option<Self>41where42Self: Sized,43{44fast_float2::parse(val).ok()45}46}47impl Parse for f64 {48fn parse(val: &[u8]) -> Option<Self>49where50Self: Sized,51{52fast_float2::parse(val).ok()53}54}5556/// Conversion of binary57pub fn binary_to_large_binary(58from: &BinaryArray<i32>,59to_dtype: ArrowDataType,60) -> BinaryArray<i64> {61let values = from.values().clone();62BinaryArray::<i64>::new(63to_dtype,64from.offsets().into(),65values,66from.validity().cloned(),67)68}6970/// Conversion of binary71pub fn binary_large_to_binary(72from: &BinaryArray<i64>,73to_dtype: ArrowDataType,74) -> PolarsResult<BinaryArray<i32>> {75let values = from.values().clone();76let offsets = from.offsets().try_into()?;77Ok(BinaryArray::<i32>::new(78to_dtype,79offsets,80values,81from.validity().cloned(),82))83}8485/// Conversion to utf886pub fn binary_to_utf8<O: Offset>(87from: &BinaryArray<O>,88to_dtype: ArrowDataType,89) -> PolarsResult<Utf8Array<O>> {90Utf8Array::<O>::try_new(91to_dtype,92from.offsets().clone(),93from.values().clone(),94from.validity().cloned(),95)96}9798/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.99pub(super) fn binary_to_primitive<O: Offset, T>(100from: &BinaryArray<O>,101to: &ArrowDataType,102) -> PrimitiveArray<T>103where104T: NativeType + Parse,105{106let iter = from.iter().map(|x| x.and_then::<T, _>(|x| T::parse(x)));107108PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())109}110111pub(super) fn binary_to_primitive_dyn<O: Offset, T>(112from: &dyn Array,113to: &ArrowDataType,114options: CastOptionsImpl,115) -> PolarsResult<Box<dyn Array>>116where117T: NativeType + Parse,118{119let from = from.as_any().downcast_ref().unwrap();120if options.partial {121unimplemented!()122} else {123Ok(Box::new(binary_to_primitive::<O, T>(from, to)))124}125}126127/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing.128/// # Errors129/// This function errors if the maximum key is smaller than the number of distinct elements130/// in the array.131pub fn binary_to_dictionary<O: Offset, K: DictionaryKey>(132from: &BinaryArray<O>,133) -> PolarsResult<DictionaryArray<K>> {134let mut array = MutableDictionaryArray::<K, MutableBinaryArray<O>>::new();135array.reserve(from.len());136array.try_extend(from.iter())?;137138Ok(array.into())139}140141pub(super) fn binary_to_dictionary_dyn<O: Offset, K: DictionaryKey>(142from: &dyn Array,143) -> PolarsResult<Box<dyn Array>> {144let values = from.as_any().downcast_ref().unwrap();145binary_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)146}147148fn fixed_size_to_offsets<O: Offset>(values_len: usize, fixed_size: usize) -> Offsets<O> {149let offsets = (0..(values_len + 1))150.step_by(fixed_size)151.map(|v| O::from_as_usize(v))152.collect();153// SAFETY:154// * every element is `>= 0`155// * element at position `i` is >= than element at position `i-1`.156unsafe { Offsets::new_unchecked(offsets) }157}158159/// Conversion of `FixedSizeBinary` to `Binary`.160pub fn fixed_size_binary_binary<O: Offset>(161from: &FixedSizeBinaryArray,162to_dtype: ArrowDataType,163) -> BinaryArray<O> {164let values = from.values().clone();165let offsets = fixed_size_to_offsets(values.len(), from.size());166BinaryArray::<O>::new(to_dtype, offsets.into(), values, from.validity().cloned())167}168169pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray {170let datatype = <[u8] as ViewType>::DATA_TYPE;171172// Fast path: all the views are inlineable173if from.size() <= View::MAX_INLINE_SIZE as usize {174// @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that175// prevents it from properly SIMD-ing this loop. It insists on memcpying while it should176// know that the size is really small. Dispatching over the `from.size()` and making it177// constant does make loop SIMD, but it does not actually speed anything up and the code it178// generates is still horrible.179//180// This is really slow, and I don't think it has to be.181182// SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before183let mut views = Vec::new();184View::extend_with_inlinable_strided(185&mut views,186from.values().as_slice(),187from.size() as u8,188);189let views = Buffer::from(views);190return BinaryViewArray::try_new(datatype, views, Arc::default(), from.validity().cloned())191.unwrap();192}193194const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize;195196let size = from.size();197let num_bytes = from.len() * size;198let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER);199assert!(num_buffers < u32::MAX as usize);200201let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size;202// This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division203let split_point = num_elements_per_buffer * size;204205// This is zero-copy for the buffer since split just increases the data since206let mut buffer = from.values().clone();207let mut buffers = Vec::with_capacity(num_buffers);208209if let Some(num_buffers) = num_buffers.checked_sub(1) {210for _ in 0..num_buffers {211let slice;212(slice, buffer) = buffer.split_at(split_point);213buffers.push(slice);214}215buffers.push(buffer);216}217218let mut iter = from.values_iter();219let iter = iter.by_ref();220let mut views = Vec::with_capacity(from.len());221for buffer_idx in 0..num_buffers {222views.extend(223iter.take(num_elements_per_buffer)224.enumerate()225.map(|(i, slice)| {226// SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before227unsafe {228View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32)229}230}),231);232}233let views = views.into();234235BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap()236}237238/// Conversion of binary239pub fn binary_to_list<O: Offset>(from: &BinaryArray<O>, to_dtype: ArrowDataType) -> ListArray<O> {240let values = from.values().clone();241let values = PrimitiveArray::new(ArrowDataType::UInt8, values, None);242ListArray::<O>::new(243to_dtype,244from.offsets().clone(),245values.boxed(),246from.validity().cloned(),247)248}249250251