Path: blob/main/crates/polars-compute/src/cast/binary_to.rs
8422 views
use arrow::array::*;1use arrow::datatypes::ArrowDataType;2use arrow::offset::{Offset, Offsets};3use arrow::types::NativeType;4use num_traits::AsPrimitive;5use polars_buffer::Buffer;6use polars_error::PolarsResult;7#[cfg(feature = "dtype-f16")]8use polars_utils::float16::pf16;910use super::CastOptionsImpl;1112pub(super) trait Parse {13fn parse(val: &[u8]) -> Option<Self>14where15Self: Sized;16}1718macro_rules! impl_parse {19($primitive_type:ident) => {20impl Parse for $primitive_type {21fn parse(val: &[u8]) -> Option<Self> {22atoi_simd::parse_skipped(val).ok()23}24}25};26}27impl_parse!(i8);28impl_parse!(i16);29impl_parse!(i32);30impl_parse!(i64);31#[cfg(feature = "dtype-i128")]32impl_parse!(i128);3334impl_parse!(u8);35impl_parse!(u16);36impl_parse!(u32);37impl_parse!(u64);38#[cfg(feature = "dtype-u128")]39impl_parse!(u128);4041#[cfg(feature = "dtype-f16")]42impl Parse for pf16 {43fn parse(val: &[u8]) -> Option<Self>44where45Self: Sized,46{47fast_float2::parse(val).ok().map(|f: f32| f.as_())48}49}5051impl Parse for f32 {52fn parse(val: &[u8]) -> Option<Self>53where54Self: Sized,55{56fast_float2::parse(val).ok()57}58}59impl Parse for f64 {60fn parse(val: &[u8]) -> Option<Self>61where62Self: Sized,63{64fast_float2::parse(val).ok()65}66}6768/// Conversion of binary69pub fn binary_to_large_binary(70from: &BinaryArray<i32>,71to_dtype: ArrowDataType,72) -> BinaryArray<i64> {73let values = from.values().clone();74BinaryArray::<i64>::new(75to_dtype,76from.offsets().into(),77values,78from.validity().cloned(),79)80}8182/// Conversion of binary83pub fn binary_large_to_binary(84from: &BinaryArray<i64>,85to_dtype: ArrowDataType,86) -> PolarsResult<BinaryArray<i32>> {87let values = from.values().clone();88let offsets = from.offsets().try_into()?;89Ok(BinaryArray::<i32>::new(90to_dtype,91offsets,92values,93from.validity().cloned(),94))95}9697/// Conversion to utf898pub fn binary_to_utf8<O: Offset>(99from: &BinaryArray<O>,100to_dtype: ArrowDataType,101) -> PolarsResult<Utf8Array<O>> {102Utf8Array::<O>::try_new(103to_dtype,104from.offsets().clone(),105from.values().clone(),106from.validity().cloned(),107)108}109110/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.111pub(super) fn binary_to_primitive<O: Offset, T>(112from: &BinaryArray<O>,113to: &ArrowDataType,114) -> PrimitiveArray<T>115where116T: NativeType + Parse,117{118let iter = from.iter().map(|x| x.and_then::<T, _>(|x| T::parse(x)));119120PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())121}122123pub(super) fn binary_to_primitive_dyn<O: Offset, T>(124from: &dyn Array,125to: &ArrowDataType,126options: CastOptionsImpl,127) -> PolarsResult<Box<dyn Array>>128where129T: NativeType + Parse,130{131let from = from.as_any().downcast_ref().unwrap();132if options.partial {133unimplemented!()134} else {135Ok(Box::new(binary_to_primitive::<O, T>(from, to)))136}137}138139/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing.140/// # Errors141/// This function errors if the maximum key is smaller than the number of distinct elements142/// in the array.143pub fn binary_to_dictionary<O: Offset, K: DictionaryKey>(144from: &BinaryArray<O>,145) -> PolarsResult<DictionaryArray<K>> {146let mut array = MutableDictionaryArray::<K, MutableBinaryArray<O>>::empty_with_value_dtype(147from.dtype().clone(),148);149array.reserve(from.len());150array.try_extend(from.iter())?;151152Ok(array.into())153}154155pub(super) fn binary_to_dictionary_dyn<O: Offset, K: DictionaryKey>(156from: &dyn Array,157) -> PolarsResult<Box<dyn Array>> {158let values = from.as_any().downcast_ref().unwrap();159binary_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)160}161162fn fixed_size_to_offsets<O: Offset>(values_len: usize, fixed_size: usize) -> Offsets<O> {163let offsets = (0..(values_len + 1))164.step_by(fixed_size)165.map(|v| O::from_as_usize(v))166.collect();167// SAFETY:168// * every element is `>= 0`169// * element at position `i` is >= than element at position `i-1`.170unsafe { Offsets::new_unchecked(offsets) }171}172173/// Conversion of `FixedSizeBinary` to `Binary`.174pub fn fixed_size_binary_binary<O: Offset>(175from: &FixedSizeBinaryArray,176to_dtype: ArrowDataType,177) -> BinaryArray<O> {178let values = from.values().clone();179let offsets = fixed_size_to_offsets(values.len(), from.size());180BinaryArray::<O>::new(to_dtype, offsets.into(), values, from.validity().cloned())181}182183pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray {184let datatype = <[u8] as ViewType>::DATA_TYPE;185186// Fast path: all the views are inlineable187if from.size() <= View::MAX_INLINE_SIZE as usize {188// @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that189// prevents it from properly SIMD-ing this loop. It insists on memcpying while it should190// know that the size is really small. Dispatching over the `from.size()` and making it191// constant does make loop SIMD, but it does not actually speed anything up and the code it192// generates is still horrible.193//194// This is really slow, and I don't think it has to be.195196// SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before197let mut views = Vec::new();198View::extend_with_inlinable_strided(199&mut views,200from.values().as_slice(),201from.size() as u8,202);203let views = Buffer::from(views);204return BinaryViewArray::try_new(datatype, views, Buffer::new(), from.validity().cloned())205.unwrap();206}207208const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize;209210let size = from.size();211let num_bytes = from.len() * size;212let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER);213assert!(num_buffers < u32::MAX as usize);214215let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size;216// This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division217let split_point = num_elements_per_buffer * size;218219// This is zero-copy for the buffer since split just increases the data since220let mut buffer = from.values().clone();221let mut buffers = Vec::with_capacity(num_buffers);222223if let Some(num_buffers) = num_buffers.checked_sub(1) {224for _ in 0..num_buffers {225let slice;226(slice, buffer) = buffer.split_at(split_point);227buffers.push(slice);228}229buffers.push(buffer);230}231232let mut iter = from.values_iter();233let iter = iter.by_ref();234let mut views = Vec::with_capacity(from.len());235for buffer_idx in 0..num_buffers {236views.extend(237iter.take(num_elements_per_buffer)238.enumerate()239.map(|(i, slice)| {240// SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before241unsafe {242View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32)243}244}),245);246}247let views = views.into();248249BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap()250}251252/// Conversion of binary253pub fn binary_to_list<O: Offset>(from: &BinaryArray<O>, to_dtype: ArrowDataType) -> ListArray<O> {254let values = from.values().clone();255let values = PrimitiveArray::new(ArrowDataType::UInt8, values, None);256ListArray::<O>::new(257to_dtype,258from.offsets().clone(),259values.boxed(),260from.validity().cloned(),261)262}263264265