Path: blob/main/crates/polars-compute/src/cast/binview_to.rs
8440 views
use std::ptr::copy_nonoverlapping;12use arrow::array::*;3use arrow::bitmap::MutableBitmap;4use arrow::datatypes::{ArrowDataType, Field, TimeUnit};5use arrow::offset::Offset;6use arrow::types::NativeType;7use bytemuck::cast_slice_mut;8use chrono::Datelike;9use num_traits::FromBytes;10use polars_error::{PolarsResult, polars_err};1112use super::CastOptionsImpl;13use super::binary_to::Parse;14use super::temporal::EPOCH_DAYS_FROM_CE;15#[cfg(feature = "dtype-decimal")]16use crate::decimal::str_to_dec128;1718pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";1920/// Cast [`BinaryViewArray`] to [`DictionaryArray`], also known as packing.21/// # Errors22/// This function errors if the maximum key is smaller than the number of distinct elements23/// in the array.24pub(super) fn binview_to_dictionary<K: DictionaryKey>(25from: &BinaryViewArray,26) -> PolarsResult<DictionaryArray<K>> {27let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<[u8]>>::new();28array.reserve(from.len());29array.try_extend(from.iter())?;3031Ok(array.into())32}3334pub(super) fn utf8view_to_dictionary<K: DictionaryKey>(35from: &Utf8ViewArray,36) -> PolarsResult<DictionaryArray<K>> {37let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<str>>::new();38array.reserve(from.len());39array.try_extend(from.iter())?;4041Ok(array.into())42}4344pub(super) fn view_to_binary<O: Offset>(array: &BinaryViewArray) -> BinaryArray<O> {45let len: usize = Array::len(array);46let mut mutable = MutableBinaryValuesArray::<O>::with_capacities(len, array.total_bytes_len());47for slice in array.values_iter() {48mutable.push(slice)49}50let out: BinaryArray<O> = mutable.into();51out.with_validity(array.validity().cloned())52}5354pub fn utf8view_to_utf8<O: Offset>(array: &Utf8ViewArray) -> Utf8Array<O> {55let array = array.to_binview();56let out = view_to_binary::<O>(&array);5758let dtype = Utf8Array::<O>::default_dtype();59unsafe {60Utf8Array::new_unchecked(61dtype,62out.offsets().clone(),63out.values().clone(),64out.validity().cloned(),65)66}67}6869/// Parses a [`Utf8ViewArray`] with text representations of numbers into a70/// [`PrimitiveArray`], making any unparsable value a Null.71pub(super) fn utf8view_to_primitive<T>(72from: &Utf8ViewArray,73to: &ArrowDataType,74) -> PrimitiveArray<T>75where76T: NativeType + Parse,77{78let iter = from79.iter()80.map(|x| x.and_then::<T, _>(|x| T::parse(x.as_bytes())));8182PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())83}8485/// Parses a `&dyn` [`Array`] of UTF-8 encoded string representations of numbers86/// into a [`PrimitiveArray`], making any unparsable value a Null.87pub(super) fn utf8view_to_primitive_dyn<T>(88from: &dyn Array,89to: &ArrowDataType,90options: CastOptionsImpl,91) -> PolarsResult<Box<dyn Array>>92where93T: NativeType + Parse,94{95let from = from.as_any().downcast_ref().unwrap();96if options.partial {97unimplemented!()98} else {99Ok(Box::new(utf8view_to_primitive::<T>(from, to)))100}101}102103#[cfg(feature = "dtype-decimal")]104pub fn binview_to_decimal(105array: &BinaryViewArray,106precision: usize,107scale: usize,108) -> PrimitiveArray<i128> {109PrimitiveArray::<i128>::from_trusted_len_iter(110array111.iter()112.map(|val| val.and_then(|val| str_to_dec128(val, precision, scale, false))),113)114.to(ArrowDataType::Decimal(precision, scale))115}116117pub(super) fn utf8view_to_naive_timestamp_dyn(118from: &dyn Array,119time_unit: TimeUnit,120) -> PolarsResult<Box<dyn Array>> {121let from = from.as_any().downcast_ref().unwrap();122Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit)))123}124125/// [`super::temporal::utf8view_to_timestamp`] applied for RFC3339 formatting126pub fn utf8view_to_naive_timestamp(127from: &Utf8ViewArray,128time_unit: TimeUnit,129) -> PrimitiveArray<i64> {130super::temporal::utf8view_to_naive_timestamp(from, RFC3339, time_unit)131}132133pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray<i32> {134let iter = from.iter().map(|x| {135x.and_then(|x| {136x.parse::<chrono::NaiveDate>()137.ok()138.map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE)139})140});141PrimitiveArray::<i32>::from_trusted_len_iter(iter).to(ArrowDataType::Date32)142}143144pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult<Box<dyn Array>> {145let from = from.as_any().downcast_ref().unwrap();146Ok(Box::new(utf8view_to_date32(from)))147}148149/// Casts a [`BinaryViewArray`] containing binary-encoded numbers to a150/// [`PrimitiveArray`], making any uncastable value a Null.151pub(super) fn binview_to_primitive<T>(152from: &BinaryViewArray,153to: &ArrowDataType,154is_little_endian: bool,155) -> PrimitiveArray<T>156where157T: FromBytes + NativeType,158for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,159{160let iter = from.iter().map(|x| {161x.and_then::<T, _>(|x| {162if is_little_endian {163Some(<T as FromBytes>::from_le_bytes(x.try_into().ok()?))164} else {165Some(<T as FromBytes>::from_be_bytes(x.try_into().ok()?))166}167})168});169170PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())171}172173/// Casts a `&dyn` [`Array`] containing binary-encoded numbers to a174/// [`PrimitiveArray`], making any uncastable value a Null.175/// # Panics176/// Panics if `Array` is not a `BinaryViewArray`177pub fn binview_to_primitive_dyn<T>(178from: &dyn Array,179to: &ArrowDataType,180is_little_endian: bool,181) -> PolarsResult<Box<dyn Array>>182where183T: FromBytes + NativeType,184for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,185{186let from = from.as_any().downcast_ref().unwrap();187Ok(Box::new(binview_to_primitive::<T>(188from,189to,190is_little_endian,191)))192}193194/// Casts a [`BinaryViewArray`] to a [`FixedSizeListArray`], making any un-castable value a Null.195///196/// # Arguments197///198/// * `from`: The array to reinterpret.199/// * `array_width`: The number of items in each `Array`.200pub(super) fn try_binview_to_fixed_size_list<T, const IS_LITTLE_ENDIAN: bool>(201from: &BinaryViewArray,202array_width: usize,203) -> PolarsResult<FixedSizeListArray>204where205T: FromBytes + NativeType,206for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,207{208let element_size = std::mem::size_of::<T>();209// The maximum number of primitives in the result:210let primitive_length = from.len().checked_mul(array_width).ok_or_else(|| {211polars_err!(212InvalidOperation:213"array chunk length * number of items ({} * {}) is too large",214from.len(),215array_width216)217})?;218// The size of each array, in bytes:219let row_size_bytes = element_size.checked_mul(array_width).ok_or_else(|| {220polars_err!(221InvalidOperation:222"array size in bytes ({} * {}) is too large",223element_size,224array_width225)226})?;227228let mut out: Vec<T> = vec![T::zeroed(); primitive_length];229let (out_u8_ptr, out_len_bytes) = {230let out_u8_slice = cast_slice_mut::<_, u8>(out.as_mut());231(out_u8_slice.as_mut_ptr(), out_u8_slice.len())232};233assert_eq!(out_len_bytes, row_size_bytes * from.len());234let mut validity = MutableBitmap::from_len_set(from.len());235236for (index, value) in from.iter().enumerate() {237if let Some(value) = value238&& value.len() == row_size_bytes239{240if cfg!(target_endian = "little") && IS_LITTLE_ENDIAN {241// Fast path, we can just copy the data with no need to242// reinterpret.243let write_index = index * row_size_bytes;244debug_assert!(value.is_empty() || write_index < out_len_bytes);245debug_assert!(value.is_empty() || (write_index + value.len() - 1 < out_len_bytes));246// # Safety247// - The start index is smaller than `out`'s capacity.248// - The end index is smaller than `out`'s capacity.249unsafe {250copy_nonoverlapping(value.as_ptr(), out_u8_ptr.add(write_index), value.len());251}252} else {253// Slow path, reinterpret items one by one.254for j in 0..array_width {255let jth_range = (j * element_size)..((j + 1) * element_size);256debug_assert!(value.get(jth_range.clone()).is_some());257// # Safety258// We made sure the range is smaller than `value` length.259let jth_bytes = unsafe { value.get_unchecked(jth_range) };260// # Safety261// We just made sure that the slice has length `element_size`262let byte_array = unsafe { jth_bytes.try_into().unwrap_unchecked() };263let jth_value = if IS_LITTLE_ENDIAN {264<T as FromBytes>::from_le_bytes(byte_array)265} else {266<T as FromBytes>::from_be_bytes(byte_array)267};268269let write_index = array_width * index + j;270debug_assert!(write_index < out.len());271// # Safety272// - The target index is smaller than the vector's pre-allocated capacity.273unsafe {274*out.get_unchecked_mut(write_index) = jth_value;275}276}277}278} else {279validity.set(index, false);280};281}282283FixedSizeListArray::try_new(284ArrowDataType::FixedSizeList(285Box::new(Field::new("".into(), T::PRIMITIVE.into(), true)),286array_width,287),288from.len(),289Box::new(PrimitiveArray::<T>::from_vec(out)),290validity.into(),291)292}293294/// Casts a `dyn` [`Array`] to a [`FixedSizeListArray`], making any un-castable value a Null.295///296/// # Arguments297///298/// * `from`: The array to reinterpret.299/// * `array_width`: The number of items in each `Array`.300///301/// # Panics302/// Panics if `from` is not `BinaryViewArray`.303pub fn binview_to_fixed_size_list_dyn<T>(304from: &dyn Array,305array_width: usize,306is_little_endian: bool,307) -> PolarsResult<Box<dyn Array>>308where309T: FromBytes + NativeType,310for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,311{312let from = from.as_any().downcast_ref().unwrap();313314let result = if is_little_endian {315try_binview_to_fixed_size_list::<T, true>(from, array_width)316} else {317try_binview_to_fixed_size_list::<T, false>(from, array_width)318}?;319Ok(Box::new(result))320}321322323