Path: blob/main/crates/polars-compute/src/cast/binview_to.rs
6939 views
use std::ptr::copy_nonoverlapping;12use arrow::array::*;3use arrow::bitmap::MutableBitmap;4#[cfg(feature = "dtype-decimal")]5use arrow::compute::decimal::deserialize_decimal;6use arrow::datatypes::{ArrowDataType, Field, TimeUnit};7use arrow::offset::Offset;8use arrow::types::NativeType;9use bytemuck::cast_slice_mut;10use chrono::Datelike;11use num_traits::FromBytes;12use polars_error::{PolarsResult, polars_err};1314use super::CastOptionsImpl;15use super::binary_to::Parse;16use super::temporal::EPOCH_DAYS_FROM_CE;1718pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";1920/// Cast [`BinaryViewArray`] to [`DictionaryArray`], also known as packing.21/// # Errors22/// This function errors if the maximum key is smaller than the number of distinct elements23/// in the array.24pub(super) fn binview_to_dictionary<K: DictionaryKey>(25from: &BinaryViewArray,26) -> PolarsResult<DictionaryArray<K>> {27let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<[u8]>>::new();28array.reserve(from.len());29array.try_extend(from.iter())?;3031Ok(array.into())32}3334pub(super) fn utf8view_to_dictionary<K: DictionaryKey>(35from: &Utf8ViewArray,36) -> PolarsResult<DictionaryArray<K>> {37let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<str>>::new();38array.reserve(from.len());39array.try_extend(from.iter())?;4041Ok(array.into())42}4344pub(super) fn view_to_binary<O: Offset>(array: &BinaryViewArray) -> BinaryArray<O> {45let len: usize = Array::len(array);46let mut mutable = MutableBinaryValuesArray::<O>::with_capacities(len, array.total_bytes_len());47for slice in array.values_iter() {48mutable.push(slice)49}50let out: BinaryArray<O> = mutable.into();51out.with_validity(array.validity().cloned())52}5354pub fn utf8view_to_utf8<O: Offset>(array: &Utf8ViewArray) -> Utf8Array<O> {55let array = array.to_binview();56let out = view_to_binary::<O>(&array);5758let dtype = Utf8Array::<O>::default_dtype();59unsafe {60Utf8Array::new_unchecked(61dtype,62out.offsets().clone(),63out.values().clone(),64out.validity().cloned(),65)66}67}6869/// Parses a [`Utf8ViewArray`] with text representations of numbers into a70/// [`PrimitiveArray`], making any unparsable value a Null.71pub(super) fn utf8view_to_primitive<T>(72from: &Utf8ViewArray,73to: &ArrowDataType,74) -> PrimitiveArray<T>75where76T: NativeType + Parse,77{78let iter = from79.iter()80.map(|x| x.and_then::<T, _>(|x| T::parse(x.as_bytes())));8182PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())83}8485/// Parses a `&dyn` [`Array`] of UTF-8 encoded string representations of numbers86/// into a [`PrimitiveArray`], making any unparsable value a Null.87pub(super) fn utf8view_to_primitive_dyn<T>(88from: &dyn Array,89to: &ArrowDataType,90options: CastOptionsImpl,91) -> PolarsResult<Box<dyn Array>>92where93T: NativeType + Parse,94{95let from = from.as_any().downcast_ref().unwrap();96if options.partial {97unimplemented!()98} else {99Ok(Box::new(utf8view_to_primitive::<T>(from, to)))100}101}102103#[cfg(feature = "dtype-decimal")]104pub fn binview_to_decimal(105array: &BinaryViewArray,106precision: Option<usize>,107scale: usize,108) -> PrimitiveArray<i128> {109let precision = precision.map(|p| p as u8);110PrimitiveArray::<i128>::from_trusted_len_iter(111array112.iter()113.map(|val| val.and_then(|val| deserialize_decimal(val, precision, scale as u8))),114)115.to(ArrowDataType::Decimal(116precision.unwrap_or(38).into(),117scale,118))119}120121pub(super) fn utf8view_to_naive_timestamp_dyn(122from: &dyn Array,123time_unit: TimeUnit,124) -> PolarsResult<Box<dyn Array>> {125let from = from.as_any().downcast_ref().unwrap();126Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit)))127}128129/// [`super::temporal::utf8view_to_timestamp`] applied for RFC3339 formatting130pub fn utf8view_to_naive_timestamp(131from: &Utf8ViewArray,132time_unit: TimeUnit,133) -> PrimitiveArray<i64> {134super::temporal::utf8view_to_naive_timestamp(from, RFC3339, time_unit)135}136137pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray<i32> {138let iter = from.iter().map(|x| {139x.and_then(|x| {140x.parse::<chrono::NaiveDate>()141.ok()142.map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE)143})144});145PrimitiveArray::<i32>::from_trusted_len_iter(iter).to(ArrowDataType::Date32)146}147148pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult<Box<dyn Array>> {149let from = from.as_any().downcast_ref().unwrap();150Ok(Box::new(utf8view_to_date32(from)))151}152153/// Casts a [`BinaryViewArray`] containing binary-encoded numbers to a154/// [`PrimitiveArray`], making any uncastable value a Null.155pub(super) fn binview_to_primitive<T>(156from: &BinaryViewArray,157to: &ArrowDataType,158is_little_endian: bool,159) -> PrimitiveArray<T>160where161T: FromBytes + NativeType,162for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,163{164let iter = from.iter().map(|x| {165x.and_then::<T, _>(|x| {166if is_little_endian {167Some(<T as FromBytes>::from_le_bytes(x.try_into().ok()?))168} else {169Some(<T as FromBytes>::from_be_bytes(x.try_into().ok()?))170}171})172});173174PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())175}176177/// Casts a `&dyn` [`Array`] containing binary-encoded numbers to a178/// [`PrimitiveArray`], making any uncastable value a Null.179/// # Panics180/// Panics if `Array` is not a `BinaryViewArray`181pub fn binview_to_primitive_dyn<T>(182from: &dyn Array,183to: &ArrowDataType,184is_little_endian: bool,185) -> PolarsResult<Box<dyn Array>>186where187T: FromBytes + NativeType,188for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,189{190let from = from.as_any().downcast_ref().unwrap();191Ok(Box::new(binview_to_primitive::<T>(192from,193to,194is_little_endian,195)))196}197198/// Casts a [`BinaryViewArray`] to a [`FixedSizeListArray`], making any un-castable value a Null.199///200/// # Arguments201///202/// * `from`: The array to reinterpret.203/// * `array_width`: The number of items in each `Array`.204pub(super) fn try_binview_to_fixed_size_list<T, const IS_LITTLE_ENDIAN: bool>(205from: &BinaryViewArray,206array_width: usize,207) -> PolarsResult<FixedSizeListArray>208where209T: FromBytes + NativeType,210for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,211{212let element_size = std::mem::size_of::<T>();213// The maximum number of primitives in the result:214let primitive_length = from.len().checked_mul(array_width).ok_or_else(|| {215polars_err!(216InvalidOperation:217"array chunk length * number of items ({} * {}) is too large",218from.len(),219array_width220)221})?;222// The size of each array, in bytes:223let row_size_bytes = element_size.checked_mul(array_width).ok_or_else(|| {224polars_err!(225InvalidOperation:226"array size in bytes ({} * {}) is too large",227element_size,228array_width229)230})?;231232let mut out: Vec<T> = vec![T::zeroed(); primitive_length];233let (out_u8_ptr, out_len_bytes) = {234let out_u8_slice = cast_slice_mut::<_, u8>(out.as_mut());235(out_u8_slice.as_mut_ptr(), out_u8_slice.len())236};237assert_eq!(out_len_bytes, row_size_bytes * from.len());238let mut validity = MutableBitmap::from_len_set(from.len());239240for (index, value) in from.iter().enumerate() {241if let Some(value) = value242&& value.len() == row_size_bytes243{244if cfg!(target_endian = "little") && IS_LITTLE_ENDIAN {245// Fast path, we can just copy the data with no need to246// reinterpret.247let write_index = index * row_size_bytes;248debug_assert!(value.is_empty() || write_index < out_len_bytes);249debug_assert!(value.is_empty() || (write_index + value.len() - 1 < out_len_bytes));250// # Safety251// - The start index is smaller than `out`'s capacity.252// - The end index is smaller than `out`'s capacity.253unsafe {254copy_nonoverlapping(value.as_ptr(), out_u8_ptr.add(write_index), value.len());255}256} else {257// Slow path, reinterpret items one by one.258for j in 0..array_width {259let jth_range = (j * element_size)..((j + 1) * element_size);260debug_assert!(value.get(jth_range.clone()).is_some());261// # Safety262// We made sure the range is smaller than `value` length.263let jth_bytes = unsafe { value.get_unchecked(jth_range) };264// # Safety265// We just made sure that the slice has length `element_size`266let byte_array = unsafe { jth_bytes.try_into().unwrap_unchecked() };267let jth_value = if IS_LITTLE_ENDIAN {268<T as FromBytes>::from_le_bytes(byte_array)269} else {270<T as FromBytes>::from_be_bytes(byte_array)271};272273let write_index = array_width * index + j;274debug_assert!(write_index < out.len());275// # Safety276// - The target index is smaller than the vector's pre-allocated capacity.277unsafe {278*out.get_unchecked_mut(write_index) = jth_value;279}280}281}282} else {283validity.set(index, false);284};285}286287FixedSizeListArray::try_new(288ArrowDataType::FixedSizeList(289Box::new(Field::new("".into(), T::PRIMITIVE.into(), true)),290array_width,291),292from.len(),293Box::new(PrimitiveArray::<T>::from_vec(out)),294validity.into(),295)296}297298/// Casts a `dyn` [`Array`] to a [`FixedSizeListArray`], making any un-castable value a Null.299///300/// # Arguments301///302/// * `from`: The array to reinterpret.303/// * `array_width`: The number of items in each `Array`.304///305/// # Panics306/// Panics if `from` is not `BinaryViewArray`.307pub fn binview_to_fixed_size_list_dyn<T>(308from: &dyn Array,309array_width: usize,310is_little_endian: bool,311) -> PolarsResult<Box<dyn Array>>312where313T: FromBytes + NativeType,314for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,315{316let from = from.as_any().downcast_ref().unwrap();317318let result = if is_little_endian {319try_binview_to_fixed_size_list::<T, true>(from, array_width)320} else {321try_binview_to_fixed_size_list::<T, false>(from, array_width)322}?;323Ok(Box::new(result))324}325326327