Path: blob/main/crates/polars-core/src/chunked_array/cast.rs
8415 views
//! Implementations of the ChunkCast Trait.12use std::borrow::Cow;34use polars_compute::cast::CastOptionsImpl;5#[cfg(feature = "serde-lazy")]6use serde::{Deserialize, Serialize};78use super::flags::StatisticsFlags;9#[cfg(feature = "dtype-datetime")]10use crate::prelude::DataType::Datetime;11use crate::prelude::*;12use crate::utils::{handle_array_casting_failures, handle_casting_failures};1314#[derive(Copy, Clone, Debug, Default, PartialEq, Hash, Eq)]15#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))]16#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]17#[repr(u8)]18pub enum CastOptions {19/// Raises on overflow20#[default]21Strict,22/// Overflow is replaced with null23NonStrict,24/// Allows wrapping overflow25Overflowing,26}2728impl CastOptions {29pub fn is_strict(&self) -> bool {30matches!(self, CastOptions::Strict)31}32}3334impl From<CastOptions> for CastOptionsImpl {35fn from(value: CastOptions) -> Self {36let wrapped = match value {37CastOptions::Strict | CastOptions::NonStrict => false,38CastOptions::Overflowing => true,39};40CastOptionsImpl {41wrapped,42partial: false,43}44}45}4647pub(crate) fn cast_chunks(48chunks: &[ArrayRef],49dtype: &DataType,50options: CastOptions,51) -> PolarsResult<Vec<ArrayRef>> {52let check_nulls = matches!(options, CastOptions::Strict);53let options = options.into();5455let arrow_dtype = dtype.try_to_arrow(CompatLevel::newest())?;56chunks57.iter()58.map(|arr| {59let out = polars_compute::cast::cast(arr.as_ref(), &arrow_dtype, options);60if check_nulls {61out.and_then(|new| {62if arr.null_count() != new.null_count() {63handle_array_casting_failures(&**arr, &*new)?;64}65Ok(new)66})67} else {68out69}70})71.collect::<PolarsResult<Vec<_>>>()72}7374fn cast_impl_inner(75name: PlSmallStr,76chunks: &[ArrayRef],77dtype: &DataType,78options: CastOptions,79) -> PolarsResult<Series> {80let chunks = match dtype {81#[cfg(feature = "dtype-decimal")]82DataType::Decimal(_, _) => {83let mut chunks = cast_chunks(chunks, dtype, options)?;84// @NOTE: We cannot cast here as that will lower the scale.85for chunk in chunks.iter_mut() {86*chunk = std::mem::take(87chunk88.as_any_mut()89.downcast_mut::<PrimitiveArray<i128>>()90.unwrap(),91)92.to(ArrowDataType::Int128)93.to_boxed();94}95chunks96},97_ => cast_chunks(chunks, &dtype.to_physical(), options)?,98};99100let out = Series::try_from((name, chunks))?;101use DataType::*;102let out = match dtype {103Date => out.into_date(),104Datetime(tu, tz) => match tz {105#[cfg(feature = "timezones")]106Some(tz) => {107TimeZone::validate_time_zone(tz)?;108out.into_datetime(*tu, Some(tz.clone()))109},110_ => out.into_datetime(*tu, None),111},112Duration(tu) => out.into_duration(*tu),113#[cfg(feature = "dtype-time")]114Time => out.into_time(),115#[cfg(feature = "dtype-decimal")]116Decimal(precision, scale) => out.into_decimal(*precision, *scale)?,117_ => out,118};119120Ok(out)121}122123fn cast_impl(124name: PlSmallStr,125chunks: &[ArrayRef],126dtype: &DataType,127options: CastOptions,128) -> PolarsResult<Series> {129cast_impl_inner(name, chunks, dtype, options)130}131132#[cfg(feature = "dtype-struct")]133fn cast_single_to_struct(134name: PlSmallStr,135chunks: &[ArrayRef],136fields: &[Field],137options: CastOptions,138) -> PolarsResult<Series> {139polars_ensure!(fields.len() == 1, InvalidOperation: "must specify one field in the struct");140let mut new_fields = Vec::with_capacity(fields.len());141// cast to first field dtype142let mut fields = fields.iter();143let fld = fields.next().unwrap();144let s = cast_impl_inner(fld.name.clone(), chunks, &fld.dtype, options)?;145let length = s.len();146new_fields.push(s);147148for fld in fields {149new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype));150}151152StructChunked::from_series(name, length, new_fields.iter()).map(|ca| ca.into_series())153}154155impl<T> ChunkedArray<T>156where157T: PolarsNumericType,158{159fn cast_impl(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {160if self.dtype() == dtype {161// SAFETY: chunks are correct dtype162let mut out = unsafe {163Series::from_chunks_and_dtype_unchecked(164self.name().clone(),165self.chunks.clone(),166dtype,167)168};169out.set_sorted_flag(self.is_sorted_flag());170return Ok(out);171}172match dtype {173// LEGACY174// TODO @ cat-rework: remove after exposing to/from physical functions.175#[cfg(feature = "dtype-categorical")]176DataType::Categorical(cats, _mapping) => {177let s = self.cast_with_options(&cats.physical().dtype(), options)?;178with_match_categorical_physical_type!(cats.physical(), |$C| {179// SAFETY: we are guarded by the type system.180type PhysCa = ChunkedArray<<$C as PolarsCategoricalType>::PolarsPhysical>;181let ca: &PhysCa = s.as_ref().as_ref();182Ok(CategoricalChunked::<$C>::from_cats_and_dtype(ca.clone(), dtype.clone())183.into_series())184})185},186187// LEGACY188// TODO @ cat-rework: remove after exposing to/from physical functions.189#[cfg(feature = "dtype-categorical")]190DataType::Enum(fcats, _mapping) => {191let s = self.cast_with_options(&fcats.physical().dtype(), options)?;192with_match_categorical_physical_type!(fcats.physical(), |$C| {193// SAFETY: we are guarded by the type system.194type PhysCa = ChunkedArray<<$C as PolarsCategoricalType>::PolarsPhysical>;195let ca: &PhysCa = s.as_ref().as_ref();196Ok(CategoricalChunked::<$C>::from_cats_and_dtype(ca.clone(), dtype.clone()).into_series())197})198},199200#[cfg(feature = "dtype-struct")]201DataType::Struct(fields) => {202cast_single_to_struct(self.name().clone(), &self.chunks, fields, options)203},204_ => cast_impl_inner(self.name().clone(), &self.chunks, dtype, options).map(|mut s| {205// maintain sorted if data types206// - remain signed207// - unsigned -> signed208// this may still fail with overflow?209let to_signed = dtype.is_signed_integer();210let unsigned2unsigned =211self.dtype().is_unsigned_integer() && dtype.is_unsigned_integer();212let allowed = to_signed || unsigned2unsigned;213214if (allowed)215&& (s.null_count() == self.null_count())216// physical to logicals217|| (self.dtype().to_physical() == dtype.to_physical())218{219let is_sorted = self.is_sorted_flag();220s.set_sorted_flag(is_sorted)221}222s223}),224}225}226}227228impl<T> ChunkCast for ChunkedArray<T>229where230T: PolarsNumericType,231{232fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {233self.cast_impl(dtype, options)234}235236unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {237match dtype {238// LEGACY239// TODO @ cat-rework: remove after exposing to/from physical functions.240#[cfg(feature = "dtype-categorical")]241DataType::Categorical(cats, _mapping) => {242polars_ensure!(self.dtype() == &cats.physical().dtype(), ComputeError: "cannot cast numeric types to 'Categorical'");243with_match_categorical_physical_type!(cats.physical(), |$C| {244// SAFETY: we are guarded by the type system.245type PhysCa = ChunkedArray<<$C as PolarsCategoricalType>::PolarsPhysical>;246let ca = unsafe { &*(self as *const ChunkedArray<T> as *const PhysCa) };247Ok(CategoricalChunked::<$C>::from_cats_and_dtype_unchecked(ca.clone(), dtype.clone())248.into_series())249})250},251252// LEGACY253// TODO @ cat-rework: remove after exposing to/from physical functions.254#[cfg(feature = "dtype-categorical")]255DataType::Enum(fcats, _mapping) => {256polars_ensure!(self.dtype() == &fcats.physical().dtype(), ComputeError: "cannot cast numeric types to 'Enum'");257with_match_categorical_physical_type!(fcats.physical(), |$C| {258// SAFETY: we are guarded by the type system.259type PhysCa = ChunkedArray<<$C as PolarsCategoricalType>::PolarsPhysical>;260let ca = unsafe { &*(self as *const ChunkedArray<T> as *const PhysCa) };261Ok(CategoricalChunked::<$C>::from_cats_and_dtype_unchecked(ca.clone(), dtype.clone()).into_series())262})263},264265_ => self.cast_impl(dtype, CastOptions::Overflowing),266}267}268}269270impl ChunkCast for StringChunked {271fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {272match dtype {273#[cfg(feature = "dtype-categorical")]274DataType::Categorical(cats, _mapping) => {275with_match_categorical_physical_type!(cats.physical(), |$C| {276Ok(CategoricalChunked::<$C>::from_str_iter(self.name().clone(), dtype.clone(), self.iter())?277.into_series())278})279},280#[cfg(feature = "dtype-categorical")]281DataType::Enum(fcats, _mapping) => {282let ret = with_match_categorical_physical_type!(fcats.physical(), |$C| {283CategoricalChunked::<$C>::from_str_iter(self.name().clone(), dtype.clone(), self.iter())?284.into_series()285});286287if options.is_strict() && self.null_count() != ret.null_count() {288handle_casting_failures(&self.clone().into_series(), &ret)?;289}290291Ok(ret)292},293#[cfg(feature = "dtype-struct")]294DataType::Struct(fields) => {295cast_single_to_struct(self.name().clone(), &self.chunks, fields, options)296},297#[cfg(feature = "dtype-decimal")]298DataType::Decimal(precision, scale) => {299let chunks = self.downcast_iter().map(|arr| {300polars_compute::cast::binview_to_decimal(&arr.to_binview(), *precision, *scale)301.to(ArrowDataType::Int128)302});303let ca = Int128Chunked::from_chunk_iter(self.name().clone(), chunks);304Ok(ca.into_decimal_unchecked(*precision, *scale).into_series())305},306#[cfg(feature = "dtype-date")]307DataType::Date => {308let result = cast_chunks(&self.chunks, dtype, options)?;309let out = Series::try_from((self.name().clone(), result))?;310Ok(out)311},312#[cfg(feature = "dtype-datetime")]313DataType::Datetime(time_unit, time_zone) => match time_zone {314#[cfg(feature = "timezones")]315Some(time_zone) => {316TimeZone::validate_time_zone(time_zone)?;317let result = cast_chunks(318&self.chunks,319&Datetime(time_unit.to_owned(), Some(time_zone.clone())),320options,321)?;322Series::try_from((self.name().clone(), result))323},324_ => {325let result =326cast_chunks(&self.chunks, &Datetime(time_unit.to_owned(), None), options)?;327Series::try_from((self.name().clone(), result))328},329},330_ => cast_impl(self.name().clone(), &self.chunks, dtype, options),331}332}333334unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {335self.cast_with_options(dtype, CastOptions::Overflowing)336}337}338339impl BinaryChunked {340/// # Safety341/// String is not validated342pub unsafe fn to_string_unchecked(&self) -> StringChunked {343let chunks = self344.downcast_iter()345.map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed())346.collect();347let field = Arc::new(Field::new(self.name().clone(), DataType::String));348349let mut ca = StringChunked::new_with_compute_len(field, chunks);350351use StatisticsFlags as F;352ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);353ca354}355}356357impl StringChunked {358pub fn as_binary(&self) -> BinaryChunked {359let chunks = self360.downcast_iter()361.map(|arr| arr.to_binview().boxed())362.collect();363let field = Arc::new(Field::new(self.name().clone(), DataType::Binary));364365let mut ca = BinaryChunked::new_with_compute_len(field, chunks);366367use StatisticsFlags as F;368ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);369ca370}371}372373impl ChunkCast for BinaryChunked {374fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {375match dtype {376#[cfg(feature = "dtype-struct")]377DataType::Struct(fields) => {378cast_single_to_struct(self.name().clone(), &self.chunks, fields, options)379},380_ => cast_impl(self.name().clone(), &self.chunks, dtype, options),381}382}383384unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {385match dtype {386DataType::String => unsafe { Ok(self.to_string_unchecked().into_series()) },387_ => self.cast_with_options(dtype, CastOptions::Overflowing),388}389}390}391392impl ChunkCast for BinaryOffsetChunked {393fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {394match dtype {395#[cfg(feature = "dtype-struct")]396DataType::Struct(fields) => {397cast_single_to_struct(self.name().clone(), &self.chunks, fields, options)398},399_ => cast_impl(self.name().clone(), &self.chunks, dtype, options),400}401}402403unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {404self.cast_with_options(dtype, CastOptions::Overflowing)405}406}407408impl ChunkCast for BooleanChunked {409fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {410match dtype {411#[cfg(feature = "dtype-struct")]412DataType::Struct(fields) => {413cast_single_to_struct(self.name().clone(), &self.chunks, fields, options)414},415#[cfg(feature = "dtype-categorical")]416DataType::Categorical(_, _) | DataType::Enum(_, _) => {417polars_bail!(InvalidOperation: "cannot cast Boolean to Categorical");418},419_ => cast_impl(self.name().clone(), &self.chunks, dtype, options),420}421}422423unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {424self.cast_with_options(dtype, CastOptions::Overflowing)425}426}427428/// We cannot cast anything to or from List/LargeList429/// So this implementation casts the inner type430impl ChunkCast for ListChunked {431fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {432let ca = self433.trim_lists_to_normalized_offsets()434.map_or(Cow::Borrowed(self), Cow::Owned);435let ca = ca.propagate_nulls().map_or(ca, Cow::Owned);436437use DataType::*;438match dtype {439List(child_type) => {440match (ca.inner_dtype(), &**child_type) {441(old, new) if old == new => Ok(ca.into_owned().into_series()),442// TODO @ cat-rework: can we implement this now?443#[cfg(feature = "dtype-categorical")]444(dt, Categorical(_, _) | Enum(_, _))445if !matches!(dt, Categorical(_, _) | Enum(_, _) | String | Null) =>446{447polars_bail!(InvalidOperation: "cannot cast List inner type: '{:?}' to Categorical", dt)448},449_ => {450// ensure the inner logical type bubbles up451let (arr, child_type) = cast_list(ca.as_ref(), child_type, options)?;452// SAFETY: we just cast so the dtype matches.453// we must take this path to correct for physical types.454unsafe {455Ok(Series::from_chunks_and_dtype_unchecked(456ca.name().clone(),457vec![arr],458&List(Box::new(child_type)),459))460}461},462}463},464#[cfg(feature = "dtype-array")]465Array(child_type, width) => {466let physical_type = dtype.to_physical();467468// cast to the physical type to avoid logical chunks.469let chunks = cast_chunks(ca.chunks(), &physical_type, options)?;470// SAFETY: we just cast so the dtype matches.471// we must take this path to correct for physical types.472unsafe {473Ok(Series::from_chunks_and_dtype_unchecked(474ca.name().clone(),475chunks,476&Array(child_type.clone(), *width),477))478}479},480#[cfg(feature = "dtype-u8")]481Binary => {482polars_ensure!(483matches!(self.inner_dtype(), UInt8),484InvalidOperation: "cannot cast List type (inner: '{:?}', to: '{:?}')",485self.inner_dtype(),486dtype,487);488let chunks = cast_chunks(self.chunks(), &DataType::Binary, options)?;489490// SAFETY: we just cast so the dtype matches.491unsafe {492Ok(Series::from_chunks_and_dtype_unchecked(493self.name().clone(),494chunks,495&DataType::Binary,496))497}498},499_ => {500polars_bail!(501InvalidOperation: "cannot cast List type (inner: '{:?}', to: '{:?}')",502ca.inner_dtype(),503dtype,504)505},506}507}508509unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {510use DataType::*;511match dtype {512List(child_type) => cast_list_unchecked(self, child_type),513_ => self.cast_with_options(dtype, CastOptions::Overflowing),514}515}516}517518/// We cannot cast anything to or from List/LargeList519/// So this implementation casts the inner type520#[cfg(feature = "dtype-array")]521impl ChunkCast for ArrayChunked {522fn cast_with_options(&self, dtype: &DataType, options: CastOptions) -> PolarsResult<Series> {523let ca = self524.trim_lists_to_normalized_offsets()525.map_or(Cow::Borrowed(self), Cow::Owned);526let ca = ca.propagate_nulls().map_or(ca, Cow::Owned);527528use DataType::*;529match dtype {530Array(child_type, width) => {531polars_ensure!(532*width == ca.width(),533InvalidOperation: "cannot cast Array to a different width"534);535536match (ca.inner_dtype(), &**child_type) {537(old, new) if old == new => Ok(ca.into_owned().into_series()),538// TODO @ cat-rework: can we implement this now?539#[cfg(feature = "dtype-categorical")]540(dt, Categorical(_, _) | Enum(_, _)) if !matches!(dt, String) => {541polars_bail!(InvalidOperation: "cannot cast Array inner type: '{:?}' to dtype: {:?}", dt, child_type)542},543_ => {544// ensure the inner logical type bubbles up545let (arr, child_type) =546cast_fixed_size_list(ca.as_ref(), child_type, options)?;547// SAFETY: we just cast so the dtype matches.548// we must take this path to correct for physical types.549unsafe {550Ok(Series::from_chunks_and_dtype_unchecked(551ca.name().clone(),552vec![arr],553&Array(Box::new(child_type), *width),554))555}556},557}558},559List(child_type) => {560let physical_type = dtype.to_physical();561// cast to the physical type to avoid logical chunks.562let chunks = cast_chunks(ca.chunks(), &physical_type, options)?;563// SAFETY: we just cast so the dtype matches.564// we must take this path to correct for physical types.565unsafe {566Ok(Series::from_chunks_and_dtype_unchecked(567ca.name().clone(),568chunks,569&List(child_type.clone()),570))571}572},573_ => {574polars_bail!(575InvalidOperation: "cannot cast Array type (inner: '{:?}', to: '{:?}')",576ca.inner_dtype(),577dtype,578)579},580}581}582583unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult<Series> {584self.cast_with_options(dtype, CastOptions::Overflowing)585}586}587588// Returns inner data type. This is needed because a cast can instantiate the dtype inner589// values for instance with categoricals590fn cast_list(591ca: &ListChunked,592child_type: &DataType,593options: CastOptions,594) -> PolarsResult<(ArrayRef, DataType)> {595// We still rechunk because we must bubble up a single data-type596// TODO!: consider a version that works on chunks and merges the data-types and arrays.597let ca = ca.rechunk();598let arr = ca.downcast_as_array();599// SAFETY: inner dtype is passed correctly600let s = unsafe {601Series::from_chunks_and_dtype_unchecked(602PlSmallStr::EMPTY,603vec![arr.values().clone()],604ca.inner_dtype(),605)606};607let new_inner = s.cast_with_options(child_type, options)?;608609let inner_dtype = new_inner.dtype().clone();610debug_assert_eq!(&inner_dtype, child_type);611612let new_values = new_inner.array_ref(0).clone();613614let dtype = ListArray::<i64>::default_datatype(new_values.dtype().clone());615let new_arr = ListArray::<i64>::new(616dtype,617arr.offsets().clone(),618new_values,619arr.validity().cloned(),620);621Ok((new_arr.boxed(), inner_dtype))622}623624unsafe fn cast_list_unchecked(ca: &ListChunked, child_type: &DataType) -> PolarsResult<Series> {625// TODO! add chunked, but this must correct for list offsets.626let ca = ca.rechunk();627let arr = ca.downcast_as_array();628// SAFETY: inner dtype is passed correctly629let s = unsafe {630Series::from_chunks_and_dtype_unchecked(631PlSmallStr::EMPTY,632vec![arr.values().clone()],633ca.inner_dtype(),634)635};636let new_inner = s.cast_unchecked(child_type)?;637let new_values = new_inner.array_ref(0).clone();638639let dtype = ListArray::<i64>::default_datatype(new_values.dtype().clone());640let new_arr = ListArray::<i64>::new(641dtype,642arr.offsets().clone(),643new_values,644arr.validity().cloned(),645);646Ok(ListChunked::from_chunks_and_dtype_unchecked(647ca.name().clone(),648vec![Box::new(new_arr)],649DataType::List(Box::new(child_type.clone())),650)651.into_series())652}653654// Returns inner data type. This is needed because a cast can instantiate the dtype inner655// values for instance with categoricals656#[cfg(feature = "dtype-array")]657fn cast_fixed_size_list(658ca: &ArrayChunked,659child_type: &DataType,660options: CastOptions,661) -> PolarsResult<(ArrayRef, DataType)> {662let ca = ca.rechunk();663let arr = ca.downcast_as_array();664// SAFETY: inner dtype is passed correctly665let s = unsafe {666Series::from_chunks_and_dtype_unchecked(667PlSmallStr::EMPTY,668vec![arr.values().clone()],669ca.inner_dtype(),670)671};672let new_inner = s.cast_with_options(child_type, options)?;673674let inner_dtype = new_inner.dtype().clone();675debug_assert_eq!(&inner_dtype, child_type);676677let new_values = new_inner.array_ref(0).clone();678679let dtype = FixedSizeListArray::default_datatype(new_values.dtype().clone(), ca.width());680let new_arr = FixedSizeListArray::new(dtype, ca.len(), new_values, arr.validity().cloned());681Ok((Box::new(new_arr), inner_dtype))682}683684#[cfg(test)]685mod test {686use crate::chunked_array::cast::CastOptions;687use crate::prelude::*;688689#[test]690fn test_cast_list() -> PolarsResult<()> {691let mut builder = ListPrimitiveChunkedBuilder::<Int32Type>::new(692PlSmallStr::from_static("a"),69310,69410,695DataType::Int32,696);697builder.append_opt_slice(Some(&[1i32, 2, 3]));698builder.append_opt_slice(Some(&[1i32, 2, 3]));699let ca = builder.finish();700701let new = ca.cast_with_options(702&DataType::List(DataType::Float64.into()),703CastOptions::Strict,704)?;705706assert_eq!(new.dtype(), &DataType::List(DataType::Float64.into()));707Ok(())708}709710#[test]711#[cfg(feature = "dtype-categorical")]712fn test_cast_noop() {713// check if we can cast categorical twice without panic714let ca = StringChunked::new(PlSmallStr::from_static("foo"), &["bar", "ham"]);715let cats = Categories::global();716let out = ca717.cast_with_options(718&DataType::from_categories(cats.clone()),719CastOptions::Strict,720)721.unwrap();722let out = out.cast(&DataType::from_categories(cats)).unwrap();723assert!(matches!(out.dtype(), &DataType::Categorical(_, _)))724}725}726727728