Path: blob/main/crates/polars-plan/src/plans/aexpr/function_expr/strings.rs
7889 views
#[cfg(feature = "dtype-decimal")]1use polars_compute::decimal::DEC128_MAX_PREC;2#[cfg(feature = "dtype-struct")]3use polars_utils::format_pl_smallstr;45use super::*;67#[cfg(all(feature = "regex", feature = "timezones"))]8polars_utils::regex_cache::cached_regex! {9pub static TZ_AWARE_RE = r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)";10}1112#[cfg_attr(feature = "ir_serde", derive(serde::Serialize, serde::Deserialize))]13#[derive(Clone, PartialEq, Debug, Eq, Hash)]14pub enum IRStringFunction {15Format {16format: PlSmallStr,17insertions: Arc<[usize]>,18},19#[cfg(feature = "concat_str")]20ConcatHorizontal {21delimiter: PlSmallStr,22ignore_nulls: bool,23},24#[cfg(feature = "concat_str")]25ConcatVertical {26delimiter: PlSmallStr,27ignore_nulls: bool,28},29#[cfg(feature = "regex")]30Contains {31literal: bool,32strict: bool,33},34CountMatches(bool),35EndsWith,36Extract(usize),37ExtractAll,38#[cfg(feature = "extract_groups")]39ExtractGroups {40dtype: DataType,41pat: PlSmallStr,42},43#[cfg(feature = "regex")]44Find {45literal: bool,46strict: bool,47},48#[cfg(feature = "string_to_integer")]49ToInteger {50dtype: Option<DataType>,51strict: bool,52},53LenBytes,54LenChars,55Lowercase,56#[cfg(feature = "extract_jsonpath")]57JsonDecode(DataType),58#[cfg(feature = "extract_jsonpath")]59JsonPathMatch,60#[cfg(feature = "regex")]61Replace {62// negative is replace all63// how many matches to replace64n: i64,65literal: bool,66},67#[cfg(feature = "string_normalize")]68Normalize {69form: UnicodeForm,70},71#[cfg(feature = "string_reverse")]72Reverse,73#[cfg(feature = "string_pad")]74PadStart {75fill_char: char,76},77#[cfg(feature = "string_pad")]78PadEnd {79fill_char: char,80},81Slice,82Head,83Tail,84#[cfg(feature = "string_encoding")]85HexEncode,86#[cfg(feature = "binary_encoding")]87HexDecode(bool),88#[cfg(feature = "string_encoding")]89Base64Encode,90#[cfg(feature = "binary_encoding")]91Base64Decode(bool),92StartsWith,93StripChars,94StripCharsStart,95StripCharsEnd,96StripPrefix,97StripSuffix,98#[cfg(feature = "dtype-struct")]99SplitExact {100n: usize,101inclusive: bool,102},103#[cfg(feature = "dtype-struct")]104SplitN(usize),105#[cfg(feature = "temporal")]106// DataType can only be Date/Datetime/Time107Strptime(DataType, StrptimeOptions),108Split(bool),109#[cfg(feature = "dtype-decimal")]110ToDecimal {111scale: usize,112},113#[cfg(feature = "nightly")]114Titlecase,115Uppercase,116#[cfg(feature = "string_pad")]117ZFill,118#[cfg(feature = "find_many")]119ContainsAny {120ascii_case_insensitive: bool,121},122#[cfg(feature = "find_many")]123ReplaceMany {124ascii_case_insensitive: bool,125leftmost: bool,126},127#[cfg(feature = "find_many")]128ExtractMany {129ascii_case_insensitive: bool,130overlapping: bool,131leftmost: bool,132},133#[cfg(feature = "find_many")]134FindMany {135ascii_case_insensitive: bool,136overlapping: bool,137leftmost: bool,138},139#[cfg(feature = "regex")]140EscapeRegex,141}142143impl IRStringFunction {144pub(super) fn get_field(&self, mapper: FieldsMapper) -> PolarsResult<Field> {145use IRStringFunction::*;146match self {147Format { .. } => mapper.with_dtype(DataType::String),148#[cfg(feature = "concat_str")]149ConcatVertical { .. } | ConcatHorizontal { .. } => mapper.with_dtype(DataType::String),150#[cfg(feature = "regex")]151Contains { .. } => mapper.with_dtype(DataType::Boolean),152CountMatches(_) => mapper.with_dtype(DataType::UInt32),153EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean),154Extract(_) => mapper.with_same_dtype(),155ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))),156#[cfg(feature = "extract_groups")]157ExtractGroups { dtype, .. } => mapper.with_dtype(dtype.clone()),158#[cfg(feature = "string_to_integer")]159ToInteger { dtype, .. } => mapper.with_dtype(dtype.clone().unwrap_or(DataType::Int64)),160#[cfg(feature = "regex")]161Find { .. } => mapper.with_dtype(DataType::UInt32),162#[cfg(feature = "extract_jsonpath")]163JsonDecode(dtype) => mapper.with_dtype(dtype.clone()),164#[cfg(feature = "extract_jsonpath")]165JsonPathMatch => mapper.with_dtype(DataType::String),166LenBytes => mapper.with_dtype(DataType::UInt32),167LenChars => mapper.with_dtype(DataType::UInt32),168#[cfg(feature = "regex")]169Replace { .. } => mapper.with_same_dtype(),170#[cfg(feature = "string_normalize")]171Normalize { .. } => mapper.with_same_dtype(),172#[cfg(feature = "string_reverse")]173Reverse => mapper.with_same_dtype(),174#[cfg(feature = "temporal")]175Strptime(dtype, options) => match dtype {176#[cfg(feature = "dtype-datetime")]177DataType::Datetime(time_unit, time_zone) => {178let mut time_zone = time_zone.clone();179#[cfg(all(feature = "regex", feature = "timezones"))]180if options181.format182.as_ref()183.is_some_and(|format| TZ_AWARE_RE.is_match(format.as_str()))184&& time_zone.is_none()185{186time_zone = Some(time_zone.unwrap_or(TimeZone::UTC));187}188mapper.with_dtype(DataType::Datetime(*time_unit, time_zone))189},190_ => mapper.with_dtype(dtype.clone()),191},192Split(_) => mapper.with_dtype(DataType::List(Box::new(DataType::String))),193#[cfg(feature = "nightly")]194Titlecase => mapper.with_same_dtype(),195#[cfg(feature = "dtype-decimal")]196ToDecimal { scale } => mapper.with_dtype(DataType::Decimal(DEC128_MAX_PREC, *scale)),197#[cfg(feature = "string_encoding")]198HexEncode => mapper.with_same_dtype(),199#[cfg(feature = "binary_encoding")]200HexDecode(_) => mapper.with_dtype(DataType::Binary),201#[cfg(feature = "string_encoding")]202Base64Encode => mapper.with_same_dtype(),203#[cfg(feature = "binary_encoding")]204Base64Decode(_) => mapper.with_dtype(DataType::Binary),205Uppercase | Lowercase | StripChars | StripCharsStart | StripCharsEnd | StripPrefix206| StripSuffix | Slice | Head | Tail => mapper.with_same_dtype(),207#[cfg(feature = "string_pad")]208PadStart { .. } | PadEnd { .. } | ZFill => mapper.with_same_dtype(),209#[cfg(feature = "dtype-struct")]210SplitExact { n, .. } => mapper.with_dtype(DataType::Struct(211(0..n + 1)212.map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String))213.collect(),214)),215#[cfg(feature = "dtype-struct")]216SplitN(n) => mapper.with_dtype(DataType::Struct(217(0..*n)218.map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String))219.collect(),220)),221#[cfg(feature = "find_many")]222ContainsAny { .. } => mapper.with_dtype(DataType::Boolean),223#[cfg(feature = "find_many")]224ReplaceMany { .. } => mapper.with_same_dtype(),225#[cfg(feature = "find_many")]226ExtractMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::String))),227#[cfg(feature = "find_many")]228FindMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::UInt32))),229#[cfg(feature = "regex")]230EscapeRegex => mapper.with_same_dtype(),231}232}233234pub fn function_options(&self) -> FunctionOptions {235use IRStringFunction as S;236match self {237S::Format { .. } => FunctionOptions::elementwise(),238#[cfg(feature = "concat_str")]239S::ConcatHorizontal { .. } => FunctionOptions::elementwise()240.with_flags(|f| f | FunctionFlags::INPUT_WILDCARD_EXPANSION),241#[cfg(feature = "concat_str")]242S::ConcatVertical { .. } => FunctionOptions::aggregation(),243#[cfg(feature = "regex")]244S::Contains { .. } => {245FunctionOptions::elementwise().with_supertyping(Default::default())246},247S::CountMatches(_) => FunctionOptions::elementwise(),248S::EndsWith | S::StartsWith | S::Extract(_) => {249FunctionOptions::elementwise().with_supertyping(Default::default())250},251S::ExtractAll => FunctionOptions::elementwise(),252#[cfg(feature = "extract_groups")]253S::ExtractGroups { .. } => FunctionOptions::elementwise(),254#[cfg(feature = "string_to_integer")]255S::ToInteger { .. } => FunctionOptions::elementwise(),256#[cfg(feature = "regex")]257S::Find { .. } => FunctionOptions::elementwise().with_supertyping(Default::default()),258#[cfg(feature = "extract_jsonpath")]259S::JsonDecode { .. } => FunctionOptions::elementwise(),260#[cfg(feature = "extract_jsonpath")]261S::JsonPathMatch => FunctionOptions::elementwise(),262S::LenBytes | S::LenChars => FunctionOptions::elementwise(),263#[cfg(feature = "regex")]264S::Replace { .. } => {265FunctionOptions::elementwise().with_supertyping(Default::default())266},267#[cfg(feature = "string_normalize")]268S::Normalize { .. } => FunctionOptions::elementwise(),269#[cfg(feature = "string_reverse")]270S::Reverse => FunctionOptions::elementwise(),271#[cfg(feature = "temporal")]272S::Strptime(_, options) if options.format.is_some() => FunctionOptions::elementwise(),273#[cfg(feature = "temporal")]274S::Strptime(_, _) => FunctionOptions::elementwise_with_infer(),275S::Split(_) => FunctionOptions::elementwise(),276#[cfg(feature = "nightly")]277S::Titlecase => FunctionOptions::elementwise(),278#[cfg(feature = "dtype-decimal")]279S::ToDecimal { .. } => FunctionOptions::elementwise(),280#[cfg(feature = "string_encoding")]281S::HexEncode | S::Base64Encode => FunctionOptions::elementwise(),282#[cfg(feature = "binary_encoding")]283S::HexDecode(_) | S::Base64Decode(_) => FunctionOptions::elementwise(),284S::Uppercase | S::Lowercase => FunctionOptions::elementwise(),285S::StripChars286| S::StripCharsStart287| S::StripCharsEnd288| S::StripPrefix289| S::StripSuffix290| S::Head291| S::Tail => FunctionOptions::elementwise(),292S::Slice => FunctionOptions::elementwise(),293#[cfg(feature = "string_pad")]294S::PadStart { .. } | S::PadEnd { .. } | S::ZFill => FunctionOptions::elementwise(),295#[cfg(feature = "dtype-struct")]296S::SplitExact { .. } => FunctionOptions::elementwise(),297#[cfg(feature = "dtype-struct")]298S::SplitN(_) => FunctionOptions::elementwise(),299#[cfg(feature = "find_many")]300S::ContainsAny { .. } => FunctionOptions::elementwise(),301#[cfg(feature = "find_many")]302S::ReplaceMany { .. } => FunctionOptions::elementwise(),303#[cfg(feature = "find_many")]304S::ExtractMany { .. } => FunctionOptions::elementwise(),305#[cfg(feature = "find_many")]306S::FindMany { .. } => FunctionOptions::elementwise(),307#[cfg(feature = "regex")]308S::EscapeRegex => FunctionOptions::elementwise(),309}310}311}312313impl Display for IRStringFunction {314fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {315use IRStringFunction::*;316let s = match self {317Format { .. } => "format",318#[cfg(feature = "regex")]319Contains { .. } => "contains",320CountMatches(_) => "count_matches",321EndsWith => "ends_with",322Extract(_) => "extract",323#[cfg(feature = "concat_str")]324ConcatHorizontal { .. } => "concat_horizontal",325#[cfg(feature = "concat_str")]326ConcatVertical { .. } => "concat_vertical",327ExtractAll => "extract_all",328#[cfg(feature = "extract_groups")]329ExtractGroups { .. } => "extract_groups",330#[cfg(feature = "string_to_integer")]331ToInteger { .. } => "to_integer",332#[cfg(feature = "regex")]333Find { .. } => "find",334Head => "head",335Tail => "tail",336#[cfg(feature = "extract_jsonpath")]337JsonDecode(..) => "json_decode",338#[cfg(feature = "extract_jsonpath")]339JsonPathMatch => "json_path_match",340LenBytes => "len_bytes",341Lowercase => "to_lowercase",342LenChars => "len_chars",343#[cfg(feature = "string_pad")]344PadEnd { .. } => "pad_end",345#[cfg(feature = "string_pad")]346PadStart { .. } => "pad_start",347#[cfg(feature = "regex")]348Replace { .. } => "replace",349#[cfg(feature = "string_normalize")]350Normalize { .. } => "normalize",351#[cfg(feature = "string_reverse")]352Reverse => "reverse",353#[cfg(feature = "string_encoding")]354HexEncode => "hex_encode",355#[cfg(feature = "binary_encoding")]356HexDecode(_) => "hex_decode",357#[cfg(feature = "string_encoding")]358Base64Encode => "base64_encode",359#[cfg(feature = "binary_encoding")]360Base64Decode(_) => "base64_decode",361Slice => "slice",362StartsWith => "starts_with",363StripChars => "strip_chars",364StripCharsStart => "strip_chars_start",365StripCharsEnd => "strip_chars_end",366StripPrefix => "strip_prefix",367StripSuffix => "strip_suffix",368#[cfg(feature = "dtype-struct")]369SplitExact { inclusive, .. } => {370if *inclusive {371"split_exact_inclusive"372} else {373"split_exact"374}375},376#[cfg(feature = "dtype-struct")]377SplitN(_) => "splitn",378#[cfg(feature = "temporal")]379Strptime(_, _) => "strptime",380Split(inclusive) => {381if *inclusive {382"split_inclusive"383} else {384"split"385}386},387#[cfg(feature = "nightly")]388Titlecase => "to_titlecase",389#[cfg(feature = "dtype-decimal")]390ToDecimal { .. } => "to_decimal",391Uppercase => "to_uppercase",392#[cfg(feature = "string_pad")]393ZFill => "zfill",394#[cfg(feature = "find_many")]395ContainsAny { .. } => "contains_any",396#[cfg(feature = "find_many")]397ReplaceMany { .. } => "replace_many",398#[cfg(feature = "find_many")]399ExtractMany { .. } => "extract_many",400#[cfg(feature = "find_many")]401FindMany { .. } => "extract_many",402#[cfg(feature = "regex")]403EscapeRegex => "escape_regex",404};405write!(f, "str.{s}")406}407}408409impl From<IRStringFunction> for IRFunctionExpr {410fn from(str: IRStringFunction) -> Self {411IRFunctionExpr::StringExpr(str)412}413}414415416