Path: blob/main/crates/polars-plan/src/plans/aexpr/function_expr/strings.rs
8393 views
#[cfg(feature = "dtype-decimal")]1use polars_compute::decimal::DEC128_MAX_PREC;2#[cfg(feature = "dtype-struct")]3use polars_utils::format_pl_smallstr;45use super::*;67#[cfg(all(feature = "regex", feature = "timezones"))]8polars_utils::regex_cache::cached_regex! {9pub static TZ_AWARE_RE = r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)";10}1112#[cfg_attr(feature = "ir_serde", derive(serde::Serialize, serde::Deserialize))]13#[derive(Clone, PartialEq, Debug, Eq, Hash)]14pub enum IRStringFunction {15Format {16format: PlSmallStr,17insertions: Arc<[usize]>,18},19#[cfg(feature = "concat_str")]20ConcatHorizontal {21delimiter: PlSmallStr,22ignore_nulls: bool,23},24#[cfg(feature = "concat_str")]25ConcatVertical {26delimiter: PlSmallStr,27ignore_nulls: bool,28},29#[cfg(feature = "regex")]30Contains {31literal: bool,32strict: bool,33},34CountMatches(bool),35EndsWith,36Extract(usize),37ExtractAll,38#[cfg(feature = "extract_groups")]39ExtractGroups {40dtype: DataType,41pat: PlSmallStr,42},43#[cfg(feature = "regex")]44Find {45literal: bool,46strict: bool,47},48#[cfg(feature = "string_to_integer")]49ToInteger {50dtype: Option<DataType>,51strict: bool,52},53LenBytes,54LenChars,55Lowercase,56#[cfg(feature = "extract_jsonpath")]57JsonDecode(DataType),58#[cfg(feature = "extract_jsonpath")]59JsonPathMatch,60#[cfg(feature = "regex")]61Replace {62// negative is replace all63// how many matches to replace64n: i64,65literal: bool,66},67#[cfg(feature = "string_normalize")]68Normalize {69form: UnicodeForm,70},71#[cfg(feature = "string_reverse")]72Reverse,73#[cfg(feature = "string_pad")]74PadStart {75fill_char: char,76},77#[cfg(feature = "string_pad")]78PadEnd {79fill_char: char,80},81Slice,82Head,83Tail,84#[cfg(feature = "string_encoding")]85HexEncode,86#[cfg(feature = "binary_encoding")]87HexDecode(bool),88#[cfg(feature = "string_encoding")]89Base64Encode,90#[cfg(feature = "binary_encoding")]91Base64Decode(bool),92StartsWith,93StripChars,94StripCharsStart,95StripCharsEnd,96StripPrefix,97StripSuffix,98#[cfg(feature = "dtype-struct")]99SplitExact {100n: usize,101inclusive: bool,102},103#[cfg(feature = "dtype-struct")]104SplitN(usize),105#[cfg(feature = "temporal")]106// DataType can only be Date/Datetime/Time107Strptime(DataType, StrptimeOptions),108Split(bool),109#[cfg(feature = "regex")]110SplitRegex {111inclusive: bool,112strict: bool,113},114#[cfg(feature = "dtype-decimal")]115ToDecimal {116scale: usize,117},118#[cfg(feature = "nightly")]119Titlecase,120Uppercase,121#[cfg(feature = "string_pad")]122ZFill,123#[cfg(feature = "find_many")]124ContainsAny {125ascii_case_insensitive: bool,126},127#[cfg(feature = "find_many")]128ReplaceMany {129ascii_case_insensitive: bool,130leftmost: bool,131},132#[cfg(feature = "find_many")]133ExtractMany {134ascii_case_insensitive: bool,135overlapping: bool,136leftmost: bool,137},138#[cfg(feature = "find_many")]139FindMany {140ascii_case_insensitive: bool,141overlapping: bool,142leftmost: bool,143},144#[cfg(feature = "regex")]145EscapeRegex,146}147148impl IRStringFunction {149pub(super) fn get_field(&self, mapper: FieldsMapper) -> PolarsResult<Field> {150use IRStringFunction::*;151match self {152Format { .. } => mapper.with_dtype(DataType::String),153#[cfg(feature = "concat_str")]154ConcatVertical { .. } | ConcatHorizontal { .. } => mapper.with_dtype(DataType::String),155#[cfg(feature = "regex")]156Contains { .. } => mapper.with_dtype(DataType::Boolean),157CountMatches(_) => mapper.with_dtype(DataType::UInt32),158EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean),159Extract(_) => mapper.with_same_dtype(),160ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))),161#[cfg(feature = "extract_groups")]162ExtractGroups { dtype, .. } => mapper.with_dtype(dtype.clone()),163#[cfg(feature = "string_to_integer")]164ToInteger { dtype, .. } => mapper.with_dtype(dtype.clone().unwrap_or(DataType::Int64)),165#[cfg(feature = "regex")]166Find { .. } => mapper.with_dtype(DataType::UInt32),167#[cfg(feature = "extract_jsonpath")]168JsonDecode(dtype) => mapper.with_dtype(dtype.clone()),169#[cfg(feature = "extract_jsonpath")]170JsonPathMatch => mapper.with_dtype(DataType::String),171LenBytes => mapper.with_dtype(DataType::UInt32),172LenChars => mapper.with_dtype(DataType::UInt32),173#[cfg(feature = "regex")]174Replace { .. } => mapper.with_same_dtype(),175#[cfg(feature = "string_normalize")]176Normalize { .. } => mapper.with_same_dtype(),177#[cfg(feature = "string_reverse")]178Reverse => mapper.with_same_dtype(),179#[cfg(feature = "temporal")]180Strptime(dtype, options) => match dtype {181#[cfg(feature = "dtype-datetime")]182DataType::Datetime(time_unit, time_zone) => {183let mut time_zone = time_zone.clone();184#[cfg(all(feature = "regex", feature = "timezones"))]185if options186.format187.as_ref()188.is_some_and(|format| TZ_AWARE_RE.is_match(format.as_str()))189&& time_zone.is_none()190{191time_zone = Some(time_zone.unwrap_or(TimeZone::UTC));192}193mapper.with_dtype(DataType::Datetime(*time_unit, time_zone))194},195_ => mapper.with_dtype(dtype.clone()),196},197Split(_) => mapper.with_dtype(DataType::List(DataType::String.into())),198#[cfg(feature = "regex")]199SplitRegex { .. } => mapper.with_dtype(DataType::List(DataType::String.into())),200#[cfg(feature = "nightly")]201Titlecase => mapper.with_same_dtype(),202#[cfg(feature = "dtype-decimal")]203ToDecimal { scale } => mapper.with_dtype(DataType::Decimal(DEC128_MAX_PREC, *scale)),204#[cfg(feature = "string_encoding")]205HexEncode => mapper.with_same_dtype(),206#[cfg(feature = "binary_encoding")]207HexDecode(_) => mapper.with_dtype(DataType::Binary),208#[cfg(feature = "string_encoding")]209Base64Encode => mapper.with_same_dtype(),210#[cfg(feature = "binary_encoding")]211Base64Decode(_) => mapper.with_dtype(DataType::Binary),212Uppercase | Lowercase | StripChars | StripCharsStart | StripCharsEnd | StripPrefix213| StripSuffix | Slice | Head | Tail => mapper.with_same_dtype(),214#[cfg(feature = "string_pad")]215PadStart { .. } | PadEnd { .. } | ZFill => mapper.with_same_dtype(),216#[cfg(feature = "dtype-struct")]217SplitExact { n, .. } => mapper.with_dtype(DataType::Struct(218(0..n + 1)219.map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String))220.collect(),221)),222#[cfg(feature = "dtype-struct")]223SplitN(n) => mapper.with_dtype(DataType::Struct(224(0..*n)225.map(|i| Field::new(format_pl_smallstr!("field_{i}"), DataType::String))226.collect(),227)),228#[cfg(feature = "find_many")]229ContainsAny { .. } => mapper.with_dtype(DataType::Boolean),230#[cfg(feature = "find_many")]231ReplaceMany { .. } => mapper.with_same_dtype(),232#[cfg(feature = "find_many")]233ExtractMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::String))),234#[cfg(feature = "find_many")]235FindMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::UInt32))),236#[cfg(feature = "regex")]237EscapeRegex => mapper.with_same_dtype(),238}239}240241pub fn function_options(&self) -> FunctionOptions {242use IRStringFunction as S;243match self {244S::Format { .. } => FunctionOptions::elementwise(),245#[cfg(feature = "concat_str")]246S::ConcatHorizontal { .. } => FunctionOptions::elementwise()247.with_flags(|f| f | FunctionFlags::INPUT_WILDCARD_EXPANSION),248#[cfg(feature = "concat_str")]249S::ConcatVertical { .. } => FunctionOptions::aggregation(),250#[cfg(feature = "regex")]251S::Contains { .. } => {252FunctionOptions::elementwise().with_supertyping(Default::default())253},254S::CountMatches(_) => FunctionOptions::elementwise(),255S::EndsWith | S::StartsWith | S::Extract(_) => {256FunctionOptions::elementwise().with_supertyping(Default::default())257},258S::ExtractAll => FunctionOptions::elementwise(),259#[cfg(feature = "extract_groups")]260S::ExtractGroups { .. } => FunctionOptions::elementwise(),261#[cfg(feature = "string_to_integer")]262S::ToInteger { .. } => FunctionOptions::elementwise(),263#[cfg(feature = "regex")]264S::Find { .. } => FunctionOptions::elementwise().with_supertyping(Default::default()),265#[cfg(feature = "extract_jsonpath")]266S::JsonDecode { .. } => FunctionOptions::elementwise(),267#[cfg(feature = "extract_jsonpath")]268S::JsonPathMatch => FunctionOptions::elementwise(),269S::LenBytes | S::LenChars => FunctionOptions::elementwise(),270#[cfg(feature = "regex")]271S::Replace { .. } => {272FunctionOptions::elementwise().with_supertyping(Default::default())273},274#[cfg(feature = "string_normalize")]275S::Normalize { .. } => FunctionOptions::elementwise(),276#[cfg(feature = "string_reverse")]277S::Reverse => FunctionOptions::elementwise(),278#[cfg(feature = "temporal")]279S::Strptime(_, options) if options.format.is_some() => FunctionOptions::elementwise(),280#[cfg(feature = "temporal")]281S::Strptime(_, _) => FunctionOptions::elementwise_with_infer(),282S::Split(_) => FunctionOptions::elementwise(),283#[cfg(feature = "nightly")]284S::Titlecase => FunctionOptions::elementwise(),285#[cfg(feature = "dtype-decimal")]286S::ToDecimal { .. } => FunctionOptions::elementwise(),287#[cfg(feature = "string_encoding")]288S::HexEncode | S::Base64Encode => FunctionOptions::elementwise(),289#[cfg(feature = "binary_encoding")]290S::HexDecode(_) | S::Base64Decode(_) => FunctionOptions::elementwise(),291S::Uppercase | S::Lowercase => FunctionOptions::elementwise(),292S::StripChars293| S::StripCharsStart294| S::StripCharsEnd295| S::StripPrefix296| S::StripSuffix297| S::Head298| S::Tail => FunctionOptions::elementwise(),299S::Slice => FunctionOptions::elementwise(),300#[cfg(feature = "string_pad")]301S::PadStart { .. } | S::PadEnd { .. } | S::ZFill => FunctionOptions::elementwise(),302#[cfg(feature = "dtype-struct")]303S::SplitExact { .. } => FunctionOptions::elementwise(),304#[cfg(feature = "dtype-struct")]305S::SplitN(_) => FunctionOptions::elementwise(),306#[cfg(feature = "regex")]307S::SplitRegex { .. } => FunctionOptions::elementwise(),308#[cfg(feature = "find_many")]309S::ContainsAny { .. } => FunctionOptions::elementwise(),310#[cfg(feature = "find_many")]311S::ReplaceMany { .. } => FunctionOptions::elementwise(),312#[cfg(feature = "find_many")]313S::ExtractMany { .. } => FunctionOptions::elementwise(),314#[cfg(feature = "find_many")]315S::FindMany { .. } => FunctionOptions::elementwise(),316#[cfg(feature = "regex")]317S::EscapeRegex => FunctionOptions::elementwise(),318}319}320}321322impl Display for IRStringFunction {323fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {324use IRStringFunction::*;325let s = match self {326Format { .. } => "format",327#[cfg(feature = "regex")]328Contains { .. } => "contains",329CountMatches(_) => "count_matches",330EndsWith => "ends_with",331Extract(_) => "extract",332#[cfg(feature = "concat_str")]333ConcatHorizontal { .. } => "concat_horizontal",334#[cfg(feature = "concat_str")]335ConcatVertical { .. } => "concat_vertical",336ExtractAll => "extract_all",337#[cfg(feature = "extract_groups")]338ExtractGroups { .. } => "extract_groups",339#[cfg(feature = "string_to_integer")]340ToInteger { .. } => "to_integer",341#[cfg(feature = "regex")]342Find { .. } => "find",343Head => "head",344Tail => "tail",345#[cfg(feature = "extract_jsonpath")]346JsonDecode(..) => "json_decode",347#[cfg(feature = "extract_jsonpath")]348JsonPathMatch => "json_path_match",349LenBytes => "len_bytes",350Lowercase => "to_lowercase",351LenChars => "len_chars",352#[cfg(feature = "string_pad")]353PadEnd { .. } => "pad_end",354#[cfg(feature = "string_pad")]355PadStart { .. } => "pad_start",356#[cfg(feature = "regex")]357Replace { .. } => "replace",358#[cfg(feature = "string_normalize")]359Normalize { .. } => "normalize",360#[cfg(feature = "string_reverse")]361Reverse => "reverse",362#[cfg(feature = "string_encoding")]363HexEncode => "hex_encode",364#[cfg(feature = "binary_encoding")]365HexDecode(_) => "hex_decode",366#[cfg(feature = "string_encoding")]367Base64Encode => "base64_encode",368#[cfg(feature = "binary_encoding")]369Base64Decode(_) => "base64_decode",370Slice => "slice",371StartsWith => "starts_with",372StripChars => "strip_chars",373StripCharsStart => "strip_chars_start",374StripCharsEnd => "strip_chars_end",375StripPrefix => "strip_prefix",376StripSuffix => "strip_suffix",377#[cfg(feature = "dtype-struct")]378SplitExact { inclusive, .. } => {379if *inclusive {380"split_exact_inclusive"381} else {382"split_exact"383}384},385#[cfg(feature = "dtype-struct")]386SplitN(_) => "splitn",387#[cfg(feature = "temporal")]388Strptime(_, _) => "strptime",389Split(inclusive) => {390if *inclusive {391"split_inclusive"392} else {393"split"394}395},396#[cfg(feature = "regex")]397SplitRegex { inclusive, .. } => {398if *inclusive {399"split_regex_inclusive"400} else {401"split_regex"402}403},404#[cfg(feature = "nightly")]405Titlecase => "to_titlecase",406#[cfg(feature = "dtype-decimal")]407ToDecimal { .. } => "to_decimal",408Uppercase => "to_uppercase",409#[cfg(feature = "string_pad")]410ZFill => "zfill",411#[cfg(feature = "find_many")]412ContainsAny { .. } => "contains_any",413#[cfg(feature = "find_many")]414ReplaceMany { .. } => "replace_many",415#[cfg(feature = "find_many")]416ExtractMany { .. } => "extract_many",417#[cfg(feature = "find_many")]418FindMany { .. } => "extract_many",419#[cfg(feature = "regex")]420EscapeRegex => "escape_regex",421};422write!(f, "str.{s}")423}424}425426impl From<IRStringFunction> for IRFunctionExpr {427fn from(str: IRStringFunction) -> Self {428IRFunctionExpr::StringExpr(str)429}430}431432433