Path: blob/main/crates/polars-ops/src/chunked_array/strings/namespace.rs
8396 views
use arrow::array::ValueSize;1use arrow::legacy::kernels::string::*;2#[cfg(feature = "string_encoding")]3use base64::Engine as _;4#[cfg(feature = "string_encoding")]5use base64::engine::general_purpose;6#[cfg(feature = "string_to_integer")]7use num_traits::Num;8use polars_core::prelude::arity::*;9use polars_utils::regex_cache::{compile_regex, with_regex_cache};1011use super::*;12#[cfg(feature = "binary_encoding")]13use crate::chunked_array::binary::BinaryNameSpaceImpl;14#[cfg(feature = "string_normalize")]15use crate::prelude::strings::normalize::UnicodeForm;1617// We need this to infer the right lifetimes for the match closure.18#[inline(always)]19fn infer_re_match<F>(f: F) -> F20where21F: for<'a, 'b> FnMut(Option<&'a str>, Option<&'b str>) -> Option<bool>,22{23f24}2526#[cfg(feature = "string_to_integer")]27// This is a helper function used in the `to_integer` method of the StringNameSpaceImpl trait.28fn parse_integer<T>(29ca: &ChunkedArray<StringType>,30base: &UInt32Chunked,31strict: bool,32) -> PolarsResult<Series>33where34T: PolarsIntegerType,35T::Native: Num,36ChunkedArray<T>: IntoSeries,37<<T as polars_core::datatypes::PolarsNumericType>::Native as num_traits::Num>::FromStrRadixErr:38std::fmt::Display,39{40let f = |opt_s: Option<&str>, opt_base: Option<u32>| -> PolarsResult<Option<T::Native>> {41let (Some(s), Some(base)) = (opt_s, opt_base) else {42return Ok(None);43};4445if !(2..=36).contains(&base) {46polars_bail!(ComputeError: "`to_integer` called with invalid base '{base}'");47}4849Ok(T::Native::from_str_radix(s, base).ok())50};51let out: ChunkedArray<T> = broadcast_try_binary_elementwise(ca, base, f)?;52if strict && ca.null_count() != out.null_count() {53let failure_mask = ca.is_not_null() & out.is_null() & base.is_not_null();54let n_failures = failure_mask.num_trues();55if n_failures == 0 {56return Ok(out.into_series());57}5859let some_failures = if ca.len() == 1 {60ca.clone()61} else {62let all_failures = ca.filter(&failure_mask)?;63// `.unique()` does not necessarily preserve the original order.64let unique_failures_args = all_failures.arg_unique()?;65all_failures.take(&unique_failures_args.slice(0, 10))?66};67let some_error_msg = match base.len() {681 => {69// we can ensure that base is not null.70let base = base.get(0).unwrap();71some_failures72.get(0)73.and_then(|s| T::Native::from_str_radix(s, base).err())74.map_or_else(75|| unreachable!("failed to extract ParseIntError"),76|e| format!("{e}"),77)78},79_ => {80let base_failures = base.filter(&failure_mask)?;81some_failures82.get(0)83.zip(base_failures.get(0))84.and_then(|(s, base)| T::Native::from_str_radix(s, base).err())85.map_or_else(86|| unreachable!("failed to extract ParseIntError"),87|e| format!("{e}"),88)89},90};91polars_bail!(92ComputeError:93"strict integer parsing failed for {} value(s): {}; error message for the \94first shown value: '{}' (consider non-strict parsing)",95n_failures,96some_failures.into_series().fmt_list(),97some_error_msg98);99}100101Ok(out.into_series())102}103104pub trait StringNameSpaceImpl: AsString {105#[cfg(not(feature = "binary_encoding"))]106fn hex_decode(&self) -> PolarsResult<StringChunked> {107panic!("activate 'binary_encoding' feature")108}109110#[cfg(feature = "binary_encoding")]111fn hex_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {112let ca = self.as_string();113ca.as_binary().hex_decode(strict)114}115116#[must_use]117#[cfg(feature = "string_encoding")]118fn hex_encode(&self) -> StringChunked {119let ca = self.as_string();120ca.apply_values(|s| hex::encode(s).into())121}122123#[cfg(not(feature = "binary_encoding"))]124fn base64_decode(&self) -> PolarsResult<StringChunked> {125panic!("activate 'binary_encoding' feature")126}127128#[cfg(feature = "binary_encoding")]129fn base64_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {130let ca = self.as_string();131ca.as_binary().base64_decode(strict)132}133134#[must_use]135#[cfg(feature = "string_encoding")]136fn base64_encode(&self) -> StringChunked {137let ca = self.as_string();138ca.apply_values(|s| general_purpose::STANDARD.encode(s).into())139}140141#[cfg(feature = "string_to_integer")]142// Parse a string number with base _radix_ into a decimal dtype143fn to_integer(144&self,145base: &UInt32Chunked,146dtype: Option<DataType>,147strict: bool,148) -> PolarsResult<Series> {149let ca = self.as_string();150151polars_ensure!(152ca.len() == base.len() || ca.len() == 1 || base.len() == 1,153length_mismatch = "str.to_integer",154ca.len(),155base.len()156);157158match dtype.unwrap_or(DataType::Int64) {159DataType::Int8 => parse_integer::<Int8Type>(ca, base, strict),160DataType::Int16 => parse_integer::<Int16Type>(ca, base, strict),161DataType::Int32 => parse_integer::<Int32Type>(ca, base, strict),162DataType::Int64 => parse_integer::<Int64Type>(ca, base, strict),163DataType::Int128 => parse_integer::<Int128Type>(ca, base, strict),164DataType::UInt8 => parse_integer::<UInt8Type>(ca, base, strict),165DataType::UInt16 => parse_integer::<UInt16Type>(ca, base, strict),166DataType::UInt32 => parse_integer::<UInt32Type>(ca, base, strict),167DataType::UInt64 => parse_integer::<UInt64Type>(ca, base, strict),168DataType::UInt128 => parse_integer::<UInt128Type>(ca, base, strict),169dtype => polars_bail!(InvalidOperation: "Invalid dtype {:?}", dtype),170}171}172173fn contains_chunked(174&self,175pat: &StringChunked,176literal: bool,177strict: bool,178) -> PolarsResult<BooleanChunked> {179let ca = self.as_string();180match (ca.len(), pat.len()) {181(_, 1) => match pat.get(0) {182Some(pat) => {183if literal {184ca.contains_literal(pat)185} else {186ca.contains(pat, strict)187}188},189None => Ok(BooleanChunked::full_null(ca.name().clone(), ca.len())),190},191(1, _) if ca.null_count() == 1 => Ok(BooleanChunked::full_null(192ca.name().clone(),193ca.len().max(pat.len()),194)),195_ => {196if literal {197Ok(broadcast_binary_elementwise_values(ca, pat, |src, pat| {198src.contains(pat)199}))200} else if strict {201with_regex_cache(|reg_cache| {202broadcast_try_binary_elementwise(ca, pat, |opt_src, opt_pat| {203match (opt_src, opt_pat) {204(Some(src), Some(pat)) => {205let reg = reg_cache.compile(pat)?;206Ok(Some(reg.is_match(src)))207},208_ => Ok(None),209}210})211})212} else {213with_regex_cache(|reg_cache| {214Ok(broadcast_binary_elementwise(215ca,216pat,217infer_re_match(|src, pat| {218let reg = reg_cache.compile(pat?).ok()?;219Some(reg.is_match(src?))220}),221))222})223}224},225}226}227228fn find_chunked(229&self,230pat: &StringChunked,231literal: bool,232strict: bool,233) -> PolarsResult<UInt32Chunked> {234let ca = self.as_string();235if pat.len() == 1 {236return if let Some(pat) = pat.get(0) {237if literal {238ca.find_literal(pat)239} else {240ca.find(pat, strict)241}242} else {243Ok(UInt32Chunked::full_null(ca.name().clone(), ca.len()))244};245} else if ca.len() == 1 && ca.null_count() == 1 {246return Ok(UInt32Chunked::full_null(247ca.name().clone(),248ca.len().max(pat.len()),249));250}251if literal {252Ok(broadcast_binary_elementwise(253ca,254pat,255|src: Option<&str>, pat: Option<&str>| src?.find(pat?).map(|idx| idx as u32),256))257} else {258with_regex_cache(|reg_cache| {259let matcher = |src: Option<&str>, pat: Option<&str>| -> PolarsResult<Option<u32>> {260if let (Some(src), Some(pat)) = (src, pat) {261let re = reg_cache.compile(pat)?;262return Ok(re.find(src).map(|m| m.start() as u32));263}264Ok(None)265};266broadcast_try_binary_elementwise(ca, pat, matcher)267})268}269}270271/// Get the length of the string values as number of chars.272fn str_len_chars(&self) -> UInt32Chunked {273let ca = self.as_string();274ca.apply_kernel_cast(&string_len_chars)275}276277/// Get the length of the string values as number of bytes.278fn str_len_bytes(&self) -> UInt32Chunked {279let ca = self.as_string();280ca.apply_kernel_cast(&utf8view_len_bytes)281}282283/// Pad the start of the string until it reaches the given length.284///285/// Padding is done using the specified `fill_char`.286/// Strings with length equal to or greater than the given length are287/// returned as-is.288#[cfg(feature = "string_pad")]289fn pad_start(&self, length: &UInt64Chunked, fill_char: char) -> StringChunked {290let ca = self.as_string();291pad::pad_start(ca, length, fill_char)292}293294/// Pad the end of the string until it reaches the given length.295///296/// Padding is done using the specified `fill_char`.297/// Strings with length equal to or greater than the given length are298/// returned as-is.299#[cfg(feature = "string_pad")]300fn pad_end(&self, length: &UInt64Chunked, fill_char: char) -> StringChunked {301let ca = self.as_string();302pad::pad_end(ca, length, fill_char)303}304305/// Pad the start of the string with zeros until it reaches the given length.306///307/// A sign prefix (`-`) is handled by inserting the padding after the sign308/// character rather than before.309/// Strings with length equal to or greater than the given length are310/// returned as-is.311#[cfg(feature = "string_pad")]312fn zfill(&self, length: &UInt64Chunked) -> StringChunked {313let ca = self.as_string();314pad::zfill(ca, length)315}316317/// Check if strings contain a regex pattern.318fn contains(&self, pat: &str, strict: bool) -> PolarsResult<BooleanChunked> {319let ca = self.as_string();320let res_reg = polars_utils::regex_cache::compile_regex(pat);321let opt_reg = if strict { Some(res_reg?) } else { res_reg.ok() };322let out: BooleanChunked = if let Some(reg) = opt_reg {323unary_elementwise_values(ca, |s| reg.is_match(s))324} else {325BooleanChunked::full_null(ca.name().clone(), ca.len())326};327Ok(out)328}329330/// Check if strings contain a given literal331fn contains_literal(&self, lit: &str) -> PolarsResult<BooleanChunked> {332// note: benchmarking shows that the regex engine is actually333// faster at finding literal matches than str::contains.334// ref: https://github.com/pola-rs/polars/pull/6811335self.contains(regex::escape(lit).as_str(), true)336}337338/// Return the index position of a literal substring in the target string.339fn find_literal(&self, lit: &str) -> PolarsResult<UInt32Chunked> {340self.find(regex::escape(lit).as_str(), true)341}342343/// Return the index position of a regular expression substring in the target string.344fn find(&self, pat: &str, strict: bool) -> PolarsResult<UInt32Chunked> {345let ca = self.as_string();346match polars_utils::regex_cache::compile_regex(pat) {347Ok(rx) => Ok(unary_elementwise(ca, |opt_s| {348opt_s.and_then(|s| rx.find(s)).map(|m| m.start() as u32)349})),350Err(_) if !strict => Ok(UInt32Chunked::full_null(ca.name().clone(), ca.len())),351Err(e) => Err(PolarsError::ComputeError(352format!("Invalid regular expression: {e}").into(),353)),354}355}356357/// Replace the leftmost regex-matched (sub)string with another string358fn replace<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<StringChunked> {359let reg = polars_utils::regex_cache::compile_regex(pat)?;360let f = |s: &'a str| reg.replace(s, val);361let ca = self.as_string();362Ok(ca.apply_values(f))363}364365/// Replace the leftmost literal (sub)string with another string366fn replace_literal<'a>(367&'a self,368pat: &str,369val: &str,370n: usize,371) -> PolarsResult<StringChunked> {372let ca = self.as_string();373if ca.is_empty() {374return Ok(ca.clone());375}376377// amortize allocation378let mut buf = String::new();379380let f = move |s: &'a str| {381buf.clear();382let mut changed = false;383384// See: str.replacen385let mut last_end = 0;386for (start, part) in s.match_indices(pat).take(n) {387changed = true;388buf.push_str(unsafe { s.get_unchecked(last_end..start) });389buf.push_str(val);390last_end = start + part.len();391}392buf.push_str(unsafe { s.get_unchecked(last_end..s.len()) });393394if changed {395// extend lifetime396// lifetime is bound to 'a397let slice = buf.as_str();398unsafe { std::mem::transmute::<&str, &'a str>(slice) }399} else {400s401}402};403Ok(ca.apply_mut(f))404}405406/// Replace all regex-matched (sub)strings with another string407fn replace_all(&self, pat: &str, val: &str) -> PolarsResult<StringChunked> {408let ca = self.as_string();409let reg = polars_utils::regex_cache::compile_regex(pat)?;410Ok(ca.apply_values(|s| reg.replace_all(s, val)))411}412413/// Replace all matching literal (sub)strings with another string414fn replace_literal_all<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<StringChunked> {415let ca = self.as_string();416if ca.is_empty() {417return Ok(ca.clone());418}419420// Amortize allocation.421let mut buf = String::new();422423let f = move |s: &'a str| {424buf.clear();425let mut changed = false;426427// See: str.replace.428let mut last_end = 0;429for (start, part) in s.match_indices(pat) {430changed = true;431buf.push_str(unsafe { s.get_unchecked(last_end..start) });432buf.push_str(val);433last_end = start + part.len();434}435buf.push_str(unsafe { s.get_unchecked(last_end..s.len()) });436437if changed {438// Extend lifetime, lifetime is bound to 'a.439let slice = buf.as_str();440unsafe { std::mem::transmute::<&str, &'a str>(slice) }441} else {442s443}444};445446Ok(ca.apply_mut(f))447}448449/// Extract the nth capture group from pattern.450fn extract(&self, pat: &StringChunked, group_index: usize) -> PolarsResult<StringChunked> {451let ca = self.as_string();452super::extract::extract_group(ca, pat, group_index)453}454455/// Extract each successive non-overlapping regex match in an individual string as an array.456fn extract_all(&self, pat: &str) -> PolarsResult<ListChunked> {457let ca = self.as_string();458let reg = polars_utils::regex_cache::compile_regex(pat)?;459460let mut builder =461ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());462for arr in ca.downcast_iter() {463for opt_s in arr {464match opt_s {465None => builder.append_null(),466Some(s) => builder.append_values_iter(reg.find_iter(s).map(|m| m.as_str())),467}468}469}470Ok(builder.finish())471}472473fn strip_chars(&self, pat: &Column) -> PolarsResult<StringChunked> {474let ca = self.as_string();475if pat.dtype() == &DataType::Null {476Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim())))477} else {478Ok(strip_chars(ca, pat.str()?))479}480}481482fn strip_chars_start(&self, pat: &Column) -> PolarsResult<StringChunked> {483let ca = self.as_string();484if pat.dtype() == &DataType::Null {485Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim_start())))486} else {487Ok(strip_chars_start(ca, pat.str()?))488}489}490491fn strip_chars_end(&self, pat: &Column) -> PolarsResult<StringChunked> {492let ca = self.as_string();493if pat.dtype() == &DataType::Null {494Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim_end())))495} else {496Ok(strip_chars_end(ca, pat.str()?))497}498}499500fn strip_prefix(&self, prefix: &StringChunked) -> StringChunked {501let ca = self.as_string();502strip_prefix(ca, prefix)503}504505fn strip_suffix(&self, suffix: &StringChunked) -> StringChunked {506let ca = self.as_string();507strip_suffix(ca, suffix)508}509510#[cfg(feature = "dtype-struct")]511fn split_exact(&self, by: &StringChunked, n: usize) -> PolarsResult<StructChunked> {512let ca = self.as_string();513514split_to_struct(ca, by, n + 1, str::split, false)515}516517#[cfg(feature = "dtype-struct")]518fn split_exact_inclusive(&self, by: &StringChunked, n: usize) -> PolarsResult<StructChunked> {519let ca = self.as_string();520521split_to_struct(ca, by, n + 1, str::split_inclusive, false)522}523524#[cfg(feature = "dtype-struct")]525fn splitn(&self, by: &StringChunked, n: usize) -> PolarsResult<StructChunked> {526let ca = self.as_string();527528split_to_struct(ca, by, n, |s, by| s.splitn(n, by), true)529}530531fn split(&self, by: &StringChunked) -> PolarsResult<ListChunked> {532let ca = self.as_string();533split_helper(ca, by, str::split)534}535536fn split_inclusive(&self, by: &StringChunked) -> PolarsResult<ListChunked> {537let ca = self.as_string();538split_helper(ca, by, str::split_inclusive)539}540541/// Extract each successive non-overlapping regex match in an individual string as an array.542fn extract_all_many(&self, pat: &StringChunked) -> PolarsResult<ListChunked> {543let ca = self.as_string();544polars_ensure!(545ca.len() == pat.len(),546ComputeError: "pattern's length: {} does not match that of the argument series: {}",547pat.len(), ca.len(),548);549550let mut builder =551ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());552with_regex_cache(|re_cache| {553binary_elementwise_for_each(ca, pat, |opt_s, opt_pat| match (opt_s, opt_pat) {554(_, None) | (None, _) => builder.append_null(),555(Some(s), Some(pat)) => {556let re = re_cache.compile(pat).unwrap();557builder.append_values_iter(re.find_iter(s).map(|m| m.as_str()));558},559});560});561Ok(builder.finish())562}563564#[cfg(feature = "extract_groups")]565/// Extract all capture groups from pattern and return as a struct.566fn extract_groups(&self, pat: &str, dtype: &DataType) -> PolarsResult<Series> {567let ca = self.as_string();568super::extract::extract_groups(ca, pat, dtype)569}570571/// Count all successive non-overlapping regex matches.572fn count_matches(&self, pat: &str, literal: bool) -> PolarsResult<UInt32Chunked> {573let ca = self.as_string();574if literal {575Ok(unary_elementwise(ca, |opt_s| {576opt_s.map(|s| s.matches(pat).count() as u32)577}))578} else {579let re = compile_regex(pat)?;580Ok(unary_elementwise(ca, |opt_s| {581opt_s.map(|s| re.find_iter(s).count() as u32)582}))583}584}585586/// Count all successive non-overlapping regex matches.587fn count_matches_many(588&self,589pat: &StringChunked,590literal: bool,591) -> PolarsResult<UInt32Chunked> {592let ca = self.as_string();593polars_ensure!(594ca.len() == pat.len(),595ComputeError: "pattern's length: {} does not match that of the argument series: {}",596pat.len(), ca.len(),597);598599let out: UInt32Chunked = if literal {600broadcast_binary_elementwise(ca, pat, |s: Option<&str>, p: Option<&str>| {601Some(s?.matches(p?).count() as u32)602})603} else {604with_regex_cache(|re_cache| {605let op = move |opt_s: Option<&str>,606opt_pat: Option<&str>|607-> PolarsResult<Option<u32>> {608match (opt_s, opt_pat) {609(Some(s), Some(pat)) => {610let reg = re_cache.compile(pat)?;611Ok(Some(reg.find_iter(s).count() as u32))612},613_ => Ok(None),614}615};616broadcast_try_binary_elementwise(ca, pat, op)617})?618};619620Ok(out.with_name(ca.name().clone()))621}622623/// Modify the strings to their lowercase equivalent.624#[must_use]625fn to_lowercase(&self) -> StringChunked {626let ca = self.as_string();627case::to_lowercase(ca)628}629630/// Modify the strings to their uppercase equivalent.631#[must_use]632fn to_uppercase(&self) -> StringChunked {633let ca = self.as_string();634case::to_uppercase(ca)635}636637/// Modify the strings to their titlecase equivalent.638#[must_use]639#[cfg(feature = "nightly")]640fn to_titlecase(&self) -> StringChunked {641let ca = self.as_string();642case::to_titlecase(ca)643}644645/// Concat with the values from a second StringChunked.646#[must_use]647fn concat(&self, other: &StringChunked) -> StringChunked {648let ca = self.as_string();649ca + other650}651652/// Normalizes the string values653#[must_use]654#[cfg(feature = "string_normalize")]655fn str_normalize(&self, form: UnicodeForm) -> StringChunked {656let ca = self.as_string();657normalize::normalize(ca, form)658}659660/// Reverses the string values661#[must_use]662#[cfg(feature = "string_reverse")]663fn str_reverse(&self) -> StringChunked {664let ca = self.as_string();665reverse::reverse(ca)666}667668/// Slice the string values.669///670/// Determines a substring starting from `offset` and with length `length` of each of the elements in `array`.671/// `offset` can be negative, in which case the start counts from the end of the string.672fn str_slice(&self, offset: &Column, length: &Column) -> PolarsResult<StringChunked> {673let ca = self.as_string();674let offset = offset.cast(&DataType::Int64)?;675// We strict cast, otherwise negative value will be treated as a valid length.676let length = length.strict_cast(&DataType::UInt64)?;677678Ok(substring::substring(ca, offset.i64()?, length.u64()?))679}680681/// Slice the first `n` values of the string.682///683/// Determines a substring starting at the beginning of the string up to offset `n` of each684/// element in `array`. `n` can be negative, in which case the slice ends `n` characters from685/// the end of the string.686fn str_head(&self, n: &Column) -> PolarsResult<StringChunked> {687let ca = self.as_string();688let n = n.strict_cast(&DataType::Int64)?;689690substring::head(ca, n.i64()?)691}692693/// Slice the last `n` values of the string.694///695/// Determines a substring starting at offset `n` of each element in `array`. `n` can be696/// negative, in which case the slice begins `n` characters from the start of the string.697fn str_tail(&self, n: &Column) -> PolarsResult<StringChunked> {698let ca = self.as_string();699let n = n.strict_cast(&DataType::Int64)?;700701substring::tail(ca, n.i64()?)702}703#[cfg(feature = "strings")]704/// Escapes all regular expression meta characters in the string.705fn str_escape_regex(&self) -> StringChunked {706let ca = self.as_string();707escape_regex::escape_regex(ca)708}709}710711impl StringNameSpaceImpl for StringChunked {}712713714