Path: blob/main/crates/polars-ops/src/chunked_array/strings/namespace.rs
6939 views
use arrow::array::ValueSize;1use arrow::legacy::kernels::string::*;2#[cfg(feature = "string_encoding")]3use base64::Engine as _;4#[cfg(feature = "string_encoding")]5use base64::engine::general_purpose;6#[cfg(feature = "string_to_integer")]7use num_traits::Num;8use polars_core::prelude::arity::*;9use polars_utils::regex_cache::{compile_regex, with_regex_cache};1011use super::*;12#[cfg(feature = "binary_encoding")]13use crate::chunked_array::binary::BinaryNameSpaceImpl;14#[cfg(feature = "string_normalize")]15use crate::prelude::strings::normalize::UnicodeForm;1617// We need this to infer the right lifetimes for the match closure.18#[inline(always)]19fn infer_re_match<F>(f: F) -> F20where21F: for<'a, 'b> FnMut(Option<&'a str>, Option<&'b str>) -> Option<bool>,22{23f24}2526#[cfg(feature = "string_to_integer")]27// This is a helper function used in the `to_integer` method of the StringNameSpaceImpl trait.28fn parse_integer<T>(29ca: &ChunkedArray<StringType>,30base: &UInt32Chunked,31strict: bool,32) -> PolarsResult<Series>33where34T: PolarsIntegerType,35T::Native: Num,36ChunkedArray<T>: IntoSeries,37<<T as polars_core::datatypes::PolarsNumericType>::Native as num_traits::Num>::FromStrRadixErr:38std::fmt::Display,39{40let f = |opt_s: Option<&str>, opt_base: Option<u32>| -> PolarsResult<Option<T::Native>> {41let (Some(s), Some(base)) = (opt_s, opt_base) else {42return Ok(None);43};4445if !(2..=36).contains(&base) {46polars_bail!(ComputeError: "`to_integer` called with invalid base '{base}'");47}4849Ok(T::Native::from_str_radix(s, base).ok())50};51let out: ChunkedArray<T> = broadcast_try_binary_elementwise(ca, base, f)?;52if strict && ca.null_count() != out.null_count() {53let failure_mask = ca.is_not_null() & out.is_null() & base.is_not_null();54let n_failures = failure_mask.num_trues();55if n_failures == 0 {56return Ok(out.into_series());57}5859let some_failures = if ca.len() == 1 {60ca.clone()61} else {62let all_failures = ca.filter(&failure_mask)?;63// `.unique()` does not necessarily preserve the original order.64let unique_failures_args = all_failures.arg_unique()?;65all_failures.take(&unique_failures_args.slice(0, 10))?66};67let some_error_msg = match base.len() {681 => {69// we can ensure that base is not null.70let base = base.get(0).unwrap();71some_failures72.get(0)73.and_then(|s| T::Native::from_str_radix(s, base).err())74.map_or_else(75|| unreachable!("failed to extract ParseIntError"),76|e| format!("{e}"),77)78},79_ => {80let base_failures = base.filter(&failure_mask)?;81some_failures82.get(0)83.zip(base_failures.get(0))84.and_then(|(s, base)| T::Native::from_str_radix(s, base).err())85.map_or_else(86|| unreachable!("failed to extract ParseIntError"),87|e| format!("{e}"),88)89},90};91polars_bail!(92ComputeError:93"strict integer parsing failed for {} value(s): {}; error message for the \94first shown value: '{}' (consider non-strict parsing)",95n_failures,96some_failures.into_series().fmt_list(),97some_error_msg98);99}100101Ok(out.into_series())102}103104pub trait StringNameSpaceImpl: AsString {105#[cfg(not(feature = "binary_encoding"))]106fn hex_decode(&self) -> PolarsResult<StringChunked> {107panic!("activate 'binary_encoding' feature")108}109110#[cfg(feature = "binary_encoding")]111fn hex_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {112let ca = self.as_string();113ca.as_binary().hex_decode(strict)114}115116#[must_use]117#[cfg(feature = "string_encoding")]118fn hex_encode(&self) -> StringChunked {119let ca = self.as_string();120ca.apply_values(|s| hex::encode(s).into())121}122123#[cfg(not(feature = "binary_encoding"))]124fn base64_decode(&self) -> PolarsResult<StringChunked> {125panic!("activate 'binary_encoding' feature")126}127128#[cfg(feature = "binary_encoding")]129fn base64_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {130let ca = self.as_string();131ca.as_binary().base64_decode(strict)132}133134#[must_use]135#[cfg(feature = "string_encoding")]136fn base64_encode(&self) -> StringChunked {137let ca = self.as_string();138ca.apply_values(|s| general_purpose::STANDARD.encode(s).into())139}140141#[cfg(feature = "string_to_integer")]142// Parse a string number with base _radix_ into a decimal dtype143fn to_integer(144&self,145base: &UInt32Chunked,146dtype: Option<DataType>,147strict: bool,148) -> PolarsResult<Series> {149let ca = self.as_string();150151polars_ensure!(152ca.len() == base.len() || ca.len() == 1 || base.len() == 1,153length_mismatch = "str.to_integer",154ca.len(),155base.len()156);157158match dtype.unwrap_or(DataType::Int64) {159DataType::Int8 => parse_integer::<Int8Type>(ca, base, strict),160DataType::Int16 => parse_integer::<Int16Type>(ca, base, strict),161DataType::Int32 => parse_integer::<Int32Type>(ca, base, strict),162DataType::Int64 => parse_integer::<Int64Type>(ca, base, strict),163DataType::Int128 => parse_integer::<Int128Type>(ca, base, strict),164DataType::UInt8 => parse_integer::<UInt8Type>(ca, base, strict),165DataType::UInt16 => parse_integer::<UInt16Type>(ca, base, strict),166DataType::UInt32 => parse_integer::<UInt32Type>(ca, base, strict),167DataType::UInt64 => parse_integer::<UInt64Type>(ca, base, strict),168dtype => polars_bail!(InvalidOperation: "Invalid dtype {:?}", dtype),169}170}171172fn contains_chunked(173&self,174pat: &StringChunked,175literal: bool,176strict: bool,177) -> PolarsResult<BooleanChunked> {178let ca = self.as_string();179match (ca.len(), pat.len()) {180(_, 1) => match pat.get(0) {181Some(pat) => {182if literal {183ca.contains_literal(pat)184} else {185ca.contains(pat, strict)186}187},188None => Ok(BooleanChunked::full_null(ca.name().clone(), ca.len())),189},190(1, _) if ca.null_count() == 1 => Ok(BooleanChunked::full_null(191ca.name().clone(),192ca.len().max(pat.len()),193)),194_ => {195if literal {196Ok(broadcast_binary_elementwise_values(ca, pat, |src, pat| {197src.contains(pat)198}))199} else if strict {200with_regex_cache(|reg_cache| {201broadcast_try_binary_elementwise(ca, pat, |opt_src, opt_pat| {202match (opt_src, opt_pat) {203(Some(src), Some(pat)) => {204let reg = reg_cache.compile(pat)?;205Ok(Some(reg.is_match(src)))206},207_ => Ok(None),208}209})210})211} else {212with_regex_cache(|reg_cache| {213Ok(broadcast_binary_elementwise(214ca,215pat,216infer_re_match(|src, pat| {217let reg = reg_cache.compile(pat?).ok()?;218Some(reg.is_match(src?))219}),220))221})222}223},224}225}226227fn find_chunked(228&self,229pat: &StringChunked,230literal: bool,231strict: bool,232) -> PolarsResult<UInt32Chunked> {233let ca = self.as_string();234if pat.len() == 1 {235return if let Some(pat) = pat.get(0) {236if literal {237ca.find_literal(pat)238} else {239ca.find(pat, strict)240}241} else {242Ok(UInt32Chunked::full_null(ca.name().clone(), ca.len()))243};244} else if ca.len() == 1 && ca.null_count() == 1 {245return Ok(UInt32Chunked::full_null(246ca.name().clone(),247ca.len().max(pat.len()),248));249}250if literal {251Ok(broadcast_binary_elementwise(252ca,253pat,254|src: Option<&str>, pat: Option<&str>| src?.find(pat?).map(|idx| idx as u32),255))256} else {257with_regex_cache(|reg_cache| {258let matcher = |src: Option<&str>, pat: Option<&str>| -> PolarsResult<Option<u32>> {259if let (Some(src), Some(pat)) = (src, pat) {260let re = reg_cache.compile(pat)?;261return Ok(re.find(src).map(|m| m.start() as u32));262}263Ok(None)264};265broadcast_try_binary_elementwise(ca, pat, matcher)266})267}268}269270/// Get the length of the string values as number of chars.271fn str_len_chars(&self) -> UInt32Chunked {272let ca = self.as_string();273ca.apply_kernel_cast(&string_len_chars)274}275276/// Get the length of the string values as number of bytes.277fn str_len_bytes(&self) -> UInt32Chunked {278let ca = self.as_string();279ca.apply_kernel_cast(&utf8view_len_bytes)280}281282/// Pad the start of the string until it reaches the given length.283///284/// Padding is done using the specified `fill_char`.285/// Strings with length equal to or greater than the given length are286/// returned as-is.287#[cfg(feature = "string_pad")]288fn pad_start(&self, length: &UInt64Chunked, fill_char: char) -> StringChunked {289let ca = self.as_string();290pad::pad_start(ca, length, fill_char)291}292293/// Pad the end of the string until it reaches the given length.294///295/// Padding is done using the specified `fill_char`.296/// Strings with length equal to or greater than the given length are297/// returned as-is.298#[cfg(feature = "string_pad")]299fn pad_end(&self, length: &UInt64Chunked, fill_char: char) -> StringChunked {300let ca = self.as_string();301pad::pad_end(ca, length, fill_char)302}303304/// Pad the start of the string with zeros until it reaches the given length.305///306/// A sign prefix (`-`) is handled by inserting the padding after the sign307/// character rather than before.308/// Strings with length equal to or greater than the given length are309/// returned as-is.310#[cfg(feature = "string_pad")]311fn zfill(&self, length: &UInt64Chunked) -> StringChunked {312let ca = self.as_string();313pad::zfill(ca, length)314}315316/// Check if strings contain a regex pattern.317fn contains(&self, pat: &str, strict: bool) -> PolarsResult<BooleanChunked> {318let ca = self.as_string();319let res_reg = polars_utils::regex_cache::compile_regex(pat);320let opt_reg = if strict { Some(res_reg?) } else { res_reg.ok() };321let out: BooleanChunked = if let Some(reg) = opt_reg {322unary_elementwise_values(ca, |s| reg.is_match(s))323} else {324BooleanChunked::full_null(ca.name().clone(), ca.len())325};326Ok(out)327}328329/// Check if strings contain a given literal330fn contains_literal(&self, lit: &str) -> PolarsResult<BooleanChunked> {331// note: benchmarking shows that the regex engine is actually332// faster at finding literal matches than str::contains.333// ref: https://github.com/pola-rs/polars/pull/6811334self.contains(regex::escape(lit).as_str(), true)335}336337/// Return the index position of a literal substring in the target string.338fn find_literal(&self, lit: &str) -> PolarsResult<UInt32Chunked> {339self.find(regex::escape(lit).as_str(), true)340}341342/// Return the index position of a regular expression substring in the target string.343fn find(&self, pat: &str, strict: bool) -> PolarsResult<UInt32Chunked> {344let ca = self.as_string();345match polars_utils::regex_cache::compile_regex(pat) {346Ok(rx) => Ok(unary_elementwise(ca, |opt_s| {347opt_s.and_then(|s| rx.find(s)).map(|m| m.start() as u32)348})),349Err(_) if !strict => Ok(UInt32Chunked::full_null(ca.name().clone(), ca.len())),350Err(e) => Err(PolarsError::ComputeError(351format!("Invalid regular expression: {e}").into(),352)),353}354}355356/// Replace the leftmost regex-matched (sub)string with another string357fn replace<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<StringChunked> {358let reg = polars_utils::regex_cache::compile_regex(pat)?;359let f = |s: &'a str| reg.replace(s, val);360let ca = self.as_string();361Ok(ca.apply_values(f))362}363364/// Replace the leftmost literal (sub)string with another string365fn replace_literal<'a>(366&'a self,367pat: &str,368val: &str,369n: usize,370) -> PolarsResult<StringChunked> {371let ca = self.as_string();372if ca.is_empty() {373return Ok(ca.clone());374}375376// amortize allocation377let mut buf = String::new();378379let f = move |s: &'a str| {380buf.clear();381let mut changed = false;382383// See: str.replacen384let mut last_end = 0;385for (start, part) in s.match_indices(pat).take(n) {386changed = true;387buf.push_str(unsafe { s.get_unchecked(last_end..start) });388buf.push_str(val);389last_end = start + part.len();390}391buf.push_str(unsafe { s.get_unchecked(last_end..s.len()) });392393if changed {394// extend lifetime395// lifetime is bound to 'a396let slice = buf.as_str();397unsafe { std::mem::transmute::<&str, &'a str>(slice) }398} else {399s400}401};402Ok(ca.apply_mut(f))403}404405/// Replace all regex-matched (sub)strings with another string406fn replace_all(&self, pat: &str, val: &str) -> PolarsResult<StringChunked> {407let ca = self.as_string();408let reg = polars_utils::regex_cache::compile_regex(pat)?;409Ok(ca.apply_values(|s| reg.replace_all(s, val)))410}411412/// Replace all matching literal (sub)strings with another string413fn replace_literal_all<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<StringChunked> {414let ca = self.as_string();415if ca.is_empty() {416return Ok(ca.clone());417}418419// Amortize allocation.420let mut buf = String::new();421422let f = move |s: &'a str| {423buf.clear();424let mut changed = false;425426// See: str.replace.427let mut last_end = 0;428for (start, part) in s.match_indices(pat) {429changed = true;430buf.push_str(unsafe { s.get_unchecked(last_end..start) });431buf.push_str(val);432last_end = start + part.len();433}434buf.push_str(unsafe { s.get_unchecked(last_end..s.len()) });435436if changed {437// Extend lifetime, lifetime is bound to 'a.438let slice = buf.as_str();439unsafe { std::mem::transmute::<&str, &'a str>(slice) }440} else {441s442}443};444445Ok(ca.apply_mut(f))446}447448/// Extract the nth capture group from pattern.449fn extract(&self, pat: &StringChunked, group_index: usize) -> PolarsResult<StringChunked> {450let ca = self.as_string();451super::extract::extract_group(ca, pat, group_index)452}453454/// Extract each successive non-overlapping regex match in an individual string as an array.455fn extract_all(&self, pat: &str) -> PolarsResult<ListChunked> {456let ca = self.as_string();457let reg = polars_utils::regex_cache::compile_regex(pat)?;458459let mut builder =460ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());461for arr in ca.downcast_iter() {462for opt_s in arr {463match opt_s {464None => builder.append_null(),465Some(s) => builder.append_values_iter(reg.find_iter(s).map(|m| m.as_str())),466}467}468}469Ok(builder.finish())470}471472fn strip_chars(&self, pat: &Column) -> PolarsResult<StringChunked> {473let ca = self.as_string();474if pat.dtype() == &DataType::Null {475Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim())))476} else {477Ok(strip_chars(ca, pat.str()?))478}479}480481fn strip_chars_start(&self, pat: &Column) -> PolarsResult<StringChunked> {482let ca = self.as_string();483if pat.dtype() == &DataType::Null {484Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim_start())))485} else {486Ok(strip_chars_start(ca, pat.str()?))487}488}489490fn strip_chars_end(&self, pat: &Column) -> PolarsResult<StringChunked> {491let ca = self.as_string();492if pat.dtype() == &DataType::Null {493Ok(unary_elementwise(ca, |opt_s| opt_s.map(|s| s.trim_end())))494} else {495Ok(strip_chars_end(ca, pat.str()?))496}497}498499fn strip_prefix(&self, prefix: &StringChunked) -> StringChunked {500let ca = self.as_string();501strip_prefix(ca, prefix)502}503504fn strip_suffix(&self, suffix: &StringChunked) -> StringChunked {505let ca = self.as_string();506strip_suffix(ca, suffix)507}508509#[cfg(feature = "dtype-struct")]510fn split_exact(&self, by: &StringChunked, n: usize) -> PolarsResult<StructChunked> {511let ca = self.as_string();512513split_to_struct(ca, by, n + 1, str::split, false)514}515516#[cfg(feature = "dtype-struct")]517fn split_exact_inclusive(&self, by: &StringChunked, n: usize) -> PolarsResult<StructChunked> {518let ca = self.as_string();519520split_to_struct(ca, by, n + 1, str::split_inclusive, false)521}522523#[cfg(feature = "dtype-struct")]524fn splitn(&self, by: &StringChunked, n: usize) -> PolarsResult<StructChunked> {525let ca = self.as_string();526527split_to_struct(ca, by, n, |s, by| s.splitn(n, by), true)528}529530fn split(&self, by: &StringChunked) -> PolarsResult<ListChunked> {531let ca = self.as_string();532split_helper(ca, by, str::split)533}534535fn split_inclusive(&self, by: &StringChunked) -> PolarsResult<ListChunked> {536let ca = self.as_string();537split_helper(ca, by, str::split_inclusive)538}539540/// Extract each successive non-overlapping regex match in an individual string as an array.541fn extract_all_many(&self, pat: &StringChunked) -> PolarsResult<ListChunked> {542let ca = self.as_string();543polars_ensure!(544ca.len() == pat.len(),545ComputeError: "pattern's length: {} does not match that of the argument series: {}",546pat.len(), ca.len(),547);548549let mut builder =550ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());551with_regex_cache(|re_cache| {552binary_elementwise_for_each(ca, pat, |opt_s, opt_pat| match (opt_s, opt_pat) {553(_, None) | (None, _) => builder.append_null(),554(Some(s), Some(pat)) => {555let re = re_cache.compile(pat).unwrap();556builder.append_values_iter(re.find_iter(s).map(|m| m.as_str()));557},558});559});560Ok(builder.finish())561}562563#[cfg(feature = "extract_groups")]564/// Extract all capture groups from pattern and return as a struct.565fn extract_groups(&self, pat: &str, dtype: &DataType) -> PolarsResult<Series> {566let ca = self.as_string();567super::extract::extract_groups(ca, pat, dtype)568}569570/// Count all successive non-overlapping regex matches.571fn count_matches(&self, pat: &str, literal: bool) -> PolarsResult<UInt32Chunked> {572let ca = self.as_string();573if literal {574Ok(unary_elementwise(ca, |opt_s| {575opt_s.map(|s| s.matches(pat).count() as u32)576}))577} else {578let re = compile_regex(pat)?;579Ok(unary_elementwise(ca, |opt_s| {580opt_s.map(|s| re.find_iter(s).count() as u32)581}))582}583}584585/// Count all successive non-overlapping regex matches.586fn count_matches_many(587&self,588pat: &StringChunked,589literal: bool,590) -> PolarsResult<UInt32Chunked> {591let ca = self.as_string();592polars_ensure!(593ca.len() == pat.len(),594ComputeError: "pattern's length: {} does not match that of the argument series: {}",595pat.len(), ca.len(),596);597598let out: UInt32Chunked = if literal {599broadcast_binary_elementwise(ca, pat, |s: Option<&str>, p: Option<&str>| {600Some(s?.matches(p?).count() as u32)601})602} else {603with_regex_cache(|re_cache| {604let op = move |opt_s: Option<&str>,605opt_pat: Option<&str>|606-> PolarsResult<Option<u32>> {607match (opt_s, opt_pat) {608(Some(s), Some(pat)) => {609let reg = re_cache.compile(pat)?;610Ok(Some(reg.find_iter(s).count() as u32))611},612_ => Ok(None),613}614};615broadcast_try_binary_elementwise(ca, pat, op)616})?617};618619Ok(out.with_name(ca.name().clone()))620}621622/// Modify the strings to their lowercase equivalent.623#[must_use]624fn to_lowercase(&self) -> StringChunked {625let ca = self.as_string();626case::to_lowercase(ca)627}628629/// Modify the strings to their uppercase equivalent.630#[must_use]631fn to_uppercase(&self) -> StringChunked {632let ca = self.as_string();633case::to_uppercase(ca)634}635636/// Modify the strings to their titlecase equivalent.637#[must_use]638#[cfg(feature = "nightly")]639fn to_titlecase(&self) -> StringChunked {640let ca = self.as_string();641case::to_titlecase(ca)642}643644/// Concat with the values from a second StringChunked.645#[must_use]646fn concat(&self, other: &StringChunked) -> StringChunked {647let ca = self.as_string();648ca + other649}650651/// Normalizes the string values652#[must_use]653#[cfg(feature = "string_normalize")]654fn str_normalize(&self, form: UnicodeForm) -> StringChunked {655let ca = self.as_string();656normalize::normalize(ca, form)657}658659/// Reverses the string values660#[must_use]661#[cfg(feature = "string_reverse")]662fn str_reverse(&self) -> StringChunked {663let ca = self.as_string();664reverse::reverse(ca)665}666667/// Slice the string values.668///669/// Determines a substring starting from `offset` and with length `length` of each of the elements in `array`.670/// `offset` can be negative, in which case the start counts from the end of the string.671fn str_slice(&self, offset: &Column, length: &Column) -> PolarsResult<StringChunked> {672let ca = self.as_string();673let offset = offset.cast(&DataType::Int64)?;674// We strict cast, otherwise negative value will be treated as a valid length.675let length = length.strict_cast(&DataType::UInt64)?;676677Ok(substring::substring(ca, offset.i64()?, length.u64()?))678}679680/// Slice the first `n` values of the string.681///682/// Determines a substring starting at the beginning of the string up to offset `n` of each683/// element in `array`. `n` can be negative, in which case the slice ends `n` characters from684/// the end of the string.685fn str_head(&self, n: &Column) -> PolarsResult<StringChunked> {686let ca = self.as_string();687let n = n.strict_cast(&DataType::Int64)?;688689substring::head(ca, n.i64()?)690}691692/// Slice the last `n` values of the string.693///694/// Determines a substring starting at offset `n` of each element in `array`. `n` can be695/// negative, in which case the slice begins `n` characters from the start of the string.696fn str_tail(&self, n: &Column) -> PolarsResult<StringChunked> {697let ca = self.as_string();698let n = n.strict_cast(&DataType::Int64)?;699700substring::tail(ca, n.i64()?)701}702#[cfg(feature = "strings")]703/// Escapes all regular expression meta characters in the string.704fn str_escape_regex(&self) -> StringChunked {705let ca = self.as_string();706escape_regex::escape_regex(ca)707}708}709710impl StringNameSpaceImpl for StringChunked {}711712713