Path: blob/main/crates/polars-ops/src/chunked_array/strings/split.rs
8362 views
use arrow::array::ValueSize;1#[cfg(feature = "dtype-struct")]2use arrow::array::{MutableArray, MutableUtf8Array};3use polars_core::chunked_array::ops::arity::binary_elementwise_for_each;4use polars_core::prelude::*;5use polars_utils::regex_cache::compile_regex;6use regex::Regex;78pub struct SplitNChars<'a> {9s: &'a str,10n: usize,11keep_remainder: bool,12}1314impl<'a> Iterator for SplitNChars<'a> {15type Item = &'a str;1617fn next(&mut self) -> Option<Self::Item> {18let single_char_limit = if self.keep_remainder { 2 } else { 1 };19if self.n >= single_char_limit {20self.n -= 1;21let ch = self.s.chars().next()?;22let first;23(first, self.s) = self.s.split_at(ch.len_utf8());24Some(first)25} else if self.n == 1 && !self.s.is_empty() {26self.n -= 1;27Some(self.s)28} else {29None30}31}32}3334/// Splits a string into substrings consisting of single characters.35///36/// Returns at most n strings, where the last string is the entire remainder37/// of the string if keep_remainder is True, and just the nth character otherwise.38#[cfg(feature = "dtype-struct")]39fn splitn_chars(s: &str, n: usize, keep_remainder: bool) -> SplitNChars<'_> {40SplitNChars {41s,42n,43keep_remainder,44}45}4647/// Splits a string into substrings consisting of single characters.48fn split_chars(s: &str) -> SplitNChars<'_> {49SplitNChars {50s,51n: usize::MAX,52keep_remainder: false,53}54}5556#[cfg(feature = "dtype-struct")]57pub fn split_to_struct<'a, F, I>(58ca: &'a StringChunked,59by: &'a StringChunked,60n: usize,61op: F,62keep_remainder: bool,63) -> PolarsResult<StructChunked>64where65F: Fn(&'a str, &'a str) -> I,66I: Iterator<Item = &'a str>,67{68use polars_utils::format_pl_smallstr;6970let mut arrs = (0..n)71.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))72.collect::<Vec<_>>();7374if by.len() == 1 {75if let Some(by) = by.get(0) {76if by.is_empty() {77ca.for_each(|opt_s| match opt_s {78None => {79for arr in &mut arrs {80arr.push_null()81}82},83Some(s) => {84let mut arr_iter = arrs.iter_mut();85splitn_chars(s, n, keep_remainder)86.zip(&mut arr_iter)87.for_each(|(splitted, arr)| arr.push(Some(splitted)));88// fill the remaining with null89for arr in arr_iter {90arr.push_null()91}92},93});94} else {95ca.for_each(|opt_s| match opt_s {96None => {97for arr in &mut arrs {98arr.push_null()99}100},101Some(s) => {102let mut arr_iter = arrs.iter_mut();103op(s, by)104.zip(&mut arr_iter)105.for_each(|(splitted, arr)| arr.push(Some(splitted)));106// fill the remaining with null107for arr in arr_iter {108arr.push_null()109}110},111});112}113} else {114for arr in &mut arrs {115arr.push_null()116}117}118} else {119binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {120(Some(s), Some(by)) => {121let mut arr_iter = arrs.iter_mut();122if by.is_empty() {123splitn_chars(s, n, keep_remainder)124.zip(&mut arr_iter)125.for_each(|(splitted, arr)| arr.push(Some(splitted)));126} else {127op(s, by)128.zip(&mut arr_iter)129.for_each(|(splitted, arr)| arr.push(Some(splitted)));130};131// fill the remaining with null132for arr in arr_iter {133arr.push_null()134}135},136_ => {137for arr in &mut arrs {138arr.push_null()139}140},141})142}143144let fields = arrs145.into_iter()146.enumerate()147.map(|(i, mut arr)| {148Series::try_from((format_pl_smallstr!("field_{i}"), arr.as_box())).unwrap()149})150.collect::<Vec<_>>();151152StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())153}154155pub fn split_helper<'a, F, I>(156ca: &'a StringChunked,157by: &'a StringChunked,158op: F,159) -> PolarsResult<ListChunked>160where161F: Fn(&'a str, &'a str) -> I,162I: Iterator<Item = &'a str>,163{164Ok(match (ca.len(), by.len()) {165(a, b) if a == b => {166let mut builder =167ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());168169binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {170(Some(s), Some(by)) => {171if by.is_empty() {172builder.append_values_iter(split_chars(s))173} else {174builder.append_values_iter(op(s, by))175}176},177_ => builder.append_null(),178});179180builder.finish()181},182(1, _) => {183if let Some(s) = ca.get(0) {184let mut builder = ListStringChunkedBuilder::new(185by.name().clone(),186by.len(),187by.get_values_size(),188);189190by.for_each(|opt_by| match opt_by {191Some(by) => builder.append_values_iter(op(s, by)),192_ => builder.append_null(),193});194builder.finish()195} else {196ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)197}198},199(_, 1) => {200if let Some(by) = by.get(0) {201let mut builder = ListStringChunkedBuilder::new(202ca.name().clone(),203ca.len(),204ca.get_values_size(),205);206207if by.is_empty() {208ca.for_each(|opt_s| match opt_s {209Some(s) => builder.append_values_iter(split_chars(s)),210_ => builder.append_null(),211});212} else {213ca.for_each(|opt_s| match opt_s {214Some(s) => builder.append_values_iter(op(s, by)),215_ => builder.append_null(),216});217}218builder.finish()219} else {220ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)221}222},223_ => polars_bail!(length_mismatch = "str.split", ca.len(), by.len()),224})225}226227#[inline]228fn split_inclusive<'a>(re: &'a Regex, s: &'a str) -> impl Iterator<Item = &'a str> + 'a {229let mut it = re.find_iter(s);230let mut last_end: usize = 0;231let mut yielded_any = false;232let mut done_tail = false;233234std::iter::from_fn(move || {235if let Some(m) = it.next() {236let end = m.end();237let out = &s[last_end..end];238last_end = end;239yielded_any = true;240return Some(out);241}242243if done_tail {244return None;245}246done_tail = true;247248if last_end < s.len() {249Some(&s[last_end..])250} else if !yielded_any {251Some(s)252} else {253None254}255})256}257258#[inline]259fn invalid_regex_err(pat: &str) -> PolarsError {260polars_err!(ComputeError: "invalid regex pattern in str.split_regex: {}", pat)261}262263#[inline]264fn append_split_compiled(265builder: &mut ListStringChunkedBuilder,266s: &str,267re: &Regex,268inclusive: bool,269) {270if inclusive {271builder.append_values_iter(split_inclusive(re, s));272} else {273builder.append_values_iter(re.split(s));274}275}276277#[inline]278fn append_split(279builder: &mut ListStringChunkedBuilder,280s: &str,281pat: &str,282inclusive: bool,283strict: bool,284) -> PolarsResult<()> {285if pat.is_empty() {286builder.append_values_iter(split_chars(s));287return Ok(());288}289290match compile_regex(pat) {291Ok(re) => {292append_split_compiled(builder, s, &re, inclusive);293Ok(())294},295Err(_) if strict => Err(invalid_regex_err(pat)),296Err(_) => {297builder.append_null();298Ok(())299},300}301}302303pub fn split_regex_helper(304ca: &StringChunked,305by: &StringChunked,306inclusive: bool,307strict: bool,308) -> PolarsResult<ListChunked> {309use polars_utils::regex_cache::compile_regex;310311Ok(match (ca.len(), by.len()) {312// elementwise: string[i] with pattern[i]313(a, b) if a == b => {314let mut builder =315ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());316317for (opt_s, opt_pat) in ca.into_iter().zip(by.into_iter()) {318match (opt_s, opt_pat) {319(Some(s), Some(pat)) => append_split(&mut builder, s, pat, inclusive, strict)?,320_ => builder.append_null(),321}322}323324builder.finish()325},326327// scalar string with per-row patterns328(1, _) => {329if let Some(s0) = ca.get(0) {330let mut builder = ListStringChunkedBuilder::new(331by.name().clone(),332by.len(),333by.get_values_size(),334);335336for opt_pat in by.into_iter() {337match opt_pat {338Some(pat) => append_split(&mut builder, s0, pat, inclusive, strict)?,339None => builder.append_null(),340}341}342343builder.finish()344} else {345ListChunked::full_null_with_dtype(ca.name().clone(), by.len(), &DataType::String)346}347},348349// per-row strings with scalar pattern350(_, 1) => {351if let Some(pat0) = by.get(0) {352let mut builder = ListStringChunkedBuilder::new(353ca.name().clone(),354ca.len(),355ca.get_values_size(),356);357358if pat0.is_empty() {359ca.for_each(|opt_s| match opt_s {360Some(s) => builder.append_values_iter(split_chars(s)),361None => builder.append_null(),362});363builder.finish()364} else {365let re = match compile_regex(pat0) {366Ok(re) => re,367Err(_) if strict => return Err(invalid_regex_err(pat0)),368Err(_) => {369return Ok(ListChunked::full_null_with_dtype(370ca.name().clone(),371ca.len(),372&DataType::String,373));374},375};376377ca.for_each(|opt_s| match opt_s {378Some(s) => append_split_compiled(&mut builder, s, &re, inclusive),379None => builder.append_null(),380});381382builder.finish()383}384} else {385ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)386}387},388389_ => polars_bail!(length_mismatch = "str.split_regex", ca.len(), by.len()),390})391}392393394