Path: blob/main/crates/polars-ops/src/chunked_array/strings/split.rs
6939 views
use arrow::array::ValueSize;1#[cfg(feature = "dtype-struct")]2use arrow::array::{MutableArray, MutableUtf8Array};3use polars_core::chunked_array::ops::arity::binary_elementwise_for_each;45use super::*;67pub struct SplitNChars<'a> {8s: &'a str,9n: usize,10keep_remainder: bool,11}1213impl<'a> Iterator for SplitNChars<'a> {14type Item = &'a str;1516fn next(&mut self) -> Option<Self::Item> {17let single_char_limit = if self.keep_remainder { 2 } else { 1 };18if self.n >= single_char_limit {19self.n -= 1;20let ch = self.s.chars().next()?;21let first;22(first, self.s) = self.s.split_at(ch.len_utf8());23Some(first)24} else if self.n == 1 && !self.s.is_empty() {25self.n -= 1;26Some(self.s)27} else {28None29}30}31}3233/// Splits a string into substrings consisting of single characters.34///35/// Returns at most n strings, where the last string is the entire remainder36/// of the string if keep_remainder is True, and just the nth character otherwise.37#[cfg(feature = "dtype-struct")]38fn splitn_chars(s: &str, n: usize, keep_remainder: bool) -> SplitNChars<'_> {39SplitNChars {40s,41n,42keep_remainder,43}44}4546/// Splits a string into substrings consisting of single characters.47fn split_chars(s: &str) -> SplitNChars<'_> {48SplitNChars {49s,50n: usize::MAX,51keep_remainder: false,52}53}5455#[cfg(feature = "dtype-struct")]56pub fn split_to_struct<'a, F, I>(57ca: &'a StringChunked,58by: &'a StringChunked,59n: usize,60op: F,61keep_remainder: bool,62) -> PolarsResult<StructChunked>63where64F: Fn(&'a str, &'a str) -> I,65I: Iterator<Item = &'a str>,66{67use polars_utils::format_pl_smallstr;6869let mut arrs = (0..n)70.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))71.collect::<Vec<_>>();7273if by.len() == 1 {74if let Some(by) = by.get(0) {75if by.is_empty() {76ca.for_each(|opt_s| match opt_s {77None => {78for arr in &mut arrs {79arr.push_null()80}81},82Some(s) => {83let mut arr_iter = arrs.iter_mut();84splitn_chars(s, n, keep_remainder)85.zip(&mut arr_iter)86.for_each(|(splitted, arr)| arr.push(Some(splitted)));87// fill the remaining with null88for arr in arr_iter {89arr.push_null()90}91},92});93} else {94ca.for_each(|opt_s| match opt_s {95None => {96for arr in &mut arrs {97arr.push_null()98}99},100Some(s) => {101let mut arr_iter = arrs.iter_mut();102op(s, by)103.zip(&mut arr_iter)104.for_each(|(splitted, arr)| arr.push(Some(splitted)));105// fill the remaining with null106for arr in arr_iter {107arr.push_null()108}109},110});111}112} else {113for arr in &mut arrs {114arr.push_null()115}116}117} else {118binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {119(Some(s), Some(by)) => {120let mut arr_iter = arrs.iter_mut();121if by.is_empty() {122splitn_chars(s, n, keep_remainder)123.zip(&mut arr_iter)124.for_each(|(splitted, arr)| arr.push(Some(splitted)));125} else {126op(s, by)127.zip(&mut arr_iter)128.for_each(|(splitted, arr)| arr.push(Some(splitted)));129};130// fill the remaining with null131for arr in arr_iter {132arr.push_null()133}134},135_ => {136for arr in &mut arrs {137arr.push_null()138}139},140})141}142143let fields = arrs144.into_iter()145.enumerate()146.map(|(i, mut arr)| {147Series::try_from((format_pl_smallstr!("field_{i}"), arr.as_box())).unwrap()148})149.collect::<Vec<_>>();150151StructChunked::from_series(ca.name().clone(), ca.len(), fields.iter())152}153154pub fn split_helper<'a, F, I>(155ca: &'a StringChunked,156by: &'a StringChunked,157op: F,158) -> PolarsResult<ListChunked>159where160F: Fn(&'a str, &'a str) -> I,161I: Iterator<Item = &'a str>,162{163Ok(match (ca.len(), by.len()) {164(a, b) if a == b => {165let mut builder =166ListStringChunkedBuilder::new(ca.name().clone(), ca.len(), ca.get_values_size());167168binary_elementwise_for_each(ca, by, |opt_s, opt_by| match (opt_s, opt_by) {169(Some(s), Some(by)) => {170if by.is_empty() {171builder.append_values_iter(split_chars(s))172} else {173builder.append_values_iter(op(s, by))174}175},176_ => builder.append_null(),177});178179builder.finish()180},181(1, _) => {182if let Some(s) = ca.get(0) {183let mut builder = ListStringChunkedBuilder::new(184by.name().clone(),185by.len(),186by.get_values_size(),187);188189by.for_each(|opt_by| match opt_by {190Some(by) => builder.append_values_iter(op(s, by)),191_ => builder.append_null(),192});193builder.finish()194} else {195ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)196}197},198(_, 1) => {199if let Some(by) = by.get(0) {200let mut builder = ListStringChunkedBuilder::new(201ca.name().clone(),202ca.len(),203ca.get_values_size(),204);205206if by.is_empty() {207ca.for_each(|opt_s| match opt_s {208Some(s) => builder.append_values_iter(split_chars(s)),209_ => builder.append_null(),210});211} else {212ca.for_each(|opt_s| match opt_s {213Some(s) => builder.append_values_iter(op(s, by)),214_ => builder.append_null(),215});216}217builder.finish()218} else {219ListChunked::full_null_with_dtype(ca.name().clone(), ca.len(), &DataType::String)220}221},222_ => polars_bail!(length_mismatch = "str.split", ca.len(), by.len()),223})224}225226227