Path: blob/main/crates/polars-ops/src/series/ops/strings.rs
7884 views
use std::borrow::Cow;12use arrow::array::builder::StaticArrayBuilder;3use arrow::array::{Array, Utf8ViewArrayBuilder};4use arrow::datatypes::ArrowDataType;5use polars_core::prelude::{Column, DataType, IntoColumn, StringChunked};6use polars_core::scalar::Scalar;7use polars_error::{PolarsResult, polars_ensure};8use polars_utils::pl_str::PlSmallStr;910#[inline(always)]11fn opt_str_to_string(s: Option<&str>) -> &str {12s.unwrap_or("null")13}1415pub fn str_format(cs: &mut [Column], format: &str, insertions: &[usize]) -> PolarsResult<Column> {16assert_eq!(cs.len(), insertions.len());17assert!(!cs.is_empty()); // Checked at IR construction1819let output_name = cs[0].name().clone();20let mut output_length = 1;21for c in cs.iter() {22if c.len() != 1 {23polars_ensure!(24output_length == 1 || output_length == c.len(),25length_mismatch = "format",26output_length,27c.len()28);29output_length = c.len();30}31}3233let mut validity = None;34let mut num_scalar_inputs = 0;35for c in cs.iter_mut() {36if let Some(c_validity) = c.rechunk_validity() {37// Column with only nulls means output is only nulls.38if c.null_count() == c.len() {39return Ok(Column::full_null(40output_name,41output_length,42&DataType::String,43));44}4546match &mut validity {47v @ None => *v = Some(c_validity),48Some(v) => *v = arrow::bitmap::and(v, &c_validity),49}50}5152*c = c.cast(&DataType::String)?;53num_scalar_inputs += usize::from(c.len() == 1);54}5556let mut format = Cow::Borrowed(format);57let mut insertions = Cow::Borrowed(insertions);5859// Fill in any constants into the format string.60if num_scalar_inputs > 0 {61let mut filled_format = String::new();62filled_format.push_str(&format[..*insertions.first().unwrap()]);63insertions = Cow::Owned(64cs.iter()65.enumerate()66.filter_map(|(i, c)| {67let v = if c.len() == 1 {68filled_format.push_str(opt_str_to_string(c.str().unwrap().get(0)));69None70} else {71Some(filled_format.len())72};7374let s = if i == cs.len() - 1 {75&format[insertions[i]..]76} else {77&format[insertions[i]..insertions[i + 1]]78};79filled_format.push_str(s);8081v82})83.collect(),84);85format = filled_format.into();86}8788let format = format.as_ref();89let insertions = insertions.as_ref();9091// If the format string is constant.92if num_scalar_inputs == cs.len() {93let sc = Scalar::from(PlSmallStr::from_str(format));94return Ok(Column::new_scalar(output_name, sc, output_length));95}9697let mut builder = Utf8ViewArrayBuilder::new(ArrowDataType::Utf8View);98builder.reserve(output_length);99100let mut arrays = cs101.iter()102.filter(|c| c.len() != 1)103.map(|c| {104let ca = c.str().unwrap();105let mut iter = ca.downcast_iter();106let arr = iter.next().unwrap();107(iter, arr, 0)108})109.collect::<Vec<_>>();110111// @Performance. There is some smarter stuff that can be done with views and stuff. Don't think112// it is worth the complexity.113114// Amortize the format string allocation.115let mut s = String::new();116for i in 0..output_length {117if validity118.as_ref()119.is_some_and(|v| !unsafe { v.get_bit_unchecked(i) })120{121unsafe { builder.push_inline_view_ignore_validity(Default::default()) };122123for (iter, arr, elem_idx) in arrays.iter_mut() {124*elem_idx += 1;125if i + 1 != output_length && *elem_idx == arr.len() {126*arr = iter.next().unwrap();127*elem_idx = 0;128}129}130131continue;132}133134s.clear();135s.push_str(&format[..insertions[0]]);136137for (j, (iter, arr, elem_idx)) in arrays.iter_mut().enumerate() {138s.push_str(opt_str_to_string(arr.get(*elem_idx)));139let start = insertions[j];140let end = insertions.get(j + 1).copied().unwrap_or(format.len());141s.push_str(&format[start..end]);142143*elem_idx += 1;144if i + 1 != output_length && *elem_idx == arr.len() {145*arr = iter.next().unwrap();146*elem_idx = 0;147}148}149150builder.push_value_ignore_validity(&s);151}152153let array = builder.freeze().with_validity(validity).to_boxed();154Ok(unsafe { StringChunked::from_chunks(output_name, vec![array]) }.into_column())155}156157158