Path: blob/main/crates/polars-ops/src/chunked_array/strings/normalize.rs
6939 views
use polars_core::prelude::{StringChunked, StringChunkedBuilder};1use unicode_normalization::UnicodeNormalization;23#[derive(Clone, Eq, PartialEq, Hash, Debug)]4#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]5#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]6pub enum UnicodeForm {7NFC,8NFKC,9NFD,10NFKD,11}1213pub fn normalize_with<F: Fn(&str, &mut String)>(14ca: &StringChunked,15normalizer: F,16) -> StringChunked {17let mut buffer = String::new();18let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len());19for opt_s in ca.iter() {20if let Some(s) = opt_s {21buffer.clear();22normalizer(s, &mut buffer);23builder.append_value(&buffer);24} else {25builder.append_null();26}27}28builder.finish()29}3031pub fn normalize(ca: &StringChunked, form: UnicodeForm) -> StringChunked {32match form {33UnicodeForm::NFC => normalize_with(ca, |s, b| b.extend(s.nfc())),34UnicodeForm::NFKC => normalize_with(ca, |s, b| b.extend(s.nfkc())),35UnicodeForm::NFD => normalize_with(ca, |s, b| b.extend(s.nfd())),36UnicodeForm::NFKD => normalize_with(ca, |s, b| b.extend(s.nfkd())),37}38}394041