Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/normalize.rs
6939 views
1
use polars_core::prelude::{StringChunked, StringChunkedBuilder};
2
use unicode_normalization::UnicodeNormalization;
3
4
#[derive(Clone, Eq, PartialEq, Hash, Debug)]
5
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
6
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
7
pub enum UnicodeForm {
8
NFC,
9
NFKC,
10
NFD,
11
NFKD,
12
}
13
14
pub fn normalize_with<F: Fn(&str, &mut String)>(
15
ca: &StringChunked,
16
normalizer: F,
17
) -> StringChunked {
18
let mut buffer = String::new();
19
let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len());
20
for opt_s in ca.iter() {
21
if let Some(s) = opt_s {
22
buffer.clear();
23
normalizer(s, &mut buffer);
24
builder.append_value(&buffer);
25
} else {
26
builder.append_null();
27
}
28
}
29
builder.finish()
30
}
31
32
pub fn normalize(ca: &StringChunked, form: UnicodeForm) -> StringChunked {
33
match form {
34
UnicodeForm::NFC => normalize_with(ca, |s, b| b.extend(s.nfc())),
35
UnicodeForm::NFKC => normalize_with(ca, |s, b| b.extend(s.nfkc())),
36
UnicodeForm::NFD => normalize_with(ca, |s, b| b.extend(s.nfd())),
37
UnicodeForm::NFKD => normalize_with(ca, |s, b| b.extend(s.nfkd())),
38
}
39
}
40
41