Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/case.rs
6939 views
1
use polars_core::prelude::StringChunked;
2
3
// Inlined from std.
4
fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8, out: &mut Vec<u8>) {
5
out.clear();
6
out.reserve(b.len());
7
8
const USIZE_SIZE: usize = size_of::<usize>();
9
const MAGIC_UNROLL: usize = 2;
10
const N: usize = USIZE_SIZE * MAGIC_UNROLL;
11
const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);
12
13
let mut i = 0;
14
unsafe {
15
while i + N <= b.len() {
16
// SAFETY: we have checks the sizes `b` and `out`.
17
let in_chunk = b.get_unchecked(i..i + N);
18
let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);
19
20
let mut bits = 0;
21
for j in 0..MAGIC_UNROLL {
22
// Read the bytes 1 usize at a time (unaligned since we haven't checked the alignment).
23
// SAFETY: in_chunk is valid bytes in the range.
24
bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();
25
}
26
// If our chunks aren't ascii, then return only the prior bytes as init.
27
if bits & NONASCII_MASK != 0 {
28
break;
29
}
30
31
// Perform the case conversions on N bytes (gets heavily autovec'd).
32
for j in 0..N {
33
// SAFETY: in_chunk and out_chunk are valid bytes in the range.
34
let out = out_chunk.get_unchecked_mut(j);
35
out.write(convert(in_chunk.get_unchecked(j)));
36
}
37
38
// Mark these bytes as initialised.
39
i += N;
40
}
41
out.set_len(i);
42
}
43
}
44
45
fn to_lowercase_helper(s: &str, buf: &mut Vec<u8>) {
46
convert_while_ascii(s.as_bytes(), u8::to_ascii_lowercase, buf);
47
48
// SAFETY: we know this is a valid char boundary since
49
// out.len() is only progressed if ASCII bytes are found.
50
let rest = unsafe { s.get_unchecked(buf.len()..) };
51
52
// SAFETY: We have written only valid ASCII to our vec.
53
let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(buf)) };
54
55
for (i, c) in rest[..].char_indices() {
56
if c == 'Σ' {
57
// Σ maps to σ, except at the end of a word where it maps to ς.
58
// This is the only conditional (contextual) but language-independent mapping
59
// in `SpecialCasing.txt`,
60
// so hard-code it rather than have a generic "condition" mechanism.
61
// See https://github.com/rust-lang/rust/issues/26035
62
map_uppercase_sigma(rest, i, &mut s)
63
} else {
64
s.extend(c.to_lowercase());
65
}
66
}
67
68
fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {
69
// See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
70
// for the definition of `Final_Sigma`.
71
debug_assert!('Σ'.len_utf8() == 2);
72
let is_word_final = case_ignorable_then_cased(from[..i].chars().rev())
73
&& !case_ignorable_then_cased(from[i + 2..].chars());
74
to.push_str(if is_word_final { "ς" } else { "σ" });
75
}
76
77
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
78
#[cfg(feature = "nightly")]
79
use core::unicode::{Case_Ignorable, Cased};
80
81
#[cfg(not(feature = "nightly"))]
82
use super::unicode_internals::{Case_Ignorable, Cased};
83
#[allow(clippy::skip_while_next)]
84
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
85
Some(c) => Cased(c),
86
None => false,
87
}
88
}
89
90
// Put buf back for next iteration.
91
*buf = s.into_bytes();
92
}
93
94
pub(super) fn to_lowercase<'a>(ca: &'a StringChunked) -> StringChunked {
95
// Amortize allocation.
96
let mut buf = Vec::new();
97
let f = |s: &'a str| -> &'a str {
98
to_lowercase_helper(s, &mut buf);
99
// SAFETY: apply_mut will copy value from buf before next iteration.
100
let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
101
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
102
};
103
ca.apply_mut(f)
104
}
105
106
// Inlined from std.
107
pub(super) fn to_uppercase<'a>(ca: &'a StringChunked) -> StringChunked {
108
// Amortize allocation.
109
let mut buf = Vec::new();
110
let f = |s: &'a str| -> &'a str {
111
convert_while_ascii(s.as_bytes(), u8::to_ascii_uppercase, &mut buf);
112
113
// SAFETY: we know this is a valid char boundary since
114
// out.len() is only progressed if ascii bytes are found.
115
let rest = unsafe { s.get_unchecked(buf.len()..) };
116
117
// SAFETY: We have written only valid ASCII to our vec.
118
let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };
119
120
for c in rest.chars() {
121
s.extend(c.to_uppercase());
122
}
123
124
// Put buf back for next iteration.
125
buf = s.into_bytes();
126
127
// SAFETY: apply_mut will copy value from buf before next iteration.
128
let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
129
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
130
};
131
ca.apply_mut(f)
132
}
133
134
#[cfg(feature = "nightly")]
135
pub(super) fn to_titlecase<'a>(ca: &'a StringChunked) -> StringChunked {
136
// Amortize allocation.
137
let mut buf = Vec::new();
138
139
// Temporary scratch space.
140
// We have a double copy as we first convert to lowercase and then copy to `buf`.
141
let mut scratch = Vec::new();
142
let f = |s: &'a str| -> &'a str {
143
to_lowercase_helper(s, &mut scratch);
144
let lowercased = unsafe { std::str::from_utf8_unchecked(&scratch) };
145
146
// SAFETY: the buffer is clear, empty string is valid UTF-8.
147
buf.clear();
148
let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };
149
150
let mut next_is_upper = true;
151
for c in lowercased.chars() {
152
if next_is_upper {
153
s.extend(c.to_uppercase());
154
} else {
155
s.push(c);
156
}
157
next_is_upper = !c.is_alphanumeric();
158
}
159
160
// Put buf back for next iteration.
161
buf = s.into_bytes();
162
163
// SAFETY: apply_mut will copy value from buf before next iteration.
164
let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
165
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
166
};
167
ca.apply_mut(f)
168
}
169
170