CoCalc -- case.rs

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/strings/case.rs
⁶⁹³⁹ views
1
use polars_core::prelude::StringChunked;
2

3
// Inlined from std.
4
fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8, out: &mut Vec<u8>) {
5
    out.clear();
6
    out.reserve(b.len());
7

8
    const USIZE_SIZE: usize = size_of::<usize>();
9
    const MAGIC_UNROLL: usize = 2;
10
    const N: usize = USIZE_SIZE * MAGIC_UNROLL;
11
    const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);
12

13
    let mut i = 0;
14
    unsafe {
15
        while i + N <= b.len() {
16
            // SAFETY: we have checks the sizes `b` and `out`.
17
            let in_chunk = b.get_unchecked(i..i + N);
18
            let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);
19

20
            let mut bits = 0;
21
            for j in 0..MAGIC_UNROLL {
22
                // Read the bytes 1 usize at a time (unaligned since we haven't checked the alignment).
23
                // SAFETY: in_chunk is valid bytes in the range.
24
                bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();
25
            }
26
            // If our chunks aren't ascii, then return only the prior bytes as init.
27
            if bits & NONASCII_MASK != 0 {
28
                break;
29
            }
30

31
            // Perform the case conversions on N bytes (gets heavily autovec'd).
32
            for j in 0..N {
33
                // SAFETY: in_chunk and out_chunk are valid bytes in the range.
34
                let out = out_chunk.get_unchecked_mut(j);
35
                out.write(convert(in_chunk.get_unchecked(j)));
36
            }
37

38
            // Mark these bytes as initialised.
39
            i += N;
40
        }
41
        out.set_len(i);
42
    }
43
}
44

45
fn to_lowercase_helper(s: &str, buf: &mut Vec<u8>) {
46
    convert_while_ascii(s.as_bytes(), u8::to_ascii_lowercase, buf);
47

48
    // SAFETY: we know this is a valid char boundary since
49
    // out.len() is only progressed if ASCII bytes are found.
50
    let rest = unsafe { s.get_unchecked(buf.len()..) };
51

52
    // SAFETY: We have written only valid ASCII to our vec.
53
    let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(buf)) };
54

55
    for (i, c) in rest[..].char_indices() {
56
        if c == 'Σ' {
57
            // Σ maps to σ, except at the end of a word where it maps to ς.
58
            // This is the only conditional (contextual) but language-independent mapping
59
            // in `SpecialCasing.txt`,
60
            // so hard-code it rather than have a generic "condition" mechanism.
61
            // See https://github.com/rust-lang/rust/issues/26035
62
            map_uppercase_sigma(rest, i, &mut s)
63
        } else {
64
            s.extend(c.to_lowercase());
65
        }
66
    }
67

68
    fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {
69
        // See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
70
        // for the definition of `Final_Sigma`.
71
        debug_assert!('Σ'.len_utf8() == 2);
72
        let is_word_final = case_ignorable_then_cased(from[..i].chars().rev())
73
            && !case_ignorable_then_cased(from[i + 2..].chars());
74
        to.push_str(if is_word_final { "ς" } else { "σ" });
75
    }
76

77
    fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
78
        #[cfg(feature = "nightly")]
79
        use core::unicode::{Case_Ignorable, Cased};
80

81
        #[cfg(not(feature = "nightly"))]
82
        use super::unicode_internals::{Case_Ignorable, Cased};
83
        #[allow(clippy::skip_while_next)]
84
        match iter.skip_while(|&c| Case_Ignorable(c)).next() {
85
            Some(c) => Cased(c),
86
            None => false,
87
        }
88
    }
89

90
    // Put buf back for next iteration.
91
    *buf = s.into_bytes();
92
}
93

94
pub(super) fn to_lowercase<'a>(ca: &'a StringChunked) -> StringChunked {
95
    // Amortize allocation.
96
    let mut buf = Vec::new();
97
    let f = |s: &'a str| -> &'a str {
98
        to_lowercase_helper(s, &mut buf);
99
        // SAFETY: apply_mut will copy value from buf before next iteration.
100
        let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
101
        unsafe { std::mem::transmute::<&str, &'a str>(slice) }
102
    };
103
    ca.apply_mut(f)
104
}
105

106
// Inlined from std.
107
pub(super) fn to_uppercase<'a>(ca: &'a StringChunked) -> StringChunked {
108
    // Amortize allocation.
109
    let mut buf = Vec::new();
110
    let f = |s: &'a str| -> &'a str {
111
        convert_while_ascii(s.as_bytes(), u8::to_ascii_uppercase, &mut buf);
112

113
        // SAFETY: we know this is a valid char boundary since
114
        // out.len() is only progressed if ascii bytes are found.
115
        let rest = unsafe { s.get_unchecked(buf.len()..) };
116

117
        // SAFETY: We have written only valid ASCII to our vec.
118
        let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };
119

120
        for c in rest.chars() {
121
            s.extend(c.to_uppercase());
122
        }
123

124
        // Put buf back for next iteration.
125
        buf = s.into_bytes();
126

127
        // SAFETY: apply_mut will copy value from buf before next iteration.
128
        let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
129
        unsafe { std::mem::transmute::<&str, &'a str>(slice) }
130
    };
131
    ca.apply_mut(f)
132
}
133

134
#[cfg(feature = "nightly")]
135
pub(super) fn to_titlecase<'a>(ca: &'a StringChunked) -> StringChunked {
136
    // Amortize allocation.
137
    let mut buf = Vec::new();
138

139
    // Temporary scratch space.
140
    // We have a double copy as we first convert to lowercase and then copy to `buf`.
141
    let mut scratch = Vec::new();
142
    let f = |s: &'a str| -> &'a str {
143
        to_lowercase_helper(s, &mut scratch);
144
        let lowercased = unsafe { std::str::from_utf8_unchecked(&scratch) };
145

146
        // SAFETY: the buffer is clear, empty string is valid UTF-8.
147
        buf.clear();
148
        let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };
149

150
        let mut next_is_upper = true;
151
        for c in lowercased.chars() {
152
            if next_is_upper {
153
                s.extend(c.to_uppercase());
154
            } else {
155
                s.push(c);
156
            }
157
            next_is_upper = !c.is_alphanumeric();
158
        }
159

160
        // Put buf back for next iteration.
161
        buf = s.into_bytes();
162

163
        // SAFETY: apply_mut will copy value from buf before next iteration.
164
        let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
165
        unsafe { std::mem::transmute::<&str, &'a str>(slice) }
166
    };
167
    ca.apply_mut(f)
168
}
169

170
Product

Resources

Company