Path: blob/main/crates/polars-ops/src/chunked_array/strings/case.rs
6939 views
use polars_core::prelude::StringChunked;12// Inlined from std.3fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8, out: &mut Vec<u8>) {4out.clear();5out.reserve(b.len());67const USIZE_SIZE: usize = size_of::<usize>();8const MAGIC_UNROLL: usize = 2;9const N: usize = USIZE_SIZE * MAGIC_UNROLL;10const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);1112let mut i = 0;13unsafe {14while i + N <= b.len() {15// SAFETY: we have checks the sizes `b` and `out`.16let in_chunk = b.get_unchecked(i..i + N);17let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);1819let mut bits = 0;20for j in 0..MAGIC_UNROLL {21// Read the bytes 1 usize at a time (unaligned since we haven't checked the alignment).22// SAFETY: in_chunk is valid bytes in the range.23bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();24}25// If our chunks aren't ascii, then return only the prior bytes as init.26if bits & NONASCII_MASK != 0 {27break;28}2930// Perform the case conversions on N bytes (gets heavily autovec'd).31for j in 0..N {32// SAFETY: in_chunk and out_chunk are valid bytes in the range.33let out = out_chunk.get_unchecked_mut(j);34out.write(convert(in_chunk.get_unchecked(j)));35}3637// Mark these bytes as initialised.38i += N;39}40out.set_len(i);41}42}4344fn to_lowercase_helper(s: &str, buf: &mut Vec<u8>) {45convert_while_ascii(s.as_bytes(), u8::to_ascii_lowercase, buf);4647// SAFETY: we know this is a valid char boundary since48// out.len() is only progressed if ASCII bytes are found.49let rest = unsafe { s.get_unchecked(buf.len()..) };5051// SAFETY: We have written only valid ASCII to our vec.52let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(buf)) };5354for (i, c) in rest[..].char_indices() {55if c == 'Σ' {56// Σ maps to σ, except at the end of a word where it maps to ς.57// This is the only conditional (contextual) but language-independent mapping58// in `SpecialCasing.txt`,59// so hard-code it rather than have a generic "condition" mechanism.60// See https://github.com/rust-lang/rust/issues/2603561map_uppercase_sigma(rest, i, &mut s)62} else {63s.extend(c.to_lowercase());64}65}6667fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {68// See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G3399269// for the definition of `Final_Sigma`.70debug_assert!('Σ'.len_utf8() == 2);71let is_word_final = case_ignorable_then_cased(from[..i].chars().rev())72&& !case_ignorable_then_cased(from[i + 2..].chars());73to.push_str(if is_word_final { "ς" } else { "σ" });74}7576fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {77#[cfg(feature = "nightly")]78use core::unicode::{Case_Ignorable, Cased};7980#[cfg(not(feature = "nightly"))]81use super::unicode_internals::{Case_Ignorable, Cased};82#[allow(clippy::skip_while_next)]83match iter.skip_while(|&c| Case_Ignorable(c)).next() {84Some(c) => Cased(c),85None => false,86}87}8889// Put buf back for next iteration.90*buf = s.into_bytes();91}9293pub(super) fn to_lowercase<'a>(ca: &'a StringChunked) -> StringChunked {94// Amortize allocation.95let mut buf = Vec::new();96let f = |s: &'a str| -> &'a str {97to_lowercase_helper(s, &mut buf);98// SAFETY: apply_mut will copy value from buf before next iteration.99let slice = unsafe { std::str::from_utf8_unchecked(&buf) };100unsafe { std::mem::transmute::<&str, &'a str>(slice) }101};102ca.apply_mut(f)103}104105// Inlined from std.106pub(super) fn to_uppercase<'a>(ca: &'a StringChunked) -> StringChunked {107// Amortize allocation.108let mut buf = Vec::new();109let f = |s: &'a str| -> &'a str {110convert_while_ascii(s.as_bytes(), u8::to_ascii_uppercase, &mut buf);111112// SAFETY: we know this is a valid char boundary since113// out.len() is only progressed if ascii bytes are found.114let rest = unsafe { s.get_unchecked(buf.len()..) };115116// SAFETY: We have written only valid ASCII to our vec.117let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };118119for c in rest.chars() {120s.extend(c.to_uppercase());121}122123// Put buf back for next iteration.124buf = s.into_bytes();125126// SAFETY: apply_mut will copy value from buf before next iteration.127let slice = unsafe { std::str::from_utf8_unchecked(&buf) };128unsafe { std::mem::transmute::<&str, &'a str>(slice) }129};130ca.apply_mut(f)131}132133#[cfg(feature = "nightly")]134pub(super) fn to_titlecase<'a>(ca: &'a StringChunked) -> StringChunked {135// Amortize allocation.136let mut buf = Vec::new();137138// Temporary scratch space.139// We have a double copy as we first convert to lowercase and then copy to `buf`.140let mut scratch = Vec::new();141let f = |s: &'a str| -> &'a str {142to_lowercase_helper(s, &mut scratch);143let lowercased = unsafe { std::str::from_utf8_unchecked(&scratch) };144145// SAFETY: the buffer is clear, empty string is valid UTF-8.146buf.clear();147let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };148149let mut next_is_upper = true;150for c in lowercased.chars() {151if next_is_upper {152s.extend(c.to_uppercase());153} else {154s.push(c);155}156next_is_upper = !c.is_alphanumeric();157}158159// Put buf back for next iteration.160buf = s.into_bytes();161162// SAFETY: apply_mut will copy value from buf before next iteration.163let slice = unsafe { std::str::from_utf8_unchecked(&buf) };164unsafe { std::mem::transmute::<&str, &'a str>(slice) }165};166ca.apply_mut(f)167}168169170