Path: blob/main/crates/polars-json/src/json/write/utf8.rs
6939 views
// Adapted from https://github.com/serde-rs/json/blob/f901012df66811354cb1d490ad59480d8fdf77b5/src/ser.rs1use std::io;23use arrow::array::{Array, MutableBinaryViewArray, Utf8ViewArray};45use crate::json::write::new_serializer;67pub fn write_str<W>(writer: &mut W, value: &str) -> io::Result<()>8where9W: io::Write,10{11writer.write_all(b"\"")?;12let bytes = value.as_bytes();1314let mut start = 0;1516for (i, &byte) in bytes.iter().enumerate() {17let escape = ESCAPE[byte as usize];18if escape == 0 {19continue;20}2122if start < i {23writer.write_all(&bytes[start..i])?;24}2526let char_escape = CharEscape::from_escape_table(escape, byte);27write_char_escape(writer, char_escape)?;2829start = i + 1;30}3132if start != bytes.len() {33writer.write_all(&bytes[start..])?;34}35writer.write_all(b"\"")36}3738const BB: u8 = b'b'; // \x0839const TT: u8 = b't'; // \x0940const NN: u8 = b'n'; // \x0A41const FF: u8 = b'f'; // \x0C42const RR: u8 = b'r'; // \x0D43const QU: u8 = b'"'; // \x2244const BS: u8 = b'\\'; // \x5C45const UU: u8 = b'u'; // \x00...\x1F except the ones above46const __: u8 = 0;4748// Lookup table of escape sequences. A value of b'x' at index i means that byte49// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.50static ESCAPE: [u8; 256] = [51// 1 2 3 4 5 6 7 8 9 A B C D E F52UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 053UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 154__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 255__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 356__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 457__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 558__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 659__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 760__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 861__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 962__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A63__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B64__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C65__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D66__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E67__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F68];6970/// Represents a character escape code in a type-safe manner.71pub enum CharEscape {72/// An escaped quote `"`73Quote,74/// An escaped reverse solidus `\`75ReverseSolidus,76// An escaped solidus `/`77//Solidus,78/// An escaped backspace character (usually escaped as `\b`)79Backspace,80/// An escaped form feed character (usually escaped as `\f`)81FormFeed,82/// An escaped line feed character (usually escaped as `\n`)83LineFeed,84/// An escaped carriage return character (usually escaped as `\r`)85CarriageReturn,86/// An escaped tab character (usually escaped as `\t`)87Tab,88/// An escaped ASCII plane control character (usually escaped as89/// `\u00XX` where `XX` are two hex characters)90AsciiControl(u8),91}9293impl CharEscape {94#[inline]95fn from_escape_table(escape: u8, byte: u8) -> CharEscape {96match escape {97self::BB => CharEscape::Backspace,98self::TT => CharEscape::Tab,99self::NN => CharEscape::LineFeed,100self::FF => CharEscape::FormFeed,101self::RR => CharEscape::CarriageReturn,102self::QU => CharEscape::Quote,103self::BS => CharEscape::ReverseSolidus,104self::UU => CharEscape::AsciiControl(byte),105_ => unreachable!(),106}107}108}109110#[inline]111fn write_char_escape<W>(writer: &mut W, char_escape: CharEscape) -> io::Result<()>112where113W: io::Write,114{115use self::CharEscape::*;116117let s = match char_escape {118Quote => b"\\\"",119ReverseSolidus => b"\\\\",120//Solidus => b"\\/",121Backspace => b"\\b",122FormFeed => b"\\f",123LineFeed => b"\\n",124CarriageReturn => b"\\r",125Tab => b"\\t",126AsciiControl(byte) => {127static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";128let bytes = &[129b'\\',130b'u',131b'0',132b'0',133HEX_DIGITS[(byte >> 4) as usize],134HEX_DIGITS[(byte & 0xF) as usize],135];136return writer.write_all(bytes);137},138};139140writer.write_all(s)141}142143pub fn serialize_to_utf8(array: &dyn Array) -> Utf8ViewArray {144let mut values = MutableBinaryViewArray::with_capacity(array.len());145let mut serializer = new_serializer(array, 0, usize::MAX);146147while let Some(v) = serializer.next() {148unsafe { values.push_value(std::str::from_utf8_unchecked(v)) }149}150values.into()151}152153154