Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-json/src/json/write/utf8.rs
6939 views
1
// Adapted from https://github.com/serde-rs/json/blob/f901012df66811354cb1d490ad59480d8fdf77b5/src/ser.rs
2
use std::io;
3
4
use arrow::array::{Array, MutableBinaryViewArray, Utf8ViewArray};
5
6
use crate::json::write::new_serializer;
7
8
pub fn write_str<W>(writer: &mut W, value: &str) -> io::Result<()>
9
where
10
W: io::Write,
11
{
12
writer.write_all(b"\"")?;
13
let bytes = value.as_bytes();
14
15
let mut start = 0;
16
17
for (i, &byte) in bytes.iter().enumerate() {
18
let escape = ESCAPE[byte as usize];
19
if escape == 0 {
20
continue;
21
}
22
23
if start < i {
24
writer.write_all(&bytes[start..i])?;
25
}
26
27
let char_escape = CharEscape::from_escape_table(escape, byte);
28
write_char_escape(writer, char_escape)?;
29
30
start = i + 1;
31
}
32
33
if start != bytes.len() {
34
writer.write_all(&bytes[start..])?;
35
}
36
writer.write_all(b"\"")
37
}
38
39
const BB: u8 = b'b'; // \x08
40
const TT: u8 = b't'; // \x09
41
const NN: u8 = b'n'; // \x0A
42
const FF: u8 = b'f'; // \x0C
43
const RR: u8 = b'r'; // \x0D
44
const QU: u8 = b'"'; // \x22
45
const BS: u8 = b'\\'; // \x5C
46
const UU: u8 = b'u'; // \x00...\x1F except the ones above
47
const __: u8 = 0;
48
49
// Lookup table of escape sequences. A value of b'x' at index i means that byte
50
// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
51
static ESCAPE: [u8; 256] = [
52
// 1 2 3 4 5 6 7 8 9 A B C D E F
53
UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
54
UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
55
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
56
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
57
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
58
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
59
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
60
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
61
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
62
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
63
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
64
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
65
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
66
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
67
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
68
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
69
];
70
71
/// Represents a character escape code in a type-safe manner.
72
pub enum CharEscape {
73
/// An escaped quote `"`
74
Quote,
75
/// An escaped reverse solidus `\`
76
ReverseSolidus,
77
// An escaped solidus `/`
78
//Solidus,
79
/// An escaped backspace character (usually escaped as `\b`)
80
Backspace,
81
/// An escaped form feed character (usually escaped as `\f`)
82
FormFeed,
83
/// An escaped line feed character (usually escaped as `\n`)
84
LineFeed,
85
/// An escaped carriage return character (usually escaped as `\r`)
86
CarriageReturn,
87
/// An escaped tab character (usually escaped as `\t`)
88
Tab,
89
/// An escaped ASCII plane control character (usually escaped as
90
/// `\u00XX` where `XX` are two hex characters)
91
AsciiControl(u8),
92
}
93
94
impl CharEscape {
95
#[inline]
96
fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
97
match escape {
98
self::BB => CharEscape::Backspace,
99
self::TT => CharEscape::Tab,
100
self::NN => CharEscape::LineFeed,
101
self::FF => CharEscape::FormFeed,
102
self::RR => CharEscape::CarriageReturn,
103
self::QU => CharEscape::Quote,
104
self::BS => CharEscape::ReverseSolidus,
105
self::UU => CharEscape::AsciiControl(byte),
106
_ => unreachable!(),
107
}
108
}
109
}
110
111
#[inline]
112
fn write_char_escape<W>(writer: &mut W, char_escape: CharEscape) -> io::Result<()>
113
where
114
W: io::Write,
115
{
116
use self::CharEscape::*;
117
118
let s = match char_escape {
119
Quote => b"\\\"",
120
ReverseSolidus => b"\\\\",
121
//Solidus => b"\\/",
122
Backspace => b"\\b",
123
FormFeed => b"\\f",
124
LineFeed => b"\\n",
125
CarriageReturn => b"\\r",
126
Tab => b"\\t",
127
AsciiControl(byte) => {
128
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
129
let bytes = &[
130
b'\\',
131
b'u',
132
b'0',
133
b'0',
134
HEX_DIGITS[(byte >> 4) as usize],
135
HEX_DIGITS[(byte & 0xF) as usize],
136
];
137
return writer.write_all(bytes);
138
},
139
};
140
141
writer.write_all(s)
142
}
143
144
pub fn serialize_to_utf8(array: &dyn Array) -> Utf8ViewArray {
145
let mut values = MutableBinaryViewArray::with_capacity(array.len());
146
let mut serializer = new_serializer(array, 0, usize::MAX);
147
148
while let Some(v) = serializer.next() {
149
unsafe { values.push_value(std::str::from_utf8_unchecked(v)) }
150
}
151
values.into()
152
}
153
154