Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/cast/utf8_to.rs
8446 views
1
use arrow::array::*;
2
use arrow::datatypes::ArrowDataType;
3
use arrow::offset::Offset;
4
use arrow::types::NativeType;
5
use polars_buffer::Buffer;
6
use polars_error::PolarsResult;
7
use polars_utils::vec::PushUnchecked;
8
9
pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";
10
11
pub(super) fn utf8_to_dictionary_dyn<O: Offset, K: DictionaryKey>(
12
from: &dyn Array,
13
) -> PolarsResult<Box<dyn Array>> {
14
let values = from.as_any().downcast_ref().unwrap();
15
utf8_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)
16
}
17
18
/// Cast [`Utf8Array`] to [`DictionaryArray`], also known as packing.
19
/// # Errors
20
/// This function errors if the maximum key is smaller than the number of distinct elements
21
/// in the array.
22
pub fn utf8_to_dictionary<O: Offset, K: DictionaryKey>(
23
from: &Utf8Array<O>,
24
) -> PolarsResult<DictionaryArray<K>> {
25
let mut array = MutableDictionaryArray::<K, MutableUtf8Array<O>>::empty_with_value_dtype(
26
from.dtype().clone(),
27
);
28
array.reserve(from.len());
29
array.try_extend(from.iter())?;
30
31
Ok(array.into())
32
}
33
34
/// Conversion of utf8
35
pub fn utf8_to_large_utf8(from: &Utf8Array<i32>) -> Utf8Array<i64> {
36
let dtype = Utf8Array::<i64>::default_dtype();
37
let validity = from.validity().cloned();
38
let values = from.values().clone();
39
40
let offsets = from.offsets().into();
41
// SAFETY: sound because `values` fulfills the same invariants as `from.values()`
42
unsafe { Utf8Array::<i64>::new_unchecked(dtype, offsets, values, validity) }
43
}
44
45
/// Conversion of utf8
46
pub fn utf8_large_to_utf8(from: &Utf8Array<i64>) -> PolarsResult<Utf8Array<i32>> {
47
let dtype = Utf8Array::<i32>::default_dtype();
48
let validity = from.validity().cloned();
49
let values = from.values().clone();
50
let offsets = from.offsets().try_into()?;
51
52
// SAFETY: sound because `values` fulfills the same invariants as `from.values()`
53
Ok(unsafe { Utf8Array::<i32>::new_unchecked(dtype, offsets, values, validity) })
54
}
55
56
/// Conversion to binary
57
pub fn utf8_to_binary<O: Offset>(from: &Utf8Array<O>, to_dtype: ArrowDataType) -> BinaryArray<O> {
58
// SAFETY: erasure of an invariant is always safe
59
BinaryArray::<O>::new(
60
to_dtype,
61
from.offsets().clone(),
62
from.values().clone(),
63
from.validity().cloned(),
64
)
65
}
66
67
// Different types to test the overflow path.
68
#[cfg(not(test))]
69
type OffsetType = u32;
70
71
// To trigger overflow
72
#[cfg(test)]
73
type OffsetType = i8;
74
75
// If we don't do this the GC of binview will trigger. As we will split up buffers into multiple
76
// chunks so that we don't overflow the offset u32.
77
fn truncate_buffer(buf: &Buffer<u8>) -> Buffer<u8> {
78
// * 2, as it must be able to hold u32::MAX offset + u32::MAX len.
79
let len = std::cmp::min(buf.len(), ((OffsetType::MAX as u64) * 2) as usize);
80
buf.clone().sliced(..len)
81
}
82
83
pub fn binary_to_binview<O: Offset>(arr: &BinaryArray<O>) -> BinaryViewArray {
84
// Ensure we didn't accidentally set wrong type
85
#[cfg(not(debug_assertions))]
86
let _ = std::mem::transmute::<OffsetType, u32>;
87
88
let mut views = Vec::with_capacity(arr.len());
89
let mut uses_buffer = false;
90
91
let mut base_buffer = arr.values().clone();
92
// Offset into the buffer
93
let mut base_ptr = base_buffer.as_ptr() as usize;
94
95
// Offset into the binview buffers
96
let mut buffer_idx = 0_u32;
97
98
// Binview buffers
99
// Note that the buffer may look far further than u32::MAX, but as we don't clone data
100
let mut buffers = vec![truncate_buffer(&base_buffer)];
101
102
for bytes in arr.values_iter() {
103
let len: u32 = bytes
104
.len()
105
.try_into()
106
.expect("max string/binary length exceeded");
107
108
let mut payload = [0; 16];
109
payload[0..4].copy_from_slice(&len.to_le_bytes());
110
111
if len <= 12 {
112
payload[4..4 + bytes.len()].copy_from_slice(bytes);
113
} else {
114
uses_buffer = true;
115
116
// Copy the parts we know are correct.
117
unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) };
118
payload[0..4].copy_from_slice(&len.to_le_bytes());
119
120
let current_bytes_ptr = bytes.as_ptr() as usize;
121
let offset = current_bytes_ptr - base_ptr;
122
123
// Here we check the overflow of the buffer offset.
124
if let Ok(offset) = OffsetType::try_from(offset) {
125
#[allow(clippy::unnecessary_cast)]
126
let offset = offset as u32;
127
payload[12..16].copy_from_slice(&offset.to_le_bytes());
128
payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes());
129
} else {
130
let len = base_buffer.len() - offset;
131
132
// Set new buffer
133
base_buffer = base_buffer.clone().sliced(offset..offset + len);
134
base_ptr = base_buffer.as_ptr() as usize;
135
136
// And add the (truncated) one to the buffers
137
buffers.push(truncate_buffer(&base_buffer));
138
buffer_idx = buffer_idx.checked_add(1).expect("max buffers exceeded");
139
140
let offset = 0u32;
141
payload[12..16].copy_from_slice(&offset.to_le_bytes());
142
payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes());
143
}
144
}
145
146
let value = View::from_le_bytes(payload);
147
unsafe { views.push_unchecked(value) };
148
}
149
let buffers = if uses_buffer {
150
Buffer::from(buffers)
151
} else {
152
Buffer::new()
153
};
154
unsafe {
155
BinaryViewArray::new_unchecked_unknown_md(
156
ArrowDataType::BinaryView,
157
views.into(),
158
buffers,
159
arr.validity().cloned(),
160
None,
161
)
162
}
163
}
164
165
pub fn utf8_to_utf8view<O: Offset>(arr: &Utf8Array<O>) -> Utf8ViewArray {
166
unsafe { binary_to_binview(&arr.to_binary()).to_utf8view_unchecked() }
167
}
168
169
#[cfg(test)]
170
mod test {
171
use super::*;
172
173
#[test]
174
fn overflowing_utf8_to_binview() {
175
let values = [
176
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 0 (offset)
177
"123", // inline
178
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 74
179
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 0 (new buffer)
180
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 74
181
"234", // inline
182
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 0 (new buffer)
183
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 74
184
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 0 (new buffer)
185
"lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", // 74
186
"324", // inline
187
];
188
let array = Utf8Array::<i64>::from_slice(values);
189
190
let out = utf8_to_utf8view(&array);
191
// Ensure we hit the multiple buffers part.
192
assert_eq!(out.data_buffers().len(), 4);
193
// Ensure we created a valid binview
194
let out = out.values_iter().collect::<Vec<_>>();
195
assert_eq!(out, values);
196
}
197
}
198
199