Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/cast/binary_to.rs
6939 views
1
use std::sync::Arc;
2
3
use arrow::array::*;
4
use arrow::buffer::Buffer;
5
use arrow::datatypes::ArrowDataType;
6
use arrow::offset::{Offset, Offsets};
7
use arrow::types::NativeType;
8
use polars_error::PolarsResult;
9
10
use super::CastOptionsImpl;
11
12
pub(super) trait Parse {
13
fn parse(val: &[u8]) -> Option<Self>
14
where
15
Self: Sized;
16
}
17
18
macro_rules! impl_parse {
19
($primitive_type:ident) => {
20
impl Parse for $primitive_type {
21
fn parse(val: &[u8]) -> Option<Self> {
22
atoi_simd::parse_skipped(val).ok()
23
}
24
}
25
};
26
}
27
impl_parse!(i8);
28
impl_parse!(i16);
29
impl_parse!(i32);
30
impl_parse!(i64);
31
32
impl_parse!(u8);
33
impl_parse!(u16);
34
impl_parse!(u32);
35
impl_parse!(u64);
36
37
#[cfg(feature = "dtype-i128")]
38
impl_parse!(i128);
39
40
impl Parse for f32 {
41
fn parse(val: &[u8]) -> Option<Self>
42
where
43
Self: Sized,
44
{
45
fast_float2::parse(val).ok()
46
}
47
}
48
impl Parse for f64 {
49
fn parse(val: &[u8]) -> Option<Self>
50
where
51
Self: Sized,
52
{
53
fast_float2::parse(val).ok()
54
}
55
}
56
57
/// Conversion of binary
58
pub fn binary_to_large_binary(
59
from: &BinaryArray<i32>,
60
to_dtype: ArrowDataType,
61
) -> BinaryArray<i64> {
62
let values = from.values().clone();
63
BinaryArray::<i64>::new(
64
to_dtype,
65
from.offsets().into(),
66
values,
67
from.validity().cloned(),
68
)
69
}
70
71
/// Conversion of binary
72
pub fn binary_large_to_binary(
73
from: &BinaryArray<i64>,
74
to_dtype: ArrowDataType,
75
) -> PolarsResult<BinaryArray<i32>> {
76
let values = from.values().clone();
77
let offsets = from.offsets().try_into()?;
78
Ok(BinaryArray::<i32>::new(
79
to_dtype,
80
offsets,
81
values,
82
from.validity().cloned(),
83
))
84
}
85
86
/// Conversion to utf8
87
pub fn binary_to_utf8<O: Offset>(
88
from: &BinaryArray<O>,
89
to_dtype: ArrowDataType,
90
) -> PolarsResult<Utf8Array<O>> {
91
Utf8Array::<O>::try_new(
92
to_dtype,
93
from.offsets().clone(),
94
from.values().clone(),
95
from.validity().cloned(),
96
)
97
}
98
99
/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
100
pub(super) fn binary_to_primitive<O: Offset, T>(
101
from: &BinaryArray<O>,
102
to: &ArrowDataType,
103
) -> PrimitiveArray<T>
104
where
105
T: NativeType + Parse,
106
{
107
let iter = from.iter().map(|x| x.and_then::<T, _>(|x| T::parse(x)));
108
109
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
110
}
111
112
pub(super) fn binary_to_primitive_dyn<O: Offset, T>(
113
from: &dyn Array,
114
to: &ArrowDataType,
115
options: CastOptionsImpl,
116
) -> PolarsResult<Box<dyn Array>>
117
where
118
T: NativeType + Parse,
119
{
120
let from = from.as_any().downcast_ref().unwrap();
121
if options.partial {
122
unimplemented!()
123
} else {
124
Ok(Box::new(binary_to_primitive::<O, T>(from, to)))
125
}
126
}
127
128
/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing.
129
/// # Errors
130
/// This function errors if the maximum key is smaller than the number of distinct elements
131
/// in the array.
132
pub fn binary_to_dictionary<O: Offset, K: DictionaryKey>(
133
from: &BinaryArray<O>,
134
) -> PolarsResult<DictionaryArray<K>> {
135
let mut array = MutableDictionaryArray::<K, MutableBinaryArray<O>>::new();
136
array.reserve(from.len());
137
array.try_extend(from.iter())?;
138
139
Ok(array.into())
140
}
141
142
pub(super) fn binary_to_dictionary_dyn<O: Offset, K: DictionaryKey>(
143
from: &dyn Array,
144
) -> PolarsResult<Box<dyn Array>> {
145
let values = from.as_any().downcast_ref().unwrap();
146
binary_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)
147
}
148
149
fn fixed_size_to_offsets<O: Offset>(values_len: usize, fixed_size: usize) -> Offsets<O> {
150
let offsets = (0..(values_len + 1))
151
.step_by(fixed_size)
152
.map(|v| O::from_as_usize(v))
153
.collect();
154
// SAFETY:
155
// * every element is `>= 0`
156
// * element at position `i` is >= than element at position `i-1`.
157
unsafe { Offsets::new_unchecked(offsets) }
158
}
159
160
/// Conversion of `FixedSizeBinary` to `Binary`.
161
pub fn fixed_size_binary_binary<O: Offset>(
162
from: &FixedSizeBinaryArray,
163
to_dtype: ArrowDataType,
164
) -> BinaryArray<O> {
165
let values = from.values().clone();
166
let offsets = fixed_size_to_offsets(values.len(), from.size());
167
BinaryArray::<O>::new(to_dtype, offsets.into(), values, from.validity().cloned())
168
}
169
170
pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray {
171
let datatype = <[u8] as ViewType>::DATA_TYPE;
172
173
// Fast path: all the views are inlineable
174
if from.size() <= View::MAX_INLINE_SIZE as usize {
175
// @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that
176
// prevents it from properly SIMD-ing this loop. It insists on memcpying while it should
177
// know that the size is really small. Dispatching over the `from.size()` and making it
178
// constant does make loop SIMD, but it does not actually speed anything up and the code it
179
// generates is still horrible.
180
//
181
// This is really slow, and I don't think it has to be.
182
183
// SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before
184
let mut views = Vec::new();
185
View::extend_with_inlinable_strided(
186
&mut views,
187
from.values().as_slice(),
188
from.size() as u8,
189
);
190
let views = Buffer::from(views);
191
return BinaryViewArray::try_new(datatype, views, Arc::default(), from.validity().cloned())
192
.unwrap();
193
}
194
195
const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize;
196
197
let size = from.size();
198
let num_bytes = from.len() * size;
199
let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER);
200
assert!(num_buffers < u32::MAX as usize);
201
202
let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size;
203
// This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division
204
let split_point = num_elements_per_buffer * size;
205
206
// This is zero-copy for the buffer since split just increases the data since
207
let mut buffer = from.values().clone();
208
let mut buffers = Vec::with_capacity(num_buffers);
209
210
if let Some(num_buffers) = num_buffers.checked_sub(1) {
211
for _ in 0..num_buffers {
212
let slice;
213
(slice, buffer) = buffer.split_at(split_point);
214
buffers.push(slice);
215
}
216
buffers.push(buffer);
217
}
218
219
let mut iter = from.values_iter();
220
let iter = iter.by_ref();
221
let mut views = Vec::with_capacity(from.len());
222
for buffer_idx in 0..num_buffers {
223
views.extend(
224
iter.take(num_elements_per_buffer)
225
.enumerate()
226
.map(|(i, slice)| {
227
// SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before
228
unsafe {
229
View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32)
230
}
231
}),
232
);
233
}
234
let views = views.into();
235
236
BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap()
237
}
238
239
/// Conversion of binary
240
pub fn binary_to_list<O: Offset>(from: &BinaryArray<O>, to_dtype: ArrowDataType) -> ListArray<O> {
241
let values = from.values().clone();
242
let values = PrimitiveArray::new(ArrowDataType::UInt8, values, None);
243
ListArray::<O>::new(
244
to_dtype,
245
from.offsets().clone(),
246
values.boxed(),
247
from.validity().cloned(),
248
)
249
}
250
251