Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/cast/binary_to.rs
8422 views
1
use arrow::array::*;
2
use arrow::datatypes::ArrowDataType;
3
use arrow::offset::{Offset, Offsets};
4
use arrow::types::NativeType;
5
use num_traits::AsPrimitive;
6
use polars_buffer::Buffer;
7
use polars_error::PolarsResult;
8
#[cfg(feature = "dtype-f16")]
9
use polars_utils::float16::pf16;
10
11
use super::CastOptionsImpl;
12
13
pub(super) trait Parse {
14
fn parse(val: &[u8]) -> Option<Self>
15
where
16
Self: Sized;
17
}
18
19
macro_rules! impl_parse {
20
($primitive_type:ident) => {
21
impl Parse for $primitive_type {
22
fn parse(val: &[u8]) -> Option<Self> {
23
atoi_simd::parse_skipped(val).ok()
24
}
25
}
26
};
27
}
28
impl_parse!(i8);
29
impl_parse!(i16);
30
impl_parse!(i32);
31
impl_parse!(i64);
32
#[cfg(feature = "dtype-i128")]
33
impl_parse!(i128);
34
35
impl_parse!(u8);
36
impl_parse!(u16);
37
impl_parse!(u32);
38
impl_parse!(u64);
39
#[cfg(feature = "dtype-u128")]
40
impl_parse!(u128);
41
42
#[cfg(feature = "dtype-f16")]
43
impl Parse for pf16 {
44
fn parse(val: &[u8]) -> Option<Self>
45
where
46
Self: Sized,
47
{
48
fast_float2::parse(val).ok().map(|f: f32| f.as_())
49
}
50
}
51
52
impl Parse for f32 {
53
fn parse(val: &[u8]) -> Option<Self>
54
where
55
Self: Sized,
56
{
57
fast_float2::parse(val).ok()
58
}
59
}
60
impl Parse for f64 {
61
fn parse(val: &[u8]) -> Option<Self>
62
where
63
Self: Sized,
64
{
65
fast_float2::parse(val).ok()
66
}
67
}
68
69
/// Conversion of binary
70
pub fn binary_to_large_binary(
71
from: &BinaryArray<i32>,
72
to_dtype: ArrowDataType,
73
) -> BinaryArray<i64> {
74
let values = from.values().clone();
75
BinaryArray::<i64>::new(
76
to_dtype,
77
from.offsets().into(),
78
values,
79
from.validity().cloned(),
80
)
81
}
82
83
/// Conversion of binary
84
pub fn binary_large_to_binary(
85
from: &BinaryArray<i64>,
86
to_dtype: ArrowDataType,
87
) -> PolarsResult<BinaryArray<i32>> {
88
let values = from.values().clone();
89
let offsets = from.offsets().try_into()?;
90
Ok(BinaryArray::<i32>::new(
91
to_dtype,
92
offsets,
93
values,
94
from.validity().cloned(),
95
))
96
}
97
98
/// Conversion to utf8
99
pub fn binary_to_utf8<O: Offset>(
100
from: &BinaryArray<O>,
101
to_dtype: ArrowDataType,
102
) -> PolarsResult<Utf8Array<O>> {
103
Utf8Array::<O>::try_new(
104
to_dtype,
105
from.offsets().clone(),
106
from.values().clone(),
107
from.validity().cloned(),
108
)
109
}
110
111
/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
112
pub(super) fn binary_to_primitive<O: Offset, T>(
113
from: &BinaryArray<O>,
114
to: &ArrowDataType,
115
) -> PrimitiveArray<T>
116
where
117
T: NativeType + Parse,
118
{
119
let iter = from.iter().map(|x| x.and_then::<T, _>(|x| T::parse(x)));
120
121
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
122
}
123
124
pub(super) fn binary_to_primitive_dyn<O: Offset, T>(
125
from: &dyn Array,
126
to: &ArrowDataType,
127
options: CastOptionsImpl,
128
) -> PolarsResult<Box<dyn Array>>
129
where
130
T: NativeType + Parse,
131
{
132
let from = from.as_any().downcast_ref().unwrap();
133
if options.partial {
134
unimplemented!()
135
} else {
136
Ok(Box::new(binary_to_primitive::<O, T>(from, to)))
137
}
138
}
139
140
/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing.
141
/// # Errors
142
/// This function errors if the maximum key is smaller than the number of distinct elements
143
/// in the array.
144
pub fn binary_to_dictionary<O: Offset, K: DictionaryKey>(
145
from: &BinaryArray<O>,
146
) -> PolarsResult<DictionaryArray<K>> {
147
let mut array = MutableDictionaryArray::<K, MutableBinaryArray<O>>::empty_with_value_dtype(
148
from.dtype().clone(),
149
);
150
array.reserve(from.len());
151
array.try_extend(from.iter())?;
152
153
Ok(array.into())
154
}
155
156
pub(super) fn binary_to_dictionary_dyn<O: Offset, K: DictionaryKey>(
157
from: &dyn Array,
158
) -> PolarsResult<Box<dyn Array>> {
159
let values = from.as_any().downcast_ref().unwrap();
160
binary_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)
161
}
162
163
fn fixed_size_to_offsets<O: Offset>(values_len: usize, fixed_size: usize) -> Offsets<O> {
164
let offsets = (0..(values_len + 1))
165
.step_by(fixed_size)
166
.map(|v| O::from_as_usize(v))
167
.collect();
168
// SAFETY:
169
// * every element is `>= 0`
170
// * element at position `i` is >= than element at position `i-1`.
171
unsafe { Offsets::new_unchecked(offsets) }
172
}
173
174
/// Conversion of `FixedSizeBinary` to `Binary`.
175
pub fn fixed_size_binary_binary<O: Offset>(
176
from: &FixedSizeBinaryArray,
177
to_dtype: ArrowDataType,
178
) -> BinaryArray<O> {
179
let values = from.values().clone();
180
let offsets = fixed_size_to_offsets(values.len(), from.size());
181
BinaryArray::<O>::new(to_dtype, offsets.into(), values, from.validity().cloned())
182
}
183
184
pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray {
185
let datatype = <[u8] as ViewType>::DATA_TYPE;
186
187
// Fast path: all the views are inlineable
188
if from.size() <= View::MAX_INLINE_SIZE as usize {
189
// @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that
190
// prevents it from properly SIMD-ing this loop. It insists on memcpying while it should
191
// know that the size is really small. Dispatching over the `from.size()` and making it
192
// constant does make loop SIMD, but it does not actually speed anything up and the code it
193
// generates is still horrible.
194
//
195
// This is really slow, and I don't think it has to be.
196
197
// SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before
198
let mut views = Vec::new();
199
View::extend_with_inlinable_strided(
200
&mut views,
201
from.values().as_slice(),
202
from.size() as u8,
203
);
204
let views = Buffer::from(views);
205
return BinaryViewArray::try_new(datatype, views, Buffer::new(), from.validity().cloned())
206
.unwrap();
207
}
208
209
const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize;
210
211
let size = from.size();
212
let num_bytes = from.len() * size;
213
let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER);
214
assert!(num_buffers < u32::MAX as usize);
215
216
let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size;
217
// This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division
218
let split_point = num_elements_per_buffer * size;
219
220
// This is zero-copy for the buffer since split just increases the data since
221
let mut buffer = from.values().clone();
222
let mut buffers = Vec::with_capacity(num_buffers);
223
224
if let Some(num_buffers) = num_buffers.checked_sub(1) {
225
for _ in 0..num_buffers {
226
let slice;
227
(slice, buffer) = buffer.split_at(split_point);
228
buffers.push(slice);
229
}
230
buffers.push(buffer);
231
}
232
233
let mut iter = from.values_iter();
234
let iter = iter.by_ref();
235
let mut views = Vec::with_capacity(from.len());
236
for buffer_idx in 0..num_buffers {
237
views.extend(
238
iter.take(num_elements_per_buffer)
239
.enumerate()
240
.map(|(i, slice)| {
241
// SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before
242
unsafe {
243
View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32)
244
}
245
}),
246
);
247
}
248
let views = views.into();
249
250
BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap()
251
}
252
253
/// Conversion of binary
254
pub fn binary_to_list<O: Offset>(from: &BinaryArray<O>, to_dtype: ArrowDataType) -> ListArray<O> {
255
let values = from.values().clone();
256
let values = PrimitiveArray::new(ArrowDataType::UInt8, values, None);
257
ListArray::<O>::new(
258
to_dtype,
259
from.offsets().clone(),
260
values.boxed(),
261
from.validity().cloned(),
262
)
263
}
264
265