Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/cast/binview_to.rs
8440 views
1
use std::ptr::copy_nonoverlapping;
2
3
use arrow::array::*;
4
use arrow::bitmap::MutableBitmap;
5
use arrow::datatypes::{ArrowDataType, Field, TimeUnit};
6
use arrow::offset::Offset;
7
use arrow::types::NativeType;
8
use bytemuck::cast_slice_mut;
9
use chrono::Datelike;
10
use num_traits::FromBytes;
11
use polars_error::{PolarsResult, polars_err};
12
13
use super::CastOptionsImpl;
14
use super::binary_to::Parse;
15
use super::temporal::EPOCH_DAYS_FROM_CE;
16
#[cfg(feature = "dtype-decimal")]
17
use crate::decimal::str_to_dec128;
18
19
pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";
20
21
/// Cast [`BinaryViewArray`] to [`DictionaryArray`], also known as packing.
22
/// # Errors
23
/// This function errors if the maximum key is smaller than the number of distinct elements
24
/// in the array.
25
pub(super) fn binview_to_dictionary<K: DictionaryKey>(
26
from: &BinaryViewArray,
27
) -> PolarsResult<DictionaryArray<K>> {
28
let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<[u8]>>::new();
29
array.reserve(from.len());
30
array.try_extend(from.iter())?;
31
32
Ok(array.into())
33
}
34
35
pub(super) fn utf8view_to_dictionary<K: DictionaryKey>(
36
from: &Utf8ViewArray,
37
) -> PolarsResult<DictionaryArray<K>> {
38
let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<str>>::new();
39
array.reserve(from.len());
40
array.try_extend(from.iter())?;
41
42
Ok(array.into())
43
}
44
45
pub(super) fn view_to_binary<O: Offset>(array: &BinaryViewArray) -> BinaryArray<O> {
46
let len: usize = Array::len(array);
47
let mut mutable = MutableBinaryValuesArray::<O>::with_capacities(len, array.total_bytes_len());
48
for slice in array.values_iter() {
49
mutable.push(slice)
50
}
51
let out: BinaryArray<O> = mutable.into();
52
out.with_validity(array.validity().cloned())
53
}
54
55
pub fn utf8view_to_utf8<O: Offset>(array: &Utf8ViewArray) -> Utf8Array<O> {
56
let array = array.to_binview();
57
let out = view_to_binary::<O>(&array);
58
59
let dtype = Utf8Array::<O>::default_dtype();
60
unsafe {
61
Utf8Array::new_unchecked(
62
dtype,
63
out.offsets().clone(),
64
out.values().clone(),
65
out.validity().cloned(),
66
)
67
}
68
}
69
70
/// Parses a [`Utf8ViewArray`] with text representations of numbers into a
71
/// [`PrimitiveArray`], making any unparsable value a Null.
72
pub(super) fn utf8view_to_primitive<T>(
73
from: &Utf8ViewArray,
74
to: &ArrowDataType,
75
) -> PrimitiveArray<T>
76
where
77
T: NativeType + Parse,
78
{
79
let iter = from
80
.iter()
81
.map(|x| x.and_then::<T, _>(|x| T::parse(x.as_bytes())));
82
83
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
84
}
85
86
/// Parses a `&dyn` [`Array`] of UTF-8 encoded string representations of numbers
87
/// into a [`PrimitiveArray`], making any unparsable value a Null.
88
pub(super) fn utf8view_to_primitive_dyn<T>(
89
from: &dyn Array,
90
to: &ArrowDataType,
91
options: CastOptionsImpl,
92
) -> PolarsResult<Box<dyn Array>>
93
where
94
T: NativeType + Parse,
95
{
96
let from = from.as_any().downcast_ref().unwrap();
97
if options.partial {
98
unimplemented!()
99
} else {
100
Ok(Box::new(utf8view_to_primitive::<T>(from, to)))
101
}
102
}
103
104
#[cfg(feature = "dtype-decimal")]
105
pub fn binview_to_decimal(
106
array: &BinaryViewArray,
107
precision: usize,
108
scale: usize,
109
) -> PrimitiveArray<i128> {
110
PrimitiveArray::<i128>::from_trusted_len_iter(
111
array
112
.iter()
113
.map(|val| val.and_then(|val| str_to_dec128(val, precision, scale, false))),
114
)
115
.to(ArrowDataType::Decimal(precision, scale))
116
}
117
118
pub(super) fn utf8view_to_naive_timestamp_dyn(
119
from: &dyn Array,
120
time_unit: TimeUnit,
121
) -> PolarsResult<Box<dyn Array>> {
122
let from = from.as_any().downcast_ref().unwrap();
123
Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit)))
124
}
125
126
/// [`super::temporal::utf8view_to_timestamp`] applied for RFC3339 formatting
127
pub fn utf8view_to_naive_timestamp(
128
from: &Utf8ViewArray,
129
time_unit: TimeUnit,
130
) -> PrimitiveArray<i64> {
131
super::temporal::utf8view_to_naive_timestamp(from, RFC3339, time_unit)
132
}
133
134
pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray<i32> {
135
let iter = from.iter().map(|x| {
136
x.and_then(|x| {
137
x.parse::<chrono::NaiveDate>()
138
.ok()
139
.map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
140
})
141
});
142
PrimitiveArray::<i32>::from_trusted_len_iter(iter).to(ArrowDataType::Date32)
143
}
144
145
pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult<Box<dyn Array>> {
146
let from = from.as_any().downcast_ref().unwrap();
147
Ok(Box::new(utf8view_to_date32(from)))
148
}
149
150
/// Casts a [`BinaryViewArray`] containing binary-encoded numbers to a
151
/// [`PrimitiveArray`], making any uncastable value a Null.
152
pub(super) fn binview_to_primitive<T>(
153
from: &BinaryViewArray,
154
to: &ArrowDataType,
155
is_little_endian: bool,
156
) -> PrimitiveArray<T>
157
where
158
T: FromBytes + NativeType,
159
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
160
{
161
let iter = from.iter().map(|x| {
162
x.and_then::<T, _>(|x| {
163
if is_little_endian {
164
Some(<T as FromBytes>::from_le_bytes(x.try_into().ok()?))
165
} else {
166
Some(<T as FromBytes>::from_be_bytes(x.try_into().ok()?))
167
}
168
})
169
});
170
171
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
172
}
173
174
/// Casts a `&dyn` [`Array`] containing binary-encoded numbers to a
175
/// [`PrimitiveArray`], making any uncastable value a Null.
176
/// # Panics
177
/// Panics if `Array` is not a `BinaryViewArray`
178
pub fn binview_to_primitive_dyn<T>(
179
from: &dyn Array,
180
to: &ArrowDataType,
181
is_little_endian: bool,
182
) -> PolarsResult<Box<dyn Array>>
183
where
184
T: FromBytes + NativeType,
185
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
186
{
187
let from = from.as_any().downcast_ref().unwrap();
188
Ok(Box::new(binview_to_primitive::<T>(
189
from,
190
to,
191
is_little_endian,
192
)))
193
}
194
195
/// Casts a [`BinaryViewArray`] to a [`FixedSizeListArray`], making any un-castable value a Null.
196
///
197
/// # Arguments
198
///
199
/// * `from`: The array to reinterpret.
200
/// * `array_width`: The number of items in each `Array`.
201
pub(super) fn try_binview_to_fixed_size_list<T, const IS_LITTLE_ENDIAN: bool>(
202
from: &BinaryViewArray,
203
array_width: usize,
204
) -> PolarsResult<FixedSizeListArray>
205
where
206
T: FromBytes + NativeType,
207
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
208
{
209
let element_size = std::mem::size_of::<T>();
210
// The maximum number of primitives in the result:
211
let primitive_length = from.len().checked_mul(array_width).ok_or_else(|| {
212
polars_err!(
213
InvalidOperation:
214
"array chunk length * number of items ({} * {}) is too large",
215
from.len(),
216
array_width
217
)
218
})?;
219
// The size of each array, in bytes:
220
let row_size_bytes = element_size.checked_mul(array_width).ok_or_else(|| {
221
polars_err!(
222
InvalidOperation:
223
"array size in bytes ({} * {}) is too large",
224
element_size,
225
array_width
226
)
227
})?;
228
229
let mut out: Vec<T> = vec![T::zeroed(); primitive_length];
230
let (out_u8_ptr, out_len_bytes) = {
231
let out_u8_slice = cast_slice_mut::<_, u8>(out.as_mut());
232
(out_u8_slice.as_mut_ptr(), out_u8_slice.len())
233
};
234
assert_eq!(out_len_bytes, row_size_bytes * from.len());
235
let mut validity = MutableBitmap::from_len_set(from.len());
236
237
for (index, value) in from.iter().enumerate() {
238
if let Some(value) = value
239
&& value.len() == row_size_bytes
240
{
241
if cfg!(target_endian = "little") && IS_LITTLE_ENDIAN {
242
// Fast path, we can just copy the data with no need to
243
// reinterpret.
244
let write_index = index * row_size_bytes;
245
debug_assert!(value.is_empty() || write_index < out_len_bytes);
246
debug_assert!(value.is_empty() || (write_index + value.len() - 1 < out_len_bytes));
247
// # Safety
248
// - The start index is smaller than `out`'s capacity.
249
// - The end index is smaller than `out`'s capacity.
250
unsafe {
251
copy_nonoverlapping(value.as_ptr(), out_u8_ptr.add(write_index), value.len());
252
}
253
} else {
254
// Slow path, reinterpret items one by one.
255
for j in 0..array_width {
256
let jth_range = (j * element_size)..((j + 1) * element_size);
257
debug_assert!(value.get(jth_range.clone()).is_some());
258
// # Safety
259
// We made sure the range is smaller than `value` length.
260
let jth_bytes = unsafe { value.get_unchecked(jth_range) };
261
// # Safety
262
// We just made sure that the slice has length `element_size`
263
let byte_array = unsafe { jth_bytes.try_into().unwrap_unchecked() };
264
let jth_value = if IS_LITTLE_ENDIAN {
265
<T as FromBytes>::from_le_bytes(byte_array)
266
} else {
267
<T as FromBytes>::from_be_bytes(byte_array)
268
};
269
270
let write_index = array_width * index + j;
271
debug_assert!(write_index < out.len());
272
// # Safety
273
// - The target index is smaller than the vector's pre-allocated capacity.
274
unsafe {
275
*out.get_unchecked_mut(write_index) = jth_value;
276
}
277
}
278
}
279
} else {
280
validity.set(index, false);
281
};
282
}
283
284
FixedSizeListArray::try_new(
285
ArrowDataType::FixedSizeList(
286
Box::new(Field::new("".into(), T::PRIMITIVE.into(), true)),
287
array_width,
288
),
289
from.len(),
290
Box::new(PrimitiveArray::<T>::from_vec(out)),
291
validity.into(),
292
)
293
}
294
295
/// Casts a `dyn` [`Array`] to a [`FixedSizeListArray`], making any un-castable value a Null.
296
///
297
/// # Arguments
298
///
299
/// * `from`: The array to reinterpret.
300
/// * `array_width`: The number of items in each `Array`.
301
///
302
/// # Panics
303
/// Panics if `from` is not `BinaryViewArray`.
304
pub fn binview_to_fixed_size_list_dyn<T>(
305
from: &dyn Array,
306
array_width: usize,
307
is_little_endian: bool,
308
) -> PolarsResult<Box<dyn Array>>
309
where
310
T: FromBytes + NativeType,
311
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
312
{
313
let from = from.as_any().downcast_ref().unwrap();
314
315
let result = if is_little_endian {
316
try_binview_to_fixed_size_list::<T, true>(from, array_width)
317
} else {
318
try_binview_to_fixed_size_list::<T, false>(from, array_width)
319
}?;
320
Ok(Box::new(result))
321
}
322
323