Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/cast/binview_to.rs
6939 views
1
use std::ptr::copy_nonoverlapping;
2
3
use arrow::array::*;
4
use arrow::bitmap::MutableBitmap;
5
#[cfg(feature = "dtype-decimal")]
6
use arrow::compute::decimal::deserialize_decimal;
7
use arrow::datatypes::{ArrowDataType, Field, TimeUnit};
8
use arrow::offset::Offset;
9
use arrow::types::NativeType;
10
use bytemuck::cast_slice_mut;
11
use chrono::Datelike;
12
use num_traits::FromBytes;
13
use polars_error::{PolarsResult, polars_err};
14
15
use super::CastOptionsImpl;
16
use super::binary_to::Parse;
17
use super::temporal::EPOCH_DAYS_FROM_CE;
18
19
pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";
20
21
/// Cast [`BinaryViewArray`] to [`DictionaryArray`], also known as packing.
22
/// # Errors
23
/// This function errors if the maximum key is smaller than the number of distinct elements
24
/// in the array.
25
pub(super) fn binview_to_dictionary<K: DictionaryKey>(
26
from: &BinaryViewArray,
27
) -> PolarsResult<DictionaryArray<K>> {
28
let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<[u8]>>::new();
29
array.reserve(from.len());
30
array.try_extend(from.iter())?;
31
32
Ok(array.into())
33
}
34
35
pub(super) fn utf8view_to_dictionary<K: DictionaryKey>(
36
from: &Utf8ViewArray,
37
) -> PolarsResult<DictionaryArray<K>> {
38
let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<str>>::new();
39
array.reserve(from.len());
40
array.try_extend(from.iter())?;
41
42
Ok(array.into())
43
}
44
45
pub(super) fn view_to_binary<O: Offset>(array: &BinaryViewArray) -> BinaryArray<O> {
46
let len: usize = Array::len(array);
47
let mut mutable = MutableBinaryValuesArray::<O>::with_capacities(len, array.total_bytes_len());
48
for slice in array.values_iter() {
49
mutable.push(slice)
50
}
51
let out: BinaryArray<O> = mutable.into();
52
out.with_validity(array.validity().cloned())
53
}
54
55
pub fn utf8view_to_utf8<O: Offset>(array: &Utf8ViewArray) -> Utf8Array<O> {
56
let array = array.to_binview();
57
let out = view_to_binary::<O>(&array);
58
59
let dtype = Utf8Array::<O>::default_dtype();
60
unsafe {
61
Utf8Array::new_unchecked(
62
dtype,
63
out.offsets().clone(),
64
out.values().clone(),
65
out.validity().cloned(),
66
)
67
}
68
}
69
70
/// Parses a [`Utf8ViewArray`] with text representations of numbers into a
71
/// [`PrimitiveArray`], making any unparsable value a Null.
72
pub(super) fn utf8view_to_primitive<T>(
73
from: &Utf8ViewArray,
74
to: &ArrowDataType,
75
) -> PrimitiveArray<T>
76
where
77
T: NativeType + Parse,
78
{
79
let iter = from
80
.iter()
81
.map(|x| x.and_then::<T, _>(|x| T::parse(x.as_bytes())));
82
83
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
84
}
85
86
/// Parses a `&dyn` [`Array`] of UTF-8 encoded string representations of numbers
87
/// into a [`PrimitiveArray`], making any unparsable value a Null.
88
pub(super) fn utf8view_to_primitive_dyn<T>(
89
from: &dyn Array,
90
to: &ArrowDataType,
91
options: CastOptionsImpl,
92
) -> PolarsResult<Box<dyn Array>>
93
where
94
T: NativeType + Parse,
95
{
96
let from = from.as_any().downcast_ref().unwrap();
97
if options.partial {
98
unimplemented!()
99
} else {
100
Ok(Box::new(utf8view_to_primitive::<T>(from, to)))
101
}
102
}
103
104
#[cfg(feature = "dtype-decimal")]
105
pub fn binview_to_decimal(
106
array: &BinaryViewArray,
107
precision: Option<usize>,
108
scale: usize,
109
) -> PrimitiveArray<i128> {
110
let precision = precision.map(|p| p as u8);
111
PrimitiveArray::<i128>::from_trusted_len_iter(
112
array
113
.iter()
114
.map(|val| val.and_then(|val| deserialize_decimal(val, precision, scale as u8))),
115
)
116
.to(ArrowDataType::Decimal(
117
precision.unwrap_or(38).into(),
118
scale,
119
))
120
}
121
122
pub(super) fn utf8view_to_naive_timestamp_dyn(
123
from: &dyn Array,
124
time_unit: TimeUnit,
125
) -> PolarsResult<Box<dyn Array>> {
126
let from = from.as_any().downcast_ref().unwrap();
127
Ok(Box::new(utf8view_to_naive_timestamp(from, time_unit)))
128
}
129
130
/// [`super::temporal::utf8view_to_timestamp`] applied for RFC3339 formatting
131
pub fn utf8view_to_naive_timestamp(
132
from: &Utf8ViewArray,
133
time_unit: TimeUnit,
134
) -> PrimitiveArray<i64> {
135
super::temporal::utf8view_to_naive_timestamp(from, RFC3339, time_unit)
136
}
137
138
pub(super) fn utf8view_to_date32(from: &Utf8ViewArray) -> PrimitiveArray<i32> {
139
let iter = from.iter().map(|x| {
140
x.and_then(|x| {
141
x.parse::<chrono::NaiveDate>()
142
.ok()
143
.map(|x| x.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
144
})
145
});
146
PrimitiveArray::<i32>::from_trusted_len_iter(iter).to(ArrowDataType::Date32)
147
}
148
149
pub(super) fn utf8view_to_date32_dyn(from: &dyn Array) -> PolarsResult<Box<dyn Array>> {
150
let from = from.as_any().downcast_ref().unwrap();
151
Ok(Box::new(utf8view_to_date32(from)))
152
}
153
154
/// Casts a [`BinaryViewArray`] containing binary-encoded numbers to a
155
/// [`PrimitiveArray`], making any uncastable value a Null.
156
pub(super) fn binview_to_primitive<T>(
157
from: &BinaryViewArray,
158
to: &ArrowDataType,
159
is_little_endian: bool,
160
) -> PrimitiveArray<T>
161
where
162
T: FromBytes + NativeType,
163
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
164
{
165
let iter = from.iter().map(|x| {
166
x.and_then::<T, _>(|x| {
167
if is_little_endian {
168
Some(<T as FromBytes>::from_le_bytes(x.try_into().ok()?))
169
} else {
170
Some(<T as FromBytes>::from_be_bytes(x.try_into().ok()?))
171
}
172
})
173
});
174
175
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
176
}
177
178
/// Casts a `&dyn` [`Array`] containing binary-encoded numbers to a
179
/// [`PrimitiveArray`], making any uncastable value a Null.
180
/// # Panics
181
/// Panics if `Array` is not a `BinaryViewArray`
182
pub fn binview_to_primitive_dyn<T>(
183
from: &dyn Array,
184
to: &ArrowDataType,
185
is_little_endian: bool,
186
) -> PolarsResult<Box<dyn Array>>
187
where
188
T: FromBytes + NativeType,
189
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
190
{
191
let from = from.as_any().downcast_ref().unwrap();
192
Ok(Box::new(binview_to_primitive::<T>(
193
from,
194
to,
195
is_little_endian,
196
)))
197
}
198
199
/// Casts a [`BinaryViewArray`] to a [`FixedSizeListArray`], making any un-castable value a Null.
200
///
201
/// # Arguments
202
///
203
/// * `from`: The array to reinterpret.
204
/// * `array_width`: The number of items in each `Array`.
205
pub(super) fn try_binview_to_fixed_size_list<T, const IS_LITTLE_ENDIAN: bool>(
206
from: &BinaryViewArray,
207
array_width: usize,
208
) -> PolarsResult<FixedSizeListArray>
209
where
210
T: FromBytes + NativeType,
211
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
212
{
213
let element_size = std::mem::size_of::<T>();
214
// The maximum number of primitives in the result:
215
let primitive_length = from.len().checked_mul(array_width).ok_or_else(|| {
216
polars_err!(
217
InvalidOperation:
218
"array chunk length * number of items ({} * {}) is too large",
219
from.len(),
220
array_width
221
)
222
})?;
223
// The size of each array, in bytes:
224
let row_size_bytes = element_size.checked_mul(array_width).ok_or_else(|| {
225
polars_err!(
226
InvalidOperation:
227
"array size in bytes ({} * {}) is too large",
228
element_size,
229
array_width
230
)
231
})?;
232
233
let mut out: Vec<T> = vec![T::zeroed(); primitive_length];
234
let (out_u8_ptr, out_len_bytes) = {
235
let out_u8_slice = cast_slice_mut::<_, u8>(out.as_mut());
236
(out_u8_slice.as_mut_ptr(), out_u8_slice.len())
237
};
238
assert_eq!(out_len_bytes, row_size_bytes * from.len());
239
let mut validity = MutableBitmap::from_len_set(from.len());
240
241
for (index, value) in from.iter().enumerate() {
242
if let Some(value) = value
243
&& value.len() == row_size_bytes
244
{
245
if cfg!(target_endian = "little") && IS_LITTLE_ENDIAN {
246
// Fast path, we can just copy the data with no need to
247
// reinterpret.
248
let write_index = index * row_size_bytes;
249
debug_assert!(value.is_empty() || write_index < out_len_bytes);
250
debug_assert!(value.is_empty() || (write_index + value.len() - 1 < out_len_bytes));
251
// # Safety
252
// - The start index is smaller than `out`'s capacity.
253
// - The end index is smaller than `out`'s capacity.
254
unsafe {
255
copy_nonoverlapping(value.as_ptr(), out_u8_ptr.add(write_index), value.len());
256
}
257
} else {
258
// Slow path, reinterpret items one by one.
259
for j in 0..array_width {
260
let jth_range = (j * element_size)..((j + 1) * element_size);
261
debug_assert!(value.get(jth_range.clone()).is_some());
262
// # Safety
263
// We made sure the range is smaller than `value` length.
264
let jth_bytes = unsafe { value.get_unchecked(jth_range) };
265
// # Safety
266
// We just made sure that the slice has length `element_size`
267
let byte_array = unsafe { jth_bytes.try_into().unwrap_unchecked() };
268
let jth_value = if IS_LITTLE_ENDIAN {
269
<T as FromBytes>::from_le_bytes(byte_array)
270
} else {
271
<T as FromBytes>::from_be_bytes(byte_array)
272
};
273
274
let write_index = array_width * index + j;
275
debug_assert!(write_index < out.len());
276
// # Safety
277
// - The target index is smaller than the vector's pre-allocated capacity.
278
unsafe {
279
*out.get_unchecked_mut(write_index) = jth_value;
280
}
281
}
282
}
283
} else {
284
validity.set(index, false);
285
};
286
}
287
288
FixedSizeListArray::try_new(
289
ArrowDataType::FixedSizeList(
290
Box::new(Field::new("".into(), T::PRIMITIVE.into(), true)),
291
array_width,
292
),
293
from.len(),
294
Box::new(PrimitiveArray::<T>::from_vec(out)),
295
validity.into(),
296
)
297
}
298
299
/// Casts a `dyn` [`Array`] to a [`FixedSizeListArray`], making any un-castable value a Null.
300
///
301
/// # Arguments
302
///
303
/// * `from`: The array to reinterpret.
304
/// * `array_width`: The number of items in each `Array`.
305
///
306
/// # Panics
307
/// Panics if `from` is not `BinaryViewArray`.
308
pub fn binview_to_fixed_size_list_dyn<T>(
309
from: &dyn Array,
310
array_width: usize,
311
is_little_endian: bool,
312
) -> PolarsResult<Box<dyn Array>>
313
where
314
T: FromBytes + NativeType,
315
for<'a> &'a <T as FromBytes>::Bytes: TryFrom<&'a [u8]>,
316
{
317
let from = from.as_any().downcast_ref().unwrap();
318
319
let result = if is_little_endian {
320
try_binview_to_fixed_size_list::<T, true>(from, array_width)
321
} else {
322
try_binview_to_fixed_size_list::<T, false>(from, array_width)
323
}?;
324
Ok(Box::new(result))
325
}
326
327