Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/specification.rs
6939 views
1
use polars_error::{PolarsResult, polars_bail, polars_err};
2
3
use crate::array::DictionaryKey;
4
use crate::offset::{Offset, Offsets, OffsetsBuffer};
5
6
/// Helper trait to support `Offset` and `OffsetBuffer`
7
pub trait OffsetsContainer<O> {
8
fn last(&self) -> usize;
9
fn as_slice(&self) -> &[O];
10
}
11
12
impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {
13
#[inline]
14
fn last(&self) -> usize {
15
self.last().to_usize()
16
}
17
18
#[inline]
19
fn as_slice(&self) -> &[O] {
20
self.buffer()
21
}
22
}
23
24
impl<O: Offset> OffsetsContainer<O> for Offsets<O> {
25
#[inline]
26
fn last(&self) -> usize {
27
self.last().to_usize()
28
}
29
30
#[inline]
31
fn as_slice(&self) -> &[O] {
32
self.as_slice()
33
}
34
}
35
36
pub(crate) fn try_check_offsets_bounds<O: Offset>(
37
offsets: &[O],
38
values_len: usize,
39
) -> PolarsResult<()> {
40
if offsets.last().unwrap().to_usize() > values_len {
41
polars_bail!(ComputeError: "offsets must not exceed the values length")
42
} else {
43
Ok(())
44
}
45
}
46
47
/// # Error
48
/// * any offset is larger or equal to `values_len`.
49
/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
50
pub fn try_check_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> PolarsResult<()> {
51
if offsets.len() == 1 {
52
return Ok(());
53
}
54
assert!(offsets.len() > 1);
55
let end = offsets.last().unwrap().to_usize();
56
let start = offsets.first().unwrap().to_usize();
57
58
try_check_offsets_bounds(offsets, values.len())?;
59
let values_range = &values[start..end];
60
61
if values_range.is_ascii() {
62
Ok(())
63
} else {
64
simdutf8::basic::from_utf8(values_range)?;
65
66
// offsets can be == values.len()
67
// find first offset from the end that is smaller
68
// Example:
69
// values.len() = 10
70
// offsets = [0, 5, 10, 10]
71
let last = offsets
72
.iter()
73
.enumerate()
74
.skip(1)
75
.rev()
76
.find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i));
77
78
let last = if let Some(last) = last {
79
// following the example: last = 1 (offset = 5)
80
last
81
} else {
82
// given `l = values.len()`, this branch is hit iff either:
83
// * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above
84
// * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above
85
return Ok(());
86
};
87
88
// truncate to relevant offsets. Note: `=last` because last was computed skipping the first item
89
// following the example: starts = [0, 5]
90
let starts = unsafe { offsets.get_unchecked(..=last) };
91
92
let mut any_invalid = false;
93
for start in starts {
94
let start = start.to_usize();
95
96
// SAFETY: `try_check_offsets_bounds` just checked for bounds
97
let b = *unsafe { values.get_unchecked(start) };
98
99
// A valid code-point iff it does not start with 0b10xxxxxx
100
// Bit-magic taken from `std::str::is_char_boundary`
101
any_invalid |= (b as i8) < -0x40;
102
}
103
if any_invalid {
104
polars_bail!(ComputeError: "non-valid char boundary detected")
105
}
106
Ok(())
107
}
108
}
109
110
/// Check dictionary indexes without checking usize conversion.
111
/// # Safety
112
/// The caller must ensure that `K::as_usize` always succeeds.
113
pub(crate) unsafe fn check_indexes_unchecked<K: DictionaryKey>(
114
keys: &[K],
115
len: usize,
116
) -> PolarsResult<()> {
117
let mut invalid = false;
118
119
// this loop is auto-vectorized
120
keys.iter().for_each(|k| invalid |= k.as_usize() > len);
121
122
if invalid {
123
let key = keys.iter().map(|k| k.as_usize()).max().unwrap();
124
polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")
125
} else {
126
Ok(())
127
}
128
}
129
130
pub fn check_indexes<K>(keys: &[K], len: usize) -> PolarsResult<()>
131
where
132
K: std::fmt::Debug + Copy + TryInto<usize>,
133
{
134
keys.iter().try_for_each(|key| {
135
let key: usize = (*key)
136
.try_into()
137
.map_err(|_| polars_err!(ComputeError: "The dictionary key must fit in a `usize`, but {key:?} does not")
138
)?;
139
if key >= len {
140
polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")
141
} else {
142
Ok(())
143
}
144
})
145
}
146
147
#[cfg(test)]
148
mod tests {
149
use proptest::prelude::*;
150
151
use super::*;
152
153
pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
154
prop::collection::vec(any::<u8>(), 1..100)
155
}
156
157
proptest! {
158
// a bit expensive, feel free to run it when changing the code above
159
// #![proptest_config(ProptestConfig::with_cases(100000))]
160
#[test]
161
#[cfg_attr(miri, ignore)] // miri and proptest do not work well
162
fn check_utf8_validation(values in binary_strategy()) {
163
164
for offset in 0..values.len() - 1 {
165
let offsets: OffsetsBuffer<i32> = vec![0, offset as i32, values.len() as i32].try_into().unwrap();
166
167
let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
168
is_valid &= std::str::from_utf8(&values[offset..]).is_ok();
169
170
assert_eq!(try_check_utf8::<i32>(&offsets, &values).is_ok(), is_valid)
171
}
172
}
173
}
174
}
175
176