Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/read/deserialize/binview/predicate.rs
8480 views
1
//! Specialized kernels to do predicate evaluation directly on the `BinView` Parquet data.
2
3
use arrow::array::View;
4
use arrow::bitmap::BitmapBuilder;
5
use polars_utils::aliases::PlIndexSet;
6
7
use crate::parquet::error::ParquetResult;
8
9
/// Create a mask for when a value is equal to the `needle`.
10
pub fn decode_equals(
11
num_expected_values: usize,
12
values: &[u8],
13
needle: &[u8],
14
pred_true_mask: &mut BitmapBuilder,
15
) -> ParquetResult<()> {
16
if needle.len() <= View::MAX_INLINE_SIZE as usize {
17
let needle_view = View::new_inline(needle);
18
decode_matches(
19
num_expected_values,
20
values,
21
|value| {
22
value.len() == needle.len()
23
&& unsafe { View::new_inline_unchecked(value) } == needle_view
24
},
25
pred_true_mask,
26
)
27
} else {
28
decode_matches(
29
num_expected_values,
30
values,
31
|value| value == needle,
32
pred_true_mask,
33
)
34
}
35
}
36
37
pub fn decode_is_in_no_values_non_inlinable(
38
num_expected_values: usize,
39
values: &[u8],
40
needles: &PlIndexSet<Box<[u8]>>,
41
pred_true_mask: &mut BitmapBuilder,
42
) -> ParquetResult<()> {
43
decode_matches(
44
num_expected_values,
45
values,
46
|value| needles.contains(value),
47
pred_true_mask,
48
)
49
}
50
51
pub fn decode_is_in_non_inlinable(
52
num_expected_values: usize,
53
mut values: &[u8],
54
needles: &PlIndexSet<Box<[u8]>>,
55
needle_views: &[View],
56
target: &mut Vec<View>,
57
total_bytes_len: &mut usize,
58
) -> ParquetResult<()> {
59
assert_eq!(needles.len(), needle_views.len());
60
assert!(!needles.is_empty());
61
62
target.reserve(num_expected_values);
63
let mut next_idx = target.len();
64
for _ in 0..num_expected_values {
65
if values.len() < 4 {
66
return Err(super::invalid_input_err());
67
}
68
69
let length;
70
(length, values) = values.split_at(4);
71
let length: &[u8; 4] = unsafe { length.try_into().unwrap_unchecked() };
72
let length = u32::from_le_bytes(*length);
73
74
if values.len() < length as usize {
75
return Err(super::invalid_input_err());
76
}
77
78
let value;
79
(value, values) = values.split_at(length as usize);
80
81
let needle_idx = needles.get_index_of(value);
82
unsafe {
83
// SAFETY: We checked that 0 < needle_views.len() == needles.len().
84
let view = needle_views.get_unchecked(needle_idx.unwrap_or_default());
85
86
// SAFETY: We reserved enough just before the loop.
87
target.as_mut_ptr().add(next_idx).write(*view);
88
}
89
90
if needle_idx.is_some() {
91
*total_bytes_len += value.len();
92
}
93
next_idx += usize::from(needle_idx.is_some());
94
}
95
96
// SAFETY: We wrote all these items. Note, that views are Copy, so erroring or panicked until
97
// this point won't miss Drop calls.
98
unsafe {
99
target.set_len(next_idx);
100
}
101
102
Ok(())
103
}
104
105
pub fn decode_is_in_no_values_inlinable(
106
num_expected_values: usize,
107
values: &[u8],
108
needles: &[View; 4],
109
pred_true_mask: &mut BitmapBuilder,
110
) -> ParquetResult<()> {
111
decode_matches(
112
num_expected_values,
113
values,
114
|value| {
115
let length = value.len() as u32;
116
if length > View::MAX_INLINE_SIZE {
117
return false;
118
}
119
// SAFETY: we made sure length <= View::MAX_INLINE_SIZE.
120
let mut view = unsafe { View::new_inline_unchecked(value) };
121
view.length = length;
122
needles.contains(&view)
123
},
124
pred_true_mask,
125
)
126
}
127
128
pub fn decode_is_in_inlinable(
129
num_expected_values: usize,
130
mut values: &[u8],
131
needles: &[View; 4],
132
target: &mut Vec<View>,
133
total_bytes_len: &mut usize,
134
) -> ParquetResult<()> {
135
target.reserve(num_expected_values);
136
let mut next_idx = target.len();
137
for _ in 0..num_expected_values {
138
if values.len() < 4 {
139
return Err(super::invalid_input_err());
140
}
141
142
let length;
143
(length, values) = values.split_at(4);
144
let length: &[u8; 4] = unsafe { length.try_into().unwrap_unchecked() };
145
let length = u32::from_le_bytes(*length);
146
147
if values.len() < length as usize {
148
return Err(super::invalid_input_err());
149
}
150
151
// Always advance the slice before checking length.
152
let value;
153
(value, values) = values.split_at(length as usize);
154
155
// Non-inlinable views can't match inlinable needles.
156
if length > View::MAX_INLINE_SIZE {
157
continue;
158
}
159
// SAFETY: we made sure length <= View::MAX_INLINE_SIZE.
160
let mut view = unsafe { View::new_inline_unchecked(value) };
161
view.length = length;
162
163
let is_pred_true = needles.contains(&view);
164
// SAFETY: We reserved enough just before the loop.
165
unsafe {
166
target.as_mut_ptr().add(next_idx).write(view);
167
}
168
if is_pred_true {
169
*total_bytes_len += value.len();
170
}
171
next_idx += usize::from(is_pred_true);
172
}
173
174
// SAFETY: We wrote all these items. Note, that views are Copy, so erroring or panicked until
175
// this point won't miss Drop calls.
176
unsafe {
177
target.set_len(next_idx);
178
}
179
180
Ok(())
181
}
182
183
pub fn decode_matches(
184
num_expected_values: usize,
185
mut values: &[u8],
186
is_match: impl Fn(&[u8]) -> bool,
187
pred_true_mask: &mut BitmapBuilder,
188
) -> ParquetResult<()> {
189
pred_true_mask.reserve(num_expected_values);
190
for _ in 0..num_expected_values {
191
if values.len() < 4 {
192
return Err(super::invalid_input_err());
193
}
194
195
let length;
196
(length, values) = values.split_at(4);
197
let length: &[u8; 4] = unsafe { length.try_into().unwrap_unchecked() };
198
let length = u32::from_le_bytes(*length);
199
200
if values.len() < length as usize {
201
return Err(super::invalid_input_err());
202
}
203
204
let value;
205
(value, values) = values.split_at(length as usize);
206
207
let is_match = is_match(value);
208
// SAFETY: We reserved enough just before the loop.
209
unsafe { pred_true_mask.push_unchecked(is_match) };
210
}
211
212
Ok(())
213
}
214
215