Path: blob/main/crates/polars-parquet/src/arrow/read/deserialize/binview/predicate.rs
8480 views
//! Specialized kernels to do predicate evaluation directly on the `BinView` Parquet data.12use arrow::array::View;3use arrow::bitmap::BitmapBuilder;4use polars_utils::aliases::PlIndexSet;56use crate::parquet::error::ParquetResult;78/// Create a mask for when a value is equal to the `needle`.9pub fn decode_equals(10num_expected_values: usize,11values: &[u8],12needle: &[u8],13pred_true_mask: &mut BitmapBuilder,14) -> ParquetResult<()> {15if needle.len() <= View::MAX_INLINE_SIZE as usize {16let needle_view = View::new_inline(needle);17decode_matches(18num_expected_values,19values,20|value| {21value.len() == needle.len()22&& unsafe { View::new_inline_unchecked(value) } == needle_view23},24pred_true_mask,25)26} else {27decode_matches(28num_expected_values,29values,30|value| value == needle,31pred_true_mask,32)33}34}3536pub fn decode_is_in_no_values_non_inlinable(37num_expected_values: usize,38values: &[u8],39needles: &PlIndexSet<Box<[u8]>>,40pred_true_mask: &mut BitmapBuilder,41) -> ParquetResult<()> {42decode_matches(43num_expected_values,44values,45|value| needles.contains(value),46pred_true_mask,47)48}4950pub fn decode_is_in_non_inlinable(51num_expected_values: usize,52mut values: &[u8],53needles: &PlIndexSet<Box<[u8]>>,54needle_views: &[View],55target: &mut Vec<View>,56total_bytes_len: &mut usize,57) -> ParquetResult<()> {58assert_eq!(needles.len(), needle_views.len());59assert!(!needles.is_empty());6061target.reserve(num_expected_values);62let mut next_idx = target.len();63for _ in 0..num_expected_values {64if values.len() < 4 {65return Err(super::invalid_input_err());66}6768let length;69(length, values) = values.split_at(4);70let length: &[u8; 4] = unsafe { length.try_into().unwrap_unchecked() };71let length = u32::from_le_bytes(*length);7273if values.len() < length as usize {74return Err(super::invalid_input_err());75}7677let value;78(value, values) = values.split_at(length as usize);7980let needle_idx = needles.get_index_of(value);81unsafe {82// SAFETY: We checked that 0 < needle_views.len() == needles.len().83let view = needle_views.get_unchecked(needle_idx.unwrap_or_default());8485// SAFETY: We reserved enough just before the loop.86target.as_mut_ptr().add(next_idx).write(*view);87}8889if needle_idx.is_some() {90*total_bytes_len += value.len();91}92next_idx += usize::from(needle_idx.is_some());93}9495// SAFETY: We wrote all these items. Note, that views are Copy, so erroring or panicked until96// this point won't miss Drop calls.97unsafe {98target.set_len(next_idx);99}100101Ok(())102}103104pub fn decode_is_in_no_values_inlinable(105num_expected_values: usize,106values: &[u8],107needles: &[View; 4],108pred_true_mask: &mut BitmapBuilder,109) -> ParquetResult<()> {110decode_matches(111num_expected_values,112values,113|value| {114let length = value.len() as u32;115if length > View::MAX_INLINE_SIZE {116return false;117}118// SAFETY: we made sure length <= View::MAX_INLINE_SIZE.119let mut view = unsafe { View::new_inline_unchecked(value) };120view.length = length;121needles.contains(&view)122},123pred_true_mask,124)125}126127pub fn decode_is_in_inlinable(128num_expected_values: usize,129mut values: &[u8],130needles: &[View; 4],131target: &mut Vec<View>,132total_bytes_len: &mut usize,133) -> ParquetResult<()> {134target.reserve(num_expected_values);135let mut next_idx = target.len();136for _ in 0..num_expected_values {137if values.len() < 4 {138return Err(super::invalid_input_err());139}140141let length;142(length, values) = values.split_at(4);143let length: &[u8; 4] = unsafe { length.try_into().unwrap_unchecked() };144let length = u32::from_le_bytes(*length);145146if values.len() < length as usize {147return Err(super::invalid_input_err());148}149150// Always advance the slice before checking length.151let value;152(value, values) = values.split_at(length as usize);153154// Non-inlinable views can't match inlinable needles.155if length > View::MAX_INLINE_SIZE {156continue;157}158// SAFETY: we made sure length <= View::MAX_INLINE_SIZE.159let mut view = unsafe { View::new_inline_unchecked(value) };160view.length = length;161162let is_pred_true = needles.contains(&view);163// SAFETY: We reserved enough just before the loop.164unsafe {165target.as_mut_ptr().add(next_idx).write(view);166}167if is_pred_true {168*total_bytes_len += value.len();169}170next_idx += usize::from(is_pred_true);171}172173// SAFETY: We wrote all these items. Note, that views are Copy, so erroring or panicked until174// this point won't miss Drop calls.175unsafe {176target.set_len(next_idx);177}178179Ok(())180}181182pub fn decode_matches(183num_expected_values: usize,184mut values: &[u8],185is_match: impl Fn(&[u8]) -> bool,186pred_true_mask: &mut BitmapBuilder,187) -> ParquetResult<()> {188pred_true_mask.reserve(num_expected_values);189for _ in 0..num_expected_values {190if values.len() < 4 {191return Err(super::invalid_input_err());192}193194let length;195(length, values) = values.split_at(4);196let length: &[u8; 4] = unsafe { length.try_into().unwrap_unchecked() };197let length = u32::from_le_bytes(*length);198199if values.len() < length as usize {200return Err(super::invalid_input_err());201}202203let value;204(value, values) = values.split_at(length as usize);205206let is_match = is_match(value);207// SAFETY: We reserved enough just before the loop.208unsafe { pred_true_mask.push_unchecked(is_match) };209}210211Ok(())212}213214215