Path: blob/main/crates/polars-row/src/variable/no_order.rs
6939 views
/// Row encoding for variable width elements without maintaining order.1///2/// Each element is prepended by a sentinel value.3///4/// If the sentinel value is:5/// - 0xFF: the element is None6/// - 0xFE: the element's length is encoded as 4 LE bytes following the sentinel7/// - 0x00 - 0xFD: the element's length is the sentinel value8///9/// After the sentinel value (and possible length), the data is then given.10use std::mem::MaybeUninit;1112use arrow::array::{BinaryViewArray, MutableBinaryViewArray};13use arrow::bitmap::BitmapBuilder;14use polars_utils::slice::Slice2Uninit;1516use crate::row::RowEncodingOptions;1718pub fn len_from_item(value: Option<usize>, opt: RowEncodingOptions) -> usize {19debug_assert!(opt.contains(RowEncodingOptions::NO_ORDER));2021match value {22None => 1,23Some(l) if l < 254 => l + 1,24Some(l) => l + 5,25}26}2728pub unsafe fn len_from_buffer(buffer: &[u8], opt: RowEncodingOptions) -> usize {29debug_assert!(opt.contains(RowEncodingOptions::NO_ORDER));3031let sentinel = *unsafe { buffer.get_unchecked(0) };3233match sentinel {340xFF => 1,350xFE => {365 + u32::from_le_bytes(unsafe { buffer.get_unchecked(1..5) }.try_into().unwrap())37as usize38},39length => 1 + length as usize,40}41}4243pub unsafe fn encode_variable_no_order<'a, I: Iterator<Item = Option<&'a [u8]>>>(44buffer: &mut [MaybeUninit<u8>],45input: I,46opt: RowEncodingOptions,47offsets: &mut [usize],48) {49debug_assert!(opt.contains(RowEncodingOptions::NO_ORDER));5051for (offset, opt_value) in offsets.iter_mut().zip(input) {52let buffer = unsafe { buffer.get_unchecked_mut(*offset..) };53match opt_value {54None => {55*unsafe { buffer.get_unchecked_mut(0) } = MaybeUninit::new(0xFF);56*offset += 1;57},58Some(v) => {59if v.len() >= 254 {60unsafe {61*buffer.get_unchecked_mut(0) = MaybeUninit::new(0xFE);62buffer63.get_unchecked_mut(1..5)64.copy_from_slice((v.len() as u32).to_le_bytes().as_uninit());65buffer66.get_unchecked_mut(5..5 + v.len())67.copy_from_slice(v.as_uninit());68}69*offset += 5 + v.len();70} else {71unsafe {72*buffer.get_unchecked_mut(0) = MaybeUninit::new(v.len() as u8);73buffer74.get_unchecked_mut(1..1 + v.len())75.copy_from_slice(v.as_uninit());76}77*offset += 1 + v.len();78}79},80}81}82}8384pub unsafe fn decode_variable_no_order(85rows: &mut [&[u8]],86opt: RowEncodingOptions,87) -> BinaryViewArray {88debug_assert!(opt.contains(RowEncodingOptions::NO_ORDER));8990let num_rows = rows.len();91let mut array = MutableBinaryViewArray::<[u8]>::with_capacity(num_rows);92let mut validity = BitmapBuilder::new();9394for row in rows.iter_mut() {95let sentinel = *unsafe { row.get_unchecked(0) };96*row = unsafe { row.get_unchecked(1..) };97if sentinel == 0xFF {98validity.reserve(num_rows);99validity.extend_constant(array.len(), true);100validity.push(false);101array.push_value_ignore_validity("");102break;103}104105let length = if sentinel < 0xFE {106sentinel as usize107} else {108let length = u32::from_le_bytes(unsafe { row.get_unchecked(..4) }.try_into().unwrap());109*row = unsafe { row.get_unchecked(4..) };110length as usize111};112113array.push_value_ignore_validity(unsafe { row.get_unchecked(..length) });114*row = unsafe { row.get_unchecked(length..) };115}116117if validity.is_empty() {118return array.into();119}120121for row in rows[array.len()..].iter_mut() {122let sentinel = *unsafe { row.get_unchecked(0) };123*row = unsafe { row.get_unchecked(1..) };124125validity.push(sentinel != 0xFF);126if sentinel == 0xFF {127array.push_value_ignore_validity("");128continue;129}130131let length = if sentinel < 0xFE {132sentinel as usize133} else {134let length = u32::from_le_bytes(unsafe { row.get_unchecked(..4) }.try_into().unwrap());135*row = unsafe { row.get_unchecked(4..) };136length as usize137};138139array.push_value_ignore_validity(unsafe { row.get_unchecked(..length) });140*row = unsafe { row.get_unchecked(length..) };141}142143let array = array.freeze();144array.with_validity(validity.into_opt_validity())145}146147148