Path: blob/main/crates/polars-parquet/src/parquet/encoding/uleb128.rs
6940 views
// Reads an uleb128 encoded integer with at most 56 bits (8 bytes with 7 bits worth of payload each).1/// Returns the integer and the number of bytes that made up this integer.2///3/// If the returned length is bigger than 8 this means the integer required more than 8 bytes and the remaining bytes need to be read sequentially and combined with the return value.4///5/// # Safety6/// `data` needs to contain at least 8 bytes.7#[target_feature(enable = "bmi2")]8#[cfg(target_feature = "bmi2")]9pub unsafe fn decode_uleb_bmi2(data: &[u8]) -> (u64, usize) {10const CONT_MARKER: u64 = 0x80808080_80808080;11debug_assert!(data.len() >= 8);1213unsafe {14let word = data.as_ptr().cast::<u64>().read_unaligned();15// mask indicating continuation bytes16let mask = std::arch::x86_64::_pext_u64(word, CONT_MARKER);17let len = (!mask).trailing_zeros() + 1;18// which payload bits to extract19let ext = std::arch::x86_64::_bzhi_u64(!CONT_MARKER, 8 * len);20let payload = std::arch::x86_64::_pext_u64(word, ext);2122(payload, len as _)23}24}2526pub fn decode(values: &[u8]) -> (u64, usize) {27#[cfg(target_feature = "bmi2")]28{29if polars_utils::cpuid::has_fast_bmi2() && values.len() >= 8 {30let (result, consumed) = unsafe { decode_uleb_bmi2(values) };31if consumed <= 8 {32return (result, consumed);33}34}35}3637let mut result = 0;38let mut shift = 0;3940let mut consumed = 0;41for byte in values {42consumed += 1;4344#[cfg(debug_assertions)]45debug_assert!(!(shift == 63 && *byte > 1));4647result |= u64::from(byte & 0b01111111) << shift;4849if byte & 0b10000000 == 0 {50break;51}5253shift += 7;54}55(result, consumed)56}5758/// Encodes `value` in ULEB128 into `container`. The exact number of bytes written59/// depends on `value`, and cannot be determined upfront. The maximum number of bytes60/// required are 10.61/// # Panic62/// This function may panic if `container.len() < 10` and `value` requires more bytes.63pub fn encode(mut value: u64, container: &mut [u8]) -> usize {64assert!(container.len() >= 10);65let mut consumed = 0;66let mut iter = container.iter_mut();67loop {68let mut byte = (value as u8) & !128;69value >>= 7;70if value != 0 {71byte |= 128;72}73*iter.next().unwrap() = byte;74consumed += 1;75if value == 0 {76break;77}78}79consumed80}8182#[cfg(test)]83mod tests {84use super::*;8586#[test]87fn decode_1() {88let data = vec![0xe5, 0x8e, 0x26, 0xDE, 0xAD, 0xBE, 0xEF];89let (value, len) = decode(&data);90assert_eq!(value, 624_485);91assert_eq!(len, 3);92}9394#[test]95fn decode_2() {96let data = vec![0b00010000, 0b00000001, 0b00000011, 0b00000011];97let (value, len) = decode(&data);98assert_eq!(value, 16);99assert_eq!(len, 1);100}101102#[test]103fn round_trip() {104let original = 123124234u64;105let mut container = [0u8; 10];106let encoded_len = encode(original, &mut container);107let (value, len) = decode(&container);108assert_eq!(value, original);109assert_eq!(len, encoded_len);110}111112#[test]113fn min_value() {114let original = u64::MIN;115let mut container = [0u8; 10];116let encoded_len = encode(original, &mut container);117let (value, len) = decode(&container);118assert_eq!(value, original);119assert_eq!(len, encoded_len);120}121122#[test]123fn max_value() {124let original = u64::MAX;125let mut container = [0u8; 10];126let encoded_len = encode(original, &mut container);127let (value, len) = decode(&container);128assert_eq!(value, original);129assert_eq!(len, encoded_len);130}131}132133134