Path: blob/main/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs
7887 views
// See https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--31mod bitmap;2mod encoder;3#[cfg(feature = "proptest")]4pub mod proptest;56pub use bitmap::{BitmapIter, encode_bool as bitpacked_encode};7pub use encoder::{Encoder, encode};89use super::{bitpacked, uleb128};10use crate::parquet::error::{ParquetError, ParquetResult};1112/// A [`Iterator`] for Hybrid Run-Length Encoding13///14/// The hybrid here means that each second is prepended by a bit that differentiates between two15/// modes.16///17/// 1. Run-Length Encoding in the shape of `[Number of Values, Value]`18/// 2. Bitpacking in the shape of `[Value 1 in n bits, Value 2 in n bits, ...]`19///20/// Note, that this can iterate, but the set of `collect_*` and `translate_and_collect_*` methods21/// should be highly preferred as they are way more efficient and have better error handling.22#[derive(Debug, Clone)]23pub struct HybridRleDecoder<'a> {24data: &'a [u8],25num_bits: usize,26num_values: usize,27}2829pub struct HybridRleChunkIter<'a> {30decoder: HybridRleDecoder<'a>,31}3233#[derive(Debug)]34pub enum HybridRleChunk<'a> {35Rle(u32, usize),36Bitpacked(bitpacked::Decoder<'a, u32>),37}3839impl<'a> Iterator for HybridRleChunkIter<'a> {40type Item = ParquetResult<HybridRleChunk<'a>>;4142fn next(&mut self) -> Option<Self::Item> {43self.decoder.next_chunk().transpose()44}45}4647impl HybridRleChunk<'_> {48#[inline]49pub fn len(&self) -> usize {50match self {51HybridRleChunk::Rle(_, size) => *size,52HybridRleChunk::Bitpacked(decoder) => decoder.len(),53}54}55}5657impl<'a> HybridRleDecoder<'a> {58/// Returns a new [`HybridRleDecoder`]59pub fn new(data: &'a [u8], num_bits: u32, num_values: usize) -> Self {60Self {61data,62num_bits: num_bits as usize,63num_values,64}65}6667pub fn len(&self) -> usize {68self.num_values69}7071pub fn num_bits(&self) -> usize {72self.num_bits73}7475pub fn into_chunk_iter(self) -> HybridRleChunkIter<'a> {76HybridRleChunkIter { decoder: self }77}7879pub fn next_chunk(&mut self) -> ParquetResult<Option<HybridRleChunk<'a>>> {80if self.len() == 0 {81return Ok(None);82}8384if self.num_bits == 0 {85let num_values = self.num_values;86self.num_values = 0;87return Ok(Some(HybridRleChunk::Rle(0, num_values)));88}8990if self.data.is_empty() {91return Ok(None);92}9394let (indicator, consumed) = uleb128::decode(self.data);95self.data = unsafe { self.data.get_unchecked(consumed..) };9697Ok(Some(if indicator & 1 == 1 {98// is bitpacking99let bytes = (indicator as usize >> 1) * self.num_bits;100let bytes = std::cmp::min(bytes, self.data.len());101let Some((packed, remaining)) = self.data.split_at_checked(bytes) else {102return Err(ParquetError::oos("Not enough bytes for bitpacked data"));103};104self.data = remaining;105106let length = std::cmp::min(packed.len() * 8 / self.num_bits, self.num_values);107let decoder = bitpacked::Decoder::<u32>::try_new(packed, self.num_bits, length)?;108109self.num_values -= length;110111HybridRleChunk::Bitpacked(decoder)112} else {113// is rle114let run_length = indicator as usize >> 1;115// repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)116let rle_bytes = self.num_bits.div_ceil(8);117let Some((pack, remaining)) = self.data.split_at_checked(rle_bytes) else {118return Err(ParquetError::oos("Not enough bytes for RLE encoded data"));119};120self.data = remaining;121122let mut bytes = [0u8; std::mem::size_of::<u32>()];123pack.iter().zip(bytes.iter_mut()).for_each(|(src, dst)| {124*dst = *src;125});126let value = u32::from_le_bytes(bytes);127128let length = std::cmp::min(run_length, self.num_values);129130self.num_values -= length;131132HybridRleChunk::Rle(value, length)133}))134}135136pub fn next_chunk_length(&mut self) -> ParquetResult<Option<usize>> {137if self.len() == 0 {138return Ok(None);139}140141if self.num_bits == 0 {142let num_values = self.num_values;143self.num_values = 0;144return Ok(Some(num_values));145}146147if self.data.is_empty() {148return Ok(None);149}150151let (indicator, consumed) = uleb128::decode(self.data);152self.data = unsafe { self.data.get_unchecked(consumed..) };153154Ok(Some(if indicator & 1 == 1 {155// is bitpacking156let bytes = (indicator as usize >> 1) * self.num_bits;157let bytes = std::cmp::min(bytes, self.data.len());158let Some((packed, remaining)) = self.data.split_at_checked(bytes) else {159return Err(ParquetError::oos("Not enough bytes for bitpacked data"));160};161self.data = remaining;162163let length = std::cmp::min(packed.len() * 8 / self.num_bits, self.num_values);164self.num_values -= length;165166length167} else {168// is rle169let run_length = indicator as usize >> 1;170// repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)171let rle_bytes = self.num_bits.div_ceil(8);172let Some(remaining) = self.data.get(rle_bytes..) else {173return Err(ParquetError::oos("Not enough bytes for RLE encoded data"));174};175self.data = remaining;176177let length = std::cmp::min(run_length, self.num_values);178self.num_values -= length;179180length181}))182}183184pub fn limit_to(&mut self, length: usize) {185self.num_values = self.num_values.min(length);186}187188pub fn collect(self) -> ParquetResult<Vec<u32>> {189let mut target = Vec::with_capacity(self.len());190191for chunk in self.into_chunk_iter() {192match chunk? {193HybridRleChunk::Rle(value, size) => {194target.resize(target.len() + size, value);195},196HybridRleChunk::Bitpacked(decoder) => {197decoder.collect_into(&mut target);198},199}200}201202Ok(target)203}204}205206#[cfg(test)]207mod tests {208use super::*;209210#[test]211fn roundtrip() -> ParquetResult<()> {212let mut buffer = vec![];213let num_bits = 10u32;214215let data = (0..1000).collect::<Vec<_>>();216217encode::<u32, _, _>(&mut buffer, data.iter().cloned(), num_bits).unwrap();218219let decoder = HybridRleDecoder::new(&buffer, num_bits, data.len());220221let result = decoder.collect()?;222223assert_eq!(result, data);224Ok(())225}226227#[test]228fn pyarrow_integration() -> ParquetResult<()> {229// data encoded from pyarrow representing (0..1000)230let data = vec![231127, 0, 4, 32, 192, 0, 4, 20, 96, 192, 1, 8, 36, 160, 192, 2, 12, 52, 224, 192, 3, 16,23268, 32, 193, 4, 20, 84, 96, 193, 5, 24, 100, 160, 193, 6, 28, 116, 224, 193, 7, 32,233132, 32, 194, 8, 36, 148, 96, 194, 9, 40, 164, 160, 194, 10, 44, 180, 224, 194, 11, 48,234196, 32, 195, 12, 52, 212, 96, 195, 13, 56, 228, 160, 195, 14, 60, 244, 224, 195, 15,23564, 4, 33, 196, 16, 68, 20, 97, 196, 17, 72, 36, 161, 196, 18, 76, 52, 225, 196, 19,23680, 68, 33, 197, 20, 84, 84, 97, 197, 21, 88, 100, 161, 197, 22, 92, 116, 225, 197, 23,23796, 132, 33, 198, 24, 100, 148, 97, 198, 25, 104, 164, 161, 198, 26, 108, 180, 225,238198, 27, 112, 196, 33, 199, 28, 116, 212, 97, 199, 29, 120, 228, 161, 199, 30, 124,239244, 225, 199, 31, 128, 4, 34, 200, 32, 132, 20, 98, 200, 33, 136, 36, 162, 200, 34,240140, 52, 226, 200, 35, 144, 68, 34, 201, 36, 148, 84, 98, 201, 37, 152, 100, 162, 201,24138, 156, 116, 226, 201, 39, 160, 132, 34, 202, 40, 164, 148, 98, 202, 41, 168, 164,242162, 202, 42, 172, 180, 226, 202, 43, 176, 196, 34, 203, 44, 180, 212, 98, 203, 45,243184, 228, 162, 203, 46, 188, 244, 226, 203, 47, 192, 4, 35, 204, 48, 196, 20, 99, 204,24449, 200, 36, 163, 204, 50, 204, 52, 227, 204, 51, 208, 68, 35, 205, 52, 212, 84, 99,245205, 53, 216, 100, 163, 205, 54, 220, 116, 227, 205, 55, 224, 132, 35, 206, 56, 228,246148, 99, 206, 57, 232, 164, 163, 206, 58, 236, 180, 227, 206, 59, 240, 196, 35, 207,24760, 244, 212, 99, 207, 61, 248, 228, 163, 207, 62, 252, 244, 227, 207, 63, 0, 5, 36,248208, 64, 4, 21, 100, 208, 65, 8, 37, 164, 208, 66, 12, 53, 228, 208, 67, 16, 69, 36,249209, 68, 20, 85, 100, 209, 69, 24, 101, 164, 209, 70, 28, 117, 228, 209, 71, 32, 133,25036, 210, 72, 36, 149, 100, 210, 73, 40, 165, 164, 210, 74, 44, 181, 228, 210, 75, 48,251197, 36, 211, 76, 52, 213, 100, 211, 77, 56, 229, 164, 211, 78, 60, 245, 228, 211, 79,25264, 5, 37, 212, 80, 68, 21, 101, 212, 81, 72, 37, 165, 212, 82, 76, 53, 229, 212, 83,25380, 69, 37, 213, 84, 84, 85, 101, 213, 85, 88, 101, 165, 213, 86, 92, 117, 229, 213,25487, 96, 133, 37, 214, 88, 100, 149, 101, 214, 89, 104, 165, 165, 214, 90, 108, 181,255229, 214, 91, 112, 197, 37, 215, 92, 116, 213, 101, 215, 93, 120, 229, 165, 215, 94,256124, 245, 229, 215, 95, 128, 5, 38, 216, 96, 132, 21, 102, 216, 97, 136, 37, 166, 216,25798, 140, 53, 230, 216, 99, 144, 69, 38, 217, 100, 148, 85, 102, 217, 101, 152, 101,258166, 217, 102, 156, 117, 230, 217, 103, 160, 133, 38, 218, 104, 164, 149, 102, 218,259105, 168, 165, 166, 218, 106, 172, 181, 230, 218, 107, 176, 197, 38, 219, 108, 180,260213, 102, 219, 109, 184, 229, 166, 219, 110, 188, 245, 230, 219, 111, 192, 5, 39, 220,261112, 196, 21, 103, 220, 113, 200, 37, 167, 220, 114, 204, 53, 231, 220, 115, 208, 69,26239, 221, 116, 212, 85, 103, 221, 117, 216, 101, 167, 221, 118, 220, 117, 231, 221, 119,263224, 133, 39, 222, 120, 228, 149, 103, 222, 121, 232, 165, 167, 222, 122, 236, 181,264231, 222, 123, 240, 197, 39, 223, 124, 244, 213, 103, 223, 125, 125, 248, 229, 167,265223, 126, 252, 245, 231, 223, 127, 0, 6, 40, 224, 128, 4, 22, 104, 224, 129, 8, 38,266168, 224, 130, 12, 54, 232, 224, 131, 16, 70, 40, 225, 132, 20, 86, 104, 225, 133, 24,267102, 168, 225, 134, 28, 118, 232, 225, 135, 32, 134, 40, 226, 136, 36, 150, 104, 226,268137, 40, 166, 168, 226, 138, 44, 182, 232, 226, 139, 48, 198, 40, 227, 140, 52, 214,269104, 227, 141, 56, 230, 168, 227, 142, 60, 246, 232, 227, 143, 64, 6, 41, 228, 144, 68,27022, 105, 228, 145, 72, 38, 169, 228, 146, 76, 54, 233, 228, 147, 80, 70, 41, 229, 148,27184, 86, 105, 229, 149, 88, 102, 169, 229, 150, 92, 118, 233, 229, 151, 96, 134, 41,272230, 152, 100, 150, 105, 230, 153, 104, 166, 169, 230, 154, 108, 182, 233, 230, 155,273112, 198, 41, 231, 156, 116, 214, 105, 231, 157, 120, 230, 169, 231, 158, 124, 246,274233, 231, 159, 128, 6, 42, 232, 160, 132, 22, 106, 232, 161, 136, 38, 170, 232, 162,275140, 54, 234, 232, 163, 144, 70, 42, 233, 164, 148, 86, 106, 233, 165, 152, 102, 170,276233, 166, 156, 118, 234, 233, 167, 160, 134, 42, 234, 168, 164, 150, 106, 234, 169,277168, 166, 170, 234, 170, 172, 182, 234, 234, 171, 176, 198, 42, 235, 172, 180, 214,278106, 235, 173, 184, 230, 170, 235, 174, 188, 246, 234, 235, 175, 192, 6, 43, 236, 176,279196, 22, 107, 236, 177, 200, 38, 171, 236, 178, 204, 54, 235, 236, 179, 208, 70, 43,280237, 180, 212, 86, 107, 237, 181, 216, 102, 171, 237, 182, 220, 118, 235, 237, 183,281224, 134, 43, 238, 184, 228, 150, 107, 238, 185, 232, 166, 171, 238, 186, 236, 182,282235, 238, 187, 240, 198, 43, 239, 188, 244, 214, 107, 239, 189, 248, 230, 171, 239,283190, 252, 246, 235, 239, 191, 0, 7, 44, 240, 192, 4, 23, 108, 240, 193, 8, 39, 172,284240, 194, 12, 55, 236, 240, 195, 16, 71, 44, 241, 196, 20, 87, 108, 241, 197, 24, 103,285172, 241, 198, 28, 119, 236, 241, 199, 32, 135, 44, 242, 200, 36, 151, 108, 242, 201,28640, 167, 172, 242, 202, 44, 183, 236, 242, 203, 48, 199, 44, 243, 204, 52, 215, 108,287243, 205, 56, 231, 172, 243, 206, 60, 247, 236, 243, 207, 64, 7, 45, 244, 208, 68, 23,288109, 244, 209, 72, 39, 173, 244, 210, 76, 55, 237, 244, 211, 80, 71, 45, 245, 212, 84,28987, 109, 245, 213, 88, 103, 173, 245, 214, 92, 119, 237, 245, 215, 96, 135, 45, 246,290216, 100, 151, 109, 246, 217, 104, 167, 173, 246, 218, 108, 183, 237, 246, 219, 112,291199, 45, 247, 220, 116, 215, 109, 247, 221, 120, 231, 173, 247, 222, 124, 247, 237,292247, 223, 128, 7, 46, 248, 224, 132, 23, 110, 248, 225, 136, 39, 174, 248, 226, 140,29355, 238, 248, 227, 144, 71, 46, 249, 228, 148, 87, 110, 249, 229, 152, 103, 174, 249,294230, 156, 119, 238, 249, 231, 160, 135, 46, 250, 232, 164, 151, 110, 250, 233, 168,295167, 174, 250, 234, 172, 183, 238, 250, 235, 176, 199, 46, 251, 236, 180, 215, 110,296251, 237, 184, 231, 174, 251, 238, 188, 247, 238, 251, 239, 192, 7, 47, 252, 240, 196,29723, 111, 252, 241, 200, 39, 175, 252, 242, 204, 55, 239, 252, 243, 208, 71, 47, 253,298244, 212, 87, 111, 253, 245, 216, 103, 175, 253, 246, 220, 119, 239, 253, 247, 224,299135, 47, 254, 248, 228, 151, 111, 254, 249,300];301let num_bits = 10;302303let decoder = HybridRleDecoder::new(&data, num_bits, 1000);304let result = decoder.collect()?;305306assert_eq!(result, (0..1000).collect::<Vec<_>>());307Ok(())308}309310#[test]311fn small() -> ParquetResult<()> {312let data = vec![3, 2];313314let num_bits = 3;315316let decoder = HybridRleDecoder::new(&data, num_bits, 1);317318let result = decoder.collect()?;319320assert_eq!(result, &[2]);321Ok(())322}323324#[test]325fn zero_bit_width() -> ParquetResult<()> {326let data = vec![3];327328let num_bits = 0;329330let decoder = HybridRleDecoder::new(&data, num_bits, 2);331332let result = decoder.collect()?;333334assert_eq!(result, &[0, 0]);335Ok(())336}337338#[test]339fn empty_values() -> ParquetResult<()> {340let data = [];341342let num_bits = 0;343344let decoder = HybridRleDecoder::new(&data, num_bits, 100);345346let result = decoder.collect()?;347348assert_eq!(result, vec![0; 100]);349Ok(())350}351}352353354