Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs
7887 views
1
// See https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3
2
mod bitmap;
3
mod encoder;
4
#[cfg(feature = "proptest")]
5
pub mod proptest;
6
7
pub use bitmap::{BitmapIter, encode_bool as bitpacked_encode};
8
pub use encoder::{Encoder, encode};
9
10
use super::{bitpacked, uleb128};
11
use crate::parquet::error::{ParquetError, ParquetResult};
12
13
/// A [`Iterator`] for Hybrid Run-Length Encoding
14
///
15
/// The hybrid here means that each second is prepended by a bit that differentiates between two
16
/// modes.
17
///
18
/// 1. Run-Length Encoding in the shape of `[Number of Values, Value]`
19
/// 2. Bitpacking in the shape of `[Value 1 in n bits, Value 2 in n bits, ...]`
20
///
21
/// Note, that this can iterate, but the set of `collect_*` and `translate_and_collect_*` methods
22
/// should be highly preferred as they are way more efficient and have better error handling.
23
#[derive(Debug, Clone)]
24
pub struct HybridRleDecoder<'a> {
25
data: &'a [u8],
26
num_bits: usize,
27
num_values: usize,
28
}
29
30
pub struct HybridRleChunkIter<'a> {
31
decoder: HybridRleDecoder<'a>,
32
}
33
34
#[derive(Debug)]
35
pub enum HybridRleChunk<'a> {
36
Rle(u32, usize),
37
Bitpacked(bitpacked::Decoder<'a, u32>),
38
}
39
40
impl<'a> Iterator for HybridRleChunkIter<'a> {
41
type Item = ParquetResult<HybridRleChunk<'a>>;
42
43
fn next(&mut self) -> Option<Self::Item> {
44
self.decoder.next_chunk().transpose()
45
}
46
}
47
48
impl HybridRleChunk<'_> {
49
#[inline]
50
pub fn len(&self) -> usize {
51
match self {
52
HybridRleChunk::Rle(_, size) => *size,
53
HybridRleChunk::Bitpacked(decoder) => decoder.len(),
54
}
55
}
56
}
57
58
impl<'a> HybridRleDecoder<'a> {
59
/// Returns a new [`HybridRleDecoder`]
60
pub fn new(data: &'a [u8], num_bits: u32, num_values: usize) -> Self {
61
Self {
62
data,
63
num_bits: num_bits as usize,
64
num_values,
65
}
66
}
67
68
pub fn len(&self) -> usize {
69
self.num_values
70
}
71
72
pub fn num_bits(&self) -> usize {
73
self.num_bits
74
}
75
76
pub fn into_chunk_iter(self) -> HybridRleChunkIter<'a> {
77
HybridRleChunkIter { decoder: self }
78
}
79
80
pub fn next_chunk(&mut self) -> ParquetResult<Option<HybridRleChunk<'a>>> {
81
if self.len() == 0 {
82
return Ok(None);
83
}
84
85
if self.num_bits == 0 {
86
let num_values = self.num_values;
87
self.num_values = 0;
88
return Ok(Some(HybridRleChunk::Rle(0, num_values)));
89
}
90
91
if self.data.is_empty() {
92
return Ok(None);
93
}
94
95
let (indicator, consumed) = uleb128::decode(self.data);
96
self.data = unsafe { self.data.get_unchecked(consumed..) };
97
98
Ok(Some(if indicator & 1 == 1 {
99
// is bitpacking
100
let bytes = (indicator as usize >> 1) * self.num_bits;
101
let bytes = std::cmp::min(bytes, self.data.len());
102
let Some((packed, remaining)) = self.data.split_at_checked(bytes) else {
103
return Err(ParquetError::oos("Not enough bytes for bitpacked data"));
104
};
105
self.data = remaining;
106
107
let length = std::cmp::min(packed.len() * 8 / self.num_bits, self.num_values);
108
let decoder = bitpacked::Decoder::<u32>::try_new(packed, self.num_bits, length)?;
109
110
self.num_values -= length;
111
112
HybridRleChunk::Bitpacked(decoder)
113
} else {
114
// is rle
115
let run_length = indicator as usize >> 1;
116
// repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)
117
let rle_bytes = self.num_bits.div_ceil(8);
118
let Some((pack, remaining)) = self.data.split_at_checked(rle_bytes) else {
119
return Err(ParquetError::oos("Not enough bytes for RLE encoded data"));
120
};
121
self.data = remaining;
122
123
let mut bytes = [0u8; std::mem::size_of::<u32>()];
124
pack.iter().zip(bytes.iter_mut()).for_each(|(src, dst)| {
125
*dst = *src;
126
});
127
let value = u32::from_le_bytes(bytes);
128
129
let length = std::cmp::min(run_length, self.num_values);
130
131
self.num_values -= length;
132
133
HybridRleChunk::Rle(value, length)
134
}))
135
}
136
137
pub fn next_chunk_length(&mut self) -> ParquetResult<Option<usize>> {
138
if self.len() == 0 {
139
return Ok(None);
140
}
141
142
if self.num_bits == 0 {
143
let num_values = self.num_values;
144
self.num_values = 0;
145
return Ok(Some(num_values));
146
}
147
148
if self.data.is_empty() {
149
return Ok(None);
150
}
151
152
let (indicator, consumed) = uleb128::decode(self.data);
153
self.data = unsafe { self.data.get_unchecked(consumed..) };
154
155
Ok(Some(if indicator & 1 == 1 {
156
// is bitpacking
157
let bytes = (indicator as usize >> 1) * self.num_bits;
158
let bytes = std::cmp::min(bytes, self.data.len());
159
let Some((packed, remaining)) = self.data.split_at_checked(bytes) else {
160
return Err(ParquetError::oos("Not enough bytes for bitpacked data"));
161
};
162
self.data = remaining;
163
164
let length = std::cmp::min(packed.len() * 8 / self.num_bits, self.num_values);
165
self.num_values -= length;
166
167
length
168
} else {
169
// is rle
170
let run_length = indicator as usize >> 1;
171
// repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)
172
let rle_bytes = self.num_bits.div_ceil(8);
173
let Some(remaining) = self.data.get(rle_bytes..) else {
174
return Err(ParquetError::oos("Not enough bytes for RLE encoded data"));
175
};
176
self.data = remaining;
177
178
let length = std::cmp::min(run_length, self.num_values);
179
self.num_values -= length;
180
181
length
182
}))
183
}
184
185
pub fn limit_to(&mut self, length: usize) {
186
self.num_values = self.num_values.min(length);
187
}
188
189
pub fn collect(self) -> ParquetResult<Vec<u32>> {
190
let mut target = Vec::with_capacity(self.len());
191
192
for chunk in self.into_chunk_iter() {
193
match chunk? {
194
HybridRleChunk::Rle(value, size) => {
195
target.resize(target.len() + size, value);
196
},
197
HybridRleChunk::Bitpacked(decoder) => {
198
decoder.collect_into(&mut target);
199
},
200
}
201
}
202
203
Ok(target)
204
}
205
}
206
207
#[cfg(test)]
208
mod tests {
209
use super::*;
210
211
#[test]
212
fn roundtrip() -> ParquetResult<()> {
213
let mut buffer = vec![];
214
let num_bits = 10u32;
215
216
let data = (0..1000).collect::<Vec<_>>();
217
218
encode::<u32, _, _>(&mut buffer, data.iter().cloned(), num_bits).unwrap();
219
220
let decoder = HybridRleDecoder::new(&buffer, num_bits, data.len());
221
222
let result = decoder.collect()?;
223
224
assert_eq!(result, data);
225
Ok(())
226
}
227
228
#[test]
229
fn pyarrow_integration() -> ParquetResult<()> {
230
// data encoded from pyarrow representing (0..1000)
231
let data = vec![
232
127, 0, 4, 32, 192, 0, 4, 20, 96, 192, 1, 8, 36, 160, 192, 2, 12, 52, 224, 192, 3, 16,
233
68, 32, 193, 4, 20, 84, 96, 193, 5, 24, 100, 160, 193, 6, 28, 116, 224, 193, 7, 32,
234
132, 32, 194, 8, 36, 148, 96, 194, 9, 40, 164, 160, 194, 10, 44, 180, 224, 194, 11, 48,
235
196, 32, 195, 12, 52, 212, 96, 195, 13, 56, 228, 160, 195, 14, 60, 244, 224, 195, 15,
236
64, 4, 33, 196, 16, 68, 20, 97, 196, 17, 72, 36, 161, 196, 18, 76, 52, 225, 196, 19,
237
80, 68, 33, 197, 20, 84, 84, 97, 197, 21, 88, 100, 161, 197, 22, 92, 116, 225, 197, 23,
238
96, 132, 33, 198, 24, 100, 148, 97, 198, 25, 104, 164, 161, 198, 26, 108, 180, 225,
239
198, 27, 112, 196, 33, 199, 28, 116, 212, 97, 199, 29, 120, 228, 161, 199, 30, 124,
240
244, 225, 199, 31, 128, 4, 34, 200, 32, 132, 20, 98, 200, 33, 136, 36, 162, 200, 34,
241
140, 52, 226, 200, 35, 144, 68, 34, 201, 36, 148, 84, 98, 201, 37, 152, 100, 162, 201,
242
38, 156, 116, 226, 201, 39, 160, 132, 34, 202, 40, 164, 148, 98, 202, 41, 168, 164,
243
162, 202, 42, 172, 180, 226, 202, 43, 176, 196, 34, 203, 44, 180, 212, 98, 203, 45,
244
184, 228, 162, 203, 46, 188, 244, 226, 203, 47, 192, 4, 35, 204, 48, 196, 20, 99, 204,
245
49, 200, 36, 163, 204, 50, 204, 52, 227, 204, 51, 208, 68, 35, 205, 52, 212, 84, 99,
246
205, 53, 216, 100, 163, 205, 54, 220, 116, 227, 205, 55, 224, 132, 35, 206, 56, 228,
247
148, 99, 206, 57, 232, 164, 163, 206, 58, 236, 180, 227, 206, 59, 240, 196, 35, 207,
248
60, 244, 212, 99, 207, 61, 248, 228, 163, 207, 62, 252, 244, 227, 207, 63, 0, 5, 36,
249
208, 64, 4, 21, 100, 208, 65, 8, 37, 164, 208, 66, 12, 53, 228, 208, 67, 16, 69, 36,
250
209, 68, 20, 85, 100, 209, 69, 24, 101, 164, 209, 70, 28, 117, 228, 209, 71, 32, 133,
251
36, 210, 72, 36, 149, 100, 210, 73, 40, 165, 164, 210, 74, 44, 181, 228, 210, 75, 48,
252
197, 36, 211, 76, 52, 213, 100, 211, 77, 56, 229, 164, 211, 78, 60, 245, 228, 211, 79,
253
64, 5, 37, 212, 80, 68, 21, 101, 212, 81, 72, 37, 165, 212, 82, 76, 53, 229, 212, 83,
254
80, 69, 37, 213, 84, 84, 85, 101, 213, 85, 88, 101, 165, 213, 86, 92, 117, 229, 213,
255
87, 96, 133, 37, 214, 88, 100, 149, 101, 214, 89, 104, 165, 165, 214, 90, 108, 181,
256
229, 214, 91, 112, 197, 37, 215, 92, 116, 213, 101, 215, 93, 120, 229, 165, 215, 94,
257
124, 245, 229, 215, 95, 128, 5, 38, 216, 96, 132, 21, 102, 216, 97, 136, 37, 166, 216,
258
98, 140, 53, 230, 216, 99, 144, 69, 38, 217, 100, 148, 85, 102, 217, 101, 152, 101,
259
166, 217, 102, 156, 117, 230, 217, 103, 160, 133, 38, 218, 104, 164, 149, 102, 218,
260
105, 168, 165, 166, 218, 106, 172, 181, 230, 218, 107, 176, 197, 38, 219, 108, 180,
261
213, 102, 219, 109, 184, 229, 166, 219, 110, 188, 245, 230, 219, 111, 192, 5, 39, 220,
262
112, 196, 21, 103, 220, 113, 200, 37, 167, 220, 114, 204, 53, 231, 220, 115, 208, 69,
263
39, 221, 116, 212, 85, 103, 221, 117, 216, 101, 167, 221, 118, 220, 117, 231, 221, 119,
264
224, 133, 39, 222, 120, 228, 149, 103, 222, 121, 232, 165, 167, 222, 122, 236, 181,
265
231, 222, 123, 240, 197, 39, 223, 124, 244, 213, 103, 223, 125, 125, 248, 229, 167,
266
223, 126, 252, 245, 231, 223, 127, 0, 6, 40, 224, 128, 4, 22, 104, 224, 129, 8, 38,
267
168, 224, 130, 12, 54, 232, 224, 131, 16, 70, 40, 225, 132, 20, 86, 104, 225, 133, 24,
268
102, 168, 225, 134, 28, 118, 232, 225, 135, 32, 134, 40, 226, 136, 36, 150, 104, 226,
269
137, 40, 166, 168, 226, 138, 44, 182, 232, 226, 139, 48, 198, 40, 227, 140, 52, 214,
270
104, 227, 141, 56, 230, 168, 227, 142, 60, 246, 232, 227, 143, 64, 6, 41, 228, 144, 68,
271
22, 105, 228, 145, 72, 38, 169, 228, 146, 76, 54, 233, 228, 147, 80, 70, 41, 229, 148,
272
84, 86, 105, 229, 149, 88, 102, 169, 229, 150, 92, 118, 233, 229, 151, 96, 134, 41,
273
230, 152, 100, 150, 105, 230, 153, 104, 166, 169, 230, 154, 108, 182, 233, 230, 155,
274
112, 198, 41, 231, 156, 116, 214, 105, 231, 157, 120, 230, 169, 231, 158, 124, 246,
275
233, 231, 159, 128, 6, 42, 232, 160, 132, 22, 106, 232, 161, 136, 38, 170, 232, 162,
276
140, 54, 234, 232, 163, 144, 70, 42, 233, 164, 148, 86, 106, 233, 165, 152, 102, 170,
277
233, 166, 156, 118, 234, 233, 167, 160, 134, 42, 234, 168, 164, 150, 106, 234, 169,
278
168, 166, 170, 234, 170, 172, 182, 234, 234, 171, 176, 198, 42, 235, 172, 180, 214,
279
106, 235, 173, 184, 230, 170, 235, 174, 188, 246, 234, 235, 175, 192, 6, 43, 236, 176,
280
196, 22, 107, 236, 177, 200, 38, 171, 236, 178, 204, 54, 235, 236, 179, 208, 70, 43,
281
237, 180, 212, 86, 107, 237, 181, 216, 102, 171, 237, 182, 220, 118, 235, 237, 183,
282
224, 134, 43, 238, 184, 228, 150, 107, 238, 185, 232, 166, 171, 238, 186, 236, 182,
283
235, 238, 187, 240, 198, 43, 239, 188, 244, 214, 107, 239, 189, 248, 230, 171, 239,
284
190, 252, 246, 235, 239, 191, 0, 7, 44, 240, 192, 4, 23, 108, 240, 193, 8, 39, 172,
285
240, 194, 12, 55, 236, 240, 195, 16, 71, 44, 241, 196, 20, 87, 108, 241, 197, 24, 103,
286
172, 241, 198, 28, 119, 236, 241, 199, 32, 135, 44, 242, 200, 36, 151, 108, 242, 201,
287
40, 167, 172, 242, 202, 44, 183, 236, 242, 203, 48, 199, 44, 243, 204, 52, 215, 108,
288
243, 205, 56, 231, 172, 243, 206, 60, 247, 236, 243, 207, 64, 7, 45, 244, 208, 68, 23,
289
109, 244, 209, 72, 39, 173, 244, 210, 76, 55, 237, 244, 211, 80, 71, 45, 245, 212, 84,
290
87, 109, 245, 213, 88, 103, 173, 245, 214, 92, 119, 237, 245, 215, 96, 135, 45, 246,
291
216, 100, 151, 109, 246, 217, 104, 167, 173, 246, 218, 108, 183, 237, 246, 219, 112,
292
199, 45, 247, 220, 116, 215, 109, 247, 221, 120, 231, 173, 247, 222, 124, 247, 237,
293
247, 223, 128, 7, 46, 248, 224, 132, 23, 110, 248, 225, 136, 39, 174, 248, 226, 140,
294
55, 238, 248, 227, 144, 71, 46, 249, 228, 148, 87, 110, 249, 229, 152, 103, 174, 249,
295
230, 156, 119, 238, 249, 231, 160, 135, 46, 250, 232, 164, 151, 110, 250, 233, 168,
296
167, 174, 250, 234, 172, 183, 238, 250, 235, 176, 199, 46, 251, 236, 180, 215, 110,
297
251, 237, 184, 231, 174, 251, 238, 188, 247, 238, 251, 239, 192, 7, 47, 252, 240, 196,
298
23, 111, 252, 241, 200, 39, 175, 252, 242, 204, 55, 239, 252, 243, 208, 71, 47, 253,
299
244, 212, 87, 111, 253, 245, 216, 103, 175, 253, 246, 220, 119, 239, 253, 247, 224,
300
135, 47, 254, 248, 228, 151, 111, 254, 249,
301
];
302
let num_bits = 10;
303
304
let decoder = HybridRleDecoder::new(&data, num_bits, 1000);
305
let result = decoder.collect()?;
306
307
assert_eq!(result, (0..1000).collect::<Vec<_>>());
308
Ok(())
309
}
310
311
#[test]
312
fn small() -> ParquetResult<()> {
313
let data = vec![3, 2];
314
315
let num_bits = 3;
316
317
let decoder = HybridRleDecoder::new(&data, num_bits, 1);
318
319
let result = decoder.collect()?;
320
321
assert_eq!(result, &[2]);
322
Ok(())
323
}
324
325
#[test]
326
fn zero_bit_width() -> ParquetResult<()> {
327
let data = vec![3];
328
329
let num_bits = 0;
330
331
let decoder = HybridRleDecoder::new(&data, num_bits, 2);
332
333
let result = decoder.collect()?;
334
335
assert_eq!(result, &[0, 0]);
336
Ok(())
337
}
338
339
#[test]
340
fn empty_values() -> ParquetResult<()> {
341
let data = [];
342
343
let num_bits = 0;
344
345
let decoder = HybridRleDecoder::new(&data, num_bits, 100);
346
347
let result = decoder.collect()?;
348
349
assert_eq!(result, vec![0; 100]);
350
Ok(())
351
}
352
}
353
354