Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs
8475 views
1
use arrow::array::BinaryArray;
2
3
use crate::parquet::encoding::hybrid_rle::{HybridRleChunk, HybridRleDecoder};
4
use crate::parquet::error::ParquetResult;
5
use crate::read::deserialize::dictionary_encoded::verify_dict_indices;
6
7
pub fn decode_dictionary(
8
values: HybridRleDecoder<'_>,
9
target: &mut Vec<u8>,
10
offsets: &mut Vec<i64>,
11
dict: &BinaryArray<i64>,
12
) -> ParquetResult<()> {
13
assert!(target.is_empty());
14
assert!(offsets.is_empty());
15
16
offsets.reserve(values.len() + 1);
17
offsets.push(0);
18
19
let mut offset = 0;
20
let dict_offsets = dict.offsets();
21
let mut total_buffer_size = 0;
22
for chunk in values.clone().into_chunk_iter() {
23
match chunk? {
24
HybridRleChunk::Rle(item, num_repeats) => {
25
let length = dict_offsets.length_at(item as usize);
26
total_buffer_size += length * num_repeats;
27
let end_offset = offset + (length * num_repeats) as i64;
28
offsets.extend((offset + length as i64..=end_offset).step_by(length));
29
offset = end_offset;
30
},
31
HybridRleChunk::Bitpacked(mut decoder) => {
32
let mut chunked = decoder.chunked();
33
for chunk in &mut chunked {
34
verify_dict_indices(&chunk, dict_offsets.len())?;
35
offsets.extend(chunk.iter().map(|&item| {
36
let length = unsafe { dict_offsets.length_at_unchecked(item as usize) };
37
total_buffer_size += length;
38
offset += length as i64;
39
offset
40
}));
41
}
42
43
if let Some((chunk, size)) = chunked.remainder() {
44
verify_dict_indices(&chunk[..size], dict_offsets.len())?;
45
offsets.extend(chunk[..size].iter().map(|&item| {
46
let length = unsafe { dict_offsets.length_at_unchecked(item as usize) };
47
total_buffer_size += length;
48
offset += length as i64;
49
offset
50
}));
51
}
52
},
53
}
54
}
55
56
target.reserve(total_buffer_size);
57
for chunk in values.into_chunk_iter() {
58
match chunk? {
59
HybridRleChunk::Rle(item, num_repeats) => {
60
let (start, end) = dict_offsets.start_end(item as usize);
61
let item = &dict.values()[start..end];
62
for _ in 0..num_repeats {
63
target.extend_from_slice(item);
64
}
65
},
66
HybridRleChunk::Bitpacked(mut decoder) => {
67
let mut chunked = decoder.chunked();
68
for chunk in &mut chunked {
69
verify_dict_indices(&chunk, dict_offsets.len())?;
70
for item in chunk {
71
let (start, end) =
72
unsafe { dict_offsets.start_end_unchecked(item as usize) };
73
let item = &dict.values()[start..end];
74
target.extend_from_slice(item);
75
}
76
}
77
78
if let Some((chunk, size)) = chunked.remainder() {
79
verify_dict_indices(&chunk[..size], dict_offsets.len())?;
80
for &item in &chunk[..size] {
81
let (start, end) =
82
unsafe { dict_offsets.start_end_unchecked(item as usize) };
83
let item = &dict.values()[start..end];
84
target.extend_from_slice(item);
85
}
86
}
87
},
88
}
89
}
90
91
Ok(())
92
}
93
94