Path: blob/main/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs
8475 views
use arrow::array::BinaryArray;12use crate::parquet::encoding::hybrid_rle::{HybridRleChunk, HybridRleDecoder};3use crate::parquet::error::ParquetResult;4use crate::read::deserialize::dictionary_encoded::verify_dict_indices;56pub fn decode_dictionary(7values: HybridRleDecoder<'_>,8target: &mut Vec<u8>,9offsets: &mut Vec<i64>,10dict: &BinaryArray<i64>,11) -> ParquetResult<()> {12assert!(target.is_empty());13assert!(offsets.is_empty());1415offsets.reserve(values.len() + 1);16offsets.push(0);1718let mut offset = 0;19let dict_offsets = dict.offsets();20let mut total_buffer_size = 0;21for chunk in values.clone().into_chunk_iter() {22match chunk? {23HybridRleChunk::Rle(item, num_repeats) => {24let length = dict_offsets.length_at(item as usize);25total_buffer_size += length * num_repeats;26let end_offset = offset + (length * num_repeats) as i64;27offsets.extend((offset + length as i64..=end_offset).step_by(length));28offset = end_offset;29},30HybridRleChunk::Bitpacked(mut decoder) => {31let mut chunked = decoder.chunked();32for chunk in &mut chunked {33verify_dict_indices(&chunk, dict_offsets.len())?;34offsets.extend(chunk.iter().map(|&item| {35let length = unsafe { dict_offsets.length_at_unchecked(item as usize) };36total_buffer_size += length;37offset += length as i64;38offset39}));40}4142if let Some((chunk, size)) = chunked.remainder() {43verify_dict_indices(&chunk[..size], dict_offsets.len())?;44offsets.extend(chunk[..size].iter().map(|&item| {45let length = unsafe { dict_offsets.length_at_unchecked(item as usize) };46total_buffer_size += length;47offset += length as i64;48offset49}));50}51},52}53}5455target.reserve(total_buffer_size);56for chunk in values.into_chunk_iter() {57match chunk? {58HybridRleChunk::Rle(item, num_repeats) => {59let (start, end) = dict_offsets.start_end(item as usize);60let item = &dict.values()[start..end];61for _ in 0..num_repeats {62target.extend_from_slice(item);63}64},65HybridRleChunk::Bitpacked(mut decoder) => {66let mut chunked = decoder.chunked();67for chunk in &mut chunked {68verify_dict_indices(&chunk, dict_offsets.len())?;69for item in chunk {70let (start, end) =71unsafe { dict_offsets.start_end_unchecked(item as usize) };72let item = &dict.values()[start..end];73target.extend_from_slice(item);74}75}7677if let Some((chunk, size)) = chunked.remainder() {78verify_dict_indices(&chunk[..size], dict_offsets.len())?;79for &item in &chunk[..size] {80let (start, end) =81unsafe { dict_offsets.start_end_unchecked(item as usize) };82let item = &dict.values()[start..end];83target.extend_from_slice(item);84}85}86},87}88}8990Ok(())91}929394