Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/write/binview/basic.rs
6940 views
1
use arrow::array::{Array, BinaryViewArray};
2
use polars_compute::min_max::MinMaxKernel;
3
use polars_error::PolarsResult;
4
5
use crate::parquet::encoding::delta_bitpacked;
6
use crate::parquet::schema::types::PrimitiveType;
7
use crate::parquet::statistics::{BinaryStatistics, ParquetStatistics};
8
use crate::read::schema::is_nullable;
9
use crate::write::binary::encode_non_null_values;
10
use crate::write::utils::invalid_encoding;
11
use crate::write::{EncodeNullability, Encoding, Page, StatisticsOptions, WriteOptions, utils};
12
13
pub(crate) fn encode_plain(
14
array: &BinaryViewArray,
15
options: EncodeNullability,
16
buffer: &mut Vec<u8>,
17
) {
18
if options.is_optional() && array.validity().is_some() {
19
// @NOTE: This capacity might overestimate the amount of bytes since the buffers might
20
// still contain data that is not referenced by any value.
21
let capacity =
22
array.total_bytes_len() + (array.len() - array.null_count()) * size_of::<u32>();
23
buffer.reserve(capacity);
24
25
encode_non_null_values(array.non_null_values_iter(), buffer);
26
// Append the non-null values.
27
} else {
28
// @NOTE: This capacity might overestimate the amount of bytes since the buffers might
29
// still contain data that is not referenced by any value.
30
let capacity = array.total_bytes_len() + array.len() * size_of::<u32>();
31
buffer.reserve(capacity);
32
33
encode_non_null_values(array.values_iter(), buffer);
34
}
35
}
36
37
pub(crate) fn encode_delta(
38
array: &BinaryViewArray,
39
options: EncodeNullability,
40
buffer: &mut Vec<u8>,
41
) {
42
if options.is_optional() && array.validity().is_some() {
43
let lengths = utils::ExactSizedIter::new(
44
array.non_null_views_iter().map(|v| v.length as i64),
45
array.len() - array.null_count(),
46
);
47
delta_bitpacked::encode(lengths, buffer, 1);
48
49
for slice in array.non_null_values_iter() {
50
buffer.extend_from_slice(slice)
51
}
52
} else {
53
let lengths =
54
utils::ExactSizedIter::new(array.views().iter().map(|v| v.length as i64), array.len());
55
delta_bitpacked::encode(lengths, buffer, 1);
56
57
buffer.extend(array.values_iter().flatten());
58
}
59
}
60
61
pub fn array_to_page(
62
array: &BinaryViewArray,
63
options: WriteOptions,
64
type_: PrimitiveType,
65
encoding: Encoding,
66
) -> PolarsResult<Page> {
67
let is_optional = is_nullable(&type_.field_info);
68
let encode_options = EncodeNullability::new(is_optional);
69
70
let mut buffer = vec![];
71
// TODO! reserve capacity
72
utils::write_def_levels(
73
&mut buffer,
74
is_optional,
75
array.validity(),
76
array.len(),
77
options.version,
78
)?;
79
80
let definition_levels_byte_length = buffer.len();
81
82
match encoding {
83
Encoding::Plain => encode_plain(array, encode_options, &mut buffer),
84
Encoding::DeltaLengthByteArray => encode_delta(array, encode_options, &mut buffer),
85
_ => return Err(invalid_encoding(encoding, array.dtype())),
86
}
87
88
let statistics = if options.has_statistics() {
89
Some(build_statistics(array, type_.clone(), &options.statistics))
90
} else {
91
None
92
};
93
94
utils::build_plain_page(
95
buffer,
96
array.len(),
97
array.len(),
98
array.null_count(),
99
0,
100
definition_levels_byte_length,
101
statistics,
102
type_,
103
options,
104
encoding,
105
)
106
.map(Page::Data)
107
}
108
109
pub(crate) fn build_statistics(
110
array: &BinaryViewArray,
111
primitive_type: PrimitiveType,
112
options: &StatisticsOptions,
113
) -> ParquetStatistics {
114
BinaryStatistics {
115
primitive_type,
116
null_count: options.null_count.then_some(array.null_count() as i64),
117
distinct_count: None,
118
max_value: options
119
.max_value
120
.then(|| array.max_propagate_nan_kernel().map(<[u8]>::to_vec))
121
.flatten(),
122
min_value: options
123
.min_value
124
.then(|| array.min_propagate_nan_kernel().map(<[u8]>::to_vec))
125
.flatten(),
126
}
127
.serialize()
128
}
129
130