Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/write/boolean/basic.rs
6940 views
1
use arrow::array::*;
2
use polars_error::{PolarsResult, polars_bail};
3
4
use super::super::{WriteOptions, utils};
5
use crate::arrow::read::schema::is_nullable;
6
use crate::parquet::encoding::Encoding;
7
use crate::parquet::encoding::hybrid_rle::{self, bitpacked_encode};
8
use crate::parquet::page::DataPage;
9
use crate::parquet::schema::types::PrimitiveType;
10
use crate::parquet::statistics::{BooleanStatistics, ParquetStatistics};
11
use crate::write::{EncodeNullability, StatisticsOptions};
12
13
fn encode(iterator: impl Iterator<Item = bool>, buffer: &mut Vec<u8>) -> PolarsResult<()> {
14
// encode values using bitpacking
15
let len = buffer.len();
16
let mut buffer = std::io::Cursor::new(buffer);
17
buffer.set_position(len as u64);
18
Ok(bitpacked_encode(&mut buffer, iterator)?)
19
}
20
21
pub(super) fn encode_plain(
22
array: &BooleanArray,
23
encode_options: EncodeNullability,
24
buffer: &mut Vec<u8>,
25
) -> PolarsResult<()> {
26
if encode_options.is_optional() && array.validity().is_some() {
27
encode(array.non_null_values_iter(), buffer)
28
} else {
29
encode(array.values().iter(), buffer)
30
}
31
}
32
33
pub(super) fn encode_hybrid_rle(
34
array: &BooleanArray,
35
encode_options: EncodeNullability,
36
buffer: &mut Vec<u8>,
37
) -> PolarsResult<()> {
38
buffer.extend_from_slice(&[0; 4]);
39
let start = buffer.len();
40
41
if encode_options.is_optional() && array.validity().is_some() {
42
hybrid_rle::encode(buffer, array.non_null_values_iter(), 1)?;
43
} else {
44
hybrid_rle::encode(buffer, array.values().iter(), 1)?;
45
}
46
47
let length = buffer.len() - start;
48
49
// write the first 4 bytes as length
50
let length = (length as i32).to_le_bytes();
51
(0..4).for_each(|i| buffer[start - 4 + i] = length[i]);
52
53
Ok(())
54
}
55
56
pub fn array_to_page(
57
array: &BooleanArray,
58
options: WriteOptions,
59
type_: PrimitiveType,
60
encoding: Encoding,
61
) -> PolarsResult<DataPage> {
62
let is_optional = is_nullable(&type_.field_info);
63
let encode_nullability = EncodeNullability::new(is_optional);
64
65
let validity = array.validity();
66
67
let mut buffer = vec![];
68
utils::write_def_levels(
69
&mut buffer,
70
is_optional,
71
validity,
72
array.len(),
73
options.version,
74
)?;
75
76
let definition_levels_byte_length = buffer.len();
77
78
match encoding {
79
Encoding::Plain => encode_plain(array, encode_nullability, &mut buffer)?,
80
Encoding::Rle => encode_hybrid_rle(array, encode_nullability, &mut buffer)?,
81
other => polars_bail!(nyi = "Encoding boolean as {other:?}"),
82
}
83
84
let statistics = if options.has_statistics() {
85
Some(build_statistics(array, &options.statistics))
86
} else {
87
None
88
};
89
90
utils::build_plain_page(
91
buffer,
92
array.len(),
93
array.len(),
94
array.null_count(),
95
0,
96
definition_levels_byte_length,
97
statistics,
98
type_,
99
options,
100
encoding,
101
)
102
}
103
104
pub(super) fn build_statistics(
105
array: &BooleanArray,
106
options: &StatisticsOptions,
107
) -> ParquetStatistics {
108
use polars_compute::min_max::MinMaxKernel;
109
use polars_compute::unique::GenericUniqueKernel;
110
111
BooleanStatistics {
112
null_count: options.null_count.then(|| array.null_count() as i64),
113
distinct_count: options
114
.distinct_count
115
.then(|| array.n_unique_non_null().try_into().ok())
116
.flatten(),
117
max_value: options
118
.max_value
119
.then(|| array.max_propagate_nan_kernel())
120
.flatten(),
121
min_value: options
122
.min_value
123
.then(|| array.min_propagate_nan_kernel())
124
.flatten(),
125
}
126
.serialize()
127
}
128
129