Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
7887 views
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements. See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership. The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License. You may obtain a copy of the License at
8
//
9
// http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied. See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs
19
20
// This implements bit unpacking. For example, for `u8` and `num_bits=3`.
21
// 0b001_101_110 -> 0b0000_0001, 0b0000_0101, 0b0000_0110
22
//
23
// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very
24
// highly used functions in Parquet and therefore this that been extensively unrolled and
25
// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off
26
// in comparison to auto-vectorization.
27
//
28
// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of
29
// runtime.
30
//
31
// 1. Each individual function unrolled to a point where all constants are known to
32
// the compiler. In microbenchmarks, this increases the performance by around 4.5
33
// to 5 times.
34
// 2. All functions are compiled separately and dispatch is done using a
35
// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.5
36
// times.
37
38
/// Macro that generates an unpack function taking the number of bits as a const generic
39
macro_rules! unpack_impl {
40
($t:ty, $bytes:literal, $bits:tt) => {
41
pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; $bits]) {
42
if NUM_BITS == 0 {
43
for out in output {
44
*out = 0;
45
}
46
return;
47
}
48
49
assert!(NUM_BITS <= $bytes * 8);
50
51
let mask = match NUM_BITS {
52
$bits => <$t>::MAX,
53
_ => ((1 << NUM_BITS) - 1),
54
};
55
56
assert!(input.len() >= NUM_BITS * $bytes);
57
58
let r = |output_idx: usize| {
59
<$t>::from_le_bytes(
60
input[output_idx * $bytes..output_idx * $bytes + $bytes]
61
.try_into()
62
.unwrap(),
63
)
64
};
65
66
// @NOTE
67
// I was surprised too, but this macro vs. a for loop saves around 4.5 - 5x on
68
// performance in a microbenchmark. Although the code it generates is completely
69
// insane. There should be something we can do here to make this less code, sane code
70
// and faster code.
71
seq_macro!(i in 0..$bits {
72
let start_bit = i * NUM_BITS;
73
let end_bit = start_bit + NUM_BITS;
74
75
let start_bit_offset = start_bit % $bits;
76
let end_bit_offset = end_bit % $bits;
77
let start_byte = start_bit / $bits;
78
let end_byte = end_bit / $bits;
79
if start_byte != end_byte && end_bit_offset != 0 {
80
let val = r(start_byte);
81
let a = val >> start_bit_offset;
82
let val = r(end_byte);
83
let b = val << (NUM_BITS - end_bit_offset);
84
85
output[i] = a | (b & mask);
86
} else {
87
let val = r(start_byte);
88
output[i] = (val >> start_bit_offset) & mask;
89
}
90
});
91
}
92
};
93
}
94
95
/// Macro that generates unpack functions that accept num_bits as a parameter
96
macro_rules! unpack {
97
($name:ident, $t:ty, $bytes:literal, $bits:tt) => {
98
mod $name {
99
unpack_impl!($t, $bytes, $bits);
100
}
101
102
/// Unpack packed `input` into `output` with a bit width of `num_bits`
103
pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) {
104
// This will get optimised into a jump table
105
//
106
// @NOTE
107
// This jumptable approach saves around 2 - 2.5x on performance over no jumptable and no
108
// generics.
109
seq_macro!(i in 0..=$bits {
110
if i == num_bits {
111
return $name::unpack::<i>(input, output);
112
}
113
});
114
unreachable!("invalid num_bits {}", num_bits);
115
}
116
};
117
}
118
119
unpack!(unpack16, u16, 2, 16);
120
unpack!(unpack32, u32, 4, 32);
121
unpack!(unpack64, u64, 8, 64);
122
123
#[cfg(test)]
124
mod tests {
125
use super::*;
126
127
#[test]
128
fn test_basic() {
129
let input = [0xFF; 4096];
130
131
for i in 0..=32 {
132
let mut output = [0; 32];
133
unpack32(&input, &mut output, i);
134
for (idx, out) in output.iter().enumerate() {
135
assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
136
}
137
}
138
139
for i in 0..=64 {
140
let mut output = [0; 64];
141
unpack64(&input, &mut output, i);
142
for (idx, out) in output.iter().enumerate() {
143
assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
144
}
145
}
146
}
147
}
148
149