CoCalc -- unpack.rs

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
⁷⁸⁸⁷ views
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs
19

20
// This implements bit unpacking. For example, for `u8` and `num_bits=3`.
21
// 0b001_101_110  -> 0b0000_0001, 0b0000_0101, 0b0000_0110
22
//
23
// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very
24
// highly used functions in Parquet and therefore this that been extensively unrolled and
25
// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off
26
// in comparison to auto-vectorization.
27
//
28
// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of
29
// runtime.
30
//
31
// 1. Each individual function unrolled to a point where all constants are known to
32
// the compiler. In microbenchmarks, this increases the performance by around 4.5
33
// to 5 times.
34
// 2. All functions are compiled separately and dispatch is done using a
35
// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.5
36
// times.
37

38
/// Macro that generates an unpack function taking the number of bits as a const generic
39
macro_rules! unpack_impl {
40
    ($t:ty, $bytes:literal, $bits:tt) => {
41
        pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; $bits]) {
42
            if NUM_BITS == 0 {
43
                for out in output {
44
                    *out = 0;
45
                }
46
                return;
47
            }
48

49
            assert!(NUM_BITS <= $bytes * 8);
50

51
            let mask = match NUM_BITS {
52
                $bits => <$t>::MAX,
53
                _ => ((1 << NUM_BITS) - 1),
54
            };
55

56
            assert!(input.len() >= NUM_BITS * $bytes);
57

58
            let r = |output_idx: usize| {
59
                <$t>::from_le_bytes(
60
                    input[output_idx * $bytes..output_idx * $bytes + $bytes]
61
                        .try_into()
62
                        .unwrap(),
63
                )
64
            };
65

66
            // @NOTE
67
            // I was surprised too, but this macro vs. a for loop saves around 4.5 - 5x on
68
            // performance in a microbenchmark. Although the code it generates is completely
69
            // insane. There should be something we can do here to make this less code, sane code
70
            // and faster code.
71
            seq_macro!(i in 0..$bits {
72
                let start_bit = i * NUM_BITS;
73
                let end_bit = start_bit + NUM_BITS;
74

75
                let start_bit_offset = start_bit % $bits;
76
                let end_bit_offset = end_bit % $bits;
77
                let start_byte = start_bit / $bits;
78
                let end_byte = end_bit / $bits;
79
                if start_byte != end_byte && end_bit_offset != 0 {
80
                    let val = r(start_byte);
81
                    let a = val >> start_bit_offset;
82
                    let val = r(end_byte);
83
                    let b = val << (NUM_BITS - end_bit_offset);
84

85
                    output[i] = a | (b & mask);
86
                } else {
87
                    let val = r(start_byte);
88
                    output[i] = (val >> start_bit_offset) & mask;
89
                }
90
            });
91
        }
92
    };
93
}
94

95
/// Macro that generates unpack functions that accept num_bits as a parameter
96
macro_rules! unpack {
97
    ($name:ident, $t:ty, $bytes:literal, $bits:tt) => {
98
        mod $name {
99
            unpack_impl!($t, $bytes, $bits);
100
        }
101

102
        /// Unpack packed `input` into `output` with a bit width of `num_bits`
103
        pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) {
104
            // This will get optimised into a jump table
105
            //
106
            // @NOTE
107
            // This jumptable approach saves around 2 - 2.5x on performance over no jumptable and no
108
            // generics.
109
            seq_macro!(i in 0..=$bits {
110
                if i == num_bits {
111
                    return $name::unpack::<i>(input, output);
112
                }
113
            });
114
            unreachable!("invalid num_bits {}", num_bits);
115
        }
116
    };
117
}
118

119
unpack!(unpack16, u16, 2, 16);
120
unpack!(unpack32, u32, 4, 32);
121
unpack!(unpack64, u64, 8, 64);
122

123
#[cfg(test)]
124
mod tests {
125
    use super::*;
126

127
    #[test]
128
    fn test_basic() {
129
        let input = [0xFF; 4096];
130

131
        for i in 0..=32 {
132
            let mut output = [0; 32];
133
            unpack32(&input, &mut output, i);
134
            for (idx, out) in output.iter().enumerate() {
135
                assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
136
            }
137
        }
138

139
        for i in 0..=64 {
140
            let mut output = [0; 64];
141
            unpack64(&input, &mut output, i);
142
            for (idx, out) in output.iter().enumerate() {
143
                assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");
144
            }
145
        }
146
    }
147
}
148

149
Product

Resources

Company