Path: blob/main/crates/polars-parquet/src/parquet/encoding/bitpacked/unpack.rs
7887 views
// Licensed to the Apache Software Foundation (ASF) under one1// or more contributor license agreements. See the NOTICE file2// distributed with this work for additional information3// regarding copyright ownership. The ASF licenses this file4// to you under the Apache License, Version 2.0 (the5// "License"); you may not use this file except in compliance6// with the License. You may obtain a copy of the License at7//8// http://www.apache.org/licenses/LICENSE-2.09//10// Unless required by applicable law or agreed to in writing,11// software distributed under the License is distributed on an12// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY13// KIND, either express or implied. See the License for the14// specific language governing permissions and limitations15// under the License.16//17// Copied from https://github.com/apache/arrow-rs/blob/6859efa690d4c9530cf8a24053bc6ed81025a164/parquet/src/util/bit_pack.rs1819// This implements bit unpacking. For example, for `u8` and `num_bits=3`.20// 0b001_101_110 -> 0b0000_0001, 0b0000_0101, 0b0000_011021//22// This file is a bit insane. It unrolls all the possible num_bits vs. combinations. These are very23// highly used functions in Parquet and therefore this that been extensively unrolled and24// optimized. Attempts have been done to introduce SIMD here, but those attempts have not paid off25// in comparison to auto-vectorization.26//27// Generally, there are two code-size vs. runtime tradeoffs taken here in favor of28// runtime.29//30// 1. Each individual function unrolled to a point where all constants are known to31// the compiler. In microbenchmarks, this increases the performance by around 4.532// to 5 times.33// 2. All functions are compiled separately and dispatch is done using a34// jumptable. In microbenchmarks, this increases the performance by around 2 to 2.535// times.3637/// Macro that generates an unpack function taking the number of bits as a const generic38macro_rules! unpack_impl {39($t:ty, $bytes:literal, $bits:tt) => {40pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; $bits]) {41if NUM_BITS == 0 {42for out in output {43*out = 0;44}45return;46}4748assert!(NUM_BITS <= $bytes * 8);4950let mask = match NUM_BITS {51$bits => <$t>::MAX,52_ => ((1 << NUM_BITS) - 1),53};5455assert!(input.len() >= NUM_BITS * $bytes);5657let r = |output_idx: usize| {58<$t>::from_le_bytes(59input[output_idx * $bytes..output_idx * $bytes + $bytes]60.try_into()61.unwrap(),62)63};6465// @NOTE66// I was surprised too, but this macro vs. a for loop saves around 4.5 - 5x on67// performance in a microbenchmark. Although the code it generates is completely68// insane. There should be something we can do here to make this less code, sane code69// and faster code.70seq_macro!(i in 0..$bits {71let start_bit = i * NUM_BITS;72let end_bit = start_bit + NUM_BITS;7374let start_bit_offset = start_bit % $bits;75let end_bit_offset = end_bit % $bits;76let start_byte = start_bit / $bits;77let end_byte = end_bit / $bits;78if start_byte != end_byte && end_bit_offset != 0 {79let val = r(start_byte);80let a = val >> start_bit_offset;81let val = r(end_byte);82let b = val << (NUM_BITS - end_bit_offset);8384output[i] = a | (b & mask);85} else {86let val = r(start_byte);87output[i] = (val >> start_bit_offset) & mask;88}89});90}91};92}9394/// Macro that generates unpack functions that accept num_bits as a parameter95macro_rules! unpack {96($name:ident, $t:ty, $bytes:literal, $bits:tt) => {97mod $name {98unpack_impl!($t, $bytes, $bits);99}100101/// Unpack packed `input` into `output` with a bit width of `num_bits`102pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) {103// This will get optimised into a jump table104//105// @NOTE106// This jumptable approach saves around 2 - 2.5x on performance over no jumptable and no107// generics.108seq_macro!(i in 0..=$bits {109if i == num_bits {110return $name::unpack::<i>(input, output);111}112});113unreachable!("invalid num_bits {}", num_bits);114}115};116}117118unpack!(unpack16, u16, 2, 16);119unpack!(unpack32, u32, 4, 32);120unpack!(unpack64, u64, 8, 64);121122#[cfg(test)]123mod tests {124use super::*;125126#[test]127fn test_basic() {128let input = [0xFF; 4096];129130for i in 0..=32 {131let mut output = [0; 32];132unpack32(&input, &mut output, i);133for (idx, out) in output.iter().enumerate() {134assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");135}136}137138for i in 0..=64 {139let mut output = [0; 64];140unpack64(&input, &mut output, i);141for (idx, out) in output.iter().enumerate() {142assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}");143}144}145}146}147148149