Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/parquet/encoding/uleb128.rs
6940 views
1
// Reads an uleb128 encoded integer with at most 56 bits (8 bytes with 7 bits worth of payload each).
2
/// Returns the integer and the number of bytes that made up this integer.
3
///
4
/// If the returned length is bigger than 8 this means the integer required more than 8 bytes and the remaining bytes need to be read sequentially and combined with the return value.
5
///
6
/// # Safety
7
/// `data` needs to contain at least 8 bytes.
8
#[target_feature(enable = "bmi2")]
9
#[cfg(target_feature = "bmi2")]
10
pub unsafe fn decode_uleb_bmi2(data: &[u8]) -> (u64, usize) {
11
const CONT_MARKER: u64 = 0x80808080_80808080;
12
debug_assert!(data.len() >= 8);
13
14
unsafe {
15
let word = data.as_ptr().cast::<u64>().read_unaligned();
16
// mask indicating continuation bytes
17
let mask = std::arch::x86_64::_pext_u64(word, CONT_MARKER);
18
let len = (!mask).trailing_zeros() + 1;
19
// which payload bits to extract
20
let ext = std::arch::x86_64::_bzhi_u64(!CONT_MARKER, 8 * len);
21
let payload = std::arch::x86_64::_pext_u64(word, ext);
22
23
(payload, len as _)
24
}
25
}
26
27
pub fn decode(values: &[u8]) -> (u64, usize) {
28
#[cfg(target_feature = "bmi2")]
29
{
30
if polars_utils::cpuid::has_fast_bmi2() && values.len() >= 8 {
31
let (result, consumed) = unsafe { decode_uleb_bmi2(values) };
32
if consumed <= 8 {
33
return (result, consumed);
34
}
35
}
36
}
37
38
let mut result = 0;
39
let mut shift = 0;
40
41
let mut consumed = 0;
42
for byte in values {
43
consumed += 1;
44
45
#[cfg(debug_assertions)]
46
debug_assert!(!(shift == 63 && *byte > 1));
47
48
result |= u64::from(byte & 0b01111111) << shift;
49
50
if byte & 0b10000000 == 0 {
51
break;
52
}
53
54
shift += 7;
55
}
56
(result, consumed)
57
}
58
59
/// Encodes `value` in ULEB128 into `container`. The exact number of bytes written
60
/// depends on `value`, and cannot be determined upfront. The maximum number of bytes
61
/// required are 10.
62
/// # Panic
63
/// This function may panic if `container.len() < 10` and `value` requires more bytes.
64
pub fn encode(mut value: u64, container: &mut [u8]) -> usize {
65
assert!(container.len() >= 10);
66
let mut consumed = 0;
67
let mut iter = container.iter_mut();
68
loop {
69
let mut byte = (value as u8) & !128;
70
value >>= 7;
71
if value != 0 {
72
byte |= 128;
73
}
74
*iter.next().unwrap() = byte;
75
consumed += 1;
76
if value == 0 {
77
break;
78
}
79
}
80
consumed
81
}
82
83
#[cfg(test)]
84
mod tests {
85
use super::*;
86
87
#[test]
88
fn decode_1() {
89
let data = vec![0xe5, 0x8e, 0x26, 0xDE, 0xAD, 0xBE, 0xEF];
90
let (value, len) = decode(&data);
91
assert_eq!(value, 624_485);
92
assert_eq!(len, 3);
93
}
94
95
#[test]
96
fn decode_2() {
97
let data = vec![0b00010000, 0b00000001, 0b00000011, 0b00000011];
98
let (value, len) = decode(&data);
99
assert_eq!(value, 16);
100
assert_eq!(len, 1);
101
}
102
103
#[test]
104
fn round_trip() {
105
let original = 123124234u64;
106
let mut container = [0u8; 10];
107
let encoded_len = encode(original, &mut container);
108
let (value, len) = decode(&container);
109
assert_eq!(value, original);
110
assert_eq!(len, encoded_len);
111
}
112
113
#[test]
114
fn min_value() {
115
let original = u64::MIN;
116
let mut container = [0u8; 10];
117
let encoded_len = encode(original, &mut container);
118
let (value, len) = decode(&container);
119
assert_eq!(value, original);
120
assert_eq!(len, encoded_len);
121
}
122
123
#[test]
124
fn max_value() {
125
let original = u64::MAX;
126
let mut container = [0u8; 10];
127
let encoded_len = encode(original, &mut container);
128
let (value, len) = decode(&container);
129
assert_eq!(value, original);
130
assert_eq!(len, encoded_len);
131
}
132
}
133
134