Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-row/src/fixed/numeric.rs
8420 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
use std::fmt::Debug;
3
use std::mem::MaybeUninit;
4
5
use arrow::array::{Array, PrimitiveArray};
6
use arrow::bitmap::Bitmap;
7
use arrow::datatypes::ArrowDataType;
8
use arrow::types::NativeType;
9
use polars_utils::float16::pf16;
10
use polars_utils::slice::*;
11
use polars_utils::total_ord::{canonical_f16, canonical_f32, canonical_f64};
12
13
use crate::row::RowEncodingOptions;
14
pub(crate) trait FromSlice {
15
fn from_slice(slice: &[u8]) -> Self;
16
}
17
18
impl<const N: usize> FromSlice for [u8; N] {
19
#[inline]
20
fn from_slice(slice: &[u8]) -> Self {
21
slice.try_into().unwrap()
22
}
23
}
24
25
/// Encodes a value of a particular fixed width type into bytes
26
pub trait FixedLengthEncoding: Copy + Debug {
27
// 1 is validity 0 or 1
28
// bit repr of encoding
29
const ENCODED_LEN: usize = 1 + size_of::<Self::Encoded>();
30
31
type Encoded: Sized + Copy + AsRef<[u8]> + AsMut<[u8]>;
32
33
fn encode(self) -> Self::Encoded;
34
35
fn decode(encoded: Self::Encoded) -> Self;
36
37
fn decode_reverse(mut encoded: Self::Encoded) -> Self {
38
for v in encoded.as_mut() {
39
*v = !*v
40
}
41
Self::decode(encoded)
42
}
43
}
44
45
// encode as big endian
46
macro_rules! encode_unsigned {
47
($n:expr, $t:ty) => {
48
impl FixedLengthEncoding for $t {
49
type Encoded = [u8; $n];
50
51
fn encode(self) -> [u8; $n] {
52
self.to_be_bytes()
53
}
54
55
fn decode(encoded: Self::Encoded) -> Self {
56
Self::from_be_bytes(encoded)
57
}
58
}
59
};
60
}
61
62
encode_unsigned!(1, u8);
63
encode_unsigned!(2, u16);
64
encode_unsigned!(4, u32);
65
encode_unsigned!(8, u64);
66
encode_unsigned!(16, u128);
67
68
// toggle the sign bit and then encode as big indian
69
macro_rules! encode_signed {
70
($n:expr, $t:ty) => {
71
impl FixedLengthEncoding for $t {
72
type Encoded = [u8; $n];
73
74
fn encode(self) -> [u8; $n] {
75
#[cfg(target_endian = "big")]
76
{
77
todo!()
78
}
79
80
let mut b = self.to_be_bytes();
81
// Toggle top "sign" bit to ensure consistent sort order
82
b[0] ^= 0x80;
83
b
84
}
85
86
fn decode(mut encoded: Self::Encoded) -> Self {
87
// Toggle top "sign" bit
88
encoded[0] ^= 0x80;
89
Self::from_be_bytes(encoded)
90
}
91
}
92
};
93
}
94
95
encode_signed!(1, i8);
96
encode_signed!(2, i16);
97
encode_signed!(4, i32);
98
encode_signed!(8, i64);
99
encode_signed!(16, i128);
100
101
impl FixedLengthEncoding for pf16 {
102
type Encoded = [u8; 2];
103
104
fn encode(self) -> [u8; 2] {
105
let s = canonical_f16(self).to_bits() as i16;
106
let val = s ^ (((s >> 15) as u16) >> 1) as i16;
107
val.encode()
108
}
109
110
fn decode(encoded: Self::Encoded) -> Self {
111
let bits = i16::decode(encoded);
112
let val = bits ^ (((bits >> 15) as u16) >> 1) as i16;
113
Self::from_bits(val as u16)
114
}
115
}
116
117
impl FixedLengthEncoding for f32 {
118
type Encoded = [u8; 4];
119
120
fn encode(self) -> [u8; 4] {
121
// https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260
122
let s = canonical_f32(self).to_bits() as i32;
123
let val = s ^ (((s >> 31) as u32) >> 1) as i32;
124
val.encode()
125
}
126
127
fn decode(encoded: Self::Encoded) -> Self {
128
let bits = i32::decode(encoded);
129
let val = bits ^ (((bits >> 31) as u32) >> 1) as i32;
130
Self::from_bits(val as u32)
131
}
132
}
133
134
impl FixedLengthEncoding for f64 {
135
type Encoded = [u8; 8];
136
137
fn encode(self) -> [u8; 8] {
138
// https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260
139
let s = canonical_f64(self).to_bits() as i64;
140
let val = s ^ (((s >> 63) as u64) >> 1) as i64;
141
val.encode()
142
}
143
144
fn decode(encoded: Self::Encoded) -> Self {
145
let bits = i64::decode(encoded);
146
let val = bits ^ (((bits >> 63) as u64) >> 1) as i64;
147
Self::from_bits(val as u64)
148
}
149
}
150
151
pub unsafe fn encode<T: NativeType + FixedLengthEncoding>(
152
buffer: &mut [MaybeUninit<u8>],
153
arr: &PrimitiveArray<T>,
154
opt: RowEncodingOptions,
155
offsets: &mut [usize],
156
) {
157
if arr.null_count() == 0 {
158
crate::fixed::numeric::encode_slice(buffer, arr.values().as_slice(), opt, offsets)
159
} else {
160
crate::fixed::numeric::encode_iter(
161
buffer,
162
arr.into_iter().map(|v| v.copied()),
163
opt,
164
offsets,
165
)
166
}
167
}
168
169
#[inline]
170
unsafe fn encode_value<T: FixedLengthEncoding>(
171
value: &T,
172
offset: &mut usize,
173
descending: bool,
174
buf: &mut [MaybeUninit<u8>],
175
) {
176
let end_offset = *offset + T::ENCODED_LEN;
177
let dst = unsafe { buf.get_unchecked_mut(*offset..end_offset) };
178
// set valid
179
dst[0] = MaybeUninit::new(1);
180
let mut encoded = value.encode();
181
182
// invert bits to reverse order
183
if descending {
184
for v in encoded.as_mut() {
185
*v = !*v
186
}
187
}
188
189
dst[1..].copy_from_slice(encoded.as_ref().as_uninit());
190
*offset = end_offset;
191
}
192
193
unsafe fn encode_opt_value<T: FixedLengthEncoding>(
194
opt_value: Option<T>,
195
offset: &mut usize,
196
opt: RowEncodingOptions,
197
buffer: &mut [MaybeUninit<u8>],
198
) {
199
let descending = opt.contains(RowEncodingOptions::DESCENDING);
200
if let Some(value) = opt_value {
201
encode_value(&value, offset, descending, buffer);
202
} else {
203
unsafe { *buffer.get_unchecked_mut(*offset) = MaybeUninit::new(opt.null_sentinel()) };
204
let end_offset = *offset + T::ENCODED_LEN;
205
206
// initialize remaining bytes
207
let remainder = unsafe { buffer.get_unchecked_mut(*offset + 1..end_offset) };
208
remainder.fill(MaybeUninit::new(0));
209
210
*offset = end_offset;
211
}
212
}
213
214
pub(crate) unsafe fn encode_slice<T: FixedLengthEncoding>(
215
buffer: &mut [MaybeUninit<u8>],
216
input: &[T],
217
opt: RowEncodingOptions,
218
row_starts: &mut [usize],
219
) {
220
let descending = opt.contains(RowEncodingOptions::DESCENDING);
221
for (offset, value) in row_starts.iter_mut().zip(input) {
222
encode_value(value, offset, descending, buffer);
223
}
224
}
225
226
pub(crate) unsafe fn encode_iter<I: Iterator<Item = Option<T>>, T: FixedLengthEncoding>(
227
buffer: &mut [MaybeUninit<u8>],
228
input: I,
229
opt: RowEncodingOptions,
230
row_starts: &mut [usize],
231
) {
232
for (offset, opt_value) in row_starts.iter_mut().zip(input) {
233
encode_opt_value(opt_value, offset, opt, buffer);
234
}
235
}
236
237
pub(crate) unsafe fn decode_primitive<T: NativeType + FixedLengthEncoding>(
238
rows: &mut [&[u8]],
239
opt: RowEncodingOptions,
240
) -> PrimitiveArray<T>
241
where
242
T::Encoded: FromSlice,
243
{
244
let dtype: ArrowDataType = T::PRIMITIVE.into();
245
let mut has_nulls = false;
246
let descending = opt.contains(RowEncodingOptions::DESCENDING);
247
let null_sentinel = opt.null_sentinel();
248
249
let values = rows
250
.iter()
251
.map(|row| {
252
has_nulls |= *row.get_unchecked(0) == null_sentinel;
253
// skip null sentinel
254
let start = 1;
255
let end = start + T::ENCODED_LEN - 1;
256
let slice = row.get_unchecked(start..end);
257
let bytes = T::Encoded::from_slice(slice);
258
259
if descending {
260
T::decode_reverse(bytes)
261
} else {
262
T::decode(bytes)
263
}
264
})
265
.collect::<Vec<_>>();
266
267
let validity = if has_nulls {
268
let null_sentinel = opt.null_sentinel();
269
Some(decode_nulls(rows, null_sentinel))
270
} else {
271
None
272
};
273
274
// validity byte and data length
275
let increment_len = T::ENCODED_LEN;
276
277
increment_row_counter(rows, increment_len);
278
PrimitiveArray::new(dtype, values.into(), validity)
279
}
280
281
unsafe fn increment_row_counter(rows: &mut [&[u8]], fixed_size: usize) {
282
for row in rows {
283
*row = row.get_unchecked(fixed_size..);
284
}
285
}
286
287
pub(super) unsafe fn decode_nulls(rows: &[&[u8]], null_sentinel: u8) -> Bitmap {
288
rows.iter()
289
.map(|row| *row.get_unchecked(0) != null_sentinel)
290
.collect()
291
}
292
293