Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-row/src/row.rs
6939 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
use std::sync::Arc;
3
4
use arrow::array::{BinaryArray, BinaryViewArray};
5
use arrow::datatypes::ArrowDataType;
6
use arrow::ffi::mmap;
7
use arrow::offset::{Offsets, OffsetsBuffer};
8
use polars_compute::cast::binary_to_binview;
9
use polars_dtype::categorical::CategoricalMapping;
10
11
const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
12
const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
13
14
/// Additional context provided to row encoding regarding a column.
15
///
16
/// This allows communication based on the Polars datatype instead on the Arrow datatype. Since
17
/// polars-row is used under polars-core, we don't have access to the actual datatypes.
18
#[derive(Debug, Clone)]
19
pub enum RowEncodingContext {
20
Struct(Vec<Option<RowEncodingContext>>),
21
/// Categorical / Enum
22
Categorical(RowEncodingCategoricalContext),
23
/// Decimal with given precision
24
Decimal(usize),
25
}
26
27
#[derive(Debug, Clone)]
28
pub struct RowEncodingCategoricalContext {
29
pub is_enum: bool,
30
pub mapping: Arc<CategoricalMapping>,
31
}
32
33
bitflags::bitflags! {
34
/// Options for the Polars Row Encoding.
35
///
36
/// The row encoding provides a method to combine several columns into one binary column which
37
/// has the same sort-order as the original columns.test
38
///
39
/// By default, the row encoding provides the ascending, nulls first sort-order of the columns.
40
#[derive(Debug, Clone, Copy, Default)]
41
pub struct RowEncodingOptions: u8 {
42
/// Sort in descending order instead of ascending order
43
const DESCENDING = 0x01;
44
/// Sort such that nulls / missing values are last
45
const NULLS_LAST = 0x02;
46
47
/// Ignore all order-related flags and don't encode order-preserving. This will keep
48
/// uniqueness.
49
///
50
/// This is faster for several encodings
51
const NO_ORDER = 0x04;
52
}
53
}
54
55
const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
56
const EMPTY_STR_TOKEN: u8 = 0x01;
57
58
impl RowEncodingOptions {
59
pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
60
let mut slf = Self::default();
61
slf.set(Self::DESCENDING, descending);
62
slf.set(Self::NULLS_LAST, nulls_last);
63
slf
64
}
65
66
pub fn new_unsorted() -> Self {
67
Self::NO_ORDER
68
}
69
70
pub fn is_ordered(self) -> bool {
71
!self.contains(Self::NO_ORDER)
72
}
73
74
pub fn null_sentinel(self) -> u8 {
75
if self.contains(Self::NULLS_LAST) {
76
0xFF
77
} else {
78
0x00
79
}
80
}
81
82
pub(crate) fn bool_true_sentinel(self) -> u8 {
83
if self.contains(Self::DESCENDING) {
84
!BOOLEAN_TRUE_SENTINEL
85
} else {
86
BOOLEAN_TRUE_SENTINEL
87
}
88
}
89
90
pub(crate) fn bool_false_sentinel(self) -> u8 {
91
if self.contains(Self::DESCENDING) {
92
!BOOLEAN_FALSE_SENTINEL
93
} else {
94
BOOLEAN_FALSE_SENTINEL
95
}
96
}
97
98
pub fn list_null_sentinel(self) -> u8 {
99
self.null_sentinel()
100
}
101
102
pub fn list_continuation_token(self) -> u8 {
103
if self.contains(Self::DESCENDING) {
104
!LIST_CONTINUATION_TOKEN
105
} else {
106
LIST_CONTINUATION_TOKEN
107
}
108
}
109
110
pub fn list_termination_token(self) -> u8 {
111
!self.list_continuation_token()
112
}
113
114
pub fn empty_str_token(self) -> u8 {
115
if self.contains(Self::DESCENDING) {
116
!EMPTY_STR_TOKEN
117
} else {
118
EMPTY_STR_TOKEN
119
}
120
}
121
122
pub fn into_nested(mut self) -> RowEncodingOptions {
123
// Correct nested ordering (see #22557)
124
self.set(
125
RowEncodingOptions::NULLS_LAST,
126
self.contains(RowEncodingOptions::DESCENDING),
127
);
128
self
129
}
130
}
131
132
#[derive(Default, Clone)]
133
pub struct RowsEncoded {
134
pub(crate) values: Vec<u8>,
135
pub(crate) offsets: Vec<usize>,
136
}
137
138
unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
139
let offsets = if size_of::<usize>() == size_of::<i64>() {
140
assert!(
141
(*offsets.last().unwrap() as u64) < i64::MAX as u64,
142
"row encoding output overflowed"
143
);
144
145
// SAFETY: we checked overflow and size
146
bytemuck::cast_vec::<usize, i64>(offsets)
147
} else {
148
offsets.into_iter().map(|v| v as i64).collect()
149
};
150
151
// SAFETY: monotonically increasing
152
let offsets = Offsets::new_unchecked(offsets);
153
154
BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
155
}
156
157
impl RowsEncoded {
158
pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
159
RowsEncoded { values, offsets }
160
}
161
162
pub fn iter(&self) -> RowsEncodedIter<'_> {
163
let iter = self.offsets[1..].iter();
164
let offset = self.offsets[0];
165
RowsEncodedIter {
166
offset,
167
end: iter,
168
values: &self.values,
169
}
170
}
171
172
/// Borrows the buffers and returns a [`BinaryArray`].
173
///
174
/// # Safety
175
/// The lifetime of that `BinaryArray` is tied to the lifetime of
176
/// `Self`. The caller must ensure that both stay alive for the same time.
177
pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
178
let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
179
let offsets = if size_of::<usize>() == size_of::<i64>() {
180
assert!(
181
(*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
182
"row encoding output overflowed"
183
);
184
185
let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
186
let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
187
offsets
188
} else {
189
self.offsets.iter().map(|&v| v as i64).collect()
190
};
191
let offsets = OffsetsBuffer::new_unchecked(offsets);
192
193
BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
194
}
195
196
/// This conversion is free.
197
pub fn into_array(self) -> BinaryArray<i64> {
198
unsafe { rows_to_array(self.values, self.offsets) }
199
}
200
201
/// This does allocate views.
202
pub fn into_binview(self) -> BinaryViewArray {
203
binary_to_binview(&self.into_array())
204
}
205
206
#[cfg(test)]
207
pub fn get(&self, i: usize) -> &[u8] {
208
let start = self.offsets[i];
209
let end = self.offsets[i + 1];
210
&self.values[start..end]
211
}
212
}
213
214
pub struct RowsEncodedIter<'a> {
215
offset: usize,
216
end: std::slice::Iter<'a, usize>,
217
values: &'a [u8],
218
}
219
220
impl<'a> Iterator for RowsEncodedIter<'a> {
221
type Item = &'a [u8];
222
223
fn next(&mut self) -> Option<Self::Item> {
224
let new_offset = *self.end.next()?;
225
let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
226
self.offset = new_offset;
227
Some(payload)
228
}
229
230
fn size_hint(&self) -> (usize, Option<usize>) {
231
self.end.size_hint()
232
}
233
}
234
235