Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/binview/view.rs
6939 views
1
use std::cmp::Ordering;
2
use std::fmt::{self, Display, Formatter};
3
4
use bytemuck::{Pod, Zeroable};
5
use polars_error::*;
6
use polars_utils::min_max::MinMax;
7
use polars_utils::nulls::IsNull;
8
use polars_utils::total_ord::{TotalEq, TotalOrd};
9
10
use crate::datatypes::PrimitiveType;
11
use crate::types::{Bytes16Alignment4, NativeType};
12
13
// We use this instead of u128 because we want alignment of <= 8 bytes.
14
/// A reference to a set of bytes.
15
///
16
/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.
17
/// If `length > 12`, these fields specify a slice of a buffer.
18
#[derive(Copy, Clone, Default)]
19
#[repr(C)]
20
pub struct View {
21
/// The length of the string/bytes.
22
pub length: u32,
23
/// First 4 bytes of string/bytes data.
24
pub prefix: u32,
25
/// The buffer index.
26
pub buffer_idx: u32,
27
/// The offset into the buffer.
28
pub offset: u32,
29
}
30
31
impl fmt::Debug for View {
32
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
33
if self.length <= Self::MAX_INLINE_SIZE {
34
fmt.debug_struct("View")
35
.field("length", &self.length)
36
.field("content", &unsafe {
37
std::slice::from_raw_parts(
38
(self as *const _ as *const u8).add(4),
39
self.length as usize,
40
)
41
})
42
.finish()
43
} else {
44
fmt.debug_struct("View")
45
.field("length", &self.length)
46
.field("prefix", &self.prefix.to_be_bytes())
47
.field("buffer_idx", &self.buffer_idx)
48
.field("offset", &self.offset)
49
.finish()
50
}
51
}
52
}
53
54
impl View {
55
pub const MAX_INLINE_SIZE: u32 = 12;
56
57
#[inline(always)]
58
pub fn is_inline(&self) -> bool {
59
self.length <= Self::MAX_INLINE_SIZE
60
}
61
62
#[inline(always)]
63
pub fn as_u128(self) -> u128 {
64
unsafe { std::mem::transmute(self) }
65
}
66
67
/// Create a new inline view without verifying the length
68
///
69
/// # Safety
70
///
71
/// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.
72
#[inline]
73
pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
74
debug_assert!(bytes.len() <= u32::MAX as usize);
75
debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
76
77
let mut view = Self {
78
length: bytes.len() as u32,
79
..Default::default()
80
};
81
82
let view_ptr = &mut view as *mut _ as *mut u8;
83
84
// SAFETY:
85
// - bytes length <= 12,
86
// - size_of::<View> == 16
87
// - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))
88
// - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and
89
// offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)
90
unsafe {
91
let inline_data_ptr = view_ptr.add(4);
92
core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
93
}
94
view
95
}
96
97
/// Create a new inline view
98
///
99
/// # Panics
100
///
101
/// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.
102
#[inline]
103
pub fn new_inline(bytes: &[u8]) -> Self {
104
assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
105
unsafe { Self::new_inline_unchecked(bytes) }
106
}
107
108
/// Create a new inline view
109
///
110
/// # Safety
111
///
112
/// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
113
#[inline]
114
pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
115
debug_assert!(bytes.len() <= u32::MAX as usize);
116
debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
117
118
// SAFETY: The invariant of this function guarantees that this is safe.
119
let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
120
Self {
121
length: bytes.len() as u32,
122
prefix,
123
buffer_idx,
124
offset,
125
}
126
}
127
128
#[inline]
129
pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
130
debug_assert!(bytes.len() <= u32::MAX as usize);
131
132
// SAFETY: We verify the invariant with the outer if statement
133
unsafe {
134
if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
135
Self::new_inline_unchecked(bytes)
136
} else {
137
Self::new_noninline_unchecked(bytes, buffer_idx, offset)
138
}
139
}
140
}
141
142
/// Constructs a byteslice from this view.
143
///
144
/// # Safety
145
/// Assumes that this view is valid for the given buffers.
146
#[inline]
147
pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {
148
unsafe {
149
if self.length <= Self::MAX_INLINE_SIZE {
150
self.get_inlined_slice_unchecked()
151
} else {
152
self.get_external_slice_unchecked(buffers)
153
}
154
}
155
}
156
157
/// Construct a byte slice from an inline view, if it is inline.
158
#[inline]
159
pub fn get_inlined_slice(&self) -> Option<&[u8]> {
160
if self.length <= Self::MAX_INLINE_SIZE {
161
unsafe { Some(self.get_inlined_slice_unchecked()) }
162
} else {
163
None
164
}
165
}
166
167
/// Construct a byte slice from an inline view.
168
///
169
/// # Safety
170
/// Assumes that this view is inlinable.
171
#[inline]
172
pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
173
debug_assert!(self.length <= View::MAX_INLINE_SIZE);
174
let ptr = self as *const View as *const u8;
175
unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
176
}
177
178
/// Construct a byte slice from an external view.
179
///
180
/// # Safety
181
/// Assumes that this view is in the external buffers.
182
#[inline]
183
pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(
184
&self,
185
buffers: &'a [B],
186
) -> &'a [u8] {
187
debug_assert!(self.length > View::MAX_INLINE_SIZE);
188
let data = buffers.get_unchecked(self.buffer_idx as usize);
189
let offset = self.offset as usize;
190
data.as_ref()
191
.get_unchecked(offset..offset + self.length as usize)
192
}
193
194
/// Extend a `Vec<View>` with inline views slices of `src` with `width`.
195
///
196
/// This tries to use SIMD to optimize the copying and can be massively faster than doing a
197
/// `views.extend(src.chunks_exact(width).map(View::new_inline))`.
198
///
199
/// # Panics
200
///
201
/// This function panics if `src.len()` is not divisible by `width`, `width >
202
/// View::MAX_INLINE_SIZE` or `width == 0`.
203
pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
204
macro_rules! dispatch {
205
($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
206
match $match {
207
$(
208
$v => {
209
const $n: usize = $v;
210
211
$block
212
}
213
)+
214
_ => $otherwise,
215
}
216
}
217
}
218
219
let width = width as usize;
220
221
assert!(width > 0);
222
assert!(width <= View::MAX_INLINE_SIZE as usize);
223
224
assert_eq!(src.len() % width, 0);
225
226
let num_values = src.len() / width;
227
228
views.reserve(num_values);
229
230
#[allow(unused_mut)]
231
let mut src = src;
232
233
dispatch! {
234
N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
235
#[cfg(feature = "simd")]
236
{
237
macro_rules! repeat_with {
238
($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
239
$({
240
const $i: usize = $v;
241
242
$block
243
})+
244
}
245
}
246
247
use std::simd::*;
248
249
// SAFETY: This is always allowed, since views.len() is always in the Vec
250
// buffer.
251
let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
252
253
let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
254
255
const BLOCKS_PER_LOAD: usize = 16 / N;
256
const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
257
258
let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
259
260
for _ in 0..num_loops {
261
// SAFETY: The num_loops calculates how many times we can do this.
262
let loaded = u8x16::from_array(unsafe {
263
src.get_unchecked(..16).try_into().unwrap()
264
});
265
src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
266
267
// This way we can reuse the same load for multiple views.
268
repeat_with!(
269
I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
270
if I < BLOCKS_PER_LOAD {
271
let zero = u8x16::default();
272
const SWIZZLE: [usize; 16] = const {
273
let mut swizzle = [16usize; 16];
274
275
let mut i = 0;
276
while i < N {
277
let idx = i + I * N;
278
if idx < 16 {
279
swizzle[4+i] = idx;
280
}
281
i += 1;
282
}
283
284
swizzle
285
};
286
287
let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
288
let view_bytes = (scattered | length_mask).to_array();
289
290
// SAFETY: dst has the capacity reserved and view_bytes is 16
291
// bytes long.
292
unsafe {
293
core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
294
dst = dst.add(16);
295
}
296
}
297
}
298
);
299
}
300
301
unsafe {
302
views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
303
}
304
}
305
306
views.extend(src.chunks_exact(N).map(|slice| unsafe {
307
View::new_inline_unchecked(slice)
308
}));
309
},
310
otherwise = unreachable!()
311
}
312
}
313
}
314
315
impl IsNull for View {
316
const HAS_NULLS: bool = false;
317
type Inner = Self;
318
319
fn is_null(&self) -> bool {
320
false
321
}
322
323
fn unwrap_inner(self) -> Self::Inner {
324
self
325
}
326
}
327
328
impl Display for View {
329
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
330
write!(f, "{self:?}")
331
}
332
}
333
334
unsafe impl Zeroable for View {}
335
336
unsafe impl Pod for View {}
337
338
impl PartialEq for View {
339
fn eq(&self, other: &Self) -> bool {
340
self.as_u128() == other.as_u128()
341
}
342
}
343
344
// These are 'implemented' because we want to implement NativeType
345
// for View, that should probably not be done.
346
impl TotalOrd for View {
347
fn tot_cmp(&self, _other: &Self) -> Ordering {
348
unimplemented!()
349
}
350
}
351
352
impl TotalEq for View {
353
fn tot_eq(&self, other: &Self) -> bool {
354
self.eq(other)
355
}
356
}
357
358
impl MinMax for View {
359
fn nan_min_lt(&self, _other: &Self) -> bool {
360
unimplemented!()
361
}
362
363
fn nan_max_lt(&self, _other: &Self) -> bool {
364
unimplemented!()
365
}
366
}
367
368
impl NativeType for View {
369
const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
370
371
type Bytes = [u8; 16];
372
type AlignedBytes = Bytes16Alignment4;
373
374
#[inline]
375
fn to_le_bytes(&self) -> Self::Bytes {
376
self.as_u128().to_le_bytes()
377
}
378
379
#[inline]
380
fn to_be_bytes(&self) -> Self::Bytes {
381
self.as_u128().to_be_bytes()
382
}
383
384
#[inline]
385
fn from_le_bytes(bytes: Self::Bytes) -> Self {
386
Self::from(u128::from_le_bytes(bytes))
387
}
388
389
#[inline]
390
fn from_be_bytes(bytes: Self::Bytes) -> Self {
391
Self::from(u128::from_be_bytes(bytes))
392
}
393
}
394
395
impl From<u128> for View {
396
#[inline]
397
fn from(value: u128) -> Self {
398
unsafe { std::mem::transmute(value) }
399
}
400
}
401
402
impl From<View> for u128 {
403
#[inline]
404
fn from(value: View) -> Self {
405
value.as_u128()
406
}
407
}
408
409
pub fn validate_views<B: AsRef<[u8]>, F>(
410
views: &[View],
411
buffers: &[B],
412
validate_bytes: F,
413
) -> PolarsResult<()>
414
where
415
F: Fn(&[u8]) -> PolarsResult<()>,
416
{
417
for view in views {
418
if let Some(inline_slice) = view.get_inlined_slice() {
419
if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0
420
{
421
polars_bail!(ComputeError: "view contained non-zero padding in prefix");
422
}
423
424
validate_bytes(inline_slice)?;
425
} else {
426
let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
427
polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
428
})?;
429
430
let start = view.offset as usize;
431
let end = start + view.length as usize;
432
let b = data
433
.as_ref()
434
.get(start..end)
435
.ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
436
437
polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
438
validate_bytes(b)?;
439
}
440
}
441
442
Ok(())
443
}
444
445
pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
446
validate_views(views, buffers, |_| Ok(()))
447
}
448
449
fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
450
match simdutf8::basic::from_utf8(b) {
451
Ok(_) => Ok(()),
452
Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
453
}
454
}
455
456
pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
457
validate_views(views, buffers, validate_utf8)
458
}
459
460
/// Checks the views for valid UTF-8. Assumes the first num_trusted_buffers are
461
/// valid UTF-8 without checking.
462
/// # Safety
463
/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.
464
pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(
465
views: &[View],
466
buffers: &[B],
467
mut num_trusted_buffers: usize,
468
) -> PolarsResult<()> {
469
unsafe {
470
while num_trusted_buffers < buffers.len()
471
&& buffers[num_trusted_buffers].as_ref().is_ascii()
472
{
473
num_trusted_buffers += 1;
474
}
475
476
// Fast path if all buffers are ASCII (or there are no buffers).
477
if num_trusted_buffers >= buffers.len() {
478
for view in views {
479
if let Some(inlined_slice) = view.get_inlined_slice() {
480
validate_utf8(inlined_slice)?;
481
}
482
}
483
} else {
484
for view in views {
485
if view.length <= View::MAX_INLINE_SIZE
486
|| view.buffer_idx as usize >= num_trusted_buffers
487
{
488
validate_utf8(view.get_slice_unchecked(buffers))?;
489
}
490
}
491
}
492
493
Ok(())
494
}
495
}
496
497