Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/binview/view.rs
8406 views
1
use std::cmp::Ordering;
2
use std::fmt::{self, Display, Formatter};
3
4
use bytemuck::{Pod, Zeroable};
5
use polars_error::*;
6
use polars_utils::min_max::MinMax;
7
use polars_utils::nulls::IsNull;
8
use polars_utils::total_ord::{TotalEq, TotalOrd};
9
10
use crate::datatypes::PrimitiveType;
11
use crate::types::{Bytes16Alignment4, NativeType};
12
13
// We use this instead of u128 because we want alignment of <= 8 bytes.
14
/// A reference to a set of bytes.
15
///
16
/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.
17
/// If `length > 12`, these fields specify a slice of a buffer.
18
#[derive(Copy, Clone, Default)]
19
#[repr(C)]
20
pub struct View {
21
/// The length of the string/bytes.
22
pub length: u32,
23
/// First 4 bytes of string/bytes data.
24
pub prefix: u32,
25
/// The buffer index.
26
pub buffer_idx: u32,
27
/// The offset into the buffer.
28
pub offset: u32,
29
}
30
31
impl fmt::Debug for View {
32
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
33
if self.length <= Self::MAX_INLINE_SIZE {
34
fmt.debug_struct("View")
35
.field("length", &self.length)
36
.field("content", &unsafe {
37
std::slice::from_raw_parts(
38
(self as *const _ as *const u8).add(4),
39
self.length as usize,
40
)
41
})
42
.finish()
43
} else {
44
fmt.debug_struct("View")
45
.field("length", &self.length)
46
.field("prefix", &self.prefix.to_be_bytes())
47
.field("buffer_idx", &self.buffer_idx)
48
.field("offset", &self.offset)
49
.finish()
50
}
51
}
52
}
53
54
impl View {
55
pub const MAX_INLINE_SIZE: u32 = 12;
56
57
#[inline(always)]
58
pub fn is_inline(&self) -> bool {
59
self.length <= Self::MAX_INLINE_SIZE
60
}
61
62
#[inline(always)]
63
pub fn as_u128(self) -> u128 {
64
unsafe { std::mem::transmute(self) }
65
}
66
67
/// Create a new inline view without verifying the length
68
///
69
/// # Safety
70
///
71
/// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.
72
#[inline]
73
pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
74
debug_assert!(bytes.len() <= u32::MAX as usize);
75
debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
76
77
let mut view = Self {
78
length: bytes.len() as u32,
79
..Default::default()
80
};
81
82
let view_ptr = &mut view as *mut _ as *mut u8;
83
84
// SAFETY:
85
// - bytes length <= 12,
86
// - size_of::<View> == 16
87
// - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))
88
// - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and
89
// offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)
90
unsafe {
91
let inline_data_ptr = view_ptr.add(4);
92
core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
93
}
94
view
95
}
96
97
/// Create a new inline view
98
///
99
/// # Panics
100
///
101
/// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.
102
#[inline]
103
pub fn new_inline(bytes: &[u8]) -> Self {
104
assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
105
unsafe { Self::new_inline_unchecked(bytes) }
106
}
107
108
/// Create a new inline view
109
///
110
/// # Safety
111
///
112
/// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
113
#[inline]
114
pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
115
debug_assert!(bytes.len() <= u32::MAX as usize);
116
debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
117
118
// SAFETY: The invariant of this function guarantees that this is safe.
119
let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
120
Self {
121
length: bytes.len() as u32,
122
prefix,
123
buffer_idx,
124
offset,
125
}
126
}
127
128
#[inline]
129
pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
130
assert!(bytes.len() <= u32::MAX as usize);
131
132
// SAFETY: We verify the invariant with the outer if statement
133
unsafe {
134
if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
135
Self::new_inline_unchecked(bytes)
136
} else {
137
Self::new_noninline_unchecked(bytes, buffer_idx, offset)
138
}
139
}
140
}
141
142
#[inline]
143
pub fn new_with_buffers(
144
bytes: &[u8],
145
buffer_idx_offset: u32,
146
buffers: &mut Vec<Vec<u8>>,
147
) -> Self {
148
unsafe {
149
if bytes.len() <= Self::MAX_INLINE_SIZE as usize {
150
Self::new_inline_unchecked(bytes)
151
} else {
152
assert!(bytes.len() <= u32::MAX as usize);
153
let num_buffers = buffers.len();
154
if let Some(buf) = buffers.last_mut() {
155
if buf.len() + bytes.len() <= buf.capacity() {
156
let buf_idx = buffer_idx_offset + num_buffers as u32 - 1;
157
let offset = buf.len() as u32;
158
buf.extend_from_slice(bytes);
159
return Self::new_noninline_unchecked(bytes, buf_idx, offset);
160
}
161
}
162
163
Self::new_with_buffers_slow(bytes, buffer_idx_offset, buffers)
164
}
165
}
166
}
167
168
/// # Safety
169
///
170
/// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
171
#[cold]
172
unsafe fn new_with_buffers_slow(
173
bytes: &[u8],
174
buffer_idx_offset: u32,
175
buffers: &mut Vec<Vec<u8>>,
176
) -> Self {
177
// Resize last buffer instead of making new buffer.
178
const RESIZE_LIMIT: usize = 8192;
179
180
if buffers.is_empty() {
181
buffers.push(bytes.to_vec());
182
return View::new_noninline_unchecked(bytes, buffer_idx_offset, 0);
183
}
184
185
let num_buffers = buffers.len();
186
let old_cap = buffers.last().unwrap().capacity();
187
let new_cap = (old_cap + old_cap).clamp(bytes.len(), u32::MAX as usize);
188
if new_cap <= RESIZE_LIMIT {
189
let buffer_idx = buffer_idx_offset + num_buffers as u32 - 1;
190
let buf = buffers.last_mut().unwrap();
191
let offset = buf.len() as u32;
192
buf.reserve(new_cap - buf.len());
193
buf.extend_from_slice(bytes);
194
View::new_noninline_unchecked(bytes, buffer_idx, offset)
195
} else {
196
let buffer_idx = buffer_idx_offset + num_buffers as u32;
197
let mut buf = Vec::with_capacity(new_cap);
198
buf.extend_from_slice(bytes);
199
buffers.push(buf);
200
View::new_noninline_unchecked(bytes, buffer_idx, 0)
201
}
202
}
203
204
/// Constructs a byteslice from this view.
205
///
206
/// # Safety
207
/// Assumes that this view is valid for the given buffers.
208
#[inline]
209
pub unsafe fn get_slice_unchecked<'a, B: AsRef<[u8]>>(&'a self, buffers: &'a [B]) -> &'a [u8] {
210
unsafe {
211
if self.length <= Self::MAX_INLINE_SIZE {
212
self.get_inlined_slice_unchecked()
213
} else {
214
self.get_external_slice_unchecked(buffers)
215
}
216
}
217
}
218
219
/// Construct a byte slice from an inline view, if it is inline.
220
#[inline]
221
pub fn get_inlined_slice(&self) -> Option<&[u8]> {
222
if self.length <= Self::MAX_INLINE_SIZE {
223
unsafe { Some(self.get_inlined_slice_unchecked()) }
224
} else {
225
None
226
}
227
}
228
229
/// Construct a byte slice from an inline view.
230
///
231
/// # Safety
232
/// Assumes that this view is inlinable.
233
#[inline]
234
pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
235
debug_assert!(self.length <= View::MAX_INLINE_SIZE);
236
let ptr = self as *const View as *const u8;
237
unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
238
}
239
240
/// Construct a byte slice from an external view.
241
///
242
/// # Safety
243
/// Assumes that this view is in the external buffers.
244
#[inline]
245
pub unsafe fn get_external_slice_unchecked<'a, B: AsRef<[u8]>>(
246
&self,
247
buffers: &'a [B],
248
) -> &'a [u8] {
249
debug_assert!(self.length > View::MAX_INLINE_SIZE);
250
let data = buffers.get_unchecked(self.buffer_idx as usize);
251
let offset = self.offset as usize;
252
data.as_ref()
253
.get_unchecked(offset..offset + self.length as usize)
254
}
255
256
/// Extend a `Vec<View>` with inline views slices of `src` with `width`.
257
///
258
/// This tries to use SIMD to optimize the copying and can be massively faster than doing a
259
/// `views.extend(src.chunks_exact(width).map(View::new_inline))`.
260
///
261
/// # Panics
262
///
263
/// This function panics if `src.len()` is not divisible by `width`, `width >
264
/// View::MAX_INLINE_SIZE` or `width == 0`.
265
pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
266
macro_rules! dispatch {
267
($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
268
match $match {
269
$(
270
$v => {
271
const $n: usize = $v;
272
273
$block
274
}
275
)+
276
_ => $otherwise,
277
}
278
}
279
}
280
281
let width = width as usize;
282
283
assert!(width > 0);
284
assert!(width <= View::MAX_INLINE_SIZE as usize);
285
286
assert_eq!(src.len() % width, 0);
287
288
let num_values = src.len() / width;
289
290
views.reserve(num_values);
291
292
#[allow(unused_mut)]
293
let mut src = src;
294
295
dispatch! {
296
N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
297
#[cfg(feature = "simd")]
298
{
299
macro_rules! repeat_with {
300
($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
301
$({
302
const $i: usize = $v;
303
304
$block
305
})+
306
}
307
}
308
309
use std::simd::*;
310
311
// SAFETY: This is always allowed, since views.len() is always in the Vec
312
// buffer.
313
let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
314
315
let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
316
317
const BLOCKS_PER_LOAD: usize = 16 / N;
318
const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
319
320
let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
321
322
for _ in 0..num_loops {
323
// SAFETY: The num_loops calculates how many times we can do this.
324
let loaded = u8x16::from_array(unsafe {
325
src.get_unchecked(..16).try_into().unwrap()
326
});
327
src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
328
329
// This way we can reuse the same load for multiple views.
330
repeat_with!(
331
I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
332
if I < BLOCKS_PER_LOAD {
333
let zero = u8x16::default();
334
const SWIZZLE: [usize; 16] = const {
335
let mut swizzle = [16usize; 16];
336
337
let mut i = 0;
338
while i < N {
339
let idx = i + I * N;
340
if idx < 16 {
341
swizzle[4+i] = idx;
342
}
343
i += 1;
344
}
345
346
swizzle
347
};
348
349
let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
350
let view_bytes = (scattered | length_mask).to_array();
351
352
// SAFETY: dst has the capacity reserved and view_bytes is 16
353
// bytes long.
354
unsafe {
355
core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
356
dst = dst.add(16);
357
}
358
}
359
}
360
);
361
}
362
363
unsafe {
364
views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
365
}
366
}
367
368
views.extend(src.chunks_exact(N).map(|slice| unsafe {
369
View::new_inline_unchecked(slice)
370
}));
371
},
372
otherwise = unreachable!()
373
}
374
}
375
}
376
377
impl IsNull for View {
378
const HAS_NULLS: bool = false;
379
type Inner = Self;
380
381
fn is_null(&self) -> bool {
382
false
383
}
384
385
fn unwrap_inner(self) -> Self::Inner {
386
self
387
}
388
}
389
390
impl Display for View {
391
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
392
write!(f, "{self:?}")
393
}
394
}
395
396
unsafe impl Zeroable for View {}
397
398
unsafe impl Pod for View {}
399
400
impl PartialEq for View {
401
fn eq(&self, other: &Self) -> bool {
402
self.as_u128() == other.as_u128()
403
}
404
}
405
406
// These are 'implemented' because we want to implement NativeType
407
// for View, that should probably not be done.
408
impl TotalOrd for View {
409
fn tot_cmp(&self, _other: &Self) -> Ordering {
410
unimplemented!()
411
}
412
}
413
414
impl TotalEq for View {
415
fn tot_eq(&self, other: &Self) -> bool {
416
self.eq(other)
417
}
418
}
419
420
impl MinMax for View {
421
fn nan_min_lt(&self, _other: &Self) -> bool {
422
unimplemented!()
423
}
424
425
fn nan_max_lt(&self, _other: &Self) -> bool {
426
unimplemented!()
427
}
428
}
429
430
impl NativeType for View {
431
const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
432
433
type Bytes = [u8; 16];
434
type AlignedBytes = Bytes16Alignment4;
435
436
#[inline]
437
fn to_le_bytes(&self) -> Self::Bytes {
438
self.as_u128().to_le_bytes()
439
}
440
441
#[inline]
442
fn to_be_bytes(&self) -> Self::Bytes {
443
self.as_u128().to_be_bytes()
444
}
445
446
#[inline]
447
fn from_le_bytes(bytes: Self::Bytes) -> Self {
448
Self::from(u128::from_le_bytes(bytes))
449
}
450
451
#[inline]
452
fn from_be_bytes(bytes: Self::Bytes) -> Self {
453
Self::from(u128::from_be_bytes(bytes))
454
}
455
}
456
457
impl From<u128> for View {
458
#[inline]
459
fn from(value: u128) -> Self {
460
unsafe { std::mem::transmute(value) }
461
}
462
}
463
464
impl From<View> for u128 {
465
#[inline]
466
fn from(value: View) -> Self {
467
value.as_u128()
468
}
469
}
470
471
pub fn validate_views<B: AsRef<[u8]>, F>(
472
views: &[View],
473
buffers: &[B],
474
validate_bytes: F,
475
) -> PolarsResult<()>
476
where
477
F: Fn(&[u8]) -> PolarsResult<()>,
478
{
479
for view in views {
480
if let Some(inline_slice) = view.get_inlined_slice() {
481
if view.length < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + view.length * 8) != 0
482
{
483
polars_bail!(ComputeError: "view contained non-zero padding in prefix");
484
}
485
486
validate_bytes(inline_slice)?;
487
} else {
488
let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
489
polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
490
})?;
491
492
let start = view.offset as usize;
493
let end = start + view.length as usize;
494
let b = data
495
.as_ref()
496
.get(start..end)
497
.ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
498
499
polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
500
validate_bytes(b)?;
501
}
502
}
503
504
Ok(())
505
}
506
507
pub fn validate_binary_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
508
validate_views(views, buffers, |_| Ok(()))
509
}
510
511
fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
512
match simdutf8::basic::from_utf8(b) {
513
Ok(_) => Ok(()),
514
Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
515
}
516
}
517
518
pub fn validate_utf8_views<B: AsRef<[u8]>>(views: &[View], buffers: &[B]) -> PolarsResult<()> {
519
validate_views(views, buffers, validate_utf8)
520
}
521
522
/// Checks the views for valid UTF-8. Assumes the first num_trusted_buffers are
523
/// valid UTF-8 without checking.
524
/// # Safety
525
/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.
526
pub unsafe fn validate_views_utf8_only<B: AsRef<[u8]>>(
527
views: &[View],
528
buffers: &[B],
529
mut num_trusted_buffers: usize,
530
) -> PolarsResult<()> {
531
unsafe {
532
while num_trusted_buffers < buffers.len()
533
&& buffers[num_trusted_buffers].as_ref().is_ascii()
534
{
535
num_trusted_buffers += 1;
536
}
537
538
// Fast path if all buffers are ASCII (or there are no buffers).
539
if num_trusted_buffers >= buffers.len() {
540
for view in views {
541
if let Some(inlined_slice) = view.get_inlined_slice() {
542
validate_utf8(inlined_slice)?;
543
}
544
}
545
} else {
546
for view in views {
547
if view.length <= View::MAX_INLINE_SIZE
548
|| view.buffer_idx as usize >= num_trusted_buffers
549
{
550
validate_utf8(view.get_slice_unchecked(buffers))?;
551
}
552
}
553
}
554
555
Ok(())
556
}
557
}
558
559