Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-row/src/encode.rs
8406 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
use std::mem::MaybeUninit;
3
4
use arrow::array::{
5
Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeListArray, ListArray,
6
PrimitiveArray, StructArray, UInt8Array, UInt16Array, UInt32Array, Utf8Array, Utf8ViewArray,
7
};
8
use arrow::bitmap::Bitmap;
9
use arrow::datatypes::ArrowDataType;
10
use arrow::types::{NativeType, Offset};
11
use polars_dtype::categorical::CatNative;
12
use polars_utils::float16::pf16;
13
14
use crate::fixed::numeric::FixedLengthEncoding;
15
use crate::fixed::{boolean, decimal, numeric};
16
use crate::row::{RowEncodingOptions, RowsEncoded};
17
use crate::variable::{binary, no_order, utf8};
18
use crate::widths::RowWidths;
19
use crate::{
20
ArrayRef, RowEncodingCategoricalContext, RowEncodingContext, with_match_arrow_primitive_type,
21
};
22
23
pub fn convert_columns(
24
num_rows: usize,
25
columns: &[ArrayRef],
26
opts: &[RowEncodingOptions],
27
dicts: &[Option<RowEncodingContext>],
28
) -> RowsEncoded {
29
let mut rows = RowsEncoded::new(vec![], vec![]);
30
convert_columns_amortized(
31
num_rows,
32
columns,
33
opts.iter().copied().zip(dicts.iter().map(|v| v.as_ref())),
34
&mut rows,
35
);
36
rows
37
}
38
39
pub fn convert_columns_no_order(
40
num_rows: usize,
41
columns: &[ArrayRef],
42
dicts: &[Option<RowEncodingContext>],
43
) -> RowsEncoded {
44
let mut rows = RowsEncoded::new(vec![], vec![]);
45
convert_columns_amortized_no_order(num_rows, columns, dicts, &mut rows);
46
rows
47
}
48
49
pub fn convert_columns_amortized_no_order(
50
num_rows: usize,
51
columns: &[ArrayRef],
52
dicts: &[Option<RowEncodingContext>],
53
rows: &mut RowsEncoded,
54
) {
55
convert_columns_amortized(
56
num_rows,
57
columns,
58
std::iter::repeat_n(RowEncodingOptions::default(), columns.len())
59
.zip(dicts.iter().map(|v| v.as_ref())),
60
rows,
61
);
62
}
63
64
pub fn convert_columns_amortized<'a>(
65
num_rows: usize,
66
columns: &[ArrayRef],
67
fields: impl IntoIterator<Item = (RowEncodingOptions, Option<&'a RowEncodingContext>)> + Clone,
68
rows: &mut RowsEncoded,
69
) {
70
let mut masked_out_max_length = 0;
71
let mut row_widths = RowWidths::new(num_rows);
72
let mut encoders = columns
73
.iter()
74
.zip(fields.clone())
75
.map(|(column, (opt, dicts))| {
76
get_encoder(
77
column.as_ref(),
78
opt,
79
dicts,
80
&mut row_widths,
81
&mut masked_out_max_length,
82
)
83
})
84
.collect::<Vec<_>>();
85
86
// Create an offsets array, we append 0 at the beginning here so it can serve as the final
87
// offset array.
88
let mut offsets = Vec::with_capacity(num_rows + 1);
89
offsets.push(0);
90
row_widths.extend_with_offsets(&mut offsets);
91
92
// Create a buffer without initializing everything to zero.
93
let total_num_bytes = row_widths.sum();
94
let mut out = Vec::<u8>::with_capacity(total_num_bytes + masked_out_max_length);
95
let buffer = &mut out.spare_capacity_mut()[..total_num_bytes + masked_out_max_length];
96
97
let masked_out_write_offset = total_num_bytes;
98
let mut scratches = EncodeScratches::default();
99
for (encoder, (opt, dict)) in encoders.iter_mut().zip(fields) {
100
unsafe {
101
encode_array(
102
buffer,
103
encoder,
104
opt,
105
dict,
106
&mut offsets[1..],
107
masked_out_write_offset,
108
&mut scratches,
109
)
110
};
111
}
112
// SAFETY: All the bytes in out up to total_num_bytes should now be initialized.
113
unsafe {
114
out.set_len(total_num_bytes);
115
}
116
117
*rows = RowsEncoded {
118
values: out,
119
offsets,
120
};
121
}
122
123
fn list_num_column_bytes<O: Offset>(
124
array: &dyn Array,
125
opt: RowEncodingOptions,
126
dicts: Option<&RowEncodingContext>,
127
row_widths: &mut RowWidths,
128
masked_out_max_width: &mut usize,
129
) -> Encoder {
130
let array = array.as_any().downcast_ref::<ListArray<O>>().unwrap();
131
let values = array.values();
132
133
let mut list_row_widths = RowWidths::new(values.len());
134
let encoder = get_encoder(
135
values.as_ref(),
136
opt.into_nested(),
137
dicts,
138
&mut list_row_widths,
139
masked_out_max_width,
140
);
141
142
match array.validity() {
143
None => row_widths.push_iter(array.offsets().offset_and_length_iter().map(
144
|(offset, length)| {
145
let mut sum = 0;
146
for i in offset..offset + length {
147
sum += list_row_widths.get(i);
148
}
149
1 + length + sum
150
},
151
)),
152
Some(validity) => row_widths.push_iter(
153
array
154
.offsets()
155
.offset_and_length_iter()
156
.zip(validity.iter())
157
.map(|((offset, length), is_valid)| {
158
if !is_valid {
159
if length > 0 {
160
for i in offset..offset + length {
161
*masked_out_max_width =
162
(*masked_out_max_width).max(list_row_widths.get(i));
163
}
164
}
165
return 1;
166
}
167
168
let mut sum = 0;
169
for i in offset..offset + length {
170
sum += list_row_widths.get(i);
171
}
172
1 + length + sum
173
}),
174
),
175
};
176
177
Encoder {
178
array: array.to_boxed(),
179
state: Some(Box::new(EncoderState::List(
180
Box::new(encoder),
181
list_row_widths,
182
))),
183
}
184
}
185
186
fn biniter_num_column_bytes(
187
array: &dyn Array,
188
iter: impl ExactSizeIterator<Item = usize>,
189
validity: Option<&Bitmap>,
190
opt: RowEncodingOptions,
191
row_widths: &mut RowWidths,
192
) -> Encoder {
193
if opt.contains(RowEncodingOptions::NO_ORDER) {
194
match validity {
195
None => row_widths.push_iter(iter.map(|v| no_order::len_from_item(Some(v), opt))),
196
Some(validity) => row_widths.push_iter(
197
iter.zip(validity.iter())
198
.map(|(v, is_valid)| no_order::len_from_item(is_valid.then_some(v), opt)),
199
),
200
}
201
} else {
202
match validity {
203
None => row_widths.push_iter(
204
iter.map(|v| crate::variable::binary::encoded_len_from_len(Some(v), opt)),
205
),
206
Some(validity) => row_widths.push_iter(
207
iter.zip(validity.iter())
208
.map(|(v, is_valid)| binary::encoded_len_from_len(is_valid.then_some(v), opt)),
209
),
210
}
211
};
212
213
Encoder {
214
array: array.to_boxed(),
215
state: None,
216
}
217
}
218
219
fn striter_num_column_bytes(
220
array: &dyn Array,
221
iter: impl ExactSizeIterator<Item = usize>,
222
validity: Option<&Bitmap>,
223
opt: RowEncodingOptions,
224
row_widths: &mut RowWidths,
225
) -> Encoder {
226
if opt.contains(RowEncodingOptions::NO_ORDER) {
227
match validity {
228
None => row_widths.push_iter(iter.map(|v| no_order::len_from_item(Some(v), opt))),
229
Some(validity) => row_widths.push_iter(
230
iter.zip(validity.iter())
231
.map(|(v, is_valid)| no_order::len_from_item(is_valid.then_some(v), opt)),
232
),
233
}
234
} else {
235
match validity {
236
None => row_widths
237
.push_iter(iter.map(|v| crate::variable::utf8::len_from_item(Some(v), opt))),
238
Some(validity) => row_widths.push_iter(
239
iter.zip(validity.iter())
240
.map(|(v, is_valid)| utf8::len_from_item(is_valid.then_some(v), opt)),
241
),
242
}
243
};
244
245
Encoder {
246
array: array.to_boxed(),
247
state: None,
248
}
249
}
250
251
/// Get the encoder for a specific array.
252
fn get_encoder(
253
array: &dyn Array,
254
opt: RowEncodingOptions,
255
dict: Option<&RowEncodingContext>,
256
row_widths: &mut RowWidths,
257
masked_out_max_width: &mut usize,
258
) -> Encoder {
259
use ArrowDataType as D;
260
let dtype = array.dtype();
261
262
// Fast path: column has a fixed size encoding
263
if let Some(size) = fixed_size(dtype, opt, dict) {
264
row_widths.push_constant(size);
265
let state = match dtype {
266
D::FixedSizeList(_, width) => {
267
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
268
269
debug_assert_eq!(array.values().len(), array.len() * width);
270
let mut nested_row_widths = RowWidths::new(array.values().len());
271
let nested_encoder = get_encoder(
272
array.values().as_ref(),
273
opt.into_nested(),
274
dict,
275
&mut nested_row_widths,
276
masked_out_max_width,
277
);
278
Some(EncoderState::FixedSizeList(
279
Box::new(nested_encoder),
280
*width,
281
nested_row_widths,
282
))
283
},
284
D::Struct(_) => {
285
let struct_array = array.as_any().downcast_ref::<StructArray>().unwrap();
286
287
Some(EncoderState::Struct(match dict {
288
None => struct_array
289
.values()
290
.iter()
291
.map(|array| {
292
get_encoder(
293
array.as_ref(),
294
opt.into_nested(),
295
None,
296
&mut RowWidths::new(row_widths.num_rows()),
297
masked_out_max_width,
298
)
299
})
300
.collect(),
301
Some(RowEncodingContext::Struct(dicts)) => struct_array
302
.values()
303
.iter()
304
.zip(dicts)
305
.map(|(array, dict)| {
306
get_encoder(
307
array.as_ref(),
308
opt,
309
dict.as_ref(),
310
&mut RowWidths::new(row_widths.num_rows()),
311
masked_out_max_width,
312
)
313
})
314
.collect(),
315
_ => unreachable!(),
316
}))
317
},
318
_ => None,
319
};
320
321
let state = state.map(Box::new);
322
return Encoder {
323
array: array.to_boxed(),
324
state,
325
};
326
}
327
328
// Non-fixed-size categorical path.
329
if let Some(RowEncodingContext::Categorical(ctx)) = dict {
330
match dtype {
331
D::UInt8 => {
332
assert!(opt.is_ordered() && !ctx.is_enum);
333
let dc_array = array.as_any().downcast_ref::<UInt8Array>().unwrap();
334
return striter_num_column_bytes(
335
array,
336
dc_array.values_iter().map(|cat| {
337
ctx.mapping
338
.cat_to_str(cat.as_cat())
339
.map(|s| s.len())
340
.unwrap_or(0)
341
}),
342
dc_array.validity(),
343
opt,
344
row_widths,
345
);
346
},
347
D::UInt16 => {
348
assert!(opt.is_ordered() && !ctx.is_enum);
349
let dc_array = array.as_any().downcast_ref::<UInt16Array>().unwrap();
350
return striter_num_column_bytes(
351
array,
352
dc_array.values_iter().map(|cat| {
353
ctx.mapping
354
.cat_to_str(cat.as_cat())
355
.map(|s| s.len())
356
.unwrap_or(0)
357
}),
358
dc_array.validity(),
359
opt,
360
row_widths,
361
);
362
},
363
D::UInt32 => {
364
assert!(opt.is_ordered() && !ctx.is_enum);
365
let dc_array = array.as_any().downcast_ref::<UInt32Array>().unwrap();
366
return striter_num_column_bytes(
367
array,
368
dc_array.values_iter().map(|cat| {
369
ctx.mapping
370
.cat_to_str(cat.as_cat())
371
.map(|s| s.len())
372
.unwrap_or(0)
373
}),
374
dc_array.validity(),
375
opt,
376
row_widths,
377
);
378
},
379
_ => {
380
// Fall through to below, should be nested type containing categorical.
381
debug_assert!(dtype.is_nested())
382
},
383
}
384
}
385
386
match dtype {
387
D::FixedSizeList(_, width) => {
388
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
389
390
debug_assert_eq!(array.values().len(), array.len() * width);
391
let mut nested_row_widths = RowWidths::new(array.values().len());
392
let nested_encoder = get_encoder(
393
array.values().as_ref(),
394
opt.into_nested(),
395
dict,
396
&mut nested_row_widths,
397
masked_out_max_width,
398
);
399
400
let mut fsl_row_widths = nested_row_widths.collapse_chunks(*width, array.len());
401
fsl_row_widths.push_constant(1); // validity byte
402
403
row_widths.push(&fsl_row_widths);
404
Encoder {
405
array: array.to_boxed(),
406
state: Some(Box::new(EncoderState::FixedSizeList(
407
Box::new(nested_encoder),
408
*width,
409
nested_row_widths,
410
))),
411
}
412
},
413
D::Struct(_) => {
414
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
415
416
let mut nested_encoders = Vec::with_capacity(array.values().len());
417
row_widths.push_constant(1); // validity byte
418
match dict {
419
None => {
420
for array in array.values() {
421
let encoder = get_encoder(
422
array.as_ref(),
423
opt.into_nested(),
424
None,
425
row_widths,
426
masked_out_max_width,
427
);
428
nested_encoders.push(encoder);
429
}
430
},
431
Some(RowEncodingContext::Struct(dicts)) => {
432
for (array, dict) in array.values().iter().zip(dicts) {
433
let encoder = get_encoder(
434
array.as_ref(),
435
opt.into_nested(),
436
dict.as_ref(),
437
row_widths,
438
masked_out_max_width,
439
);
440
nested_encoders.push(encoder);
441
}
442
},
443
_ => unreachable!(),
444
}
445
Encoder {
446
array: array.to_boxed(),
447
state: Some(Box::new(EncoderState::Struct(nested_encoders))),
448
}
449
},
450
451
D::List(_) => {
452
list_num_column_bytes::<i32>(array, opt, dict, row_widths, masked_out_max_width)
453
},
454
D::LargeList(_) => {
455
list_num_column_bytes::<i64>(array, opt, dict, row_widths, masked_out_max_width)
456
},
457
458
D::BinaryView => {
459
let dc_array = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
460
biniter_num_column_bytes(
461
array,
462
dc_array.views().iter().map(|v| v.length as usize),
463
dc_array.validity(),
464
opt,
465
row_widths,
466
)
467
},
468
D::Binary => {
469
let dc_array = array.as_any().downcast_ref::<BinaryArray<i32>>().unwrap();
470
biniter_num_column_bytes(
471
array,
472
dc_array.offsets().lengths(),
473
dc_array.validity(),
474
opt,
475
row_widths,
476
)
477
},
478
D::LargeBinary => {
479
let dc_array = array.as_any().downcast_ref::<BinaryArray<i64>>().unwrap();
480
biniter_num_column_bytes(
481
array,
482
dc_array.offsets().lengths(),
483
dc_array.validity(),
484
opt,
485
row_widths,
486
)
487
},
488
489
D::Utf8View => {
490
let dc_array = array.as_any().downcast_ref::<Utf8ViewArray>().unwrap();
491
striter_num_column_bytes(
492
array,
493
dc_array.views().iter().map(|v| v.length as usize),
494
dc_array.validity(),
495
opt,
496
row_widths,
497
)
498
},
499
D::Utf8 => {
500
let dc_array = array.as_any().downcast_ref::<Utf8Array<i32>>().unwrap();
501
striter_num_column_bytes(
502
array,
503
dc_array.offsets().lengths(),
504
dc_array.validity(),
505
opt,
506
row_widths,
507
)
508
},
509
D::LargeUtf8 => {
510
let dc_array = array.as_any().downcast_ref::<Utf8Array<i64>>().unwrap();
511
striter_num_column_bytes(
512
array,
513
dc_array.offsets().lengths(),
514
dc_array.validity(),
515
opt,
516
row_widths,
517
)
518
},
519
520
D::Union(_) => unreachable!(),
521
D::Map(_, _) => unreachable!(),
522
D::Extension(_) => unreachable!(),
523
D::Unknown => unreachable!(),
524
525
// All non-physical types
526
D::Timestamp(_, _)
527
| D::Date32
528
| D::Date64
529
| D::Time32(_)
530
| D::Time64(_)
531
| D::Duration(_)
532
| D::Interval(_)
533
| D::Dictionary(_, _, _)
534
| D::Decimal(_, _)
535
| D::Decimal32(_, _)
536
| D::Decimal64(_, _)
537
| D::Decimal256(_, _) => unreachable!(),
538
539
// Should be fixed size type
540
_ => unreachable!(),
541
}
542
}
543
544
struct Encoder {
545
array: Box<dyn Array>,
546
547
/// State contains nested encoders and extra information needed to encode.
548
state: Option<Box<EncoderState>>,
549
}
550
551
enum EncoderState {
552
List(Box<Encoder>, RowWidths),
553
FixedSizeList(Box<Encoder>, usize, RowWidths),
554
Struct(Vec<Encoder>),
555
}
556
557
unsafe fn encode_strs<'a>(
558
buffer: &mut [MaybeUninit<u8>],
559
iter: impl Iterator<Item = Option<&'a str>>,
560
opt: RowEncodingOptions,
561
offsets: &mut [usize],
562
) {
563
if opt.contains(RowEncodingOptions::NO_ORDER) {
564
no_order::encode_variable_no_order(
565
buffer,
566
iter.map(|v| v.map(str::as_bytes)),
567
opt,
568
offsets,
569
);
570
} else {
571
utf8::encode_str(buffer, iter, opt, offsets);
572
}
573
}
574
575
unsafe fn encode_bins<'a>(
576
buffer: &mut [MaybeUninit<u8>],
577
iter: impl Iterator<Item = Option<&'a [u8]>>,
578
opt: RowEncodingOptions,
579
offsets: &mut [usize],
580
) {
581
if opt.contains(RowEncodingOptions::NO_ORDER) {
582
no_order::encode_variable_no_order(buffer, iter, opt, offsets);
583
} else {
584
binary::encode_iter(buffer, iter, opt, offsets);
585
}
586
}
587
588
unsafe fn encode_cat_array<T: NativeType + FixedLengthEncoding + CatNative>(
589
buffer: &mut [MaybeUninit<u8>],
590
keys: &PrimitiveArray<T>,
591
opt: RowEncodingOptions,
592
ctx: &RowEncodingCategoricalContext,
593
offsets: &mut [usize],
594
) {
595
if ctx.is_enum || !opt.is_ordered() {
596
numeric::encode(buffer, keys, opt, offsets);
597
} else {
598
utf8::encode_str(
599
buffer,
600
keys.iter()
601
.map(|k| k.map(|&cat| ctx.mapping.cat_to_str_unchecked(cat.as_cat()))),
602
opt,
603
offsets,
604
);
605
}
606
}
607
608
unsafe fn encode_flat_array(
609
buffer: &mut [MaybeUninit<u8>],
610
array: &dyn Array,
611
opt: RowEncodingOptions,
612
dict: Option<&RowEncodingContext>,
613
offsets: &mut [usize],
614
) {
615
use ArrowDataType as D;
616
617
if let Some(RowEncodingContext::Categorical(ctx)) = dict {
618
match array.dtype() {
619
D::UInt8 => {
620
let keys = array.as_any().downcast_ref::<PrimitiveArray<u8>>().unwrap();
621
encode_cat_array(buffer, keys, opt, ctx, offsets);
622
},
623
D::UInt16 => {
624
let keys = array
625
.as_any()
626
.downcast_ref::<PrimitiveArray<u16>>()
627
.unwrap();
628
encode_cat_array(buffer, keys, opt, ctx, offsets);
629
},
630
D::UInt32 => {
631
let keys = array
632
.as_any()
633
.downcast_ref::<PrimitiveArray<u32>>()
634
.unwrap();
635
encode_cat_array(buffer, keys, opt, ctx, offsets);
636
},
637
_ => unreachable!(),
638
};
639
return;
640
}
641
642
match array.dtype() {
643
D::Null => {},
644
D::Boolean => {
645
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
646
boolean::encode_bool(buffer, array.iter(), opt, offsets);
647
},
648
649
dt if dt.is_numeric() => {
650
if matches!(dt, D::Int128) {
651
if let Some(RowEncodingContext::Decimal(precision)) = dict {
652
decimal::encode(
653
buffer,
654
array
655
.as_any()
656
.downcast_ref::<PrimitiveArray<i128>>()
657
.unwrap(),
658
opt,
659
offsets,
660
*precision,
661
);
662
return;
663
}
664
}
665
666
with_match_arrow_primitive_type!(dt, |$T| {
667
let array = array.as_any().downcast_ref::<PrimitiveArray<$T>>().unwrap();
668
numeric::encode(buffer, array, opt, offsets);
669
})
670
},
671
672
D::Binary => {
673
let array = array.as_any().downcast_ref::<BinaryArray<i32>>().unwrap();
674
encode_bins(buffer, array.iter(), opt, offsets);
675
},
676
D::LargeBinary => {
677
let array = array.as_any().downcast_ref::<BinaryArray<i64>>().unwrap();
678
encode_bins(buffer, array.iter(), opt, offsets);
679
},
680
D::BinaryView => {
681
let array = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
682
encode_bins(buffer, array.iter(), opt, offsets);
683
},
684
D::Utf8 => {
685
let array = array.as_any().downcast_ref::<Utf8Array<i32>>().unwrap();
686
encode_strs(buffer, array.iter(), opt, offsets);
687
},
688
D::LargeUtf8 => {
689
let array = array.as_any().downcast_ref::<Utf8Array<i64>>().unwrap();
690
encode_strs(buffer, array.iter(), opt, offsets);
691
},
692
D::Utf8View => {
693
let array = array.as_any().downcast_ref::<Utf8ViewArray>().unwrap();
694
encode_strs(buffer, array.iter(), opt, offsets);
695
},
696
697
// Lexical ordered Categorical are cast to PrimitiveArray above.
698
D::Dictionary(_, _, _) => todo!(),
699
700
D::FixedSizeBinary(_) => todo!(),
701
D::Decimal(_, _) => todo!(),
702
D::Decimal32(_, _) => todo!(),
703
D::Decimal64(_, _) => todo!(),
704
D::Decimal256(_, _) => todo!(),
705
706
D::Union(_) => todo!(),
707
D::Map(_, _) => todo!(),
708
D::Extension(_) => todo!(),
709
D::Unknown => todo!(),
710
711
// All are non-physical types.
712
D::Timestamp(_, _)
713
| D::Date32
714
| D::Date64
715
| D::Time32(_)
716
| D::Time64(_)
717
| D::Duration(_)
718
| D::Interval(_) => unreachable!(),
719
720
_ => unreachable!(),
721
}
722
}
723
724
#[derive(Default)]
725
struct EncodeScratches {
726
nested_offsets: Vec<usize>,
727
nested_buffer: Vec<u8>,
728
}
729
730
impl EncodeScratches {
731
fn clear(&mut self) {
732
self.nested_offsets.clear();
733
self.nested_buffer.clear();
734
}
735
}
736
737
unsafe fn encode_array(
738
buffer: &mut [MaybeUninit<u8>],
739
encoder: &Encoder,
740
opt: RowEncodingOptions,
741
dict: Option<&RowEncodingContext>,
742
offsets: &mut [usize],
743
masked_out_write_offset: usize, // Masked out values need to be written somewhere. We just
744
// reserved space at the end and tell all values to write
745
// there.
746
scratches: &mut EncodeScratches,
747
) {
748
let Some(state) = &encoder.state else {
749
// This is actually the main path.
750
//
751
// If no nested types or special types are needed, this path is taken.
752
return encode_flat_array(buffer, encoder.array.as_ref(), opt, dict, offsets);
753
};
754
755
match state.as_ref() {
756
EncoderState::List(nested_encoder, nested_row_widths) => {
757
// @TODO: make more general.
758
let array = encoder
759
.array
760
.as_any()
761
.downcast_ref::<ListArray<i64>>()
762
.unwrap();
763
764
scratches.clear();
765
766
scratches
767
.nested_offsets
768
.reserve(nested_row_widths.num_rows());
769
let nested_offsets = &mut scratches.nested_offsets;
770
771
let list_null_sentinel = opt.list_null_sentinel();
772
let list_continuation_token = opt.list_continuation_token();
773
let list_termination_token = opt.list_termination_token();
774
775
match array.validity() {
776
None => {
777
for (i, (offset, length)) in
778
array.offsets().offset_and_length_iter().enumerate()
779
{
780
for j in offset..offset + length {
781
buffer[offsets[i]] = MaybeUninit::new(list_continuation_token);
782
offsets[i] += 1;
783
784
nested_offsets.push(offsets[i]);
785
offsets[i] += nested_row_widths.get(j);
786
}
787
buffer[offsets[i]] = MaybeUninit::new(list_termination_token);
788
offsets[i] += 1;
789
}
790
},
791
Some(validity) => {
792
for (i, ((offset, length), is_valid)) in array
793
.offsets()
794
.offset_and_length_iter()
795
.zip(validity.iter())
796
.enumerate()
797
{
798
if !is_valid {
799
buffer[offsets[i]] = MaybeUninit::new(list_null_sentinel);
800
offsets[i] += 1;
801
802
// Values might have been masked out.
803
if length > 0 {
804
nested_offsets
805
.extend(std::iter::repeat_n(masked_out_write_offset, length));
806
}
807
808
continue;
809
}
810
811
for j in offset..offset + length {
812
buffer[offsets[i]] = MaybeUninit::new(list_continuation_token);
813
offsets[i] += 1;
814
815
nested_offsets.push(offsets[i]);
816
offsets[i] += nested_row_widths.get(j);
817
}
818
buffer[offsets[i]] = MaybeUninit::new(list_termination_token);
819
offsets[i] += 1;
820
}
821
},
822
}
823
824
unsafe {
825
encode_array(
826
buffer,
827
nested_encoder,
828
opt.into_nested(),
829
dict,
830
nested_offsets,
831
masked_out_write_offset,
832
&mut EncodeScratches::default(),
833
)
834
};
835
},
836
EncoderState::FixedSizeList(array, width, nested_row_widths) => {
837
encode_validity(buffer, encoder.array.validity(), opt, offsets);
838
839
if *width == 0 {
840
return;
841
}
842
843
let mut child_offsets = Vec::with_capacity(offsets.len() * width);
844
for (i, offset) in offsets.iter_mut().enumerate() {
845
for j in 0..*width {
846
child_offsets.push(*offset);
847
*offset += nested_row_widths.get((i * width) + j);
848
}
849
}
850
851
encode_array(
852
buffer,
853
array.as_ref(),
854
opt.into_nested(),
855
dict,
856
&mut child_offsets,
857
masked_out_write_offset,
858
scratches,
859
);
860
for (i, offset) in offsets.iter_mut().enumerate() {
861
*offset = child_offsets[(i + 1) * width - 1];
862
}
863
},
864
EncoderState::Struct(arrays) => {
865
encode_validity(buffer, encoder.array.validity(), opt, offsets);
866
867
match dict {
868
None => {
869
for array in arrays {
870
encode_array(
871
buffer,
872
array,
873
opt.into_nested(),
874
None,
875
offsets,
876
masked_out_write_offset,
877
scratches,
878
);
879
}
880
},
881
Some(RowEncodingContext::Struct(dicts)) => {
882
for (array, dict) in arrays.iter().zip(dicts) {
883
encode_array(
884
buffer,
885
array,
886
opt.into_nested(),
887
dict.as_ref(),
888
offsets,
889
masked_out_write_offset,
890
scratches,
891
);
892
}
893
},
894
_ => unreachable!(),
895
}
896
},
897
}
898
}
899
900
unsafe fn encode_validity(
901
buffer: &mut [MaybeUninit<u8>],
902
validity: Option<&Bitmap>,
903
opt: RowEncodingOptions,
904
row_starts: &mut [usize],
905
) {
906
let null_sentinel = opt.null_sentinel();
907
match validity {
908
None => {
909
for row_start in row_starts.iter_mut() {
910
buffer[*row_start] = MaybeUninit::new(1);
911
*row_start += 1;
912
}
913
},
914
Some(validity) => {
915
for (row_start, is_valid) in row_starts.iter_mut().zip(validity.iter()) {
916
let v = if is_valid {
917
MaybeUninit::new(1)
918
} else {
919
MaybeUninit::new(null_sentinel)
920
};
921
buffer[*row_start] = v;
922
*row_start += 1;
923
}
924
},
925
}
926
}
927
928
pub fn fixed_size(
929
dtype: &ArrowDataType,
930
opt: RowEncodingOptions,
931
dict: Option<&RowEncodingContext>,
932
) -> Option<usize> {
933
use ArrowDataType as D;
934
use numeric::FixedLengthEncoding;
935
936
if let Some(RowEncodingContext::Categorical(ctx)) = dict {
937
// If ordered categorical (non-enum) we encode strings, otherwise physical.
938
if !ctx.is_enum && opt.is_ordered() {
939
return None;
940
}
941
}
942
943
Some(match dtype {
944
D::Null => 0,
945
D::Boolean => 1,
946
947
D::UInt8 => u8::ENCODED_LEN,
948
D::UInt16 => u16::ENCODED_LEN,
949
D::UInt32 => u32::ENCODED_LEN,
950
D::UInt64 => u64::ENCODED_LEN,
951
D::UInt128 => u128::ENCODED_LEN,
952
953
D::Int8 => i8::ENCODED_LEN,
954
D::Int16 => i16::ENCODED_LEN,
955
D::Int32 => i32::ENCODED_LEN,
956
D::Int64 => i64::ENCODED_LEN,
957
D::Int128 => match dict {
958
None => i128::ENCODED_LEN,
959
Some(RowEncodingContext::Decimal(precision)) => decimal::len_from_precision(*precision),
960
_ => unreachable!(),
961
},
962
963
D::Float16 => pf16::ENCODED_LEN,
964
D::Float32 => f32::ENCODED_LEN,
965
D::Float64 => f64::ENCODED_LEN,
966
D::FixedSizeList(f, width) => 1 + width * fixed_size(f.dtype(), opt, dict)?,
967
D::Struct(fs) => match dict {
968
None => {
969
let mut sum = 0;
970
for f in fs {
971
sum += fixed_size(f.dtype(), opt, None)?;
972
}
973
1 + sum
974
},
975
Some(RowEncodingContext::Struct(dicts)) => {
976
let mut sum = 0;
977
for (f, dict) in fs.iter().zip(dicts) {
978
sum += fixed_size(f.dtype(), opt, dict.as_ref())?;
979
}
980
1 + sum
981
},
982
_ => unreachable!(),
983
},
984
_ => return None,
985
})
986
}
987
988
#[cfg(test)]
989
mod tests {
990
use arrow::array::proptest::{
991
ArrayArbitraryOptions, ArrowDataTypeArbitraryOptions, ArrowDataTypeArbitrarySelection,
992
array_with_options,
993
};
994
995
use super::*;
996
997
proptest::prop_compose! {
998
fn arrays
999
()
1000
(length in 0..100usize)
1001
(arrays in proptest::collection::vec(array_with_options(length, ArrayArbitraryOptions {
1002
dtype: ArrowDataTypeArbitraryOptions {
1003
allowed_dtypes: ArrowDataTypeArbitrarySelection::all() & !ArrowDataTypeArbitrarySelection::BINARY,
1004
..Default::default()
1005
}
1006
}), 1..3))
1007
-> Vec<Box<dyn Array>> {
1008
arrays
1009
}
1010
}
1011
1012
proptest::proptest! {
1013
#[test]
1014
fn test_encode_arrays
1015
(arrays in arrays())
1016
{
1017
let dicts: Vec<Option<RowEncodingContext>> = (0..arrays.len()).map(|_| None).collect();
1018
convert_columns_no_order(arrays[0].len(), &arrays, &dicts);
1019
}
1020
}
1021
}
1022
1023