Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/write/pages.rs
6940 views
1
use std::fmt::Debug;
2
3
use arrow::array::{Array, FixedSizeListArray, ListArray, MapArray, StructArray};
4
use arrow::bitmap::{Bitmap, MutableBitmap};
5
use arrow::datatypes::PhysicalType;
6
use arrow::offset::{Offset, OffsetsBuffer};
7
use polars_error::{PolarsResult, polars_bail};
8
9
use super::{ColumnWriteOptions, WriteOptions, array_to_pages};
10
use crate::arrow::read::schema::is_nullable;
11
use crate::parquet::page::Page;
12
use crate::parquet::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType};
13
use crate::write::DynIter;
14
15
#[derive(Debug, Clone, PartialEq)]
16
pub struct PrimitiveNested {
17
pub is_optional: bool,
18
pub validity: Option<Bitmap>,
19
pub length: usize,
20
}
21
22
#[derive(Debug, Clone, PartialEq)]
23
pub struct ListNested<O: Offset> {
24
pub is_optional: bool,
25
pub offsets: OffsetsBuffer<O>,
26
pub validity: Option<Bitmap>,
27
}
28
29
#[derive(Debug, Clone, PartialEq)]
30
pub struct FixedSizeListNested {
31
pub validity: Option<Bitmap>,
32
pub is_optional: bool,
33
pub width: usize,
34
pub length: usize,
35
}
36
37
#[derive(Debug, Clone, PartialEq)]
38
pub struct StructNested {
39
pub is_optional: bool,
40
pub validity: Option<Bitmap>,
41
pub length: usize,
42
}
43
44
impl<O: Offset> ListNested<O> {
45
pub fn new(offsets: OffsetsBuffer<O>, validity: Option<Bitmap>, is_optional: bool) -> Self {
46
Self {
47
is_optional,
48
offsets,
49
validity,
50
}
51
}
52
}
53
54
/// Descriptor of nested information of a field
55
#[derive(Debug, Clone, PartialEq)]
56
pub enum Nested {
57
/// a primitive (leaf or parquet column)
58
Primitive(PrimitiveNested),
59
List(ListNested<i32>),
60
LargeList(ListNested<i64>),
61
FixedSizeList(FixedSizeListNested),
62
Struct(StructNested),
63
}
64
65
impl Nested {
66
/// Returns the length (number of rows) of the element
67
pub fn len(&self) -> usize {
68
match self {
69
Nested::Primitive(nested) => nested.length,
70
Nested::List(nested) => nested.offsets.len_proxy(),
71
Nested::LargeList(nested) => nested.offsets.len_proxy(),
72
Nested::FixedSizeList(nested) => nested.length,
73
Nested::Struct(nested) => nested.length,
74
}
75
}
76
77
pub fn primitive(validity: Option<Bitmap>, is_optional: bool, length: usize) -> Self {
78
Self::Primitive(PrimitiveNested {
79
validity,
80
is_optional,
81
length,
82
})
83
}
84
85
pub fn list(validity: Option<Bitmap>, is_optional: bool, offsets: OffsetsBuffer<i32>) -> Self {
86
Self::List(ListNested {
87
validity,
88
is_optional,
89
offsets,
90
})
91
}
92
93
pub fn large_list(
94
validity: Option<Bitmap>,
95
is_optional: bool,
96
offsets: OffsetsBuffer<i64>,
97
) -> Self {
98
Self::LargeList(ListNested {
99
validity,
100
is_optional,
101
offsets,
102
})
103
}
104
105
pub fn fixed_size_list(
106
validity: Option<Bitmap>,
107
is_optional: bool,
108
width: usize,
109
length: usize,
110
) -> Self {
111
Self::FixedSizeList(FixedSizeListNested {
112
validity,
113
is_optional,
114
width,
115
length,
116
})
117
}
118
119
pub fn structure(validity: Option<Bitmap>, is_optional: bool, length: usize) -> Self {
120
Self::Struct(StructNested {
121
validity,
122
is_optional,
123
length,
124
})
125
}
126
}
127
128
/// Constructs the necessary `Vec<Vec<Nested>>` to write the rep and def levels of `array` to parquet
129
pub fn to_nested(array: &dyn Array, type_: &ParquetType) -> PolarsResult<Vec<Vec<Nested>>> {
130
let mut nested = vec![];
131
132
to_nested_recursive(array, type_, &mut nested, vec![])?;
133
Ok(nested)
134
}
135
136
fn to_nested_recursive(
137
array: &dyn Array,
138
type_: &ParquetType,
139
nested: &mut Vec<Vec<Nested>>,
140
mut parents: Vec<Nested>,
141
) -> PolarsResult<()> {
142
let is_optional = is_nullable(type_.get_field_info());
143
144
if !is_optional && array.null_count() > 0 {
145
polars_bail!(InvalidOperation: "writing a missing value to required field '{}'", type_.name());
146
}
147
148
use PhysicalType::*;
149
match array.dtype().to_physical_type() {
150
Struct => {
151
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
152
let fields = if let ParquetType::GroupType { fields, .. } = type_ {
153
fields
154
} else {
155
polars_bail!(InvalidOperation:
156
"Parquet type must be a group for a struct array",
157
)
158
};
159
160
parents.push(Nested::Struct(StructNested {
161
is_optional,
162
validity: array.validity().cloned(),
163
length: array.len(),
164
}));
165
166
for (type_, array) in fields.iter().zip(array.values()) {
167
to_nested_recursive(array.as_ref(), type_, nested, parents.clone())?;
168
}
169
},
170
FixedSizeList => {
171
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
172
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
173
if let ParquetType::GroupType { fields, .. } = &fields[0] {
174
&fields[0]
175
} else {
176
polars_bail!(InvalidOperation:
177
"Parquet type must be a group for a list array",
178
)
179
}
180
} else {
181
polars_bail!(InvalidOperation:
182
"Parquet type must be a group for a list array",
183
)
184
};
185
186
parents.push(Nested::FixedSizeList(FixedSizeListNested {
187
validity: array.validity().cloned(),
188
length: array.len(),
189
width: array.size(),
190
is_optional,
191
}));
192
to_nested_recursive(array.values().as_ref(), type_, nested, parents)?;
193
},
194
List => {
195
let array = array.as_any().downcast_ref::<ListArray<i32>>().unwrap();
196
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
197
if let ParquetType::GroupType { fields, .. } = &fields[0] {
198
&fields[0]
199
} else {
200
polars_bail!(InvalidOperation:
201
"Parquet type must be a group for a list array",
202
)
203
}
204
} else {
205
polars_bail!(InvalidOperation:
206
"Parquet type must be a group for a list array",
207
)
208
};
209
210
parents.push(Nested::List(ListNested::new(
211
array.offsets().clone(),
212
array.validity().cloned(),
213
is_optional,
214
)));
215
to_nested_recursive(array.values().as_ref(), type_, nested, parents)?;
216
},
217
LargeList => {
218
let array = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
219
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
220
if let ParquetType::GroupType { fields, .. } = &fields[0] {
221
&fields[0]
222
} else {
223
polars_bail!(InvalidOperation:
224
"Parquet type must be a group for a list array",
225
)
226
}
227
} else {
228
polars_bail!(InvalidOperation:
229
"Parquet type must be a group for a list array",
230
)
231
};
232
233
parents.push(Nested::LargeList(ListNested::new(
234
array.offsets().clone(),
235
array.validity().cloned(),
236
is_optional,
237
)));
238
to_nested_recursive(array.values().as_ref(), type_, nested, parents)?;
239
},
240
Map => {
241
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
242
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
243
if let ParquetType::GroupType { fields, .. } = &fields[0] {
244
&fields[0]
245
} else {
246
polars_bail!(InvalidOperation:
247
"Parquet type must be a group for a map array",
248
)
249
}
250
} else {
251
polars_bail!(InvalidOperation:
252
"Parquet type must be a group for a map array",
253
)
254
};
255
256
parents.push(Nested::List(ListNested::new(
257
array.offsets().clone(),
258
array.validity().cloned(),
259
is_optional,
260
)));
261
to_nested_recursive(array.field().as_ref(), type_, nested, parents)?;
262
},
263
_ => {
264
parents.push(Nested::Primitive(PrimitiveNested {
265
validity: array.validity().cloned(),
266
is_optional,
267
length: array.len(),
268
}));
269
nested.push(parents)
270
},
271
}
272
Ok(())
273
}
274
275
fn expand_list_validity<'a, O: Offset>(
276
array: &'a ListArray<O>,
277
validity: BitmapState,
278
array_stack: &mut Vec<(&'a dyn Array, BitmapState)>,
279
) {
280
let BitmapState::SomeSet(list_validity) = validity else {
281
array_stack.push((
282
array.values().as_ref(),
283
match validity {
284
BitmapState::AllSet => BitmapState::AllSet,
285
BitmapState::SomeSet(_) => unreachable!(),
286
BitmapState::AllUnset(_) => BitmapState::AllUnset(array.values().len()),
287
},
288
));
289
return;
290
};
291
292
let offsets = array.offsets().buffer();
293
let mut validity = MutableBitmap::with_capacity(array.values().len());
294
let mut list_validity_iter = list_validity.iter();
295
296
// @NOTE: We need to take into account here that the list might only point to a slice of the
297
// values, therefore we need to extend the validity mask with dummy values to match the length
298
// of the values array.
299
300
let mut idx = 0;
301
validity.extend_constant(offsets[0].to_usize(), false);
302
while list_validity_iter.num_remaining() > 0 {
303
let num_ones = list_validity_iter.take_leading_ones();
304
let num_elements = offsets[idx + num_ones] - offsets[idx];
305
validity.extend_constant(num_elements.to_usize(), true);
306
307
idx += num_ones;
308
309
let num_zeros = list_validity_iter.take_leading_zeros();
310
let num_elements = offsets[idx + num_zeros] - offsets[idx];
311
validity.extend_constant(num_elements.to_usize(), false);
312
313
idx += num_zeros;
314
}
315
validity.extend_constant(array.values().len() - validity.len(), false);
316
317
debug_assert_eq!(idx, array.len());
318
let validity = validity.freeze();
319
320
debug_assert_eq!(validity.len(), array.values().len());
321
array_stack.push((array.values().as_ref(), BitmapState::SomeSet(validity)));
322
}
323
324
#[derive(Clone)]
325
enum BitmapState {
326
AllSet,
327
SomeSet(Bitmap),
328
AllUnset(usize),
329
}
330
331
impl From<Option<&Bitmap>> for BitmapState {
332
fn from(bm: Option<&Bitmap>) -> Self {
333
let Some(bm) = bm else {
334
return Self::AllSet;
335
};
336
337
let null_count = bm.unset_bits();
338
339
if null_count == 0 {
340
Self::AllSet
341
} else if null_count == bm.len() {
342
Self::AllUnset(bm.len())
343
} else {
344
Self::SomeSet(bm.clone())
345
}
346
}
347
}
348
349
impl From<BitmapState> for Option<Bitmap> {
350
fn from(bms: BitmapState) -> Self {
351
match bms {
352
BitmapState::AllSet => None,
353
BitmapState::SomeSet(bm) => Some(bm),
354
BitmapState::AllUnset(len) => Some(Bitmap::new_zeroed(len)),
355
}
356
}
357
}
358
359
impl std::ops::BitAnd for &BitmapState {
360
type Output = BitmapState;
361
362
fn bitand(self, rhs: Self) -> Self::Output {
363
use BitmapState as B;
364
match (self, rhs) {
365
(B::AllSet, B::AllSet) => B::AllSet,
366
(B::AllSet, B::SomeSet(v)) | (B::SomeSet(v), B::AllSet) => B::SomeSet(v.clone()),
367
(B::SomeSet(lhs), B::SomeSet(rhs)) => {
368
let result = lhs & rhs;
369
let null_count = result.unset_bits();
370
371
if null_count == 0 {
372
B::AllSet
373
} else if null_count == result.len() {
374
B::AllUnset(result.len())
375
} else {
376
B::SomeSet(result)
377
}
378
},
379
(B::AllUnset(len), _) | (_, B::AllUnset(len)) => B::AllUnset(*len),
380
}
381
}
382
}
383
384
/// Convert [`Array`] to a `Vec<Box<dyn Array>>` leaves in DFS order.
385
///
386
/// Each leaf array has the validity propagated from the nesting levels above.
387
pub fn to_leaves(array: &dyn Array, leaves: &mut Vec<Box<dyn Array>>) {
388
use PhysicalType as P;
389
390
leaves.clear();
391
let mut array_stack: Vec<(&dyn Array, BitmapState)> = Vec::new();
392
393
array_stack.push((array, BitmapState::AllSet));
394
395
while let Some((array, inherited_validity)) = array_stack.pop() {
396
let child_validity = BitmapState::from(array.validity());
397
let validity = (&child_validity) & (&inherited_validity);
398
399
match array.dtype().to_physical_type() {
400
P::Struct => {
401
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
402
403
leaves.reserve(array.len().saturating_sub(1));
404
array
405
.values()
406
.iter()
407
.rev()
408
.for_each(|field| array_stack.push((field.as_ref(), validity.clone())));
409
},
410
P::List => {
411
let array = array.as_any().downcast_ref::<ListArray<i32>>().unwrap();
412
expand_list_validity(array, validity, &mut array_stack);
413
},
414
P::LargeList => {
415
let array = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
416
expand_list_validity(array, validity, &mut array_stack);
417
},
418
P::FixedSizeList => {
419
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
420
421
let BitmapState::SomeSet(fsl_validity) = validity else {
422
array_stack.push((
423
array.values().as_ref(),
424
match validity {
425
BitmapState::AllSet => BitmapState::AllSet,
426
BitmapState::SomeSet(_) => unreachable!(),
427
BitmapState::AllUnset(_) => BitmapState::AllUnset(array.values().len()),
428
},
429
));
430
continue;
431
};
432
433
let num_values = array.values().len();
434
let size = array.size();
435
436
let mut validity = MutableBitmap::with_capacity(num_values);
437
let mut fsl_validity_iter = fsl_validity.iter();
438
439
let mut idx = 0;
440
while fsl_validity_iter.num_remaining() > 0 {
441
let num_ones = fsl_validity_iter.take_leading_ones();
442
let num_elements = num_ones * size;
443
validity.extend_constant(num_elements, true);
444
445
idx += num_ones;
446
447
let num_zeros = fsl_validity_iter.take_leading_zeros();
448
let num_elements = num_zeros * size;
449
validity.extend_constant(num_elements, false);
450
451
idx += num_zeros;
452
}
453
454
debug_assert_eq!(idx, array.len());
455
456
let validity = BitmapState::SomeSet(validity.freeze());
457
458
array_stack.push((array.values().as_ref(), validity));
459
},
460
P::Map => {
461
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
462
array_stack.push((array.field().as_ref(), validity));
463
},
464
P::Null
465
| P::Boolean
466
| P::Primitive(_)
467
| P::Binary
468
| P::FixedSizeBinary
469
| P::LargeBinary
470
| P::Utf8
471
| P::LargeUtf8
472
| P::Dictionary(_)
473
| P::BinaryView
474
| P::Utf8View => {
475
leaves.push(array.with_validity(validity.into()));
476
},
477
478
other => todo!("Writing {:?} to parquet not yet implemented", other),
479
}
480
}
481
}
482
483
/// Convert `ParquetType` to `Vec<ParquetPrimitiveType>` leaves in DFS order.
484
pub fn to_parquet_leaves(type_: ParquetType) -> Vec<ParquetPrimitiveType> {
485
let mut leaves = vec![];
486
to_parquet_leaves_recursive(type_, &mut leaves);
487
leaves
488
}
489
490
fn to_parquet_leaves_recursive(type_: ParquetType, leaves: &mut Vec<ParquetPrimitiveType>) {
491
match type_ {
492
ParquetType::PrimitiveType(primitive) => leaves.push(primitive),
493
ParquetType::GroupType { fields, .. } => {
494
fields
495
.into_iter()
496
.for_each(|type_| to_parquet_leaves_recursive(type_, leaves));
497
},
498
}
499
}
500
501
/// Returns a vector of iterators of [`Page`], one per leaf column in the array
502
pub fn array_to_columns<A: AsRef<dyn Array> + Send + Sync>(
503
array: A,
504
type_: ParquetType,
505
column_options: &ColumnWriteOptions,
506
options: WriteOptions,
507
) -> PolarsResult<Vec<DynIter<'static, PolarsResult<Page>>>> {
508
let array = array.as_ref();
509
510
let nested = to_nested(array, &type_)?;
511
let types = to_parquet_leaves(type_);
512
513
let mut values = Vec::new();
514
to_leaves(array, &mut values);
515
516
let mut field_options = Vec::with_capacity(types.len());
517
column_options.to_leaves(&mut field_options);
518
519
assert_eq!(field_options.len(), types.len());
520
521
values
522
.iter()
523
.zip(nested)
524
.zip(types)
525
.zip(field_options)
526
.map(|(((values, nested), type_), field_options)| {
527
array_to_pages(values.as_ref(), type_, &nested, options, field_options)
528
})
529
.collect()
530
}
531
532
pub fn arrays_to_columns<A: AsRef<dyn Array> + Send + Sync>(
533
arrays: &[A],
534
type_: ParquetType,
535
options: WriteOptions,
536
column_options: &ColumnWriteOptions,
537
) -> PolarsResult<Vec<DynIter<'static, PolarsResult<Page>>>> {
538
let array = arrays[0].as_ref();
539
let nested = to_nested(array, &type_)?;
540
541
let types = to_parquet_leaves(type_);
542
543
let mut field_options = Vec::with_capacity(types.len());
544
column_options.to_leaves(&mut field_options);
545
546
// leaves; index level is nesting depth.
547
// index i: has a vec because we have multiple chunks.
548
let mut leaves = vec![];
549
550
// Ensure we transpose the leaves. So that all the leaves from the same columns are at the same level vec.
551
let mut scratch = vec![];
552
for arr in arrays {
553
to_leaves(arr.as_ref(), &mut scratch);
554
for (i, leave) in std::mem::take(&mut scratch).into_iter().enumerate() {
555
while i < leaves.len() {
556
leaves.push(vec![]);
557
}
558
leaves[i].push(leave);
559
}
560
}
561
562
leaves
563
.into_iter()
564
.zip(nested)
565
.zip(types)
566
.zip(field_options)
567
.map(move |(((values, nested), type_), column_options)| {
568
let iter = values.into_iter().map(|leave_values| {
569
array_to_pages(
570
leave_values.as_ref(),
571
type_.clone(),
572
&nested,
573
options,
574
column_options,
575
)
576
});
577
578
// Need a scratch to bubble up the error :/
579
let mut scratch = Vec::with_capacity(iter.size_hint().0);
580
for v in iter {
581
scratch.push(v?)
582
}
583
Ok(DynIter::new(scratch.into_iter().flatten()))
584
})
585
.collect::<PolarsResult<Vec<_>>>()
586
}
587
588
#[cfg(test)]
589
mod tests {
590
use arrow::array::*;
591
use arrow::datatypes::*;
592
593
use super::super::{FieldInfo, ParquetPhysicalType};
594
use super::*;
595
use crate::parquet::schema::Repetition;
596
use crate::parquet::schema::types::{
597
GroupLogicalType, PrimitiveConvertedType, PrimitiveLogicalType,
598
};
599
600
#[test]
601
fn test_struct() {
602
let boolean = BooleanArray::from_slice([false, false, true, true]).boxed();
603
let int = Int32Array::from_slice([42, 28, 19, 31]).boxed();
604
605
let fields = vec![
606
Field::new("b".into(), ArrowDataType::Boolean, false),
607
Field::new("c".into(), ArrowDataType::Int32, false),
608
];
609
610
let array = StructArray::new(
611
ArrowDataType::Struct(fields),
612
4,
613
vec![boolean.clone(), int.clone()],
614
Some(Bitmap::from([true, true, false, true])),
615
);
616
617
let type_ = ParquetType::GroupType {
618
field_info: FieldInfo {
619
name: "a".into(),
620
repetition: Repetition::Optional,
621
id: None,
622
},
623
logical_type: None,
624
converted_type: None,
625
fields: vec![
626
ParquetType::PrimitiveType(ParquetPrimitiveType {
627
field_info: FieldInfo {
628
name: "b".into(),
629
repetition: Repetition::Required,
630
id: None,
631
},
632
logical_type: None,
633
converted_type: None,
634
physical_type: ParquetPhysicalType::Boolean,
635
}),
636
ParquetType::PrimitiveType(ParquetPrimitiveType {
637
field_info: FieldInfo {
638
name: "c".into(),
639
repetition: Repetition::Required,
640
id: None,
641
},
642
logical_type: None,
643
converted_type: None,
644
physical_type: ParquetPhysicalType::Int32,
645
}),
646
],
647
};
648
let a = to_nested(&array, &type_).unwrap();
649
650
assert_eq!(
651
a,
652
vec![
653
vec![
654
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
655
Nested::primitive(None, false, 4),
656
],
657
vec![
658
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
659
Nested::primitive(None, false, 4),
660
],
661
]
662
);
663
}
664
665
#[test]
666
fn test_struct_struct() {
667
let boolean = BooleanArray::from_slice([false, false, true, true]).boxed();
668
let int = Int32Array::from_slice([42, 28, 19, 31]).boxed();
669
670
let fields = vec![
671
Field::new("b".into(), ArrowDataType::Boolean, false),
672
Field::new("c".into(), ArrowDataType::Int32, false),
673
];
674
675
let array = StructArray::new(
676
ArrowDataType::Struct(fields),
677
4,
678
vec![boolean.clone(), int.clone()],
679
Some(Bitmap::from([true, true, false, true])),
680
);
681
682
let fields = vec![
683
Field::new("b".into(), array.dtype().clone(), true),
684
Field::new("c".into(), array.dtype().clone(), true),
685
];
686
687
let array = StructArray::new(
688
ArrowDataType::Struct(fields),
689
4,
690
vec![Box::new(array.clone()), Box::new(array)],
691
None,
692
);
693
694
let type_ = ParquetType::GroupType {
695
field_info: FieldInfo {
696
name: "a".into(),
697
repetition: Repetition::Optional,
698
id: None,
699
},
700
logical_type: None,
701
converted_type: None,
702
fields: vec![
703
ParquetType::PrimitiveType(ParquetPrimitiveType {
704
field_info: FieldInfo {
705
name: "b".into(),
706
repetition: Repetition::Required,
707
id: None,
708
},
709
logical_type: None,
710
converted_type: None,
711
physical_type: ParquetPhysicalType::Boolean,
712
}),
713
ParquetType::PrimitiveType(ParquetPrimitiveType {
714
field_info: FieldInfo {
715
name: "c".into(),
716
repetition: Repetition::Required,
717
id: None,
718
},
719
logical_type: None,
720
converted_type: None,
721
physical_type: ParquetPhysicalType::Int32,
722
}),
723
],
724
};
725
726
let type_ = ParquetType::GroupType {
727
field_info: FieldInfo {
728
name: "a".into(),
729
repetition: Repetition::Required,
730
id: None,
731
},
732
logical_type: None,
733
converted_type: None,
734
fields: vec![type_.clone(), type_],
735
};
736
737
let a = to_nested(&array, &type_).unwrap();
738
739
assert_eq!(
740
a,
741
vec![
742
// a.b.b
743
vec![
744
Nested::structure(None, false, 4),
745
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
746
Nested::primitive(None, false, 4),
747
],
748
// a.b.c
749
vec![
750
Nested::structure(None, false, 4),
751
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
752
Nested::primitive(None, false, 4),
753
],
754
// a.c.b
755
vec![
756
Nested::structure(None, false, 4),
757
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
758
Nested::primitive(None, false, 4),
759
],
760
// a.c.c
761
vec![
762
Nested::structure(None, false, 4),
763
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
764
Nested::primitive(None, false, 4),
765
],
766
]
767
);
768
}
769
770
#[test]
771
fn test_list_struct() {
772
let boolean = BooleanArray::from_slice([false, false, true, true]).boxed();
773
let int = Int32Array::from_slice([42, 28, 19, 31]).boxed();
774
775
let fields = vec![
776
Field::new("b".into(), ArrowDataType::Boolean, false),
777
Field::new("c".into(), ArrowDataType::Int32, false),
778
];
779
780
let array = StructArray::new(
781
ArrowDataType::Struct(fields),
782
4,
783
vec![boolean.clone(), int.clone()],
784
Some(Bitmap::from([true, true, false, true])),
785
);
786
787
let array = ListArray::new(
788
ArrowDataType::List(Box::new(Field::new(
789
"l".into(),
790
array.dtype().clone(),
791
true,
792
))),
793
vec![0i32, 2, 4].try_into().unwrap(),
794
Box::new(array),
795
None,
796
);
797
798
let type_ = ParquetType::GroupType {
799
field_info: FieldInfo {
800
name: "a".into(),
801
repetition: Repetition::Optional,
802
id: None,
803
},
804
logical_type: None,
805
converted_type: None,
806
fields: vec![
807
ParquetType::PrimitiveType(ParquetPrimitiveType {
808
field_info: FieldInfo {
809
name: "b".into(),
810
repetition: Repetition::Required,
811
id: None,
812
},
813
logical_type: None,
814
converted_type: None,
815
physical_type: ParquetPhysicalType::Boolean,
816
}),
817
ParquetType::PrimitiveType(ParquetPrimitiveType {
818
field_info: FieldInfo {
819
name: "c".into(),
820
repetition: Repetition::Required,
821
id: None,
822
},
823
logical_type: None,
824
converted_type: None,
825
physical_type: ParquetPhysicalType::Int32,
826
}),
827
],
828
};
829
830
let type_ = ParquetType::GroupType {
831
field_info: FieldInfo {
832
name: "l".into(),
833
repetition: Repetition::Required,
834
id: None,
835
},
836
logical_type: None,
837
converted_type: None,
838
fields: vec![ParquetType::GroupType {
839
field_info: FieldInfo {
840
name: "list".into(),
841
repetition: Repetition::Repeated,
842
id: None,
843
},
844
logical_type: None,
845
converted_type: None,
846
fields: vec![type_],
847
}],
848
};
849
850
let a = to_nested(&array, &type_).unwrap();
851
852
assert_eq!(
853
a,
854
vec![
855
vec![
856
Nested::List(ListNested::<i32> {
857
is_optional: false,
858
offsets: vec![0, 2, 4].try_into().unwrap(),
859
validity: None,
860
}),
861
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
862
Nested::primitive(None, false, 4),
863
],
864
vec![
865
Nested::List(ListNested::<i32> {
866
is_optional: false,
867
offsets: vec![0, 2, 4].try_into().unwrap(),
868
validity: None,
869
}),
870
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
871
Nested::primitive(None, false, 4),
872
],
873
]
874
);
875
}
876
877
#[test]
878
fn test_map() {
879
let kv_type = ArrowDataType::Struct(vec![
880
Field::new("k".into(), ArrowDataType::Utf8, false),
881
Field::new("v".into(), ArrowDataType::Int32, false),
882
]);
883
let kv_field = Field::new("kv".into(), kv_type.clone(), false);
884
let map_type = ArrowDataType::Map(Box::new(kv_field), false);
885
886
let key_array = Utf8Array::<i32>::from_slice(["k1", "k2", "k3", "k4", "k5", "k6"]).boxed();
887
let val_array = Int32Array::from_slice([42, 28, 19, 31, 21, 17]).boxed();
888
let kv_array = StructArray::try_new(kv_type, 6, vec![key_array, val_array], None)
889
.unwrap()
890
.boxed();
891
let offsets = OffsetsBuffer::try_from(vec![0, 2, 3, 4, 6]).unwrap();
892
893
let array = MapArray::try_new(map_type, offsets, kv_array, None).unwrap();
894
895
let type_ = ParquetType::GroupType {
896
field_info: FieldInfo {
897
name: "kv".into(),
898
repetition: Repetition::Optional,
899
id: None,
900
},
901
logical_type: None,
902
converted_type: None,
903
fields: vec![
904
ParquetType::PrimitiveType(ParquetPrimitiveType {
905
field_info: FieldInfo {
906
name: "k".into(),
907
repetition: Repetition::Required,
908
id: None,
909
},
910
logical_type: Some(PrimitiveLogicalType::String),
911
converted_type: Some(PrimitiveConvertedType::Utf8),
912
physical_type: ParquetPhysicalType::ByteArray,
913
}),
914
ParquetType::PrimitiveType(ParquetPrimitiveType {
915
field_info: FieldInfo {
916
name: "v".into(),
917
repetition: Repetition::Required,
918
id: None,
919
},
920
logical_type: None,
921
converted_type: None,
922
physical_type: ParquetPhysicalType::Int32,
923
}),
924
],
925
};
926
927
let type_ = ParquetType::GroupType {
928
field_info: FieldInfo {
929
name: "m".into(),
930
repetition: Repetition::Required,
931
id: None,
932
},
933
logical_type: Some(GroupLogicalType::Map),
934
converted_type: None,
935
fields: vec![ParquetType::GroupType {
936
field_info: FieldInfo {
937
name: "map".into(),
938
repetition: Repetition::Repeated,
939
id: None,
940
},
941
logical_type: None,
942
converted_type: None,
943
fields: vec![type_],
944
}],
945
};
946
947
let a = to_nested(&array, &type_).unwrap();
948
949
assert_eq!(
950
a,
951
vec![
952
vec![
953
Nested::List(ListNested::<i32> {
954
is_optional: false,
955
offsets: vec![0, 2, 3, 4, 6].try_into().unwrap(),
956
validity: None,
957
}),
958
Nested::structure(None, true, 6),
959
Nested::primitive(None, false, 6),
960
],
961
vec![
962
Nested::List(ListNested::<i32> {
963
is_optional: false,
964
offsets: vec![0, 2, 3, 4, 6].try_into().unwrap(),
965
validity: None,
966
}),
967
Nested::structure(None, true, 6),
968
Nested::primitive(None, false, 6),
969
],
970
]
971
);
972
}
973
}
974
975