Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-parquet/src/arrow/write/pages.rs
8480 views
1
use std::fmt::Debug;
2
3
use arrow::array::{Array, FixedSizeListArray, ListArray, MapArray, StructArray};
4
use arrow::bitmap::{Bitmap, MutableBitmap};
5
use arrow::datatypes::{ArrowDataType, PhysicalType};
6
use arrow::offset::{Offset, OffsetsBuffer};
7
use polars_error::{PolarsResult, polars_bail};
8
9
use super::{Encoding, WriteOptions, array_to_pages};
10
use crate::arrow::read::schema::is_nullable;
11
use crate::parquet::page::Page;
12
use crate::parquet::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType};
13
use crate::write::DynIter;
14
15
#[derive(Debug, Clone, PartialEq)]
16
pub struct PrimitiveNested {
17
pub is_optional: bool,
18
pub validity: Option<Bitmap>,
19
pub length: usize,
20
}
21
22
#[derive(Debug, Clone, PartialEq)]
23
pub struct ListNested<O: Offset> {
24
pub is_optional: bool,
25
pub offsets: OffsetsBuffer<O>,
26
pub validity: Option<Bitmap>,
27
}
28
29
#[derive(Debug, Clone, PartialEq)]
30
pub struct FixedSizeListNested {
31
pub validity: Option<Bitmap>,
32
pub is_optional: bool,
33
pub width: usize,
34
pub length: usize,
35
}
36
37
#[derive(Debug, Clone, PartialEq)]
38
pub struct StructNested {
39
pub is_optional: bool,
40
pub validity: Option<Bitmap>,
41
pub length: usize,
42
}
43
44
impl<O: Offset> ListNested<O> {
45
pub fn new(offsets: OffsetsBuffer<O>, validity: Option<Bitmap>, is_optional: bool) -> Self {
46
Self {
47
is_optional,
48
offsets,
49
validity,
50
}
51
}
52
}
53
54
/// Descriptor of nested information of a field
55
#[derive(Debug, Clone, PartialEq)]
56
pub enum Nested {
57
/// a primitive (leaf or parquet column)
58
Primitive(PrimitiveNested),
59
List(ListNested<i32>),
60
LargeList(ListNested<i64>),
61
FixedSizeList(FixedSizeListNested),
62
Struct(StructNested),
63
}
64
65
impl Nested {
66
/// Returns the length (number of rows) of the element
67
pub fn len(&self) -> usize {
68
match self {
69
Nested::Primitive(nested) => nested.length,
70
Nested::List(nested) => nested.offsets.len_proxy(),
71
Nested::LargeList(nested) => nested.offsets.len_proxy(),
72
Nested::FixedSizeList(nested) => nested.length,
73
Nested::Struct(nested) => nested.length,
74
}
75
}
76
77
pub fn primitive(validity: Option<Bitmap>, is_optional: bool, length: usize) -> Self {
78
Self::Primitive(PrimitiveNested {
79
validity,
80
is_optional,
81
length,
82
})
83
}
84
85
pub fn list(validity: Option<Bitmap>, is_optional: bool, offsets: OffsetsBuffer<i32>) -> Self {
86
Self::List(ListNested {
87
validity,
88
is_optional,
89
offsets,
90
})
91
}
92
93
pub fn large_list(
94
validity: Option<Bitmap>,
95
is_optional: bool,
96
offsets: OffsetsBuffer<i64>,
97
) -> Self {
98
Self::LargeList(ListNested {
99
validity,
100
is_optional,
101
offsets,
102
})
103
}
104
105
pub fn fixed_size_list(
106
validity: Option<Bitmap>,
107
is_optional: bool,
108
width: usize,
109
length: usize,
110
) -> Self {
111
Self::FixedSizeList(FixedSizeListNested {
112
validity,
113
is_optional,
114
width,
115
length,
116
})
117
}
118
119
pub fn structure(validity: Option<Bitmap>, is_optional: bool, length: usize) -> Self {
120
Self::Struct(StructNested {
121
validity,
122
is_optional,
123
length,
124
})
125
}
126
}
127
128
/// Constructs the necessary `Vec<Vec<Nested>>` to write the rep and def levels of `array` to parquet
129
pub fn to_nested(array: &dyn Array, type_: &ParquetType) -> PolarsResult<Vec<Vec<Nested>>> {
130
let mut nested = vec![];
131
132
to_nested_recursive(array, type_, &mut nested, vec![])?;
133
Ok(nested)
134
}
135
136
fn to_nested_recursive(
137
array: &dyn Array,
138
type_: &ParquetType,
139
nested: &mut Vec<Vec<Nested>>,
140
mut parents: Vec<Nested>,
141
) -> PolarsResult<()> {
142
let is_optional = is_nullable(type_.get_field_info());
143
144
if !is_optional && array.null_count() > 0 {
145
polars_bail!(InvalidOperation: "writing a missing value to required field '{}'", type_.name());
146
}
147
148
use PhysicalType::*;
149
match array.dtype().to_physical_type() {
150
Struct => {
151
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
152
let fields = if let ParquetType::GroupType { fields, .. } = type_ {
153
fields
154
} else {
155
// @NOTE: Support empty struct by mapping to Boolean array.
156
if let ArrowDataType::Struct(fs) = array.dtype()
157
&& fs.is_empty()
158
{
159
parents.push(Nested::Primitive(PrimitiveNested {
160
validity: array.validity().cloned(),
161
is_optional,
162
length: array.len(),
163
}));
164
nested.push(parents);
165
return Ok(());
166
}
167
168
polars_bail!(InvalidOperation:
169
"Parquet type must be a group for a struct array",
170
)
171
};
172
173
parents.push(Nested::Struct(StructNested {
174
is_optional,
175
validity: array.validity().cloned(),
176
length: array.len(),
177
}));
178
179
for (type_, array) in fields.iter().zip(array.values()) {
180
to_nested_recursive(array.as_ref(), type_, nested, parents.clone())?;
181
}
182
},
183
FixedSizeList => {
184
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
185
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
186
if let ParquetType::GroupType { fields, .. } = &fields[0] {
187
&fields[0]
188
} else {
189
polars_bail!(InvalidOperation:
190
"Parquet type must be a group for a list array",
191
)
192
}
193
} else {
194
polars_bail!(InvalidOperation:
195
"Parquet type must be a group for a list array",
196
)
197
};
198
199
parents.push(Nested::FixedSizeList(FixedSizeListNested {
200
validity: array.validity().cloned(),
201
length: array.len(),
202
width: array.size(),
203
is_optional,
204
}));
205
to_nested_recursive(array.values().as_ref(), type_, nested, parents)?;
206
},
207
List => {
208
let array = array.as_any().downcast_ref::<ListArray<i32>>().unwrap();
209
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
210
if let ParquetType::GroupType { fields, .. } = &fields[0] {
211
&fields[0]
212
} else {
213
polars_bail!(InvalidOperation:
214
"Parquet type must be a group for a list array",
215
)
216
}
217
} else {
218
polars_bail!(InvalidOperation:
219
"Parquet type must be a group for a list array",
220
)
221
};
222
223
parents.push(Nested::List(ListNested::new(
224
array.offsets().clone(),
225
array.validity().cloned(),
226
is_optional,
227
)));
228
to_nested_recursive(array.values().as_ref(), type_, nested, parents)?;
229
},
230
LargeList => {
231
let array = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
232
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
233
if let ParquetType::GroupType { fields, .. } = &fields[0] {
234
&fields[0]
235
} else {
236
polars_bail!(InvalidOperation:
237
"Parquet type must be a group for a list array",
238
)
239
}
240
} else {
241
polars_bail!(InvalidOperation:
242
"Parquet type must be a group for a list array",
243
)
244
};
245
246
parents.push(Nested::LargeList(ListNested::new(
247
array.offsets().clone(),
248
array.validity().cloned(),
249
is_optional,
250
)));
251
to_nested_recursive(array.values().as_ref(), type_, nested, parents)?;
252
},
253
Map => {
254
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
255
let type_ = if let ParquetType::GroupType { fields, .. } = type_ {
256
if let ParquetType::GroupType { fields, .. } = &fields[0] {
257
&fields[0]
258
} else {
259
polars_bail!(InvalidOperation:
260
"Parquet type must be a group for a map array",
261
)
262
}
263
} else {
264
polars_bail!(InvalidOperation:
265
"Parquet type must be a group for a map array",
266
)
267
};
268
269
parents.push(Nested::List(ListNested::new(
270
array.offsets().clone(),
271
array.validity().cloned(),
272
is_optional,
273
)));
274
to_nested_recursive(array.field().as_ref(), type_, nested, parents)?;
275
},
276
_ => {
277
parents.push(Nested::Primitive(PrimitiveNested {
278
validity: array.validity().cloned(),
279
is_optional,
280
length: array.len(),
281
}));
282
nested.push(parents)
283
},
284
}
285
Ok(())
286
}
287
288
fn expand_list_validity<'a, O: Offset>(
289
array: &'a ListArray<O>,
290
validity: BitmapState,
291
array_stack: &mut Vec<(&'a dyn Array, BitmapState)>,
292
) {
293
let BitmapState::SomeSet(list_validity) = validity else {
294
array_stack.push((
295
array.values().as_ref(),
296
match validity {
297
BitmapState::AllSet => BitmapState::AllSet,
298
BitmapState::SomeSet(_) => unreachable!(),
299
BitmapState::AllUnset(_) => BitmapState::AllUnset(array.values().len()),
300
},
301
));
302
return;
303
};
304
305
let offsets = array.offsets().buffer();
306
let mut validity = MutableBitmap::with_capacity(array.values().len());
307
let mut list_validity_iter = list_validity.iter();
308
309
// @NOTE: We need to take into account here that the list might only point to a slice of the
310
// values, therefore we need to extend the validity mask with dummy values to match the length
311
// of the values array.
312
313
let mut idx = 0;
314
validity.extend_constant(offsets[0].to_usize(), false);
315
while list_validity_iter.num_remaining() > 0 {
316
let num_ones = list_validity_iter.take_leading_ones();
317
let num_elements = offsets[idx + num_ones] - offsets[idx];
318
validity.extend_constant(num_elements.to_usize(), true);
319
320
idx += num_ones;
321
322
let num_zeros = list_validity_iter.take_leading_zeros();
323
let num_elements = offsets[idx + num_zeros] - offsets[idx];
324
validity.extend_constant(num_elements.to_usize(), false);
325
326
idx += num_zeros;
327
}
328
validity.extend_constant(array.values().len() - validity.len(), false);
329
330
debug_assert_eq!(idx, array.len());
331
let validity = validity.freeze();
332
333
debug_assert_eq!(validity.len(), array.values().len());
334
array_stack.push((array.values().as_ref(), BitmapState::SomeSet(validity)));
335
}
336
337
#[derive(Clone)]
338
enum BitmapState {
339
AllSet,
340
SomeSet(Bitmap),
341
AllUnset(usize),
342
}
343
344
impl From<Option<&Bitmap>> for BitmapState {
345
fn from(bm: Option<&Bitmap>) -> Self {
346
let Some(bm) = bm else {
347
return Self::AllSet;
348
};
349
350
let null_count = bm.unset_bits();
351
352
if null_count == 0 {
353
Self::AllSet
354
} else if null_count == bm.len() {
355
Self::AllUnset(bm.len())
356
} else {
357
Self::SomeSet(bm.clone())
358
}
359
}
360
}
361
362
impl From<BitmapState> for Option<Bitmap> {
363
fn from(bms: BitmapState) -> Self {
364
match bms {
365
BitmapState::AllSet => None,
366
BitmapState::SomeSet(bm) => Some(bm),
367
BitmapState::AllUnset(len) => Some(Bitmap::new_zeroed(len)),
368
}
369
}
370
}
371
372
impl std::ops::BitAnd for &BitmapState {
373
type Output = BitmapState;
374
375
fn bitand(self, rhs: Self) -> Self::Output {
376
use BitmapState as B;
377
match (self, rhs) {
378
(B::AllSet, B::AllSet) => B::AllSet,
379
(B::AllSet, B::SomeSet(v)) | (B::SomeSet(v), B::AllSet) => B::SomeSet(v.clone()),
380
(B::SomeSet(lhs), B::SomeSet(rhs)) => {
381
let result = lhs & rhs;
382
let null_count = result.unset_bits();
383
384
if null_count == 0 {
385
B::AllSet
386
} else if null_count == result.len() {
387
B::AllUnset(result.len())
388
} else {
389
B::SomeSet(result)
390
}
391
},
392
(B::AllUnset(len), _) | (_, B::AllUnset(len)) => B::AllUnset(*len),
393
}
394
}
395
}
396
397
/// Convert [`Array`] to a `Vec<Box<dyn Array>>` leaves in DFS order.
398
///
399
/// Each leaf array has the validity propagated from the nesting levels above.
400
pub fn to_leaves(array: &dyn Array, leaves: &mut Vec<Box<dyn Array>>) {
401
use PhysicalType as P;
402
403
leaves.clear();
404
let mut array_stack: Vec<(&dyn Array, BitmapState)> = Vec::new();
405
406
array_stack.push((array, BitmapState::AllSet));
407
408
while let Some((array, inherited_validity)) = array_stack.pop() {
409
let child_validity = BitmapState::from(array.validity());
410
let validity = (&child_validity) & (&inherited_validity);
411
412
match array.dtype().to_physical_type() {
413
P::Struct if !matches!(array.dtype(), ArrowDataType::Struct(fs) if fs.is_empty()) => {
414
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
415
416
leaves.reserve(array.len().saturating_sub(1));
417
array
418
.values()
419
.iter()
420
.rev()
421
.for_each(|field| array_stack.push((field.as_ref(), validity.clone())));
422
},
423
P::List => {
424
let array = array.as_any().downcast_ref::<ListArray<i32>>().unwrap();
425
expand_list_validity(array, validity, &mut array_stack);
426
},
427
P::LargeList => {
428
let array = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
429
expand_list_validity(array, validity, &mut array_stack);
430
},
431
P::FixedSizeList => {
432
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
433
434
let BitmapState::SomeSet(fsl_validity) = validity else {
435
array_stack.push((
436
array.values().as_ref(),
437
match validity {
438
BitmapState::AllSet => BitmapState::AllSet,
439
BitmapState::SomeSet(_) => unreachable!(),
440
BitmapState::AllUnset(_) => BitmapState::AllUnset(array.values().len()),
441
},
442
));
443
continue;
444
};
445
446
let num_values = array.values().len();
447
let size = array.size();
448
449
let mut validity = MutableBitmap::with_capacity(num_values);
450
let mut fsl_validity_iter = fsl_validity.iter();
451
452
let mut idx = 0;
453
while fsl_validity_iter.num_remaining() > 0 {
454
let num_ones = fsl_validity_iter.take_leading_ones();
455
let num_elements = num_ones * size;
456
validity.extend_constant(num_elements, true);
457
458
idx += num_ones;
459
460
let num_zeros = fsl_validity_iter.take_leading_zeros();
461
let num_elements = num_zeros * size;
462
validity.extend_constant(num_elements, false);
463
464
idx += num_zeros;
465
}
466
467
debug_assert_eq!(idx, array.len());
468
469
let validity = BitmapState::SomeSet(validity.freeze());
470
471
array_stack.push((array.values().as_ref(), validity));
472
},
473
P::Map => {
474
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
475
array_stack.push((array.field().as_ref(), validity));
476
},
477
P::Null
478
| P::Boolean
479
| P::Primitive(_)
480
| P::Binary
481
| P::FixedSizeBinary
482
| P::LargeBinary
483
| P::Utf8
484
| P::LargeUtf8
485
| P::Dictionary(_)
486
| P::BinaryView
487
| P::Utf8View
488
| P::Struct => {
489
leaves.push(array.with_validity(validity.into()));
490
},
491
492
other => todo!("Writing {:?} to parquet not yet implemented", other),
493
}
494
}
495
}
496
497
/// Convert `ParquetType` to `Vec<ParquetPrimitiveType>` leaves in DFS order.
498
pub fn to_parquet_leaves(type_: ParquetType) -> Vec<ParquetPrimitiveType> {
499
let mut leaves = vec![];
500
to_parquet_leaves_recursive(type_, &mut leaves);
501
leaves
502
}
503
504
fn to_parquet_leaves_recursive(type_: ParquetType, leaves: &mut Vec<ParquetPrimitiveType>) {
505
match type_ {
506
ParquetType::PrimitiveType(primitive) => leaves.push(primitive),
507
ParquetType::GroupType { fields, .. } => {
508
fields
509
.into_iter()
510
.for_each(|type_| to_parquet_leaves_recursive(type_, leaves));
511
},
512
}
513
}
514
515
/// Returns a vector of iterators of [`Page`], one per leaf column in the array
516
pub fn array_to_columns<A: AsRef<dyn Array> + Send + Sync>(
517
array: A,
518
type_: ParquetType,
519
options: WriteOptions,
520
encoding: &[Encoding],
521
) -> PolarsResult<Vec<DynIter<'static, PolarsResult<Page>>>> {
522
let array = array.as_ref();
523
524
let nested = to_nested(array, &type_)?;
525
526
let types = to_parquet_leaves(type_);
527
528
let mut values = Vec::new();
529
to_leaves(array, &mut values);
530
531
assert_eq!(encoding.len(), types.len());
532
533
let x = values
534
.iter()
535
.zip(nested)
536
.zip(types)
537
.zip(encoding.iter())
538
.map(|(((values, nested), type_), encoding)| {
539
array_to_pages(values.as_ref(), type_, &nested, options, *encoding)
540
})
541
.collect::<PolarsResult<Vec<DynIter<'static, PolarsResult<Page>>>>>()?;
542
Ok(x)
543
}
544
545
pub fn arrays_to_columns<A: AsRef<dyn Array> + Send + Sync>(
546
arrays: &[A],
547
type_: ParquetType,
548
options: WriteOptions,
549
encoding: &[Encoding],
550
) -> PolarsResult<Vec<DynIter<'static, PolarsResult<Page>>>> {
551
let array = arrays[0].as_ref();
552
let nested = to_nested(array, &type_)?;
553
554
let types = to_parquet_leaves(type_);
555
556
// leaves; index level is nesting depth.
557
// index i: has a vec because we have multiple chunks.
558
let mut leaves = vec![];
559
560
// Ensure we transpose the leaves. So that all the leaves from the same columns are at the same level vec.
561
let mut scratch = vec![];
562
for arr in arrays {
563
to_leaves(arr.as_ref(), &mut scratch);
564
for (i, leave) in std::mem::take(&mut scratch).into_iter().enumerate() {
565
while i < leaves.len() {
566
leaves.push(vec![]);
567
}
568
leaves[i].push(leave);
569
}
570
}
571
572
leaves
573
.into_iter()
574
.zip(nested)
575
.zip(types)
576
.zip(encoding.iter())
577
.map(move |(((values, nested), type_), encoding)| {
578
let iter = values.into_iter().map(|leave_values| {
579
array_to_pages(
580
leave_values.as_ref(),
581
type_.clone(),
582
&nested,
583
options,
584
*encoding,
585
)
586
});
587
588
// Need a scratch to bubble up the error :/
589
let mut scratch = Vec::with_capacity(iter.size_hint().0);
590
for v in iter {
591
scratch.push(v?)
592
}
593
Ok(DynIter::new(scratch.into_iter().flatten()))
594
})
595
.collect::<PolarsResult<Vec<_>>>()
596
}
597
598
#[cfg(test)]
599
mod tests {
600
use arrow::array::*;
601
use arrow::datatypes::*;
602
603
use super::super::{FieldInfo, ParquetPhysicalType};
604
use super::*;
605
use crate::parquet::schema::Repetition;
606
use crate::parquet::schema::types::{
607
GroupLogicalType, PrimitiveConvertedType, PrimitiveLogicalType,
608
};
609
610
#[test]
611
fn test_struct() {
612
let boolean = BooleanArray::from_slice([false, false, true, true]).boxed();
613
let int = Int32Array::from_slice([42, 28, 19, 31]).boxed();
614
615
let fields = vec![
616
Field::new("b".into(), ArrowDataType::Boolean, false),
617
Field::new("c".into(), ArrowDataType::Int32, false),
618
];
619
620
let array = StructArray::new(
621
ArrowDataType::Struct(fields),
622
4,
623
vec![boolean.clone(), int.clone()],
624
Some(Bitmap::from([true, true, false, true])),
625
);
626
627
let type_ = ParquetType::GroupType {
628
field_info: FieldInfo {
629
name: "a".into(),
630
repetition: Repetition::Optional,
631
id: None,
632
},
633
logical_type: None,
634
converted_type: None,
635
fields: vec![
636
ParquetType::PrimitiveType(ParquetPrimitiveType {
637
field_info: FieldInfo {
638
name: "b".into(),
639
repetition: Repetition::Required,
640
id: None,
641
},
642
logical_type: None,
643
converted_type: None,
644
physical_type: ParquetPhysicalType::Boolean,
645
}),
646
ParquetType::PrimitiveType(ParquetPrimitiveType {
647
field_info: FieldInfo {
648
name: "c".into(),
649
repetition: Repetition::Required,
650
id: None,
651
},
652
logical_type: None,
653
converted_type: None,
654
physical_type: ParquetPhysicalType::Int32,
655
}),
656
],
657
};
658
let a = to_nested(&array, &type_).unwrap();
659
660
assert_eq!(
661
a,
662
vec![
663
vec![
664
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
665
Nested::primitive(None, false, 4),
666
],
667
vec![
668
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
669
Nested::primitive(None, false, 4),
670
],
671
]
672
);
673
}
674
675
#[test]
676
fn test_struct_struct() {
677
let boolean = BooleanArray::from_slice([false, false, true, true]).boxed();
678
let int = Int32Array::from_slice([42, 28, 19, 31]).boxed();
679
680
let fields = vec![
681
Field::new("b".into(), ArrowDataType::Boolean, false),
682
Field::new("c".into(), ArrowDataType::Int32, false),
683
];
684
685
let array = StructArray::new(
686
ArrowDataType::Struct(fields),
687
4,
688
vec![boolean.clone(), int.clone()],
689
Some(Bitmap::from([true, true, false, true])),
690
);
691
692
let fields = vec![
693
Field::new("b".into(), array.dtype().clone(), true),
694
Field::new("c".into(), array.dtype().clone(), true),
695
];
696
697
let array = StructArray::new(
698
ArrowDataType::Struct(fields),
699
4,
700
vec![Box::new(array.clone()), Box::new(array)],
701
None,
702
);
703
704
let type_ = ParquetType::GroupType {
705
field_info: FieldInfo {
706
name: "a".into(),
707
repetition: Repetition::Optional,
708
id: None,
709
},
710
logical_type: None,
711
converted_type: None,
712
fields: vec![
713
ParquetType::PrimitiveType(ParquetPrimitiveType {
714
field_info: FieldInfo {
715
name: "b".into(),
716
repetition: Repetition::Required,
717
id: None,
718
},
719
logical_type: None,
720
converted_type: None,
721
physical_type: ParquetPhysicalType::Boolean,
722
}),
723
ParquetType::PrimitiveType(ParquetPrimitiveType {
724
field_info: FieldInfo {
725
name: "c".into(),
726
repetition: Repetition::Required,
727
id: None,
728
},
729
logical_type: None,
730
converted_type: None,
731
physical_type: ParquetPhysicalType::Int32,
732
}),
733
],
734
};
735
736
let type_ = ParquetType::GroupType {
737
field_info: FieldInfo {
738
name: "a".into(),
739
repetition: Repetition::Required,
740
id: None,
741
},
742
logical_type: None,
743
converted_type: None,
744
fields: vec![type_.clone(), type_],
745
};
746
747
let a = to_nested(&array, &type_).unwrap();
748
749
assert_eq!(
750
a,
751
vec![
752
// a.b.b
753
vec![
754
Nested::structure(None, false, 4),
755
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
756
Nested::primitive(None, false, 4),
757
],
758
// a.b.c
759
vec![
760
Nested::structure(None, false, 4),
761
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
762
Nested::primitive(None, false, 4),
763
],
764
// a.c.b
765
vec![
766
Nested::structure(None, false, 4),
767
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
768
Nested::primitive(None, false, 4),
769
],
770
// a.c.c
771
vec![
772
Nested::structure(None, false, 4),
773
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
774
Nested::primitive(None, false, 4),
775
],
776
]
777
);
778
}
779
780
#[test]
781
fn test_list_struct() {
782
let boolean = BooleanArray::from_slice([false, false, true, true]).boxed();
783
let int = Int32Array::from_slice([42, 28, 19, 31]).boxed();
784
785
let fields = vec![
786
Field::new("b".into(), ArrowDataType::Boolean, false),
787
Field::new("c".into(), ArrowDataType::Int32, false),
788
];
789
790
let array = StructArray::new(
791
ArrowDataType::Struct(fields),
792
4,
793
vec![boolean.clone(), int.clone()],
794
Some(Bitmap::from([true, true, false, true])),
795
);
796
797
let array = ListArray::new(
798
ArrowDataType::List(Box::new(Field::new(
799
"l".into(),
800
array.dtype().clone(),
801
true,
802
))),
803
vec![0i32, 2, 4].try_into().unwrap(),
804
Box::new(array),
805
None,
806
);
807
808
let type_ = ParquetType::GroupType {
809
field_info: FieldInfo {
810
name: "a".into(),
811
repetition: Repetition::Optional,
812
id: None,
813
},
814
logical_type: None,
815
converted_type: None,
816
fields: vec![
817
ParquetType::PrimitiveType(ParquetPrimitiveType {
818
field_info: FieldInfo {
819
name: "b".into(),
820
repetition: Repetition::Required,
821
id: None,
822
},
823
logical_type: None,
824
converted_type: None,
825
physical_type: ParquetPhysicalType::Boolean,
826
}),
827
ParquetType::PrimitiveType(ParquetPrimitiveType {
828
field_info: FieldInfo {
829
name: "c".into(),
830
repetition: Repetition::Required,
831
id: None,
832
},
833
logical_type: None,
834
converted_type: None,
835
physical_type: ParquetPhysicalType::Int32,
836
}),
837
],
838
};
839
840
let type_ = ParquetType::GroupType {
841
field_info: FieldInfo {
842
name: "l".into(),
843
repetition: Repetition::Required,
844
id: None,
845
},
846
logical_type: None,
847
converted_type: None,
848
fields: vec![ParquetType::GroupType {
849
field_info: FieldInfo {
850
name: "list".into(),
851
repetition: Repetition::Repeated,
852
id: None,
853
},
854
logical_type: None,
855
converted_type: None,
856
fields: vec![type_],
857
}],
858
};
859
860
let a = to_nested(&array, &type_).unwrap();
861
862
assert_eq!(
863
a,
864
vec![
865
vec![
866
Nested::List(ListNested::<i32> {
867
is_optional: false,
868
offsets: vec![0, 2, 4].try_into().unwrap(),
869
validity: None,
870
}),
871
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
872
Nested::primitive(None, false, 4),
873
],
874
vec![
875
Nested::List(ListNested::<i32> {
876
is_optional: false,
877
offsets: vec![0, 2, 4].try_into().unwrap(),
878
validity: None,
879
}),
880
Nested::structure(Some(Bitmap::from([true, true, false, true])), true, 4),
881
Nested::primitive(None, false, 4),
882
],
883
]
884
);
885
}
886
887
#[test]
888
fn test_map() {
889
let kv_type = ArrowDataType::Struct(vec![
890
Field::new("k".into(), ArrowDataType::Utf8, false),
891
Field::new("v".into(), ArrowDataType::Int32, false),
892
]);
893
let kv_field = Field::new("kv".into(), kv_type.clone(), false);
894
let map_type = ArrowDataType::Map(Box::new(kv_field), false);
895
896
let key_array = Utf8Array::<i32>::from_slice(["k1", "k2", "k3", "k4", "k5", "k6"]).boxed();
897
let val_array = Int32Array::from_slice([42, 28, 19, 31, 21, 17]).boxed();
898
let kv_array = StructArray::try_new(kv_type, 6, vec![key_array, val_array], None)
899
.unwrap()
900
.boxed();
901
let offsets = OffsetsBuffer::try_from(vec![0, 2, 3, 4, 6]).unwrap();
902
903
let array = MapArray::try_new(map_type, offsets, kv_array, None).unwrap();
904
905
let type_ = ParquetType::GroupType {
906
field_info: FieldInfo {
907
name: "kv".into(),
908
repetition: Repetition::Optional,
909
id: None,
910
},
911
logical_type: None,
912
converted_type: None,
913
fields: vec![
914
ParquetType::PrimitiveType(ParquetPrimitiveType {
915
field_info: FieldInfo {
916
name: "k".into(),
917
repetition: Repetition::Required,
918
id: None,
919
},
920
logical_type: Some(PrimitiveLogicalType::String),
921
converted_type: Some(PrimitiveConvertedType::Utf8),
922
physical_type: ParquetPhysicalType::ByteArray,
923
}),
924
ParquetType::PrimitiveType(ParquetPrimitiveType {
925
field_info: FieldInfo {
926
name: "v".into(),
927
repetition: Repetition::Required,
928
id: None,
929
},
930
logical_type: None,
931
converted_type: None,
932
physical_type: ParquetPhysicalType::Int32,
933
}),
934
],
935
};
936
937
let type_ = ParquetType::GroupType {
938
field_info: FieldInfo {
939
name: "m".into(),
940
repetition: Repetition::Required,
941
id: None,
942
},
943
logical_type: Some(GroupLogicalType::Map),
944
converted_type: None,
945
fields: vec![ParquetType::GroupType {
946
field_info: FieldInfo {
947
name: "map".into(),
948
repetition: Repetition::Repeated,
949
id: None,
950
},
951
logical_type: None,
952
converted_type: None,
953
fields: vec![type_],
954
}],
955
};
956
957
let a = to_nested(&array, &type_).unwrap();
958
959
assert_eq!(
960
a,
961
vec![
962
vec![
963
Nested::List(ListNested::<i32> {
964
is_optional: false,
965
offsets: vec![0, 2, 3, 4, 6].try_into().unwrap(),
966
validity: None,
967
}),
968
Nested::structure(None, true, 6),
969
Nested::primitive(None, false, 6),
970
],
971
vec![
972
Nested::List(ListNested::<i32> {
973
is_optional: false,
974
offsets: vec![0, 2, 3, 4, 6].try_into().unwrap(),
975
validity: None,
976
}),
977
Nested::structure(None, true, 6),
978
Nested::primitive(None, false, 6),
979
],
980
]
981
);
982
}
983
}
984
985