Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-io/src/csv/read/builder.rs
8420 views
1
use arrow::array::MutableBinaryViewArray;
2
#[cfg(feature = "dtype-decimal")]
3
use polars_compute::decimal::str_to_dec128;
4
#[cfg(feature = "dtype-categorical")]
5
use polars_core::chunked_array::builder::CategoricalChunkedBuilder;
6
use polars_core::prelude::*;
7
use polars_error::to_compute_err;
8
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
9
use polars_time::chunkedarray::string::Pattern;
10
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
11
use polars_time::prelude::string::infer::{
12
DatetimeInfer, StrpTimeParser, TryFromWithUnit, infer_pattern_single,
13
};
14
#[cfg(feature = "dtype-f16")]
15
use polars_utils::float16::pf16;
16
use polars_utils::vec::PushUnchecked;
17
18
use super::options::CsvEncoding;
19
use super::parser::{could_be_whitespace_fast, skip_whitespace};
20
use super::utils::escape_field;
21
22
pub(crate) trait PrimitiveParser: PolarsNumericType {
23
fn parse(bytes: &[u8]) -> Option<Self::Native>;
24
}
25
26
#[cfg(feature = "dtype-f16")]
27
impl PrimitiveParser for Float16Type {
28
#[inline]
29
fn parse(bytes: &[u8]) -> Option<pf16> {
30
use num_traits::FromPrimitive;
31
32
pf16::from_f32(fast_float2::parse(bytes).ok()?)
33
}
34
}
35
36
impl PrimitiveParser for Float32Type {
37
#[inline]
38
fn parse(bytes: &[u8]) -> Option<f32> {
39
fast_float2::parse(bytes).ok()
40
}
41
}
42
impl PrimitiveParser for Float64Type {
43
#[inline]
44
fn parse(bytes: &[u8]) -> Option<f64> {
45
fast_float2::parse(bytes).ok()
46
}
47
}
48
49
#[cfg(feature = "dtype-u8")]
50
impl PrimitiveParser for UInt8Type {
51
#[inline]
52
fn parse(bytes: &[u8]) -> Option<u8> {
53
atoi_simd::parse_skipped(bytes).ok()
54
}
55
}
56
#[cfg(feature = "dtype-u16")]
57
impl PrimitiveParser for UInt16Type {
58
#[inline]
59
fn parse(bytes: &[u8]) -> Option<u16> {
60
atoi_simd::parse_skipped(bytes).ok()
61
}
62
}
63
impl PrimitiveParser for UInt32Type {
64
#[inline]
65
fn parse(bytes: &[u8]) -> Option<u32> {
66
atoi_simd::parse_skipped(bytes).ok()
67
}
68
}
69
impl PrimitiveParser for UInt64Type {
70
#[inline]
71
fn parse(bytes: &[u8]) -> Option<u64> {
72
atoi_simd::parse_skipped(bytes).ok()
73
}
74
}
75
#[cfg(feature = "dtype-u128")]
76
impl PrimitiveParser for UInt128Type {
77
#[inline]
78
fn parse(bytes: &[u8]) -> Option<u128> {
79
atoi_simd::parse_skipped(bytes).ok()
80
}
81
}
82
#[cfg(feature = "dtype-i8")]
83
impl PrimitiveParser for Int8Type {
84
#[inline]
85
fn parse(bytes: &[u8]) -> Option<i8> {
86
atoi_simd::parse_skipped(bytes).ok()
87
}
88
}
89
#[cfg(feature = "dtype-i16")]
90
impl PrimitiveParser for Int16Type {
91
#[inline]
92
fn parse(bytes: &[u8]) -> Option<i16> {
93
atoi_simd::parse_skipped(bytes).ok()
94
}
95
}
96
impl PrimitiveParser for Int32Type {
97
#[inline]
98
fn parse(bytes: &[u8]) -> Option<i32> {
99
atoi_simd::parse_skipped(bytes).ok()
100
}
101
}
102
impl PrimitiveParser for Int64Type {
103
#[inline]
104
fn parse(bytes: &[u8]) -> Option<i64> {
105
atoi_simd::parse_skipped(bytes).ok()
106
}
107
}
108
#[cfg(feature = "dtype-i128")]
109
impl PrimitiveParser for Int128Type {
110
#[inline]
111
fn parse(bytes: &[u8]) -> Option<i128> {
112
atoi_simd::parse_skipped(bytes).ok()
113
}
114
}
115
116
trait ParsedBuilder {
117
fn parse_bytes(
118
&mut self,
119
bytes: &[u8],
120
ignore_errors: bool,
121
_needs_escaping: bool,
122
_missing_is_null: bool,
123
_time_unit: Option<TimeUnit>,
124
) -> PolarsResult<()>;
125
}
126
127
impl<T> ParsedBuilder for PrimitiveChunkedBuilder<T>
128
where
129
T: PolarsNumericType + PrimitiveParser,
130
{
131
#[inline]
132
fn parse_bytes(
133
&mut self,
134
mut bytes: &[u8],
135
ignore_errors: bool,
136
needs_escaping: bool,
137
_missing_is_null: bool,
138
_time_unit: Option<TimeUnit>,
139
) -> PolarsResult<()> {
140
if !bytes.is_empty() && needs_escaping {
141
bytes = &bytes[1..bytes.len() - 1];
142
}
143
144
if !bytes.is_empty() && could_be_whitespace_fast(bytes[0]) {
145
bytes = skip_whitespace(bytes);
146
}
147
148
if bytes.is_empty() {
149
self.append_null();
150
return Ok(());
151
}
152
153
match T::parse(bytes) {
154
Some(value) => self.append_value(value),
155
None => {
156
if ignore_errors {
157
self.append_null()
158
} else {
159
polars_bail!(ComputeError: "invalid primitive value found during CSV parsing")
160
}
161
},
162
}
163
Ok(())
164
}
165
}
166
167
pub struct Utf8Field {
168
name: PlSmallStr,
169
mutable: MutableBinaryViewArray<[u8]>,
170
scratch: Vec<u8>,
171
quote_char: u8,
172
encoding: CsvEncoding,
173
}
174
175
impl Utf8Field {
176
fn new(
177
name: PlSmallStr,
178
capacity: usize,
179
quote_char: Option<u8>,
180
encoding: CsvEncoding,
181
) -> Self {
182
Self {
183
name,
184
mutable: MutableBinaryViewArray::with_capacity(capacity),
185
scratch: vec![],
186
quote_char: quote_char.unwrap_or(b'"'),
187
encoding,
188
}
189
}
190
}
191
192
#[inline]
193
pub fn validate_utf8(bytes: &[u8]) -> bool {
194
simdutf8::basic::from_utf8(bytes).is_ok()
195
}
196
197
impl ParsedBuilder for Utf8Field {
198
#[inline]
199
fn parse_bytes(
200
&mut self,
201
bytes: &[u8],
202
ignore_errors: bool,
203
needs_escaping: bool,
204
missing_is_null: bool,
205
_time_unit: Option<TimeUnit>,
206
) -> PolarsResult<()> {
207
if bytes.is_empty() {
208
if missing_is_null {
209
self.mutable.push_null()
210
} else {
211
self.mutable.push(Some([]))
212
}
213
return Ok(());
214
}
215
216
// note that one branch writes without updating the length, so we must do that later.
217
let escaped_bytes = if needs_escaping {
218
self.scratch.clear();
219
self.scratch.reserve(bytes.len());
220
polars_ensure!(bytes.len() > 1 && bytes.last() == Some(&self.quote_char), ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
221
222
// SAFETY:
223
// we just allocated enough capacity and data_len is correct.
224
unsafe {
225
let n_written =
226
escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut());
227
self.scratch.set_len(n_written);
228
}
229
230
self.scratch.as_slice()
231
} else {
232
bytes
233
};
234
235
if matches!(self.encoding, CsvEncoding::LossyUtf8) | ignore_errors {
236
// It is important that this happens after escaping, as invalid escaped string can produce
237
// invalid utf8.
238
let parse_result = validate_utf8(escaped_bytes);
239
240
match parse_result {
241
true => {
242
let value = escaped_bytes;
243
self.mutable.push_value(value)
244
},
245
false => {
246
if matches!(self.encoding, CsvEncoding::LossyUtf8) {
247
// TODO! do this without allocating
248
let s = String::from_utf8_lossy(escaped_bytes);
249
self.mutable.push_value(s.as_ref().as_bytes())
250
} else if ignore_errors {
251
self.mutable.push_null()
252
} else {
253
// If field before escaping is valid utf8, the escaping is incorrect.
254
if needs_escaping && validate_utf8(bytes) {
255
polars_bail!(ComputeError: "string field is not properly escaped");
256
} else {
257
polars_bail!(ComputeError: "invalid utf-8 sequence");
258
}
259
}
260
},
261
}
262
} else {
263
self.mutable.push_value(escaped_bytes)
264
}
265
266
Ok(())
267
}
268
}
269
270
#[cfg(feature = "dtype-categorical")]
271
pub struct CategoricalField<T: PolarsCategoricalType> {
272
escape_scratch: Vec<u8>,
273
quote_char: u8,
274
builder: CategoricalChunkedBuilder<T>,
275
}
276
277
#[cfg(feature = "dtype-categorical")]
278
impl<T: PolarsCategoricalType> CategoricalField<T> {
279
fn new(name: PlSmallStr, capacity: usize, quote_char: Option<u8>, dtype: DataType) -> Self {
280
let mut builder = CategoricalChunkedBuilder::new(name, dtype);
281
builder.reserve(capacity);
282
283
Self {
284
escape_scratch: vec![],
285
quote_char: quote_char.unwrap_or(b'"'),
286
builder,
287
}
288
}
289
290
#[inline]
291
fn parse_bytes(
292
&mut self,
293
bytes: &[u8],
294
ignore_errors: bool,
295
needs_escaping: bool,
296
_missing_is_null: bool,
297
_time_unit: Option<TimeUnit>,
298
) -> PolarsResult<()> {
299
if bytes.is_empty() {
300
self.builder.append_null();
301
return Ok(());
302
}
303
if validate_utf8(bytes) {
304
if needs_escaping {
305
polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
306
self.escape_scratch.clear();
307
self.escape_scratch.reserve(bytes.len());
308
// SAFETY:
309
// we just allocated enough capacity and data_len is correct.
310
unsafe {
311
let n_written = escape_field(
312
bytes,
313
self.quote_char,
314
self.escape_scratch.spare_capacity_mut(),
315
);
316
self.escape_scratch.set_len(n_written);
317
}
318
319
// SAFETY:
320
// just did utf8 check
321
let key = unsafe { std::str::from_utf8_unchecked(&self.escape_scratch) };
322
self.builder.append_str(key)?;
323
} else {
324
// SAFETY:
325
// just did utf8 check
326
let key = unsafe { std::str::from_utf8_unchecked(bytes) };
327
self.builder.append_str(key)?;
328
}
329
} else if ignore_errors {
330
self.builder.append_null()
331
} else {
332
polars_bail!(ComputeError: "invalid utf-8 sequence");
333
}
334
Ok(())
335
}
336
}
337
338
impl ParsedBuilder for BooleanChunkedBuilder {
339
#[inline]
340
fn parse_bytes(
341
&mut self,
342
bytes: &[u8],
343
ignore_errors: bool,
344
needs_escaping: bool,
345
_missing_is_null: bool,
346
_time_unit: Option<TimeUnit>,
347
) -> PolarsResult<()> {
348
let bytes = if needs_escaping {
349
&bytes[1..bytes.len() - 1]
350
} else {
351
bytes
352
};
353
if bytes.eq_ignore_ascii_case(b"false") {
354
self.append_value(false);
355
} else if bytes.eq_ignore_ascii_case(b"true") {
356
self.append_value(true);
357
} else if ignore_errors || bytes.is_empty() {
358
self.append_null();
359
} else {
360
polars_bail!(
361
ComputeError: "error while parsing value {} as boolean",
362
String::from_utf8_lossy(bytes),
363
);
364
}
365
Ok(())
366
}
367
}
368
369
#[cfg(feature = "dtype-decimal")]
370
pub struct DecimalField {
371
builder: PrimitiveChunkedBuilder<Int128Type>,
372
precision: usize,
373
scale: usize,
374
decimal_comma: bool,
375
}
376
377
#[cfg(feature = "dtype-decimal")]
378
impl DecimalField {
379
fn new(
380
name: PlSmallStr,
381
capacity: usize,
382
precision: usize,
383
scale: usize,
384
decimal_comma: bool,
385
) -> Self {
386
let builder = PrimitiveChunkedBuilder::<Int128Type>::new(name, capacity);
387
Self {
388
builder,
389
precision,
390
scale,
391
decimal_comma,
392
}
393
}
394
}
395
396
#[cfg(feature = "dtype-decimal")]
397
impl ParsedBuilder for DecimalField {
398
#[inline]
399
fn parse_bytes(
400
&mut self,
401
mut bytes: &[u8],
402
ignore_errors: bool,
403
needs_escaping: bool,
404
_missing_is_null: bool,
405
_time_unit: Option<TimeUnit>,
406
) -> PolarsResult<()> {
407
if !bytes.is_empty() && needs_escaping {
408
bytes = &bytes[1..bytes.len() - 1];
409
}
410
411
if !bytes.is_empty() && could_be_whitespace_fast(bytes[0]) {
412
bytes = skip_whitespace(bytes);
413
}
414
415
if bytes.is_empty() {
416
self.builder.append_null();
417
return Ok(());
418
}
419
420
match str_to_dec128(bytes, self.precision, self.scale, self.decimal_comma) {
421
Some(value) => self.builder.append_value(value),
422
None => {
423
if ignore_errors {
424
self.builder.append_null()
425
} else {
426
polars_bail!(ComputeError: "invalid decimal value found during CSV parsing")
427
}
428
},
429
}
430
431
Ok(())
432
}
433
}
434
435
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
436
pub struct DatetimeField<T: PolarsNumericType> {
437
compiled: Option<DatetimeInfer<T>>,
438
builder: PrimitiveChunkedBuilder<T>,
439
}
440
441
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
442
impl<T: PolarsNumericType> DatetimeField<T> {
443
fn new(name: PlSmallStr, capacity: usize) -> Self {
444
let builder = PrimitiveChunkedBuilder::<T>::new(name, capacity);
445
Self {
446
compiled: None,
447
builder,
448
}
449
}
450
}
451
452
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
453
fn slow_datetime_parser<T>(
454
buf: &mut DatetimeField<T>,
455
bytes: &[u8],
456
time_unit: Option<TimeUnit>,
457
ignore_errors: bool,
458
) -> PolarsResult<()>
459
where
460
T: PolarsNumericType,
461
DatetimeInfer<T>: TryFromWithUnit<Pattern>,
462
{
463
let val = if bytes.is_ascii() {
464
// SAFETY:
465
// we just checked it is ascii
466
unsafe { std::str::from_utf8_unchecked(bytes) }
467
} else {
468
match std::str::from_utf8(bytes) {
469
Ok(val) => val,
470
Err(_) => {
471
if ignore_errors {
472
buf.builder.append_null();
473
return Ok(());
474
} else {
475
polars_bail!(ComputeError: "invalid utf-8 sequence");
476
}
477
},
478
}
479
};
480
481
let pattern = match &buf.compiled {
482
Some(compiled) => compiled.pattern,
483
None => match infer_pattern_single(val) {
484
Some(pattern) => pattern,
485
None => {
486
if ignore_errors {
487
buf.builder.append_null();
488
return Ok(());
489
} else {
490
polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for '{}'", val)
491
}
492
},
493
},
494
};
495
match DatetimeInfer::try_from_with_unit(pattern, time_unit) {
496
Ok(mut infer) => {
497
let parsed = infer.parse(val);
498
let Some(parsed) = parsed else {
499
if ignore_errors {
500
buf.builder.append_null();
501
return Ok(());
502
} else {
503
polars_bail!(ComputeError: "could not parse '{}' with pattern '{:?}'", val, pattern)
504
}
505
};
506
507
buf.compiled = Some(infer);
508
buf.builder.append_value(parsed);
509
Ok(())
510
},
511
Err(err) => {
512
if ignore_errors {
513
buf.builder.append_null();
514
Ok(())
515
} else {
516
Err(err)
517
}
518
},
519
}
520
}
521
522
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
523
impl<T> ParsedBuilder for DatetimeField<T>
524
where
525
T: PolarsNumericType,
526
DatetimeInfer<T>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,
527
{
528
#[inline]
529
fn parse_bytes(
530
&mut self,
531
mut bytes: &[u8],
532
ignore_errors: bool,
533
needs_escaping: bool,
534
_missing_is_null: bool,
535
time_unit: Option<TimeUnit>,
536
) -> PolarsResult<()> {
537
if needs_escaping && bytes.len() >= 2 {
538
bytes = &bytes[1..bytes.len() - 1]
539
}
540
541
if bytes.is_empty() {
542
// for types other than string `_missing_is_null` is irrelevant; we always append null
543
self.builder.append_null();
544
return Ok(());
545
}
546
547
match &mut self.compiled {
548
None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
549
Some(compiled) => {
550
match compiled.parse_bytes(bytes, time_unit) {
551
Some(parsed) => {
552
self.builder.append_value(parsed);
553
Ok(())
554
},
555
// fall back on chrono parser
556
// this is a lot slower, we need to do utf8 checking and use
557
// the slower parser
558
None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
559
}
560
},
561
}
562
}
563
}
564
565
pub fn init_builders(
566
projection: &[usize],
567
capacity: usize,
568
schema: &Schema,
569
quote_char: Option<u8>,
570
encoding: CsvEncoding,
571
decimal_comma: bool,
572
) -> PolarsResult<Vec<Builder>> {
573
projection
574
.iter()
575
.map(|&i| {
576
let (name, dtype) = schema.get_at_index(i).unwrap();
577
let name = name.clone();
578
let builder = match dtype {
579
&DataType::Boolean => Builder::Boolean(BooleanChunkedBuilder::new(name, capacity)),
580
#[cfg(feature = "dtype-i8")]
581
&DataType::Int8 => Builder::Int8(PrimitiveChunkedBuilder::new(name, capacity)),
582
#[cfg(feature = "dtype-i16")]
583
&DataType::Int16 => Builder::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
584
&DataType::Int32 => Builder::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
585
&DataType::Int64 => Builder::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
586
#[cfg(feature = "dtype-i128")]
587
&DataType::Int128 => Builder::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
588
#[cfg(feature = "dtype-u8")]
589
&DataType::UInt8 => Builder::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
590
#[cfg(feature = "dtype-u16")]
591
&DataType::UInt16 => Builder::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),
592
&DataType::UInt32 => Builder::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),
593
&DataType::UInt64 => Builder::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),
594
#[cfg(feature = "dtype-u128")]
595
&DataType::UInt128 => {
596
Builder::UInt128(PrimitiveChunkedBuilder::new(name, capacity))
597
},
598
#[cfg(feature = "dtype-f16")]
599
&DataType::Float16 => {
600
if decimal_comma {
601
Builder::DecimalFloat16(
602
PrimitiveChunkedBuilder::new(name, capacity),
603
Default::default(),
604
)
605
} else {
606
Builder::Float16(PrimitiveChunkedBuilder::new(name, capacity))
607
}
608
},
609
&DataType::Float32 => {
610
if decimal_comma {
611
Builder::DecimalFloat32(
612
PrimitiveChunkedBuilder::new(name, capacity),
613
Default::default(),
614
)
615
} else {
616
Builder::Float32(PrimitiveChunkedBuilder::new(name, capacity))
617
}
618
},
619
&DataType::Float64 => {
620
if decimal_comma {
621
Builder::DecimalFloat64(
622
PrimitiveChunkedBuilder::new(name, capacity),
623
Default::default(),
624
)
625
} else {
626
Builder::Float64(PrimitiveChunkedBuilder::new(name, capacity))
627
}
628
},
629
#[cfg(feature = "dtype-decimal")]
630
&DataType::Decimal(precision, scale) => Builder::Decimal(DecimalField::new(
631
name,
632
capacity,
633
precision,
634
scale,
635
decimal_comma,
636
)),
637
&DataType::String => {
638
Builder::Utf8(Utf8Field::new(name, capacity, quote_char, encoding))
639
},
640
#[cfg(feature = "dtype-datetime")]
641
DataType::Datetime(time_unit, time_zone) => Builder::Datetime {
642
buf: DatetimeField::new(name, capacity),
643
time_unit: *time_unit,
644
time_zone: time_zone.clone(),
645
},
646
#[cfg(feature = "dtype-date")]
647
&DataType::Date => Builder::Date(DatetimeField::new(name, capacity)),
648
#[cfg(feature = "dtype-categorical")]
649
DataType::Categorical(_, _) | DataType::Enum(_, _) => {
650
match dtype.cat_physical().unwrap() {
651
CategoricalPhysical::U8 => {
652
Builder::Categorical8(CategoricalField::<Categorical8Type>::new(
653
name,
654
capacity,
655
quote_char,
656
dtype.clone(),
657
))
658
},
659
CategoricalPhysical::U16 => {
660
Builder::Categorical16(CategoricalField::<Categorical16Type>::new(
661
name,
662
capacity,
663
quote_char,
664
dtype.clone(),
665
))
666
},
667
CategoricalPhysical::U32 => {
668
Builder::Categorical32(CategoricalField::<Categorical32Type>::new(
669
name,
670
capacity,
671
quote_char,
672
dtype.clone(),
673
))
674
},
675
}
676
},
677
dt => polars_bail!(
678
ComputeError: "unsupported data type when reading CSV: {} when reading CSV", dt,
679
),
680
};
681
Ok(builder)
682
})
683
.collect()
684
}
685
686
#[allow(clippy::large_enum_variant)]
687
pub enum Builder {
688
Boolean(BooleanChunkedBuilder),
689
#[cfg(feature = "dtype-i8")]
690
Int8(PrimitiveChunkedBuilder<Int8Type>),
691
#[cfg(feature = "dtype-i16")]
692
Int16(PrimitiveChunkedBuilder<Int16Type>),
693
Int32(PrimitiveChunkedBuilder<Int32Type>),
694
Int64(PrimitiveChunkedBuilder<Int64Type>),
695
#[cfg(feature = "dtype-i128")]
696
Int128(PrimitiveChunkedBuilder<Int128Type>),
697
#[cfg(feature = "dtype-u8")]
698
UInt8(PrimitiveChunkedBuilder<UInt8Type>),
699
#[cfg(feature = "dtype-u16")]
700
UInt16(PrimitiveChunkedBuilder<UInt16Type>),
701
UInt32(PrimitiveChunkedBuilder<UInt32Type>),
702
UInt64(PrimitiveChunkedBuilder<UInt64Type>),
703
#[cfg(feature = "dtype-u128")]
704
UInt128(PrimitiveChunkedBuilder<UInt128Type>),
705
#[cfg(feature = "dtype-f16")]
706
Float16(PrimitiveChunkedBuilder<Float16Type>),
707
Float32(PrimitiveChunkedBuilder<Float32Type>),
708
Float64(PrimitiveChunkedBuilder<Float64Type>),
709
#[cfg(feature = "dtype-decimal")]
710
Decimal(DecimalField),
711
/// Stores the Utf8 fields and the total string length seen for that column
712
Utf8(Utf8Field),
713
#[cfg(feature = "dtype-datetime")]
714
Datetime {
715
buf: DatetimeField<Int64Type>,
716
time_unit: TimeUnit,
717
time_zone: Option<TimeZone>,
718
},
719
#[cfg(feature = "dtype-date")]
720
Date(DatetimeField<Int32Type>),
721
#[cfg(feature = "dtype-categorical")]
722
Categorical8(CategoricalField<Categorical8Type>),
723
#[cfg(feature = "dtype-categorical")]
724
Categorical16(CategoricalField<Categorical16Type>),
725
#[cfg(feature = "dtype-categorical")]
726
Categorical32(CategoricalField<Categorical32Type>),
727
#[cfg(feature = "dtype-f16")]
728
DecimalFloat16(PrimitiveChunkedBuilder<Float16Type>, Vec<u8>),
729
DecimalFloat32(PrimitiveChunkedBuilder<Float32Type>, Vec<u8>),
730
DecimalFloat64(PrimitiveChunkedBuilder<Float64Type>, Vec<u8>),
731
}
732
733
impl Builder {
734
pub fn into_series(self) -> PolarsResult<Series> {
735
let s = match self {
736
Builder::Boolean(v) => v.finish().into_series(),
737
#[cfg(feature = "dtype-i8")]
738
Builder::Int8(v) => v.finish().into_series(),
739
#[cfg(feature = "dtype-i16")]
740
Builder::Int16(v) => v.finish().into_series(),
741
Builder::Int32(v) => v.finish().into_series(),
742
Builder::Int64(v) => v.finish().into_series(),
743
#[cfg(feature = "dtype-i128")]
744
Builder::Int128(v) => v.finish().into_series(),
745
#[cfg(feature = "dtype-u8")]
746
Builder::UInt8(v) => v.finish().into_series(),
747
#[cfg(feature = "dtype-u16")]
748
Builder::UInt16(v) => v.finish().into_series(),
749
Builder::UInt32(v) => v.finish().into_series(),
750
Builder::UInt64(v) => v.finish().into_series(),
751
#[cfg(feature = "dtype-u128")]
752
Builder::UInt128(v) => v.finish().into_series(),
753
#[cfg(feature = "dtype-f16")]
754
Builder::Float16(v) => v.finish().into_series(),
755
Builder::Float32(v) => v.finish().into_series(),
756
Builder::Float64(v) => v.finish().into_series(),
757
#[cfg(feature = "dtype-f16")]
758
Builder::DecimalFloat16(v, _) => v.finish().into_series(),
759
Builder::DecimalFloat32(v, _) => v.finish().into_series(),
760
Builder::DecimalFloat64(v, _) => v.finish().into_series(),
761
#[cfg(feature = "dtype-decimal")]
762
Builder::Decimal(DecimalField {
763
builder,
764
precision,
765
scale,
766
..
767
}) => unsafe {
768
builder
769
.finish()
770
.into_series()
771
.from_physical_unchecked(&DataType::Decimal(precision, scale))
772
.unwrap()
773
},
774
#[cfg(feature = "dtype-datetime")]
775
Builder::Datetime {
776
buf,
777
time_unit,
778
time_zone,
779
} => buf
780
.builder
781
.finish()
782
.into_series()
783
.cast(&DataType::Datetime(time_unit, time_zone))
784
.unwrap(),
785
#[cfg(feature = "dtype-date")]
786
Builder::Date(v) => v
787
.builder
788
.finish()
789
.into_series()
790
.cast(&DataType::Date)
791
.unwrap(),
792
793
Builder::Utf8(v) => {
794
let arr = v.mutable.freeze();
795
StringChunked::with_chunk(v.name, unsafe { arr.to_utf8view_unchecked() })
796
.into_series()
797
},
798
#[cfg(feature = "dtype-categorical")]
799
Builder::Categorical8(buf) => buf.builder.finish().into_series(),
800
#[cfg(feature = "dtype-categorical")]
801
Builder::Categorical16(buf) => buf.builder.finish().into_series(),
802
#[cfg(feature = "dtype-categorical")]
803
Builder::Categorical32(buf) => buf.builder.finish().into_series(),
804
};
805
Ok(s)
806
}
807
808
pub fn add_null(&mut self, valid: bool) {
809
match self {
810
Builder::Boolean(v) => v.append_null(),
811
#[cfg(feature = "dtype-i8")]
812
Builder::Int8(v) => v.append_null(),
813
#[cfg(feature = "dtype-i16")]
814
Builder::Int16(v) => v.append_null(),
815
Builder::Int32(v) => v.append_null(),
816
Builder::Int64(v) => v.append_null(),
817
#[cfg(feature = "dtype-i128")]
818
Builder::Int128(v) => v.append_null(),
819
#[cfg(feature = "dtype-u8")]
820
Builder::UInt8(v) => v.append_null(),
821
#[cfg(feature = "dtype-u16")]
822
Builder::UInt16(v) => v.append_null(),
823
Builder::UInt32(v) => v.append_null(),
824
Builder::UInt64(v) => v.append_null(),
825
#[cfg(feature = "dtype-u128")]
826
Builder::UInt128(v) => v.append_null(),
827
#[cfg(feature = "dtype-f16")]
828
Builder::Float16(v) => v.append_null(),
829
Builder::Float32(v) => v.append_null(),
830
Builder::Float64(v) => v.append_null(),
831
#[cfg(feature = "dtype-decimal")]
832
Builder::Decimal(buf) => buf.builder.append_null(),
833
#[cfg(feature = "dtype-f16")]
834
Builder::DecimalFloat16(v, _) => v.append_null(),
835
Builder::DecimalFloat32(v, _) => v.append_null(),
836
Builder::DecimalFloat64(v, _) => v.append_null(),
837
Builder::Utf8(v) => {
838
if valid {
839
v.mutable.push_value("")
840
} else {
841
v.mutable.push_null()
842
}
843
},
844
#[cfg(feature = "dtype-datetime")]
845
Builder::Datetime { buf, .. } => buf.builder.append_null(),
846
#[cfg(feature = "dtype-date")]
847
Builder::Date(v) => v.builder.append_null(),
848
#[cfg(feature = "dtype-categorical")]
849
Builder::Categorical8(buf) => buf.builder.append_null(),
850
#[cfg(feature = "dtype-categorical")]
851
Builder::Categorical16(buf) => buf.builder.append_null(),
852
#[cfg(feature = "dtype-categorical")]
853
Builder::Categorical32(buf) => buf.builder.append_null(),
854
};
855
}
856
857
pub fn dtype(&self) -> DataType {
858
match self {
859
Builder::Boolean(_) => DataType::Boolean,
860
#[cfg(feature = "dtype-i8")]
861
Builder::Int8(_) => DataType::Int8,
862
#[cfg(feature = "dtype-i16")]
863
Builder::Int16(_) => DataType::Int16,
864
Builder::Int32(_) => DataType::Int32,
865
Builder::Int64(_) => DataType::Int64,
866
#[cfg(feature = "dtype-i128")]
867
Builder::Int128(_) => DataType::Int128,
868
#[cfg(feature = "dtype-u8")]
869
Builder::UInt8(_) => DataType::UInt8,
870
#[cfg(feature = "dtype-u16")]
871
Builder::UInt16(_) => DataType::UInt16,
872
Builder::UInt32(_) => DataType::UInt32,
873
Builder::UInt64(_) => DataType::UInt64,
874
#[cfg(feature = "dtype-u128")]
875
Builder::UInt128(_) => DataType::UInt128,
876
#[cfg(feature = "dtype-f16")]
877
Builder::Float16(_) | Builder::DecimalFloat16(_, _) => DataType::Float16,
878
Builder::Float32(_) | Builder::DecimalFloat32(_, _) => DataType::Float32,
879
Builder::Float64(_) | Builder::DecimalFloat64(_, _) => DataType::Float64,
880
#[cfg(feature = "dtype-decimal")]
881
Builder::Decimal(DecimalField {
882
precision, scale, ..
883
}) => DataType::Decimal(*precision, *scale),
884
Builder::Utf8(_) => DataType::String,
885
#[cfg(feature = "dtype-datetime")]
886
Builder::Datetime { time_unit, .. } => DataType::Datetime(*time_unit, None),
887
#[cfg(feature = "dtype-date")]
888
Builder::Date(_) => DataType::Date,
889
#[cfg(feature = "dtype-categorical")]
890
Builder::Categorical8(buf) => buf.builder.dtype().clone(),
891
#[cfg(feature = "dtype-categorical")]
892
Builder::Categorical16(buf) => buf.builder.dtype().clone(),
893
#[cfg(feature = "dtype-categorical")]
894
Builder::Categorical32(buf) => buf.builder.dtype().clone(),
895
}
896
}
897
898
#[inline]
899
pub fn add(
900
&mut self,
901
bytes: &[u8],
902
ignore_errors: bool,
903
needs_escaping: bool,
904
missing_is_null: bool,
905
) -> PolarsResult<()> {
906
use Builder::*;
907
match self {
908
Boolean(buf) => <BooleanChunkedBuilder as ParsedBuilder>::parse_bytes(
909
buf,
910
bytes,
911
ignore_errors,
912
needs_escaping,
913
missing_is_null,
914
None,
915
),
916
#[cfg(feature = "dtype-i8")]
917
Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuilder>::parse_bytes(
918
buf,
919
bytes,
920
ignore_errors,
921
needs_escaping,
922
missing_is_null,
923
None,
924
),
925
#[cfg(feature = "dtype-i16")]
926
Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuilder>::parse_bytes(
927
buf,
928
bytes,
929
ignore_errors,
930
needs_escaping,
931
missing_is_null,
932
None,
933
),
934
Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuilder>::parse_bytes(
935
buf,
936
bytes,
937
ignore_errors,
938
needs_escaping,
939
missing_is_null,
940
None,
941
),
942
Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuilder>::parse_bytes(
943
buf,
944
bytes,
945
ignore_errors,
946
needs_escaping,
947
missing_is_null,
948
None,
949
),
950
#[cfg(feature = "dtype-i128")]
951
Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuilder>::parse_bytes(
952
buf,
953
bytes,
954
ignore_errors,
955
needs_escaping,
956
missing_is_null,
957
None,
958
),
959
#[cfg(feature = "dtype-u8")]
960
UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuilder>::parse_bytes(
961
buf,
962
bytes,
963
ignore_errors,
964
needs_escaping,
965
missing_is_null,
966
None,
967
),
968
#[cfg(feature = "dtype-u16")]
969
UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuilder>::parse_bytes(
970
buf,
971
bytes,
972
ignore_errors,
973
needs_escaping,
974
missing_is_null,
975
None,
976
),
977
UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuilder>::parse_bytes(
978
buf,
979
bytes,
980
ignore_errors,
981
needs_escaping,
982
missing_is_null,
983
None,
984
),
985
UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuilder>::parse_bytes(
986
buf,
987
bytes,
988
ignore_errors,
989
needs_escaping,
990
missing_is_null,
991
None,
992
),
993
#[cfg(feature = "dtype-u128")]
994
UInt128(buf) => <PrimitiveChunkedBuilder<UInt128Type> as ParsedBuilder>::parse_bytes(
995
buf,
996
bytes,
997
ignore_errors,
998
needs_escaping,
999
missing_is_null,
1000
None,
1001
),
1002
#[cfg(feature = "dtype-f16")]
1003
Float16(buf) => <PrimitiveChunkedBuilder<Float16Type> as ParsedBuilder>::parse_bytes(
1004
buf,
1005
bytes,
1006
ignore_errors,
1007
needs_escaping,
1008
missing_is_null,
1009
None,
1010
),
1011
Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuilder>::parse_bytes(
1012
buf,
1013
bytes,
1014
ignore_errors,
1015
needs_escaping,
1016
missing_is_null,
1017
None,
1018
),
1019
Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuilder>::parse_bytes(
1020
buf,
1021
bytes,
1022
ignore_errors,
1023
needs_escaping,
1024
missing_is_null,
1025
None,
1026
),
1027
#[cfg(feature = "dtype-f16")]
1028
DecimalFloat16(buf, scratch) => {
1029
prepare_decimal_comma(bytes, scratch);
1030
<PrimitiveChunkedBuilder<Float16Type> as ParsedBuilder>::parse_bytes(
1031
buf,
1032
scratch,
1033
ignore_errors,
1034
needs_escaping,
1035
missing_is_null,
1036
None,
1037
)
1038
},
1039
DecimalFloat32(buf, scratch) => {
1040
prepare_decimal_comma(bytes, scratch);
1041
<PrimitiveChunkedBuilder<Float32Type> as ParsedBuilder>::parse_bytes(
1042
buf,
1043
scratch,
1044
ignore_errors,
1045
needs_escaping,
1046
missing_is_null,
1047
None,
1048
)
1049
},
1050
DecimalFloat64(buf, scratch) => {
1051
prepare_decimal_comma(bytes, scratch);
1052
<PrimitiveChunkedBuilder<Float64Type> as ParsedBuilder>::parse_bytes(
1053
buf,
1054
scratch,
1055
ignore_errors,
1056
needs_escaping,
1057
missing_is_null,
1058
None,
1059
)
1060
},
1061
#[cfg(feature = "dtype-decimal")]
1062
Decimal(buf) => <DecimalField as ParsedBuilder>::parse_bytes(
1063
buf,
1064
bytes,
1065
ignore_errors,
1066
needs_escaping,
1067
missing_is_null,
1068
None,
1069
),
1070
Utf8(buf) => <Utf8Field as ParsedBuilder>::parse_bytes(
1071
buf,
1072
bytes,
1073
ignore_errors,
1074
needs_escaping,
1075
missing_is_null,
1076
None,
1077
),
1078
#[cfg(feature = "dtype-datetime")]
1079
Datetime { buf, time_unit, .. } => {
1080
<DatetimeField<Int64Type> as ParsedBuilder>::parse_bytes(
1081
buf,
1082
bytes,
1083
ignore_errors,
1084
needs_escaping,
1085
missing_is_null,
1086
Some(*time_unit),
1087
)
1088
},
1089
#[cfg(feature = "dtype-date")]
1090
Date(buf) => <DatetimeField<Int32Type> as ParsedBuilder>::parse_bytes(
1091
buf,
1092
bytes,
1093
ignore_errors,
1094
needs_escaping,
1095
missing_is_null,
1096
None,
1097
),
1098
#[cfg(feature = "dtype-categorical")]
1099
Categorical8(buf) => {
1100
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
1101
},
1102
#[cfg(feature = "dtype-categorical")]
1103
Categorical16(buf) => {
1104
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
1105
},
1106
#[cfg(feature = "dtype-categorical")]
1107
Categorical32(buf) => {
1108
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
1109
},
1110
}
1111
}
1112
}
1113
1114
#[inline]
1115
fn prepare_decimal_comma(bytes: &[u8], scratch: &mut Vec<u8>) {
1116
scratch.clear();
1117
scratch.reserve(bytes.len());
1118
1119
// SAFETY: we pre-allocated.
1120
for &byte in bytes {
1121
if byte == b',' {
1122
unsafe { scratch.push_unchecked(b'.') }
1123
} else {
1124
unsafe { scratch.push_unchecked(byte) }
1125
}
1126
}
1127
}
1128
1129