Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-io/src/csv/read/buffer.rs
6939 views
1
use arrow::array::MutableBinaryViewArray;
2
#[cfg(feature = "dtype-categorical")]
3
use polars_core::chunked_array::builder::CategoricalChunkedBuilder;
4
use polars_core::prelude::*;
5
use polars_error::to_compute_err;
6
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
7
use polars_time::chunkedarray::string::Pattern;
8
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
9
use polars_time::prelude::string::infer::{
10
DatetimeInfer, StrpTimeParser, TryFromWithUnit, infer_pattern_single,
11
};
12
use polars_utils::vec::PushUnchecked;
13
14
use super::options::CsvEncoding;
15
use super::parser::{is_whitespace, skip_whitespace};
16
use super::utils::escape_field;
17
18
pub(crate) trait PrimitiveParser: PolarsNumericType {
19
fn parse(bytes: &[u8]) -> Option<Self::Native>;
20
}
21
22
impl PrimitiveParser for Float32Type {
23
#[inline]
24
fn parse(bytes: &[u8]) -> Option<f32> {
25
fast_float2::parse(bytes).ok()
26
}
27
}
28
impl PrimitiveParser for Float64Type {
29
#[inline]
30
fn parse(bytes: &[u8]) -> Option<f64> {
31
fast_float2::parse(bytes).ok()
32
}
33
}
34
35
#[cfg(feature = "dtype-u8")]
36
impl PrimitiveParser for UInt8Type {
37
#[inline]
38
fn parse(bytes: &[u8]) -> Option<u8> {
39
atoi_simd::parse_skipped(bytes).ok()
40
}
41
}
42
#[cfg(feature = "dtype-u16")]
43
impl PrimitiveParser for UInt16Type {
44
#[inline]
45
fn parse(bytes: &[u8]) -> Option<u16> {
46
atoi_simd::parse_skipped(bytes).ok()
47
}
48
}
49
impl PrimitiveParser for UInt32Type {
50
#[inline]
51
fn parse(bytes: &[u8]) -> Option<u32> {
52
atoi_simd::parse_skipped(bytes).ok()
53
}
54
}
55
impl PrimitiveParser for UInt64Type {
56
#[inline]
57
fn parse(bytes: &[u8]) -> Option<u64> {
58
atoi_simd::parse_skipped(bytes).ok()
59
}
60
}
61
#[cfg(feature = "dtype-i8")]
62
impl PrimitiveParser for Int8Type {
63
#[inline]
64
fn parse(bytes: &[u8]) -> Option<i8> {
65
atoi_simd::parse_skipped(bytes).ok()
66
}
67
}
68
#[cfg(feature = "dtype-i16")]
69
impl PrimitiveParser for Int16Type {
70
#[inline]
71
fn parse(bytes: &[u8]) -> Option<i16> {
72
atoi_simd::parse_skipped(bytes).ok()
73
}
74
}
75
impl PrimitiveParser for Int32Type {
76
#[inline]
77
fn parse(bytes: &[u8]) -> Option<i32> {
78
atoi_simd::parse_skipped(bytes).ok()
79
}
80
}
81
impl PrimitiveParser for Int64Type {
82
#[inline]
83
fn parse(bytes: &[u8]) -> Option<i64> {
84
atoi_simd::parse_skipped(bytes).ok()
85
}
86
}
87
#[cfg(feature = "dtype-i128")]
88
impl PrimitiveParser for Int128Type {
89
#[inline]
90
fn parse(bytes: &[u8]) -> Option<i128> {
91
atoi_simd::parse_skipped(bytes).ok()
92
}
93
}
94
95
trait ParsedBuffer {
96
fn parse_bytes(
97
&mut self,
98
bytes: &[u8],
99
ignore_errors: bool,
100
_needs_escaping: bool,
101
_missing_is_null: bool,
102
_time_unit: Option<TimeUnit>,
103
) -> PolarsResult<()>;
104
}
105
106
impl<T> ParsedBuffer for PrimitiveChunkedBuilder<T>
107
where
108
T: PolarsNumericType + PrimitiveParser,
109
{
110
#[inline]
111
fn parse_bytes(
112
&mut self,
113
bytes: &[u8],
114
ignore_errors: bool,
115
needs_escaping: bool,
116
_missing_is_null: bool,
117
_time_unit: Option<TimeUnit>,
118
) -> PolarsResult<()> {
119
if bytes.is_empty() {
120
self.append_null()
121
} else {
122
let bytes = if needs_escaping {
123
&bytes[1..bytes.len() - 1]
124
} else {
125
bytes
126
};
127
128
// legacy comment (remember this if you decide to use Results again):
129
// its faster to work on options.
130
// if we need to throw an error, we parse again to be able to throw the error
131
132
match T::parse(bytes) {
133
Some(value) => self.append_value(value),
134
None => {
135
// try again without whitespace
136
if !bytes.is_empty() && is_whitespace(bytes[0]) {
137
let bytes = skip_whitespace(bytes);
138
return self.parse_bytes(
139
bytes,
140
ignore_errors,
141
false, // escaping was already done
142
_missing_is_null,
143
None,
144
);
145
}
146
polars_ensure!(
147
bytes.is_empty() || ignore_errors,
148
ComputeError: "remaining bytes non-empty",
149
);
150
self.append_null()
151
},
152
};
153
}
154
Ok(())
155
}
156
}
157
158
pub struct Utf8Field {
159
name: PlSmallStr,
160
mutable: MutableBinaryViewArray<[u8]>,
161
scratch: Vec<u8>,
162
quote_char: u8,
163
encoding: CsvEncoding,
164
}
165
166
impl Utf8Field {
167
fn new(
168
name: PlSmallStr,
169
capacity: usize,
170
quote_char: Option<u8>,
171
encoding: CsvEncoding,
172
) -> Self {
173
Self {
174
name,
175
mutable: MutableBinaryViewArray::with_capacity(capacity),
176
scratch: vec![],
177
quote_char: quote_char.unwrap_or(b'"'),
178
encoding,
179
}
180
}
181
}
182
183
#[inline]
184
pub fn validate_utf8(bytes: &[u8]) -> bool {
185
simdutf8::basic::from_utf8(bytes).is_ok()
186
}
187
188
impl ParsedBuffer for Utf8Field {
189
#[inline]
190
fn parse_bytes(
191
&mut self,
192
bytes: &[u8],
193
ignore_errors: bool,
194
needs_escaping: bool,
195
missing_is_null: bool,
196
_time_unit: Option<TimeUnit>,
197
) -> PolarsResult<()> {
198
if bytes.is_empty() {
199
if missing_is_null {
200
self.mutable.push_null()
201
} else {
202
self.mutable.push(Some([]))
203
}
204
return Ok(());
205
}
206
207
// note that one branch writes without updating the length, so we must do that later.
208
let escaped_bytes = if needs_escaping {
209
self.scratch.clear();
210
self.scratch.reserve(bytes.len());
211
polars_ensure!(bytes.len() > 1 && bytes.last() == Some(&self.quote_char), ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
212
213
// SAFETY:
214
// we just allocated enough capacity and data_len is correct.
215
unsafe {
216
let n_written =
217
escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut());
218
self.scratch.set_len(n_written);
219
}
220
221
self.scratch.as_slice()
222
} else {
223
bytes
224
};
225
226
if matches!(self.encoding, CsvEncoding::LossyUtf8) | ignore_errors {
227
// It is important that this happens after escaping, as invalid escaped string can produce
228
// invalid utf8.
229
let parse_result = validate_utf8(escaped_bytes);
230
231
match parse_result {
232
true => {
233
let value = escaped_bytes;
234
self.mutable.push_value(value)
235
},
236
false => {
237
if matches!(self.encoding, CsvEncoding::LossyUtf8) {
238
// TODO! do this without allocating
239
let s = String::from_utf8_lossy(escaped_bytes);
240
self.mutable.push_value(s.as_ref().as_bytes())
241
} else if ignore_errors {
242
self.mutable.push_null()
243
} else {
244
// If field before escaping is valid utf8, the escaping is incorrect.
245
if needs_escaping && validate_utf8(bytes) {
246
polars_bail!(ComputeError: "string field is not properly escaped");
247
} else {
248
polars_bail!(ComputeError: "invalid utf-8 sequence");
249
}
250
}
251
},
252
}
253
} else {
254
self.mutable.push_value(escaped_bytes)
255
}
256
257
Ok(())
258
}
259
}
260
261
#[cfg(feature = "dtype-categorical")]
262
pub struct CategoricalField<T: PolarsCategoricalType> {
263
escape_scratch: Vec<u8>,
264
quote_char: u8,
265
builder: CategoricalChunkedBuilder<T>,
266
}
267
268
#[cfg(feature = "dtype-categorical")]
269
impl<T: PolarsCategoricalType> CategoricalField<T> {
270
fn new(name: PlSmallStr, capacity: usize, quote_char: Option<u8>, dtype: DataType) -> Self {
271
let mut builder = CategoricalChunkedBuilder::new(name, dtype);
272
builder.reserve(capacity);
273
274
Self {
275
escape_scratch: vec![],
276
quote_char: quote_char.unwrap_or(b'"'),
277
builder,
278
}
279
}
280
281
#[inline]
282
fn parse_bytes(
283
&mut self,
284
bytes: &[u8],
285
ignore_errors: bool,
286
needs_escaping: bool,
287
_missing_is_null: bool,
288
_time_unit: Option<TimeUnit>,
289
) -> PolarsResult<()> {
290
if bytes.is_empty() {
291
self.builder.append_null();
292
return Ok(());
293
}
294
if validate_utf8(bytes) {
295
if needs_escaping {
296
polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
297
self.escape_scratch.clear();
298
self.escape_scratch.reserve(bytes.len());
299
// SAFETY:
300
// we just allocated enough capacity and data_len is correct.
301
unsafe {
302
let n_written = escape_field(
303
bytes,
304
self.quote_char,
305
self.escape_scratch.spare_capacity_mut(),
306
);
307
self.escape_scratch.set_len(n_written);
308
}
309
310
// SAFETY:
311
// just did utf8 check
312
let key = unsafe { std::str::from_utf8_unchecked(&self.escape_scratch) };
313
self.builder.append_str(key)?;
314
} else {
315
// SAFETY:
316
// just did utf8 check
317
let key = unsafe { std::str::from_utf8_unchecked(bytes) };
318
self.builder.append_str(key)?;
319
}
320
} else if ignore_errors {
321
self.builder.append_null()
322
} else {
323
polars_bail!(ComputeError: "invalid utf-8 sequence");
324
}
325
Ok(())
326
}
327
}
328
329
impl ParsedBuffer for BooleanChunkedBuilder {
330
#[inline]
331
fn parse_bytes(
332
&mut self,
333
bytes: &[u8],
334
ignore_errors: bool,
335
needs_escaping: bool,
336
_missing_is_null: bool,
337
_time_unit: Option<TimeUnit>,
338
) -> PolarsResult<()> {
339
let bytes = if needs_escaping {
340
&bytes[1..bytes.len() - 1]
341
} else {
342
bytes
343
};
344
if bytes.eq_ignore_ascii_case(b"false") {
345
self.append_value(false);
346
} else if bytes.eq_ignore_ascii_case(b"true") {
347
self.append_value(true);
348
} else if ignore_errors || bytes.is_empty() {
349
self.append_null();
350
} else {
351
polars_bail!(
352
ComputeError: "error while parsing value {} as boolean",
353
String::from_utf8_lossy(bytes),
354
);
355
}
356
Ok(())
357
}
358
}
359
360
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
361
pub struct DatetimeField<T: PolarsNumericType> {
362
compiled: Option<DatetimeInfer<T>>,
363
builder: PrimitiveChunkedBuilder<T>,
364
}
365
366
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
367
impl<T: PolarsNumericType> DatetimeField<T> {
368
fn new(name: PlSmallStr, capacity: usize) -> Self {
369
let builder = PrimitiveChunkedBuilder::<T>::new(name, capacity);
370
Self {
371
compiled: None,
372
builder,
373
}
374
}
375
}
376
377
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
378
fn slow_datetime_parser<T>(
379
buf: &mut DatetimeField<T>,
380
bytes: &[u8],
381
time_unit: Option<TimeUnit>,
382
ignore_errors: bool,
383
) -> PolarsResult<()>
384
where
385
T: PolarsNumericType,
386
DatetimeInfer<T>: TryFromWithUnit<Pattern>,
387
{
388
let val = if bytes.is_ascii() {
389
// SAFETY:
390
// we just checked it is ascii
391
unsafe { std::str::from_utf8_unchecked(bytes) }
392
} else {
393
match std::str::from_utf8(bytes) {
394
Ok(val) => val,
395
Err(_) => {
396
if ignore_errors {
397
buf.builder.append_null();
398
return Ok(());
399
} else {
400
polars_bail!(ComputeError: "invalid utf-8 sequence");
401
}
402
},
403
}
404
};
405
406
let pattern = match &buf.compiled {
407
Some(compiled) => compiled.pattern,
408
None => match infer_pattern_single(val) {
409
Some(pattern) => pattern,
410
None => {
411
if ignore_errors {
412
buf.builder.append_null();
413
return Ok(());
414
} else {
415
polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for '{}'", val)
416
}
417
},
418
},
419
};
420
match DatetimeInfer::try_from_with_unit(pattern, time_unit) {
421
Ok(mut infer) => {
422
let parsed = infer.parse(val);
423
let Some(parsed) = parsed else {
424
if ignore_errors {
425
buf.builder.append_null();
426
return Ok(());
427
} else {
428
polars_bail!(ComputeError: "could not parse '{}' with pattern '{:?}'", val, pattern)
429
}
430
};
431
432
buf.compiled = Some(infer);
433
buf.builder.append_value(parsed);
434
Ok(())
435
},
436
Err(err) => {
437
if ignore_errors {
438
buf.builder.append_null();
439
Ok(())
440
} else {
441
Err(err)
442
}
443
},
444
}
445
}
446
447
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
448
impl<T> ParsedBuffer for DatetimeField<T>
449
where
450
T: PolarsNumericType,
451
DatetimeInfer<T>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,
452
{
453
#[inline]
454
fn parse_bytes(
455
&mut self,
456
mut bytes: &[u8],
457
ignore_errors: bool,
458
needs_escaping: bool,
459
_missing_is_null: bool,
460
time_unit: Option<TimeUnit>,
461
) -> PolarsResult<()> {
462
if needs_escaping && bytes.len() >= 2 {
463
bytes = &bytes[1..bytes.len() - 1]
464
}
465
466
if bytes.is_empty() {
467
// for types other than string `_missing_is_null` is irrelevant; we always append null
468
self.builder.append_null();
469
return Ok(());
470
}
471
472
match &mut self.compiled {
473
None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
474
Some(compiled) => {
475
match compiled.parse_bytes(bytes, time_unit) {
476
Some(parsed) => {
477
self.builder.append_value(parsed);
478
Ok(())
479
},
480
// fall back on chrono parser
481
// this is a lot slower, we need to do utf8 checking and use
482
// the slower parser
483
None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
484
}
485
},
486
}
487
}
488
}
489
490
pub fn init_buffers(
491
projection: &[usize],
492
capacity: usize,
493
schema: &Schema,
494
quote_char: Option<u8>,
495
encoding: CsvEncoding,
496
decimal_comma: bool,
497
) -> PolarsResult<Vec<Buffer>> {
498
projection
499
.iter()
500
.map(|&i| {
501
let (name, dtype) = schema.get_at_index(i).unwrap();
502
let name = name.clone();
503
let builder = match dtype {
504
&DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)),
505
#[cfg(feature = "dtype-i8")]
506
&DataType::Int8 => Buffer::Int8(PrimitiveChunkedBuilder::new(name, capacity)),
507
#[cfg(feature = "dtype-i16")]
508
&DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
509
&DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
510
&DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
511
#[cfg(feature = "dtype-i128")]
512
&DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
513
#[cfg(feature = "dtype-u8")]
514
&DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
515
#[cfg(feature = "dtype-u16")]
516
&DataType::UInt16 => Buffer::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),
517
&DataType::UInt32 => Buffer::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),
518
&DataType::UInt64 => Buffer::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),
519
&DataType::Float32 => {
520
if decimal_comma {
521
Buffer::DecimalFloat32(
522
PrimitiveChunkedBuilder::new(name, capacity),
523
Default::default(),
524
)
525
} else {
526
Buffer::Float32(PrimitiveChunkedBuilder::new(name, capacity))
527
}
528
},
529
&DataType::Float64 => {
530
if decimal_comma {
531
Buffer::DecimalFloat64(
532
PrimitiveChunkedBuilder::new(name, capacity),
533
Default::default(),
534
)
535
} else {
536
Buffer::Float64(PrimitiveChunkedBuilder::new(name, capacity))
537
}
538
},
539
&DataType::String => {
540
Buffer::Utf8(Utf8Field::new(name, capacity, quote_char, encoding))
541
},
542
#[cfg(feature = "dtype-datetime")]
543
DataType::Datetime(time_unit, time_zone) => Buffer::Datetime {
544
buf: DatetimeField::new(name, capacity),
545
time_unit: *time_unit,
546
time_zone: time_zone.clone(),
547
},
548
#[cfg(feature = "dtype-date")]
549
&DataType::Date => Buffer::Date(DatetimeField::new(name, capacity)),
550
#[cfg(feature = "dtype-categorical")]
551
DataType::Categorical(_, _) | DataType::Enum(_, _) => {
552
match dtype.cat_physical().unwrap() {
553
CategoricalPhysical::U8 => {
554
Buffer::Categorical8(CategoricalField::<Categorical8Type>::new(
555
name,
556
capacity,
557
quote_char,
558
dtype.clone(),
559
))
560
},
561
CategoricalPhysical::U16 => {
562
Buffer::Categorical16(CategoricalField::<Categorical16Type>::new(
563
name,
564
capacity,
565
quote_char,
566
dtype.clone(),
567
))
568
},
569
CategoricalPhysical::U32 => {
570
Buffer::Categorical32(CategoricalField::<Categorical32Type>::new(
571
name,
572
capacity,
573
quote_char,
574
dtype.clone(),
575
))
576
},
577
}
578
},
579
dt => polars_bail!(
580
ComputeError: "unsupported data type when reading CSV: {} when reading CSV", dt,
581
),
582
};
583
Ok(builder)
584
})
585
.collect()
586
}
587
588
#[allow(clippy::large_enum_variant)]
589
pub enum Buffer {
590
Boolean(BooleanChunkedBuilder),
591
#[cfg(feature = "dtype-i8")]
592
Int8(PrimitiveChunkedBuilder<Int8Type>),
593
#[cfg(feature = "dtype-i16")]
594
Int16(PrimitiveChunkedBuilder<Int16Type>),
595
Int32(PrimitiveChunkedBuilder<Int32Type>),
596
Int64(PrimitiveChunkedBuilder<Int64Type>),
597
#[cfg(feature = "dtype-i128")]
598
Int128(PrimitiveChunkedBuilder<Int128Type>),
599
#[cfg(feature = "dtype-u8")]
600
UInt8(PrimitiveChunkedBuilder<UInt8Type>),
601
#[cfg(feature = "dtype-u16")]
602
UInt16(PrimitiveChunkedBuilder<UInt16Type>),
603
UInt32(PrimitiveChunkedBuilder<UInt32Type>),
604
UInt64(PrimitiveChunkedBuilder<UInt64Type>),
605
Float32(PrimitiveChunkedBuilder<Float32Type>),
606
Float64(PrimitiveChunkedBuilder<Float64Type>),
607
/// Stores the Utf8 fields and the total string length seen for that column
608
Utf8(Utf8Field),
609
#[cfg(feature = "dtype-datetime")]
610
Datetime {
611
buf: DatetimeField<Int64Type>,
612
time_unit: TimeUnit,
613
time_zone: Option<TimeZone>,
614
},
615
#[cfg(feature = "dtype-date")]
616
Date(DatetimeField<Int32Type>),
617
#[cfg(feature = "dtype-categorical")]
618
Categorical8(CategoricalField<Categorical8Type>),
619
#[cfg(feature = "dtype-categorical")]
620
Categorical16(CategoricalField<Categorical16Type>),
621
#[cfg(feature = "dtype-categorical")]
622
Categorical32(CategoricalField<Categorical32Type>),
623
DecimalFloat32(PrimitiveChunkedBuilder<Float32Type>, Vec<u8>),
624
DecimalFloat64(PrimitiveChunkedBuilder<Float64Type>, Vec<u8>),
625
}
626
627
impl Buffer {
628
pub fn into_series(self) -> PolarsResult<Series> {
629
let s = match self {
630
Buffer::Boolean(v) => v.finish().into_series(),
631
#[cfg(feature = "dtype-i8")]
632
Buffer::Int8(v) => v.finish().into_series(),
633
#[cfg(feature = "dtype-i16")]
634
Buffer::Int16(v) => v.finish().into_series(),
635
Buffer::Int32(v) => v.finish().into_series(),
636
Buffer::Int64(v) => v.finish().into_series(),
637
#[cfg(feature = "dtype-i128")]
638
Buffer::Int128(v) => v.finish().into_series(),
639
#[cfg(feature = "dtype-u8")]
640
Buffer::UInt8(v) => v.finish().into_series(),
641
#[cfg(feature = "dtype-u16")]
642
Buffer::UInt16(v) => v.finish().into_series(),
643
Buffer::UInt32(v) => v.finish().into_series(),
644
Buffer::UInt64(v) => v.finish().into_series(),
645
Buffer::Float32(v) => v.finish().into_series(),
646
Buffer::Float64(v) => v.finish().into_series(),
647
Buffer::DecimalFloat32(v, _) => v.finish().into_series(),
648
Buffer::DecimalFloat64(v, _) => v.finish().into_series(),
649
#[cfg(feature = "dtype-datetime")]
650
Buffer::Datetime {
651
buf,
652
time_unit,
653
time_zone,
654
} => buf
655
.builder
656
.finish()
657
.into_series()
658
.cast(&DataType::Datetime(time_unit, time_zone))
659
.unwrap(),
660
#[cfg(feature = "dtype-date")]
661
Buffer::Date(v) => v
662
.builder
663
.finish()
664
.into_series()
665
.cast(&DataType::Date)
666
.unwrap(),
667
668
Buffer::Utf8(v) => {
669
let arr = v.mutable.freeze();
670
StringChunked::with_chunk(v.name, unsafe { arr.to_utf8view_unchecked() })
671
.into_series()
672
},
673
#[cfg(feature = "dtype-categorical")]
674
Buffer::Categorical8(buf) => buf.builder.finish().into_series(),
675
#[cfg(feature = "dtype-categorical")]
676
Buffer::Categorical16(buf) => buf.builder.finish().into_series(),
677
#[cfg(feature = "dtype-categorical")]
678
Buffer::Categorical32(buf) => buf.builder.finish().into_series(),
679
};
680
Ok(s)
681
}
682
683
pub fn add_null(&mut self, valid: bool) {
684
match self {
685
Buffer::Boolean(v) => v.append_null(),
686
#[cfg(feature = "dtype-i8")]
687
Buffer::Int8(v) => v.append_null(),
688
#[cfg(feature = "dtype-i16")]
689
Buffer::Int16(v) => v.append_null(),
690
Buffer::Int32(v) => v.append_null(),
691
Buffer::Int64(v) => v.append_null(),
692
#[cfg(feature = "dtype-i128")]
693
Buffer::Int128(v) => v.append_null(),
694
#[cfg(feature = "dtype-u8")]
695
Buffer::UInt8(v) => v.append_null(),
696
#[cfg(feature = "dtype-u16")]
697
Buffer::UInt16(v) => v.append_null(),
698
Buffer::UInt32(v) => v.append_null(),
699
Buffer::UInt64(v) => v.append_null(),
700
Buffer::Float32(v) => v.append_null(),
701
Buffer::Float64(v) => v.append_null(),
702
Buffer::DecimalFloat32(v, _) => v.append_null(),
703
Buffer::DecimalFloat64(v, _) => v.append_null(),
704
Buffer::Utf8(v) => {
705
if valid {
706
v.mutable.push_value("")
707
} else {
708
v.mutable.push_null()
709
}
710
},
711
#[cfg(feature = "dtype-datetime")]
712
Buffer::Datetime { buf, .. } => buf.builder.append_null(),
713
#[cfg(feature = "dtype-date")]
714
Buffer::Date(v) => v.builder.append_null(),
715
#[cfg(feature = "dtype-categorical")]
716
Buffer::Categorical8(buf) => buf.builder.append_null(),
717
#[cfg(feature = "dtype-categorical")]
718
Buffer::Categorical16(buf) => buf.builder.append_null(),
719
#[cfg(feature = "dtype-categorical")]
720
Buffer::Categorical32(buf) => buf.builder.append_null(),
721
};
722
}
723
724
pub fn dtype(&self) -> DataType {
725
match self {
726
Buffer::Boolean(_) => DataType::Boolean,
727
#[cfg(feature = "dtype-i8")]
728
Buffer::Int8(_) => DataType::Int8,
729
#[cfg(feature = "dtype-i16")]
730
Buffer::Int16(_) => DataType::Int16,
731
Buffer::Int32(_) => DataType::Int32,
732
Buffer::Int64(_) => DataType::Int64,
733
#[cfg(feature = "dtype-i128")]
734
Buffer::Int128(_) => DataType::Int128,
735
#[cfg(feature = "dtype-u8")]
736
Buffer::UInt8(_) => DataType::UInt8,
737
#[cfg(feature = "dtype-u16")]
738
Buffer::UInt16(_) => DataType::UInt16,
739
Buffer::UInt32(_) => DataType::UInt32,
740
Buffer::UInt64(_) => DataType::UInt64,
741
Buffer::Float32(_) | Buffer::DecimalFloat32(_, _) => DataType::Float32,
742
Buffer::Float64(_) | Buffer::DecimalFloat64(_, _) => DataType::Float64,
743
Buffer::Utf8(_) => DataType::String,
744
#[cfg(feature = "dtype-datetime")]
745
Buffer::Datetime { time_unit, .. } => DataType::Datetime(*time_unit, None),
746
#[cfg(feature = "dtype-date")]
747
Buffer::Date(_) => DataType::Date,
748
#[cfg(feature = "dtype-categorical")]
749
Buffer::Categorical8(buf) => buf.builder.dtype().clone(),
750
#[cfg(feature = "dtype-categorical")]
751
Buffer::Categorical16(buf) => buf.builder.dtype().clone(),
752
#[cfg(feature = "dtype-categorical")]
753
Buffer::Categorical32(buf) => buf.builder.dtype().clone(),
754
}
755
}
756
757
#[inline]
758
pub fn add(
759
&mut self,
760
bytes: &[u8],
761
ignore_errors: bool,
762
needs_escaping: bool,
763
missing_is_null: bool,
764
) -> PolarsResult<()> {
765
use Buffer::*;
766
match self {
767
Boolean(buf) => <BooleanChunkedBuilder as ParsedBuffer>::parse_bytes(
768
buf,
769
bytes,
770
ignore_errors,
771
needs_escaping,
772
missing_is_null,
773
None,
774
),
775
#[cfg(feature = "dtype-i8")]
776
Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuffer>::parse_bytes(
777
buf,
778
bytes,
779
ignore_errors,
780
needs_escaping,
781
missing_is_null,
782
None,
783
),
784
#[cfg(feature = "dtype-i16")]
785
Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuffer>::parse_bytes(
786
buf,
787
bytes,
788
ignore_errors,
789
needs_escaping,
790
missing_is_null,
791
None,
792
),
793
Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuffer>::parse_bytes(
794
buf,
795
bytes,
796
ignore_errors,
797
needs_escaping,
798
missing_is_null,
799
None,
800
),
801
Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuffer>::parse_bytes(
802
buf,
803
bytes,
804
ignore_errors,
805
needs_escaping,
806
missing_is_null,
807
None,
808
),
809
#[cfg(feature = "dtype-i128")]
810
Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
811
buf,
812
bytes,
813
ignore_errors,
814
needs_escaping,
815
missing_is_null,
816
None,
817
),
818
#[cfg(feature = "dtype-u8")]
819
UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
820
buf,
821
bytes,
822
ignore_errors,
823
needs_escaping,
824
missing_is_null,
825
None,
826
),
827
#[cfg(feature = "dtype-u16")]
828
UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuffer>::parse_bytes(
829
buf,
830
bytes,
831
ignore_errors,
832
needs_escaping,
833
missing_is_null,
834
None,
835
),
836
UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuffer>::parse_bytes(
837
buf,
838
bytes,
839
ignore_errors,
840
needs_escaping,
841
missing_is_null,
842
None,
843
),
844
UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
845
buf,
846
bytes,
847
ignore_errors,
848
needs_escaping,
849
missing_is_null,
850
None,
851
),
852
Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
853
buf,
854
bytes,
855
ignore_errors,
856
needs_escaping,
857
missing_is_null,
858
None,
859
),
860
Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
861
buf,
862
bytes,
863
ignore_errors,
864
needs_escaping,
865
missing_is_null,
866
None,
867
),
868
DecimalFloat32(buf, scratch) => {
869
prepare_decimal_comma(bytes, scratch);
870
<PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
871
buf,
872
scratch,
873
ignore_errors,
874
needs_escaping,
875
missing_is_null,
876
None,
877
)
878
},
879
DecimalFloat64(buf, scratch) => {
880
prepare_decimal_comma(bytes, scratch);
881
<PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
882
buf,
883
scratch,
884
ignore_errors,
885
needs_escaping,
886
missing_is_null,
887
None,
888
)
889
},
890
Utf8(buf) => <Utf8Field as ParsedBuffer>::parse_bytes(
891
buf,
892
bytes,
893
ignore_errors,
894
needs_escaping,
895
missing_is_null,
896
None,
897
),
898
#[cfg(feature = "dtype-datetime")]
899
Datetime { buf, time_unit, .. } => {
900
<DatetimeField<Int64Type> as ParsedBuffer>::parse_bytes(
901
buf,
902
bytes,
903
ignore_errors,
904
needs_escaping,
905
missing_is_null,
906
Some(*time_unit),
907
)
908
},
909
#[cfg(feature = "dtype-date")]
910
Date(buf) => <DatetimeField<Int32Type> as ParsedBuffer>::parse_bytes(
911
buf,
912
bytes,
913
ignore_errors,
914
needs_escaping,
915
missing_is_null,
916
None,
917
),
918
#[cfg(feature = "dtype-categorical")]
919
Categorical8(buf) => {
920
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
921
},
922
#[cfg(feature = "dtype-categorical")]
923
Categorical16(buf) => {
924
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
925
},
926
#[cfg(feature = "dtype-categorical")]
927
Categorical32(buf) => {
928
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
929
},
930
}
931
}
932
}
933
934
#[inline]
935
fn prepare_decimal_comma(bytes: &[u8], scratch: &mut Vec<u8>) {
936
scratch.clear();
937
scratch.reserve(bytes.len());
938
939
// SAFETY: we pre-allocated.
940
for &byte in bytes {
941
if byte == b',' {
942
unsafe { scratch.push_unchecked(b'.') }
943
} else {
944
unsafe { scratch.push_unchecked(byte) }
945
}
946
}
947
}
948
949