Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-io/src/csv/write/write_impl/serializer.rs
6939 views
1
//! This file is complicated because we have complicated escape handling. We want to avoid having
2
//! to write down each combination of type & escaping, but we also want the compiler to optimize them
3
//! to efficient machine code - so no dynamic dispatch. That means a lot of generics and macros.
4
//!
5
//! We need to differentiate between several kinds of types, and several kinds of escaping we support:
6
//!
7
//! - The simplest escaping mechanism are [`QuoteStyle::Always`] and [`QuoteStyle::Never`].
8
//! For `Never` we just never quote. For `Always` we pass any serializer that never quotes
9
//! to [`quote_serializer()`] then it becomes quoted properly.
10
//! - [`QuoteStyle::Necessary`] (the default) is only relevant for strings and floats with decimal_comma,
11
//! as these are the only types that can have newlines (row separators), commas (default column separators)
12
//! or quotes. String escaping is complicated anyway, and it is all inside [`string_serializer()`].
13
//! - The real complication is [`QuoteStyle::NonNumeric`], that doesn't quote numbers (unless necessary)
14
//! and nulls, and quotes any other thing. The problem is that nulls can be within any type, so we
15
//! need to handle two possibilities of quoting everywhere.
16
//!
17
//! So in case the chosen style is anything but `NonNumeric`, we statically know for each column except strings
18
//! whether it should be quoted (and for strings too when not `Necessary`). There we use
19
//! `quote_serializer()` or nothing.
20
//!
21
//! But to help with `NonNumeric`, each serializer carry the potential to distinguish between nulls and non-nulls,
22
//! and quote the latter and not the former. But in order to not have the branch when we statically know the answer,
23
//! we have an option to statically disable it with a const generic flag `QUOTE_NON_NULL`. Numbers (that should never
24
//! be quoted with `NonNumeric`) just always disable this flag.
25
//!
26
//! So we have three possibilities:
27
//!
28
//! 1. A serializer that never quotes. This is a bare serializer with `QUOTE_NON_NULL = false`.
29
//! 2. A serializer that always quotes. This is a serializer wrapped with `quote_serializer()`,
30
//! but also with `QUOTE_NON_NULL = false`.
31
//! 3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`.
32
33
use std::fmt::LowerExp;
34
use std::io::Write;
35
36
use arrow::array::{Array, BooleanArray, NullArray, PrimitiveArray, Utf8ViewArray};
37
use arrow::legacy::time_zone::Tz;
38
use arrow::types::NativeType;
39
#[cfg(feature = "timezones")]
40
use chrono::TimeZone;
41
use memchr::{memchr_iter, memchr3};
42
use num_traits::NumCast;
43
use polars_core::prelude::*;
44
45
use crate::csv::write::{QuoteStyle, SerializeOptions};
46
47
const TOO_MANY_MSG: &str = "too many items requested from CSV serializer";
48
const ARRAY_MISMATCH_MSG: &str = "wrong array type";
49
50
#[allow(dead_code)]
51
struct IgnoreFmt;
52
impl std::fmt::Write for IgnoreFmt {
53
fn write_str(&mut self, _s: &str) -> std::fmt::Result {
54
Ok(())
55
}
56
}
57
58
pub(super) trait Serializer<'a> {
59
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions);
60
// Updates the array without changing the configuration.
61
fn update_array(&mut self, array: &'a dyn Array);
62
}
63
64
fn make_serializer<'a, T, I: Iterator<Item = Option<T>>, const QUOTE_NON_NULL: bool>(
65
f: impl FnMut(T, &mut Vec<u8>, &SerializeOptions),
66
iter: I,
67
update_array: impl FnMut(&'a dyn Array) -> I,
68
) -> impl Serializer<'a> {
69
struct SerializerImpl<F, I, Update, const QUOTE_NON_NULL: bool> {
70
f: F,
71
iter: I,
72
update_array: Update,
73
}
74
75
impl<'a, T, F, I, Update, const QUOTE_NON_NULL: bool> Serializer<'a>
76
for SerializerImpl<F, I, Update, QUOTE_NON_NULL>
77
where
78
F: FnMut(T, &mut Vec<u8>, &SerializeOptions),
79
I: Iterator<Item = Option<T>>,
80
Update: FnMut(&'a dyn Array) -> I,
81
{
82
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
83
let item = self.iter.next().expect(TOO_MANY_MSG);
84
match item {
85
Some(item) => {
86
if QUOTE_NON_NULL {
87
buf.push(options.quote_char);
88
}
89
(self.f)(item, buf, options);
90
if QUOTE_NON_NULL {
91
buf.push(options.quote_char);
92
}
93
},
94
None => buf.extend_from_slice(options.null.as_bytes()),
95
}
96
}
97
98
fn update_array(&mut self, array: &'a dyn Array) {
99
self.iter = (self.update_array)(array);
100
}
101
}
102
103
SerializerImpl::<_, _, _, QUOTE_NON_NULL> {
104
f,
105
iter,
106
update_array,
107
}
108
}
109
110
fn integer_serializer<I: NativeType + itoa::Integer>(
111
array: &PrimitiveArray<I>,
112
) -> impl Serializer<'_> {
113
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
114
let mut buffer = itoa::Buffer::new();
115
let value = buffer.format(item);
116
buf.extend_from_slice(value.as_bytes());
117
};
118
119
make_serializer::<_, _, false>(f, array.iter(), |array| {
120
array
121
.as_any()
122
.downcast_ref::<PrimitiveArray<I>>()
123
.expect(ARRAY_MISMATCH_MSG)
124
.iter()
125
})
126
}
127
128
fn float_serializer_no_precision_autoformat<I: NativeType + ryu::Float>(
129
array: &PrimitiveArray<I>,
130
) -> impl Serializer<'_> {
131
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
132
let mut buffer = ryu::Buffer::new();
133
let value = buffer.format(item);
134
buf.extend_from_slice(value.as_bytes());
135
};
136
137
make_serializer::<_, _, false>(f, array.iter(), |array| {
138
array
139
.as_any()
140
.downcast_ref::<PrimitiveArray<I>>()
141
.expect(ARRAY_MISMATCH_MSG)
142
.iter()
143
})
144
}
145
146
fn float_serializer_no_precision_autoformat_decimal_comma<I: NativeType + ryu::Float>(
147
array: &PrimitiveArray<I>,
148
) -> impl Serializer<'_> {
149
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
150
let mut buffer = ryu::Buffer::new();
151
let value = buffer.format(item).as_bytes();
152
153
for ch in value {
154
buf.push(if *ch == b'.' { b',' } else { *ch });
155
}
156
};
157
158
make_serializer::<_, _, false>(f, array.iter(), |array| {
159
array
160
.as_any()
161
.downcast_ref::<PrimitiveArray<I>>()
162
.expect(ARRAY_MISMATCH_MSG)
163
.iter()
164
})
165
}
166
167
fn float_serializer_no_precision_scientific<I: NativeType + LowerExp>(
168
array: &PrimitiveArray<I>,
169
) -> impl Serializer<'_> {
170
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
171
// Float writing into a buffer of `Vec<u8>` cannot fail.
172
let _ = write!(buf, "{item:.e}");
173
};
174
175
make_serializer::<_, _, false>(f, array.iter(), |array| {
176
array
177
.as_any()
178
.downcast_ref::<PrimitiveArray<I>>()
179
.expect(ARRAY_MISMATCH_MSG)
180
.iter()
181
})
182
}
183
184
fn float_serializer_no_precision_scientific_decimal_comma<I: NativeType + LowerExp>(
185
array: &PrimitiveArray<I>,
186
) -> impl Serializer<'_> {
187
let mut scratch = Vec::new();
188
189
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
190
// Float writing into a buffer of `Vec<u8>` cannot fail.
191
let _ = write!(&mut scratch, "{item:.e}");
192
for c in &mut scratch {
193
if *c == b'.' {
194
*c = b',';
195
break;
196
}
197
}
198
buf.extend_from_slice(&scratch);
199
};
200
201
make_serializer::<_, _, false>(f, array.iter(), |array| {
202
array
203
.as_any()
204
.downcast_ref::<PrimitiveArray<I>>()
205
.expect(ARRAY_MISMATCH_MSG)
206
.iter()
207
})
208
}
209
210
fn float_serializer_no_precision_positional<I: NativeType + NumCast>(
211
array: &PrimitiveArray<I>,
212
) -> impl Serializer<'_> {
213
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
214
let v: f64 = NumCast::from(item).unwrap();
215
let _ = write!(buf, "{v}");
216
};
217
218
make_serializer::<_, _, false>(f, array.iter(), |array| {
219
array
220
.as_any()
221
.downcast_ref::<PrimitiveArray<I>>()
222
.expect(ARRAY_MISMATCH_MSG)
223
.iter()
224
})
225
}
226
227
fn float_serializer_no_precision_positional_decimal_comma<I: NativeType + NumCast>(
228
array: &PrimitiveArray<I>,
229
) -> impl Serializer<'_> {
230
let mut scratch = Vec::new();
231
232
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
233
scratch.clear();
234
let v: f64 = NumCast::from(item).unwrap();
235
let _ = write!(&mut scratch, "{v}");
236
for c in &mut scratch {
237
if *c == b'.' {
238
*c = b',';
239
break;
240
}
241
}
242
buf.extend_from_slice(&scratch);
243
};
244
245
make_serializer::<_, _, false>(f, array.iter(), |array| {
246
array
247
.as_any()
248
.downcast_ref::<PrimitiveArray<I>>()
249
.expect(ARRAY_MISMATCH_MSG)
250
.iter()
251
})
252
}
253
254
fn float_serializer_with_precision_scientific<I: NativeType + LowerExp>(
255
array: &PrimitiveArray<I>,
256
precision: usize,
257
) -> impl Serializer<'_> {
258
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
259
// Float writing into a buffer of `Vec<u8>` cannot fail.
260
let _ = write!(buf, "{item:.precision$e}");
261
};
262
263
make_serializer::<_, _, false>(f, array.iter(), |array| {
264
array
265
.as_any()
266
.downcast_ref::<PrimitiveArray<I>>()
267
.expect(ARRAY_MISMATCH_MSG)
268
.iter()
269
})
270
}
271
272
fn float_serializer_with_precision_scientific_decimal_comma<I: NativeType + LowerExp>(
273
array: &PrimitiveArray<I>,
274
precision: usize,
275
) -> impl Serializer<'_> {
276
let mut scratch = Vec::new();
277
278
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
279
scratch.clear();
280
// Float writing into a buffer of `Vec<u8>` cannot fail.
281
let _ = write!(&mut scratch, "{item:.precision$e}");
282
for c in &mut scratch {
283
if *c == b'.' {
284
*c = b',';
285
break;
286
}
287
}
288
buf.extend_from_slice(&scratch);
289
};
290
291
make_serializer::<_, _, false>(f, array.iter(), |array| {
292
array
293
.as_any()
294
.downcast_ref::<PrimitiveArray<I>>()
295
.expect(ARRAY_MISMATCH_MSG)
296
.iter()
297
})
298
}
299
300
fn float_serializer_with_precision_positional<I: NativeType>(
301
array: &PrimitiveArray<I>,
302
precision: usize,
303
) -> impl Serializer<'_> {
304
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
305
// Float writing into a buffer of `Vec<u8>` cannot fail.
306
let _ = write!(buf, "{item:.precision$}");
307
};
308
309
make_serializer::<_, _, false>(f, array.iter(), |array| {
310
array
311
.as_any()
312
.downcast_ref::<PrimitiveArray<I>>()
313
.expect(ARRAY_MISMATCH_MSG)
314
.iter()
315
})
316
}
317
318
fn float_serializer_with_precision_positional_decimal_comma<I: NativeType>(
319
array: &PrimitiveArray<I>,
320
precision: usize,
321
) -> impl Serializer<'_> {
322
let mut scratch = Vec::new();
323
324
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
325
scratch.clear();
326
let _ = write!(&mut scratch, "{item:.precision$}");
327
for c in &mut scratch {
328
if *c == b'.' {
329
*c = b',';
330
break;
331
}
332
}
333
buf.extend_from_slice(&scratch);
334
};
335
336
make_serializer::<_, _, false>(f, array.iter(), |array| {
337
array
338
.as_any()
339
.downcast_ref::<PrimitiveArray<I>>()
340
.expect(ARRAY_MISMATCH_MSG)
341
.iter()
342
})
343
}
344
345
fn null_serializer(_array: &NullArray) -> impl Serializer<'_> {
346
struct NullSerializer;
347
impl<'a> Serializer<'a> for NullSerializer {
348
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
349
buf.extend_from_slice(options.null.as_bytes());
350
}
351
fn update_array(&mut self, _array: &'a dyn Array) {}
352
}
353
NullSerializer
354
}
355
356
fn bool_serializer<const QUOTE_NON_NULL: bool>(array: &BooleanArray) -> impl Serializer<'_> {
357
let f = move |item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
358
let s = if item { "true" } else { "false" };
359
buf.extend_from_slice(s.as_bytes());
360
};
361
362
make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter(), |array| {
363
array
364
.as_any()
365
.downcast_ref::<BooleanArray>()
366
.expect(ARRAY_MISMATCH_MSG)
367
.iter()
368
})
369
}
370
371
#[cfg(feature = "dtype-decimal")]
372
fn decimal_serializer(array: &PrimitiveArray<i128>, scale: usize) -> impl Serializer<'_> {
373
let trim_zeros = arrow::compute::decimal::get_trim_decimal_zeros();
374
375
let mut fmt_buf = arrow::compute::decimal::DecimalFmtBuffer::new();
376
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
377
buf.extend_from_slice(fmt_buf.format(item, scale, trim_zeros).as_bytes());
378
};
379
380
make_serializer::<_, _, false>(f, array.iter(), |array| {
381
array
382
.as_any()
383
.downcast_ref::<PrimitiveArray<i128>>()
384
.expect(ARRAY_MISMATCH_MSG)
385
.iter()
386
})
387
}
388
389
#[cfg(any(
390
feature = "dtype-date",
391
feature = "dtype-time",
392
feature = "dtype-datetime"
393
))]
394
fn callback_serializer<'a, T: NativeType, const QUOTE_NON_NULL: bool>(
395
array: &'a PrimitiveArray<T>,
396
mut callback: impl FnMut(T, &mut Vec<u8>) + 'a,
397
) -> impl Serializer<'a> {
398
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
399
callback(item, buf);
400
};
401
402
make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter(), |array| {
403
array
404
.as_any()
405
.downcast_ref::<PrimitiveArray<T>>()
406
.expect(ARRAY_MISMATCH_MSG)
407
.iter()
408
})
409
}
410
411
#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]
412
type ChronoFormatIter<'a, 'b> = std::slice::Iter<'a, chrono::format::Item<'b>>;
413
414
#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]
415
fn date_and_time_serializer<'a, Underlying: NativeType, T: std::fmt::Display>(
416
format_str: &'a Option<String>,
417
description: &str,
418
array: &'a dyn Array,
419
sample_value: T,
420
mut convert: impl FnMut(Underlying) -> T + Send + 'a,
421
mut format_fn: impl for<'b> FnMut(
422
&T,
423
ChronoFormatIter<'b, 'a>,
424
) -> chrono::format::DelayedFormat<ChronoFormatIter<'b, 'a>>
425
+ Send
426
+ 'a,
427
options: &SerializeOptions,
428
) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {
429
let array = array.as_any().downcast_ref().unwrap();
430
let serializer = match format_str {
431
Some(format_str) => {
432
let format = chrono::format::StrftimeItems::new(format_str).parse().map_err(
433
|_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),
434
)?;
435
use std::fmt::Write;
436
// Fail fast for invalid format. This return error faster to the user, and allows us to not return
437
// `Result` from `serialize()`.
438
write!(IgnoreFmt, "{}", format_fn(&sample_value, format.iter())).map_err(
439
|_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),
440
)?;
441
let callback = move |item, buf: &mut Vec<u8>| {
442
let item = convert(item);
443
// We checked the format is valid above.
444
let _ = write!(buf, "{}", format_fn(&item, format.iter()));
445
};
446
date_and_time_final_serializer(array, callback, options)
447
},
448
None => {
449
let callback = move |item, buf: &mut Vec<u8>| {
450
let item = convert(item);
451
// Formatting dates into `Vec<u8>` cannot fail.
452
let _ = write!(buf, "{item}");
453
};
454
date_and_time_final_serializer(array, callback, options)
455
},
456
};
457
Ok(serializer)
458
}
459
460
#[cfg(any(
461
feature = "dtype-date",
462
feature = "dtype-time",
463
feature = "dtype-datetime"
464
))]
465
fn date_and_time_final_serializer<'a, T: NativeType>(
466
array: &'a PrimitiveArray<T>,
467
callback: impl FnMut(T, &mut Vec<u8>) + Send + 'a,
468
options: &SerializeOptions,
469
) -> Box<dyn Serializer<'a> + Send + 'a> {
470
match options.quote_style {
471
QuoteStyle::Always => Box::new(quote_serializer(callback_serializer::<T, false>(
472
array, callback,
473
))) as Box<dyn Serializer + Send>,
474
QuoteStyle::NonNumeric => Box::new(callback_serializer::<T, true>(array, callback)),
475
_ => Box::new(callback_serializer::<T, false>(array, callback)),
476
}
477
}
478
479
pub(super) fn string_serializer<'a, Iter: Send + 'a>(
480
mut f: impl FnMut(&mut Iter) -> Option<&str> + Send + 'a,
481
options: &SerializeOptions,
482
mut update: impl FnMut(&'a dyn Array) -> Iter + Send + 'a,
483
array: &'a dyn Array,
484
) -> Box<dyn Serializer<'a> + 'a + Send> {
485
const LF: u8 = b'\n';
486
const CR: u8 = b'\r';
487
488
struct StringSerializer<F, Iter, Update> {
489
serialize: F,
490
update: Update,
491
iter: Iter,
492
}
493
494
impl<'a, F, Iter, Update> Serializer<'a> for StringSerializer<F, Iter, Update>
495
where
496
F: FnMut(&mut Iter, &mut Vec<u8>, &SerializeOptions),
497
Update: FnMut(&'a dyn Array) -> Iter,
498
{
499
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
500
(self.serialize)(&mut self.iter, buf, options);
501
}
502
503
fn update_array(&mut self, array: &'a dyn Array) {
504
self.iter = (self.update)(array);
505
}
506
}
507
508
fn serialize_str_escaped(buf: &mut Vec<u8>, s: &[u8], quote_char: u8, quoted: bool) {
509
let mut iter = memchr_iter(quote_char, s);
510
let first_quote = iter.next();
511
match first_quote {
512
None => buf.extend_from_slice(s),
513
Some(mut quote_pos) => {
514
if !quoted {
515
buf.push(quote_char);
516
}
517
let mut start_pos = 0;
518
loop {
519
buf.extend_from_slice(&s[start_pos..quote_pos]);
520
buf.extend_from_slice(&[quote_char, quote_char]);
521
match iter.next() {
522
Some(quote) => {
523
start_pos = quote_pos + 1;
524
quote_pos = quote;
525
},
526
None => {
527
buf.extend_from_slice(&s[quote_pos + 1..]);
528
break;
529
},
530
}
531
}
532
if !quoted {
533
buf.push(quote_char);
534
}
535
},
536
}
537
}
538
539
let iter = update(array);
540
match options.quote_style {
541
QuoteStyle::Always => {
542
let serialize =
543
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
544
let quote_char = options.quote_char;
545
buf.push(quote_char);
546
let Some(s) = f(iter) else {
547
buf.extend_from_slice(options.null.as_bytes());
548
buf.push(quote_char);
549
return;
550
};
551
serialize_str_escaped(buf, s.as_bytes(), quote_char, true);
552
buf.push(quote_char);
553
};
554
Box::new(StringSerializer {
555
serialize,
556
update,
557
iter,
558
})
559
},
560
QuoteStyle::NonNumeric => {
561
let serialize =
562
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
563
let Some(s) = f(iter) else {
564
buf.extend_from_slice(options.null.as_bytes());
565
return;
566
};
567
let quote_char = options.quote_char;
568
buf.push(quote_char);
569
serialize_str_escaped(buf, s.as_bytes(), quote_char, true);
570
buf.push(quote_char);
571
};
572
Box::new(StringSerializer {
573
serialize,
574
update,
575
iter,
576
})
577
},
578
QuoteStyle::Necessary => {
579
let serialize =
580
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
581
let Some(s) = f(iter) else {
582
buf.extend_from_slice(options.null.as_bytes());
583
return;
584
};
585
let quote_char = options.quote_char;
586
// An empty string conflicts with null, so it is necessary to quote.
587
if s.is_empty() {
588
buf.extend_from_slice(&[quote_char, quote_char]);
589
return;
590
}
591
let needs_quote = memchr3(options.separator, LF, CR, s.as_bytes()).is_some();
592
if needs_quote {
593
buf.push(quote_char);
594
}
595
serialize_str_escaped(buf, s.as_bytes(), quote_char, needs_quote);
596
if needs_quote {
597
buf.push(quote_char);
598
}
599
};
600
Box::new(StringSerializer {
601
serialize,
602
update,
603
iter,
604
})
605
},
606
QuoteStyle::Never => {
607
let serialize =
608
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
609
let Some(s) = f(iter) else {
610
buf.extend_from_slice(options.null.as_bytes());
611
return;
612
};
613
buf.extend_from_slice(s.as_bytes());
614
};
615
Box::new(StringSerializer {
616
serialize,
617
update,
618
iter,
619
})
620
},
621
}
622
}
623
624
fn quote_serializer<'a>(serializer: impl Serializer<'a>) -> impl Serializer<'a> {
625
struct QuoteSerializer<S>(S);
626
impl<'a, S: Serializer<'a>> Serializer<'a> for QuoteSerializer<S> {
627
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
628
buf.push(options.quote_char);
629
self.0.serialize(buf, options);
630
buf.push(options.quote_char);
631
}
632
633
fn update_array(&mut self, array: &'a dyn Array) {
634
self.0.update_array(array);
635
}
636
}
637
QuoteSerializer(serializer)
638
}
639
640
pub(super) fn serializer_for<'a>(
641
array: &'a dyn Array,
642
options: &'a SerializeOptions,
643
dtype: &'a DataType,
644
_datetime_format: &'a str,
645
_time_zone: Option<Tz>,
646
) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {
647
// The needs_quotes flag captures the quote logic for the quote_wrapper! macro
648
// It is targeted at numerical types primarily; other types may required additional logic
649
let needs_quotes = match dtype {
650
DataType::Float32 | DataType::Float64 => {
651
// When comma is used as both the field separator and decimal separator, quoting
652
// may be required. Specifically, when:
653
// - quote_style is Always, or
654
// - quote_style is Necessary or Non-Numeric, the field separator is also a comma,
655
// and the float string field contains a comma character (no precision or precision > 0)
656
//
657
// In some rare cases, a field may get quoted when it is not strictly necessary
658
// (e.g., in scientific notation when only the first digit is non-zero such as '1e12',
659
// or null values in 'non_numeric' quote_style).
660
661
let mut should_quote = options.decimal_comma && options.separator == b',';
662
if let Some(precision) = options.float_precision {
663
should_quote &= precision > 0;
664
}
665
666
match options.quote_style {
667
QuoteStyle::Always => true,
668
QuoteStyle::Necessary | QuoteStyle::NonNumeric => should_quote,
669
QuoteStyle::Never => false,
670
}
671
},
672
_ => options.quote_style == QuoteStyle::Always,
673
};
674
675
macro_rules! quote_wrapper {
676
($make_serializer:path, $($arg:tt)*) => {{
677
let serializer = $make_serializer(array.as_any().downcast_ref().unwrap(), $($arg)*);
678
if needs_quotes {
679
Box::new(quote_serializer(serializer)) as Box<dyn Serializer + Send>
680
} else {
681
Box::new(serializer)
682
}
683
}};
684
($make_serializer:path) => { quote_wrapper!($make_serializer,) };
685
}
686
687
let serializer = match dtype {
688
DataType::Int8 => quote_wrapper!(integer_serializer::<i8>),
689
DataType::UInt8 => quote_wrapper!(integer_serializer::<u8>),
690
DataType::Int16 => quote_wrapper!(integer_serializer::<i16>),
691
DataType::UInt16 => quote_wrapper!(integer_serializer::<u16>),
692
DataType::Int32 => quote_wrapper!(integer_serializer::<i32>),
693
DataType::UInt32 => quote_wrapper!(integer_serializer::<u32>),
694
DataType::Int64 => quote_wrapper!(integer_serializer::<i64>),
695
DataType::UInt64 => quote_wrapper!(integer_serializer::<u64>),
696
DataType::Int128 => quote_wrapper!(integer_serializer::<i128>),
697
DataType::Float32 => {
698
match (
699
options.decimal_comma,
700
options.float_precision,
701
options.float_scientific,
702
) {
703
// standard decimal separator (period)
704
(false, Some(precision), Some(true)) => {
705
quote_wrapper!(float_serializer_with_precision_scientific::<f32>, precision)
706
},
707
(false, Some(precision), _) => {
708
quote_wrapper!(float_serializer_with_precision_positional::<f32>, precision)
709
},
710
(false, None, Some(true)) => {
711
quote_wrapper!(float_serializer_no_precision_scientific::<f32>)
712
},
713
(false, None, Some(false)) => {
714
quote_wrapper!(float_serializer_no_precision_positional::<f32>)
715
},
716
(false, None, None) => {
717
quote_wrapper!(float_serializer_no_precision_autoformat::<f32>)
718
},
719
720
// comma as the decimal separator
721
(true, Some(precision), Some(true)) => quote_wrapper!(
722
float_serializer_with_precision_scientific_decimal_comma::<f32>,
723
precision
724
),
725
(true, Some(precision), _) => quote_wrapper!(
726
float_serializer_with_precision_positional_decimal_comma::<f32>,
727
precision
728
),
729
(true, None, Some(true)) => {
730
quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f32>)
731
},
732
(true, None, Some(false)) => {
733
quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f32>)
734
},
735
(true, None, None) => {
736
quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f32>)
737
},
738
}
739
},
740
DataType::Float64 => {
741
match (
742
options.decimal_comma,
743
options.float_precision,
744
options.float_scientific,
745
) {
746
// standard decimal separator (period)
747
(false, Some(precision), Some(true)) => {
748
quote_wrapper!(float_serializer_with_precision_scientific::<f64>, precision)
749
},
750
(false, Some(precision), _) => {
751
quote_wrapper!(float_serializer_with_precision_positional::<f64>, precision)
752
},
753
(false, None, Some(true)) => {
754
quote_wrapper!(float_serializer_no_precision_scientific::<f64>)
755
},
756
(false, None, Some(false)) => {
757
quote_wrapper!(float_serializer_no_precision_positional::<f64>)
758
},
759
(false, None, None) => {
760
quote_wrapper!(float_serializer_no_precision_autoformat::<f64>)
761
},
762
763
// comma as the decimal separator
764
(true, Some(precision), Some(true)) => quote_wrapper!(
765
float_serializer_with_precision_scientific_decimal_comma::<f64>,
766
precision
767
),
768
(true, Some(precision), _) => quote_wrapper!(
769
float_serializer_with_precision_positional_decimal_comma::<f64>,
770
precision
771
),
772
(true, None, Some(true)) => {
773
quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f64>)
774
},
775
(true, None, Some(false)) => {
776
quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f64>)
777
},
778
(true, None, None) => {
779
quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f64>)
780
},
781
}
782
},
783
DataType::Null => quote_wrapper!(null_serializer),
784
DataType::Boolean => {
785
let array = array.as_any().downcast_ref().unwrap();
786
match options.quote_style {
787
QuoteStyle::Always => Box::new(quote_serializer(bool_serializer::<false>(array)))
788
as Box<dyn Serializer + Send>,
789
QuoteStyle::NonNumeric => Box::new(bool_serializer::<true>(array)),
790
_ => Box::new(bool_serializer::<false>(array)),
791
}
792
},
793
#[cfg(feature = "dtype-date")]
794
DataType::Date => date_and_time_serializer(
795
&options.date_format,
796
"NaiveDate",
797
array,
798
chrono::NaiveDate::MAX,
799
arrow::temporal_conversions::date32_to_date,
800
|date, items| date.format_with_items(items),
801
options,
802
)?,
803
#[cfg(feature = "dtype-time")]
804
DataType::Time => date_and_time_serializer(
805
&options.time_format,
806
"NaiveTime",
807
array,
808
chrono::NaiveTime::MIN,
809
arrow::temporal_conversions::time64ns_to_time,
810
|time, items| time.format_with_items(items),
811
options,
812
)?,
813
#[cfg(feature = "dtype-datetime")]
814
DataType::Datetime(time_unit, _) => {
815
let format = chrono::format::StrftimeItems::new(_datetime_format)
816
.parse()
817
.map_err(|_| {
818
polars_err!(
819
ComputeError: "cannot format {} with format '{_datetime_format}'",
820
if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },
821
)
822
})?;
823
use std::fmt::Write;
824
let sample_datetime = match _time_zone {
825
#[cfg(feature = "timezones")]
826
Some(time_zone) => time_zone
827
.from_utc_datetime(&chrono::NaiveDateTime::MAX)
828
.format_with_items(format.iter()),
829
#[cfg(not(feature = "timezones"))]
830
Some(_) => panic!("activate 'timezones' feature"),
831
None => chrono::NaiveDateTime::MAX.format_with_items(format.iter()),
832
};
833
// Fail fast for invalid format. This return error faster to the user, and allows us to not return
834
// `Result` from `serialize()`.
835
write!(IgnoreFmt, "{sample_datetime}").map_err(|_| {
836
polars_err!(
837
ComputeError: "cannot format {} with format '{_datetime_format}'",
838
if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },
839
)
840
})?;
841
842
let array = array.as_any().downcast_ref().unwrap();
843
844
macro_rules! time_unit_serializer {
845
($convert:ident) => {
846
match _time_zone {
847
#[cfg(feature = "timezones")]
848
Some(time_zone) => {
849
let callback = move |item, buf: &mut Vec<u8>| {
850
let item = arrow::temporal_conversions::$convert(item);
851
let item = time_zone.from_utc_datetime(&item);
852
// We checked the format is valid above.
853
let _ = write!(buf, "{}", item.format_with_items(format.iter()));
854
};
855
date_and_time_final_serializer(array, callback, options)
856
},
857
#[cfg(not(feature = "timezones"))]
858
Some(_) => panic!("activate 'timezones' feature"),
859
None => {
860
let callback = move |item, buf: &mut Vec<u8>| {
861
let item = arrow::temporal_conversions::$convert(item);
862
// We checked the format is valid above.
863
let _ = write!(buf, "{}", item.format_with_items(format.iter()));
864
};
865
date_and_time_final_serializer(array, callback, options)
866
},
867
}
868
};
869
}
870
871
match time_unit {
872
TimeUnit::Nanoseconds => time_unit_serializer!(timestamp_ns_to_datetime),
873
TimeUnit::Microseconds => time_unit_serializer!(timestamp_us_to_datetime),
874
TimeUnit::Milliseconds => time_unit_serializer!(timestamp_ms_to_datetime),
875
}
876
},
877
DataType::String => string_serializer(
878
|iter| Iterator::next(iter).expect(TOO_MANY_MSG),
879
options,
880
|arr| {
881
arr.as_any()
882
.downcast_ref::<Utf8ViewArray>()
883
.expect(ARRAY_MISMATCH_MSG)
884
.iter()
885
},
886
array,
887
),
888
#[cfg(feature = "dtype-categorical")]
889
DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => {
890
polars_core::with_match_categorical_physical_type!(dtype.cat_physical().unwrap(), |$C| {
891
string_serializer(
892
|iter| {
893
let &idx: &<$C as PolarsCategoricalType>::Native = Iterator::next(iter).expect(TOO_MANY_MSG)?;
894
Some(unsafe { mapping.cat_to_str_unchecked(idx.as_cat()) })
895
},
896
options,
897
|arr| {
898
arr.as_any()
899
.downcast_ref::<PrimitiveArray<<$C as PolarsCategoricalType>::Native>>()
900
.expect(ARRAY_MISMATCH_MSG)
901
.iter()
902
},
903
array,
904
)
905
})
906
},
907
#[cfg(feature = "dtype-decimal")]
908
DataType::Decimal(_, scale) => {
909
quote_wrapper!(decimal_serializer, scale.unwrap_or(0))
910
},
911
_ => {
912
polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or a binary format.")
913
},
914
};
915
Ok(serializer)
916
}
917
918
#[cfg(test)]
919
mod test {
920
use arrow::array::NullArray;
921
use polars_core::prelude::ArrowDataType;
922
923
use super::string_serializer;
924
use crate::csv::write::options::{QuoteStyle, SerializeOptions};
925
926
// It is the most complex serializer with most edge cases, it definitely needs a comprehensive test.
927
#[test]
928
fn test_string_serializer() {
929
#[track_caller]
930
fn check_string_serialization(options: &SerializeOptions, s: Option<&str>, expected: &str) {
931
let fake_array = NullArray::new(ArrowDataType::Null, 0);
932
let mut serializer = string_serializer(|s| *s, options, |_| s, &fake_array);
933
let mut buf = Vec::new();
934
serializer.serialize(&mut buf, options);
935
let serialized = std::str::from_utf8(&buf).unwrap();
936
// Don't use `assert_eq!()` because it prints debug format and it's hard to read with all the escapes.
937
if serialized != expected {
938
panic!(
939
"CSV string {s:?} wasn't serialized correctly: expected: `{expected}`, got: `{serialized}`"
940
);
941
}
942
}
943
944
let always_quote = SerializeOptions {
945
quote_style: QuoteStyle::Always,
946
..SerializeOptions::default()
947
};
948
check_string_serialization(&always_quote, None, r#""""#);
949
check_string_serialization(&always_quote, Some(""), r#""""#);
950
check_string_serialization(&always_quote, Some("a"), r#""a""#);
951
check_string_serialization(&always_quote, Some("\""), r#""""""#);
952
check_string_serialization(&always_quote, Some("a\"\"b"), r#""a""""b""#);
953
954
let necessary_quote = SerializeOptions {
955
quote_style: QuoteStyle::Necessary,
956
..SerializeOptions::default()
957
};
958
check_string_serialization(&necessary_quote, None, r#""#);
959
check_string_serialization(&necessary_quote, Some(""), r#""""#);
960
check_string_serialization(&necessary_quote, Some("a"), r#"a"#);
961
check_string_serialization(&necessary_quote, Some("\""), r#""""""#);
962
check_string_serialization(&necessary_quote, Some("a\"\"b"), r#""a""""b""#);
963
check_string_serialization(&necessary_quote, Some("a b"), r#"a b"#);
964
check_string_serialization(&necessary_quote, Some("a,b"), r#""a,b""#);
965
check_string_serialization(&necessary_quote, Some("a\nb"), "\"a\nb\"");
966
check_string_serialization(&necessary_quote, Some("a\rb"), "\"a\rb\"");
967
968
let never_quote = SerializeOptions {
969
quote_style: QuoteStyle::Never,
970
..SerializeOptions::default()
971
};
972
check_string_serialization(&never_quote, None, "");
973
check_string_serialization(&never_quote, Some(""), "");
974
check_string_serialization(&never_quote, Some("a"), "a");
975
check_string_serialization(&never_quote, Some("\""), "\"");
976
check_string_serialization(&never_quote, Some("a\"\"b"), "a\"\"b");
977
check_string_serialization(&never_quote, Some("a b"), "a b");
978
check_string_serialization(&never_quote, Some("a,b"), "a,b");
979
check_string_serialization(&never_quote, Some("a\nb"), "a\nb");
980
check_string_serialization(&never_quote, Some("a\rb"), "a\rb");
981
982
let non_numeric_quote = SerializeOptions {
983
quote_style: QuoteStyle::NonNumeric,
984
..SerializeOptions::default()
985
};
986
check_string_serialization(&non_numeric_quote, None, "");
987
check_string_serialization(&non_numeric_quote, Some(""), r#""""#);
988
check_string_serialization(&non_numeric_quote, Some("a"), r#""a""#);
989
check_string_serialization(&non_numeric_quote, Some("\""), r#""""""#);
990
check_string_serialization(&non_numeric_quote, Some("a\"\"b"), r#""a""""b""#);
991
check_string_serialization(&non_numeric_quote, Some("a b"), r#""a b""#);
992
check_string_serialization(&non_numeric_quote, Some("a,b"), r#""a,b""#);
993
check_string_serialization(&non_numeric_quote, Some("a\nb"), "\"a\nb\"");
994
check_string_serialization(&non_numeric_quote, Some("a\rb"), "\"a\rb\"");
995
}
996
}
997
998