Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-io/src/csv/write/write_impl/serializer.rs
8409 views
1
//! This file is complicated because we have complicated escape handling. We want to avoid having
2
//! to write down each combination of type & escaping, but we also want the compiler to optimize them
3
//! to efficient machine code - so no dynamic dispatch. That means a lot of generics and macros.
4
//!
5
//! We need to differentiate between several kinds of types, and several kinds of escaping we support:
6
//!
7
//! - The simplest escaping mechanism are [`QuoteStyle::Always`] and [`QuoteStyle::Never`].
8
//! For `Never` we just never quote. For `Always` we pass any serializer that never quotes
9
//! to [`quote_serializer()`] then it becomes quoted properly.
10
//! - [`QuoteStyle::Necessary`] (the default) is only relevant for strings and floats with decimal_comma,
11
//! as these are the only types that can have newlines (row separators), commas (default column separators)
12
//! or quotes. String escaping is complicated anyway, and it is all inside [`string_serializer()`].
13
//! - The real complication is [`QuoteStyle::NonNumeric`], that doesn't quote numbers (unless necessary)
14
//! and nulls, and quotes any other thing. The problem is that nulls can be within any type, so we
15
//! need to handle two possibilities of quoting everywhere.
16
//!
17
//! So in case the chosen style is anything but `NonNumeric`, we statically know for each column except strings
18
//! whether it should be quoted (and for strings too when not `Necessary`). There we use
19
//! `quote_serializer()` or nothing.
20
//!
21
//! But to help with `NonNumeric`, each serializer carry the potential to distinguish between nulls and non-nulls,
22
//! and quote the latter and not the former. But in order to not have the branch when we statically know the answer,
23
//! we have an option to statically disable it with a const generic flag `QUOTE_NON_NULL`. Numbers (that should never
24
//! be quoted with `NonNumeric`) just always disable this flag.
25
//!
26
//! So we have three possibilities:
27
//!
28
//! 1. A serializer that never quotes. This is a bare serializer with `QUOTE_NON_NULL = false`.
29
//! 2. A serializer that always quotes. This is a serializer wrapped with `quote_serializer()`,
30
//! but also with `QUOTE_NON_NULL = false`.
31
//! 3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`.
32
33
use std::fmt::LowerExp;
34
use std::io::Write;
35
36
use arrow::array::{Array, BooleanArray, Float16Array, NullArray, PrimitiveArray, Utf8ViewArray};
37
use arrow::legacy::time_zone::Tz;
38
use arrow::types::NativeType;
39
#[cfg(feature = "timezones")]
40
use chrono::TimeZone;
41
use memchr::{memchr_iter, memchr3};
42
use num_traits::NumCast;
43
use polars_core::prelude::*;
44
use polars_utils::float16::pf16;
45
46
use crate::csv::write::{QuoteStyle, SerializeOptions};
47
48
const TOO_MANY_MSG: &str = "too many items requested from CSV serializer";
49
const ARRAY_MISMATCH_MSG: &str = "wrong array type";
50
51
#[allow(dead_code)]
52
struct IgnoreFmt;
53
impl std::fmt::Write for IgnoreFmt {
54
fn write_str(&mut self, _s: &str) -> std::fmt::Result {
55
Ok(())
56
}
57
}
58
59
pub(super) trait Serializer<'a> {
60
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions);
61
}
62
63
fn make_serializer<'a, T, I: Iterator<Item = Option<T>>, const QUOTE_NON_NULL: bool>(
64
f: impl FnMut(T, &mut Vec<u8>, &SerializeOptions),
65
iter: I,
66
) -> impl Serializer<'a> {
67
struct SerializerImpl<F, I, const QUOTE_NON_NULL: bool> {
68
f: F,
69
iter: I,
70
}
71
72
impl<'a, T, F, I, const QUOTE_NON_NULL: bool> Serializer<'a>
73
for SerializerImpl<F, I, QUOTE_NON_NULL>
74
where
75
F: FnMut(T, &mut Vec<u8>, &SerializeOptions),
76
I: Iterator<Item = Option<T>>,
77
{
78
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
79
let item = self.iter.next().expect(TOO_MANY_MSG);
80
match item {
81
Some(item) => {
82
if QUOTE_NON_NULL {
83
buf.push(options.quote_char);
84
}
85
(self.f)(item, buf, options);
86
if QUOTE_NON_NULL {
87
buf.push(options.quote_char);
88
}
89
},
90
None => buf.extend_from_slice(options.null.as_bytes()),
91
}
92
}
93
}
94
95
SerializerImpl::<_, _, QUOTE_NON_NULL> { f, iter }
96
}
97
98
fn integer_serializer<I: NativeType + itoa::Integer>(
99
array: &PrimitiveArray<I>,
100
) -> impl Serializer<'_> {
101
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
102
let mut buffer = itoa::Buffer::new();
103
let value = buffer.format(item);
104
buf.extend_from_slice(value.as_bytes());
105
};
106
107
make_serializer::<_, _, false>(f, array.iter())
108
}
109
110
fn float_serializer_no_precision_autoformat_f16(array: &Float16Array) -> impl Serializer<'_> {
111
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
112
let mut buffer = zmij::Buffer::new();
113
let cast: f32 = NumCast::from(item).unwrap();
114
let value = buffer.format(cast);
115
buf.extend_from_slice(value.as_bytes());
116
};
117
float_serializer_no_precision_autoformat_(array, f)
118
}
119
120
fn float_serializer_no_precision_autoformat<I: NativeType + zmij::Float>(
121
array: &PrimitiveArray<I>,
122
) -> impl Serializer<'_> {
123
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
124
let mut buffer = zmij::Buffer::new();
125
let value = buffer.format(item);
126
buf.extend_from_slice(value.as_bytes());
127
};
128
float_serializer_no_precision_autoformat_(array, f)
129
}
130
131
fn float_serializer_no_precision_autoformat_<
132
'a,
133
I: NativeType,
134
F: Fn(&'a I, &mut Vec<u8>, &SerializeOptions),
135
>(
136
array: &'a PrimitiveArray<I>,
137
f: F,
138
) -> impl Serializer<'a> {
139
make_serializer::<_, _, false>(f, array.iter())
140
}
141
142
fn float_serializer_no_precision_autoformat_decimal_comma_f16(
143
array: &Float16Array,
144
) -> impl Serializer<'_> {
145
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
146
let mut buffer = zmij::Buffer::new();
147
let cast: f32 = NumCast::from(item).unwrap();
148
let value = buffer.format(cast);
149
150
for ch in value.as_bytes() {
151
buf.push(if *ch == b'.' { b',' } else { *ch });
152
}
153
};
154
float_serializer_no_precision_autoformat_decimal_comma_(array, f)
155
}
156
157
fn float_serializer_no_precision_autoformat_decimal_comma<I: NativeType + zmij::Float>(
158
array: &PrimitiveArray<I>,
159
) -> impl Serializer<'_> {
160
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
161
let mut buffer = zmij::Buffer::new();
162
let value = buffer.format(item).as_bytes();
163
164
for ch in value {
165
buf.push(if *ch == b'.' { b',' } else { *ch });
166
}
167
};
168
float_serializer_no_precision_autoformat_decimal_comma_(array, f)
169
}
170
171
fn float_serializer_no_precision_autoformat_decimal_comma_<
172
'a,
173
I: NativeType,
174
F: Fn(&'a I, &mut Vec<u8>, &SerializeOptions),
175
>(
176
array: &'a PrimitiveArray<I>,
177
f: F,
178
) -> impl Serializer<'a> {
179
make_serializer::<_, _, false>(f, array.iter())
180
}
181
182
fn float_serializer_no_precision_scientific<I: NativeType + LowerExp>(
183
array: &PrimitiveArray<I>,
184
) -> impl Serializer<'_> {
185
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
186
// Float writing into a buffer of `Vec<u8>` cannot fail.
187
let _ = write!(buf, "{item:.e}");
188
};
189
190
make_serializer::<_, _, false>(f, array.iter())
191
}
192
193
fn float_serializer_no_precision_scientific_decimal_comma<I: NativeType + LowerExp>(
194
array: &PrimitiveArray<I>,
195
) -> impl Serializer<'_> {
196
let mut scratch = Vec::new();
197
198
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
199
// Float writing into a buffer of `Vec<u8>` cannot fail.
200
let _ = write!(&mut scratch, "{item:.e}");
201
for c in &mut scratch {
202
if *c == b'.' {
203
*c = b',';
204
break;
205
}
206
}
207
buf.extend_from_slice(&scratch);
208
};
209
210
make_serializer::<_, _, false>(f, array.iter())
211
}
212
213
fn float_serializer_no_precision_positional<I: NativeType + NumCast>(
214
array: &PrimitiveArray<I>,
215
) -> impl Serializer<'_> {
216
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
217
let v: f64 = NumCast::from(item).unwrap();
218
let _ = write!(buf, "{v}");
219
};
220
221
make_serializer::<_, _, false>(f, array.iter())
222
}
223
224
fn float_serializer_no_precision_positional_decimal_comma<I: NativeType + NumCast>(
225
array: &PrimitiveArray<I>,
226
) -> impl Serializer<'_> {
227
let mut scratch = Vec::new();
228
229
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
230
scratch.clear();
231
let v: f64 = NumCast::from(item).unwrap();
232
let _ = write!(&mut scratch, "{v}");
233
for c in &mut scratch {
234
if *c == b'.' {
235
*c = b',';
236
break;
237
}
238
}
239
buf.extend_from_slice(&scratch);
240
};
241
242
make_serializer::<_, _, false>(f, array.iter())
243
}
244
245
fn float_serializer_with_precision_scientific<I: NativeType + LowerExp>(
246
array: &PrimitiveArray<I>,
247
precision: usize,
248
) -> impl Serializer<'_> {
249
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
250
// Float writing into a buffer of `Vec<u8>` cannot fail.
251
let _ = write!(buf, "{item:.precision$e}");
252
};
253
254
make_serializer::<_, _, false>(f, array.iter())
255
}
256
257
fn float_serializer_with_precision_scientific_decimal_comma<I: NativeType + LowerExp>(
258
array: &PrimitiveArray<I>,
259
precision: usize,
260
) -> impl Serializer<'_> {
261
let mut scratch = Vec::new();
262
263
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
264
scratch.clear();
265
// Float writing into a buffer of `Vec<u8>` cannot fail.
266
let _ = write!(&mut scratch, "{item:.precision$e}");
267
for c in &mut scratch {
268
if *c == b'.' {
269
*c = b',';
270
break;
271
}
272
}
273
buf.extend_from_slice(&scratch);
274
};
275
276
make_serializer::<_, _, false>(f, array.iter())
277
}
278
279
fn float_serializer_with_precision_positional<I: NativeType>(
280
array: &PrimitiveArray<I>,
281
precision: usize,
282
) -> impl Serializer<'_> {
283
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
284
// Float writing into a buffer of `Vec<u8>` cannot fail.
285
let _ = write!(buf, "{item:.precision$}");
286
};
287
288
make_serializer::<_, _, false>(f, array.iter())
289
}
290
291
fn float_serializer_with_precision_positional_decimal_comma<I: NativeType>(
292
array: &PrimitiveArray<I>,
293
precision: usize,
294
) -> impl Serializer<'_> {
295
let mut scratch = Vec::new();
296
297
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
298
scratch.clear();
299
let _ = write!(&mut scratch, "{item:.precision$}");
300
for c in &mut scratch {
301
if *c == b'.' {
302
*c = b',';
303
break;
304
}
305
}
306
buf.extend_from_slice(&scratch);
307
};
308
309
make_serializer::<_, _, false>(f, array.iter())
310
}
311
312
fn null_serializer(_array: &NullArray) -> impl Serializer<'_> {
313
struct NullSerializer;
314
impl<'a> Serializer<'a> for NullSerializer {
315
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
316
buf.extend_from_slice(options.null.as_bytes());
317
}
318
}
319
NullSerializer
320
}
321
322
fn bool_serializer<const QUOTE_NON_NULL: bool>(array: &BooleanArray) -> impl Serializer<'_> {
323
let f = move |item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
324
let s = if item { "true" } else { "false" };
325
buf.extend_from_slice(s.as_bytes());
326
};
327
328
make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter())
329
}
330
331
#[cfg(feature = "dtype-decimal")]
332
fn decimal_serializer(array: &PrimitiveArray<i128>, scale: usize) -> impl Serializer<'_> {
333
let trim_zeros = arrow::compute::decimal::get_trim_decimal_zeros();
334
335
let mut fmt_buf = polars_compute::decimal::DecimalFmtBuffer::new();
336
let f = move |&item, buf: &mut Vec<u8>, options: &SerializeOptions| {
337
buf.extend_from_slice(
338
fmt_buf
339
.format_dec128(item, scale, trim_zeros, options.decimal_comma)
340
.as_bytes(),
341
);
342
};
343
344
make_serializer::<_, _, false>(f, array.iter())
345
}
346
347
#[cfg(any(
348
feature = "dtype-date",
349
feature = "dtype-time",
350
feature = "dtype-datetime"
351
))]
352
fn callback_serializer<'a, T: NativeType, const QUOTE_NON_NULL: bool>(
353
array: &'a PrimitiveArray<T>,
354
mut callback: impl FnMut(T, &mut Vec<u8>) + 'a,
355
) -> impl Serializer<'a> {
356
let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
357
callback(item, buf);
358
};
359
360
make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter())
361
}
362
363
#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]
364
type ChronoFormatIter<'a, 'b> = std::slice::Iter<'a, chrono::format::Item<'b>>;
365
366
#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]
367
fn date_and_time_serializer<'a, Underlying: NativeType, T: std::fmt::Display>(
368
format_str: Option<&'a str>,
369
description: &str,
370
array: &'a dyn Array,
371
sample_value: T,
372
mut convert: impl FnMut(Underlying) -> T + Send + 'a,
373
mut format_fn: impl for<'b> FnMut(
374
&T,
375
ChronoFormatIter<'b, 'a>,
376
) -> chrono::format::DelayedFormat<ChronoFormatIter<'b, 'a>>
377
+ Send
378
+ 'a,
379
options: &SerializeOptions,
380
) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {
381
let array = array.as_any().downcast_ref().unwrap();
382
let serializer = match format_str {
383
Some(format_str) => {
384
let format = chrono::format::StrftimeItems::new(format_str).parse().map_err(
385
|_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),
386
)?;
387
use std::fmt::Write;
388
// Fail fast for invalid format. This return error faster to the user, and allows us to not return
389
// `Result` from `serialize()`.
390
write!(IgnoreFmt, "{}", format_fn(&sample_value, format.iter())).map_err(
391
|_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),
392
)?;
393
let callback = move |item, buf: &mut Vec<u8>| {
394
let item = convert(item);
395
// We checked the format is valid above.
396
let _ = write!(buf, "{}", format_fn(&item, format.iter()));
397
};
398
date_and_time_final_serializer(array, callback, options)
399
},
400
None => {
401
let callback = move |item, buf: &mut Vec<u8>| {
402
let item = convert(item);
403
// Formatting dates into `Vec<u8>` cannot fail.
404
let _ = write!(buf, "{item}");
405
};
406
date_and_time_final_serializer(array, callback, options)
407
},
408
};
409
Ok(serializer)
410
}
411
412
#[cfg(any(
413
feature = "dtype-date",
414
feature = "dtype-time",
415
feature = "dtype-datetime"
416
))]
417
fn date_and_time_final_serializer<'a, T: NativeType>(
418
array: &'a PrimitiveArray<T>,
419
callback: impl FnMut(T, &mut Vec<u8>) + Send + 'a,
420
options: &SerializeOptions,
421
) -> Box<dyn Serializer<'a> + Send + 'a> {
422
match options.quote_style {
423
QuoteStyle::Always => Box::new(quote_serializer(callback_serializer::<T, false>(
424
array, callback,
425
))) as Box<dyn Serializer + Send>,
426
QuoteStyle::NonNumeric => Box::new(callback_serializer::<T, true>(array, callback)),
427
_ => Box::new(callback_serializer::<T, false>(array, callback)),
428
}
429
}
430
431
pub(super) fn string_serializer<'a, Iter: Send + 'a>(
432
mut f: impl FnMut(&mut Iter) -> Option<&str> + Send + 'a,
433
options: &SerializeOptions,
434
mut update: impl FnMut(&'a dyn Array) -> Iter + Send + 'a,
435
array: &'a dyn Array,
436
) -> Box<dyn Serializer<'a> + 'a + Send> {
437
const LF: u8 = b'\n';
438
const CR: u8 = b'\r';
439
440
struct StringSerializer<F, Iter> {
441
serialize: F,
442
iter: Iter,
443
}
444
445
impl<'a, F, Iter> Serializer<'a> for StringSerializer<F, Iter>
446
where
447
F: FnMut(&mut Iter, &mut Vec<u8>, &SerializeOptions),
448
{
449
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
450
(self.serialize)(&mut self.iter, buf, options);
451
}
452
}
453
454
fn serialize_str_escaped(buf: &mut Vec<u8>, s: &[u8], quote_char: u8, quoted: bool) {
455
let mut iter = memchr_iter(quote_char, s);
456
let first_quote = iter.next();
457
match first_quote {
458
None => buf.extend_from_slice(s),
459
Some(mut quote_pos) => {
460
if !quoted {
461
buf.push(quote_char);
462
}
463
let mut start_pos = 0;
464
loop {
465
buf.extend_from_slice(&s[start_pos..quote_pos]);
466
buf.extend_from_slice(&[quote_char, quote_char]);
467
match iter.next() {
468
Some(quote) => {
469
start_pos = quote_pos + 1;
470
quote_pos = quote;
471
},
472
None => {
473
buf.extend_from_slice(&s[quote_pos + 1..]);
474
break;
475
},
476
}
477
}
478
if !quoted {
479
buf.push(quote_char);
480
}
481
},
482
}
483
}
484
485
let iter = update(array);
486
match options.quote_style {
487
QuoteStyle::Always => {
488
let serialize =
489
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
490
let quote_char = options.quote_char;
491
buf.push(quote_char);
492
let Some(s) = f(iter) else {
493
buf.extend_from_slice(options.null.as_bytes());
494
buf.push(quote_char);
495
return;
496
};
497
serialize_str_escaped(buf, s.as_bytes(), quote_char, true);
498
buf.push(quote_char);
499
};
500
Box::new(StringSerializer { serialize, iter })
501
},
502
QuoteStyle::NonNumeric => {
503
let serialize =
504
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
505
let Some(s) = f(iter) else {
506
buf.extend_from_slice(options.null.as_bytes());
507
return;
508
};
509
let quote_char = options.quote_char;
510
buf.push(quote_char);
511
serialize_str_escaped(buf, s.as_bytes(), quote_char, true);
512
buf.push(quote_char);
513
};
514
Box::new(StringSerializer { serialize, iter })
515
},
516
QuoteStyle::Necessary => {
517
let serialize =
518
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
519
let Some(s) = f(iter) else {
520
buf.extend_from_slice(options.null.as_bytes());
521
return;
522
};
523
let quote_char = options.quote_char;
524
// An empty string conflicts with null, so it is necessary to quote.
525
if s.is_empty() {
526
buf.extend_from_slice(&[quote_char, quote_char]);
527
return;
528
}
529
let needs_quote = memchr3(options.separator, LF, CR, s.as_bytes()).is_some();
530
if needs_quote {
531
buf.push(quote_char);
532
}
533
serialize_str_escaped(buf, s.as_bytes(), quote_char, needs_quote);
534
if needs_quote {
535
buf.push(quote_char);
536
}
537
};
538
Box::new(StringSerializer { serialize, iter })
539
},
540
QuoteStyle::Never => {
541
let serialize =
542
move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
543
let Some(s) = f(iter) else {
544
buf.extend_from_slice(options.null.as_bytes());
545
return;
546
};
547
buf.extend_from_slice(s.as_bytes());
548
};
549
Box::new(StringSerializer { serialize, iter })
550
},
551
}
552
}
553
554
fn quote_serializer<'a>(serializer: impl Serializer<'a>) -> impl Serializer<'a> {
555
struct QuoteSerializer<S>(S);
556
impl<'a, S: Serializer<'a>> Serializer<'a> for QuoteSerializer<S> {
557
fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
558
buf.push(options.quote_char);
559
self.0.serialize(buf, options);
560
buf.push(options.quote_char);
561
}
562
}
563
QuoteSerializer(serializer)
564
}
565
566
pub(super) fn serializer_for<'a>(
567
array: &'a dyn Array,
568
options: &'a SerializeOptions,
569
dtype: &'a DataType,
570
_datetime_format: &'a str,
571
_time_zone: Option<Tz>,
572
) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {
573
// The needs_quotes flag captures the quote logic for the quote_wrapper! macro
574
// It is targeted at numerical types primarily; other types may required additional logic
575
let needs_quotes = match dtype {
576
DataType::Float16 | DataType::Float32 | DataType::Float64 => {
577
// When comma is used as both the field separator and decimal separator, quoting
578
// may be required. Specifically, when:
579
// - quote_style is Always, or
580
// - quote_style is Necessary or Non-Numeric, the field separator is also a comma,
581
// and the float string field contains a comma character (no precision or precision > 0)
582
//
583
// In some rare cases, a field may get quoted when it is not strictly necessary
584
// (e.g., in scientific notation when only the first digit is non-zero such as '1e12',
585
// or null values in 'non_numeric' quote_style).
586
587
let mut should_quote = options.decimal_comma && options.separator == b',';
588
if let Some(precision) = options.float_precision {
589
should_quote &= precision > 0;
590
}
591
592
match options.quote_style {
593
QuoteStyle::Always => true,
594
QuoteStyle::Necessary | QuoteStyle::NonNumeric => should_quote,
595
QuoteStyle::Never => false,
596
}
597
},
598
#[cfg(feature = "dtype-decimal")]
599
DataType::Decimal(_, scale) => {
600
// Similar to logic for float data-types, but need to consider scale rather than precision
601
let should_quote = options.decimal_comma && options.separator == b',' && *scale > 0;
602
603
match options.quote_style {
604
QuoteStyle::Always => true,
605
QuoteStyle::Necessary | QuoteStyle::NonNumeric => should_quote,
606
QuoteStyle::Never => false,
607
}
608
},
609
_ => options.quote_style == QuoteStyle::Always,
610
};
611
612
macro_rules! quote_wrapper {
613
($make_serializer:path, $($arg:tt)*) => {{
614
let serializer = $make_serializer(array.as_any().downcast_ref().unwrap(), $($arg)*);
615
if needs_quotes {
616
Box::new(quote_serializer(serializer)) as Box<dyn Serializer + Send>
617
} else {
618
Box::new(serializer)
619
}
620
}};
621
($make_serializer:path) => { quote_wrapper!($make_serializer,) };
622
}
623
624
let serializer = match dtype {
625
DataType::Int8 => quote_wrapper!(integer_serializer::<i8>),
626
DataType::UInt8 => quote_wrapper!(integer_serializer::<u8>),
627
DataType::Int16 => quote_wrapper!(integer_serializer::<i16>),
628
DataType::UInt16 => quote_wrapper!(integer_serializer::<u16>),
629
DataType::Int32 => quote_wrapper!(integer_serializer::<i32>),
630
DataType::UInt32 => quote_wrapper!(integer_serializer::<u32>),
631
DataType::Int64 => quote_wrapper!(integer_serializer::<i64>),
632
DataType::UInt64 => quote_wrapper!(integer_serializer::<u64>),
633
DataType::Int128 => quote_wrapper!(integer_serializer::<i128>),
634
DataType::UInt128 => quote_wrapper!(integer_serializer::<u128>),
635
DataType::Float16 => {
636
match (
637
options.decimal_comma,
638
options.float_precision,
639
options.float_scientific,
640
) {
641
// standard decimal separator (period)
642
(false, Some(precision), Some(true)) => {
643
quote_wrapper!(
644
float_serializer_with_precision_scientific::<pf16>,
645
precision
646
)
647
},
648
(false, Some(precision), _) => {
649
quote_wrapper!(
650
float_serializer_with_precision_positional::<pf16>,
651
precision
652
)
653
},
654
(false, None, Some(true)) => {
655
quote_wrapper!(float_serializer_no_precision_scientific::<pf16>)
656
},
657
(false, None, Some(false)) => {
658
quote_wrapper!(float_serializer_no_precision_positional::<pf16>)
659
},
660
(false, None, None) => {
661
quote_wrapper!(float_serializer_no_precision_autoformat_f16)
662
},
663
664
// comma as the decimal separator
665
(true, Some(precision), Some(true)) => quote_wrapper!(
666
float_serializer_with_precision_scientific_decimal_comma::<pf16>,
667
precision
668
),
669
(true, Some(precision), _) => quote_wrapper!(
670
float_serializer_with_precision_positional_decimal_comma::<pf16>,
671
precision
672
),
673
(true, None, Some(true)) => {
674
quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<pf16>)
675
},
676
(true, None, Some(false)) => {
677
quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<pf16>)
678
},
679
(true, None, None) => {
680
quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma_f16)
681
},
682
}
683
},
684
DataType::Float32 => {
685
match (
686
options.decimal_comma,
687
options.float_precision,
688
options.float_scientific,
689
) {
690
// standard decimal separator (period)
691
(false, Some(precision), Some(true)) => {
692
quote_wrapper!(float_serializer_with_precision_scientific::<f32>, precision)
693
},
694
(false, Some(precision), _) => {
695
quote_wrapper!(float_serializer_with_precision_positional::<f32>, precision)
696
},
697
(false, None, Some(true)) => {
698
quote_wrapper!(float_serializer_no_precision_scientific::<f32>)
699
},
700
(false, None, Some(false)) => {
701
quote_wrapper!(float_serializer_no_precision_positional::<f32>)
702
},
703
(false, None, None) => {
704
quote_wrapper!(float_serializer_no_precision_autoformat::<f32>)
705
},
706
707
// comma as the decimal separator
708
(true, Some(precision), Some(true)) => quote_wrapper!(
709
float_serializer_with_precision_scientific_decimal_comma::<f32>,
710
precision
711
),
712
(true, Some(precision), _) => quote_wrapper!(
713
float_serializer_with_precision_positional_decimal_comma::<f32>,
714
precision
715
),
716
(true, None, Some(true)) => {
717
quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f32>)
718
},
719
(true, None, Some(false)) => {
720
quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f32>)
721
},
722
(true, None, None) => {
723
quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f32>)
724
},
725
}
726
},
727
DataType::Float64 => {
728
match (
729
options.decimal_comma,
730
options.float_precision,
731
options.float_scientific,
732
) {
733
// standard decimal separator (period)
734
(false, Some(precision), Some(true)) => {
735
quote_wrapper!(float_serializer_with_precision_scientific::<f64>, precision)
736
},
737
(false, Some(precision), _) => {
738
quote_wrapper!(float_serializer_with_precision_positional::<f64>, precision)
739
},
740
(false, None, Some(true)) => {
741
quote_wrapper!(float_serializer_no_precision_scientific::<f64>)
742
},
743
(false, None, Some(false)) => {
744
quote_wrapper!(float_serializer_no_precision_positional::<f64>)
745
},
746
(false, None, None) => {
747
quote_wrapper!(float_serializer_no_precision_autoformat::<f64>)
748
},
749
750
// comma as the decimal separator
751
(true, Some(precision), Some(true)) => quote_wrapper!(
752
float_serializer_with_precision_scientific_decimal_comma::<f64>,
753
precision
754
),
755
(true, Some(precision), _) => quote_wrapper!(
756
float_serializer_with_precision_positional_decimal_comma::<f64>,
757
precision
758
),
759
(true, None, Some(true)) => {
760
quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f64>)
761
},
762
(true, None, Some(false)) => {
763
quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f64>)
764
},
765
(true, None, None) => {
766
quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f64>)
767
},
768
}
769
},
770
DataType::Null => quote_wrapper!(null_serializer),
771
DataType::Boolean => {
772
let array = array.as_any().downcast_ref().unwrap();
773
match options.quote_style {
774
QuoteStyle::Always => Box::new(quote_serializer(bool_serializer::<false>(array)))
775
as Box<dyn Serializer + Send>,
776
QuoteStyle::NonNumeric => Box::new(bool_serializer::<true>(array)),
777
_ => Box::new(bool_serializer::<false>(array)),
778
}
779
},
780
#[cfg(feature = "dtype-date")]
781
DataType::Date => date_and_time_serializer(
782
options.date_format.as_deref(),
783
"NaiveDate",
784
array,
785
chrono::NaiveDate::MAX,
786
arrow::temporal_conversions::date32_to_date,
787
|date, items| date.format_with_items(items),
788
options,
789
)?,
790
#[cfg(feature = "dtype-time")]
791
DataType::Time => date_and_time_serializer(
792
Some(options.time_format.as_deref().unwrap_or("%T%.9f")),
793
"NaiveTime",
794
array,
795
chrono::NaiveTime::MIN,
796
arrow::temporal_conversions::time64ns_to_time,
797
|time, items| time.format_with_items(items),
798
options,
799
)?,
800
#[cfg(feature = "dtype-datetime")]
801
DataType::Datetime(time_unit, _) => {
802
let format = chrono::format::StrftimeItems::new(_datetime_format)
803
.parse()
804
.map_err(|_| {
805
polars_err!(
806
ComputeError: "cannot format {} with format '{_datetime_format}'",
807
if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },
808
)
809
})?;
810
use std::fmt::Write;
811
let sample_datetime = match _time_zone {
812
#[cfg(feature = "timezones")]
813
Some(time_zone) => time_zone
814
.from_utc_datetime(&chrono::NaiveDateTime::MAX)
815
.format_with_items(format.iter()),
816
#[cfg(not(feature = "timezones"))]
817
Some(_) => panic!("activate 'timezones' feature"),
818
None => chrono::NaiveDateTime::MAX.format_with_items(format.iter()),
819
};
820
// Fail fast for invalid format. This return error faster to the user, and allows us to not return
821
// `Result` from `serialize()`.
822
write!(IgnoreFmt, "{sample_datetime}").map_err(|_| {
823
polars_err!(
824
ComputeError: "cannot format {} with format '{_datetime_format}'",
825
if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },
826
)
827
})?;
828
829
let array = array.as_any().downcast_ref().unwrap();
830
831
macro_rules! time_unit_serializer {
832
($convert:ident) => {
833
match _time_zone {
834
#[cfg(feature = "timezones")]
835
Some(time_zone) => {
836
let callback = move |item, buf: &mut Vec<u8>| {
837
let item = arrow::temporal_conversions::$convert(item);
838
let item = time_zone.from_utc_datetime(&item);
839
// We checked the format is valid above.
840
let _ = write!(buf, "{}", item.format_with_items(format.iter()));
841
};
842
date_and_time_final_serializer(array, callback, options)
843
},
844
#[cfg(not(feature = "timezones"))]
845
Some(_) => panic!("activate 'timezones' feature"),
846
None => {
847
let callback = move |item, buf: &mut Vec<u8>| {
848
let item = arrow::temporal_conversions::$convert(item);
849
// We checked the format is valid above.
850
let _ = write!(buf, "{}", item.format_with_items(format.iter()));
851
};
852
date_and_time_final_serializer(array, callback, options)
853
},
854
}
855
};
856
}
857
858
match time_unit {
859
TimeUnit::Nanoseconds => time_unit_serializer!(timestamp_ns_to_datetime),
860
TimeUnit::Microseconds => time_unit_serializer!(timestamp_us_to_datetime),
861
TimeUnit::Milliseconds => time_unit_serializer!(timestamp_ms_to_datetime),
862
}
863
},
864
DataType::String => string_serializer(
865
|iter| Iterator::next(iter).expect(TOO_MANY_MSG),
866
options,
867
|arr| {
868
arr.as_any()
869
.downcast_ref::<Utf8ViewArray>()
870
.expect(ARRAY_MISMATCH_MSG)
871
.iter()
872
},
873
array,
874
),
875
#[cfg(feature = "dtype-categorical")]
876
DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => {
877
polars_core::with_match_categorical_physical_type!(dtype.cat_physical().unwrap(), |$C| {
878
string_serializer(
879
|iter| {
880
let &idx: &<$C as PolarsCategoricalType>::Native = Iterator::next(iter).expect(TOO_MANY_MSG)?;
881
Some(unsafe { mapping.cat_to_str_unchecked(idx.as_cat()) })
882
},
883
options,
884
|arr| {
885
arr.as_any()
886
.downcast_ref::<PrimitiveArray<<$C as PolarsCategoricalType>::Native>>()
887
.expect(ARRAY_MISMATCH_MSG)
888
.iter()
889
},
890
array,
891
)
892
})
893
},
894
#[cfg(feature = "dtype-decimal")]
895
DataType::Decimal(_, scale) => {
896
quote_wrapper!(decimal_serializer, *scale)
897
},
898
_ => {
899
polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or a binary format.")
900
},
901
};
902
Ok(serializer)
903
}
904
905
#[cfg(test)]
906
mod test {
907
use arrow::array::NullArray;
908
use polars_core::prelude::ArrowDataType;
909
910
use super::string_serializer;
911
use crate::csv::write::options::{QuoteStyle, SerializeOptions};
912
913
// It is the most complex serializer with most edge cases, it definitely needs a comprehensive test.
914
#[test]
915
fn test_string_serializer() {
916
#[track_caller]
917
fn check_string_serialization(options: &SerializeOptions, s: Option<&str>, expected: &str) {
918
let fake_array = NullArray::new(ArrowDataType::Null, 0);
919
let mut serializer = string_serializer(|s| *s, options, |_| s, &fake_array);
920
let mut buf = Vec::new();
921
serializer.serialize(&mut buf, options);
922
let serialized = std::str::from_utf8(&buf).unwrap();
923
// Don't use `assert_eq!()` because it prints debug format and it's hard to read with all the escapes.
924
if serialized != expected {
925
panic!(
926
"CSV string {s:?} wasn't serialized correctly: expected: `{expected}`, got: `{serialized}`"
927
);
928
}
929
}
930
931
let always_quote = SerializeOptions {
932
quote_style: QuoteStyle::Always,
933
..SerializeOptions::default()
934
};
935
check_string_serialization(&always_quote, None, r#""""#);
936
check_string_serialization(&always_quote, Some(""), r#""""#);
937
check_string_serialization(&always_quote, Some("a"), r#""a""#);
938
check_string_serialization(&always_quote, Some("\""), r#""""""#);
939
check_string_serialization(&always_quote, Some("a\"\"b"), r#""a""""b""#);
940
941
let necessary_quote = SerializeOptions {
942
quote_style: QuoteStyle::Necessary,
943
..SerializeOptions::default()
944
};
945
check_string_serialization(&necessary_quote, None, r#""#);
946
check_string_serialization(&necessary_quote, Some(""), r#""""#);
947
check_string_serialization(&necessary_quote, Some("a"), r#"a"#);
948
check_string_serialization(&necessary_quote, Some("\""), r#""""""#);
949
check_string_serialization(&necessary_quote, Some("a\"\"b"), r#""a""""b""#);
950
check_string_serialization(&necessary_quote, Some("a b"), r#"a b"#);
951
check_string_serialization(&necessary_quote, Some("a,b"), r#""a,b""#);
952
check_string_serialization(&necessary_quote, Some("a\nb"), "\"a\nb\"");
953
check_string_serialization(&necessary_quote, Some("a\rb"), "\"a\rb\"");
954
955
let never_quote = SerializeOptions {
956
quote_style: QuoteStyle::Never,
957
..SerializeOptions::default()
958
};
959
check_string_serialization(&never_quote, None, "");
960
check_string_serialization(&never_quote, Some(""), "");
961
check_string_serialization(&never_quote, Some("a"), "a");
962
check_string_serialization(&never_quote, Some("\""), "\"");
963
check_string_serialization(&never_quote, Some("a\"\"b"), "a\"\"b");
964
check_string_serialization(&never_quote, Some("a b"), "a b");
965
check_string_serialization(&never_quote, Some("a,b"), "a,b");
966
check_string_serialization(&never_quote, Some("a\nb"), "a\nb");
967
check_string_serialization(&never_quote, Some("a\rb"), "a\rb");
968
969
let non_numeric_quote = SerializeOptions {
970
quote_style: QuoteStyle::NonNumeric,
971
..SerializeOptions::default()
972
};
973
check_string_serialization(&non_numeric_quote, None, "");
974
check_string_serialization(&non_numeric_quote, Some(""), r#""""#);
975
check_string_serialization(&non_numeric_quote, Some("a"), r#""a""#);
976
check_string_serialization(&non_numeric_quote, Some("\""), r#""""""#);
977
check_string_serialization(&non_numeric_quote, Some("a\"\"b"), r#""a""""b""#);
978
check_string_serialization(&non_numeric_quote, Some("a b"), r#""a b""#);
979
check_string_serialization(&non_numeric_quote, Some("a,b"), r#""a,b""#);
980
check_string_serialization(&non_numeric_quote, Some("a\nb"), "\"a\nb\"");
981
check_string_serialization(&non_numeric_quote, Some("a\rb"), "\"a\rb\"");
982
}
983
}
984
985