Path: blob/main/crates/polars-io/src/csv/write/write_impl/serializer.rs
6939 views
//! This file is complicated because we have complicated escape handling. We want to avoid having1//! to write down each combination of type & escaping, but we also want the compiler to optimize them2//! to efficient machine code - so no dynamic dispatch. That means a lot of generics and macros.3//!4//! We need to differentiate between several kinds of types, and several kinds of escaping we support:5//!6//! - The simplest escaping mechanism are [`QuoteStyle::Always`] and [`QuoteStyle::Never`].7//! For `Never` we just never quote. For `Always` we pass any serializer that never quotes8//! to [`quote_serializer()`] then it becomes quoted properly.9//! - [`QuoteStyle::Necessary`] (the default) is only relevant for strings and floats with decimal_comma,10//! as these are the only types that can have newlines (row separators), commas (default column separators)11//! or quotes. String escaping is complicated anyway, and it is all inside [`string_serializer()`].12//! - The real complication is [`QuoteStyle::NonNumeric`], that doesn't quote numbers (unless necessary)13//! and nulls, and quotes any other thing. The problem is that nulls can be within any type, so we14//! need to handle two possibilities of quoting everywhere.15//!16//! So in case the chosen style is anything but `NonNumeric`, we statically know for each column except strings17//! whether it should be quoted (and for strings too when not `Necessary`). There we use18//! `quote_serializer()` or nothing.19//!20//! But to help with `NonNumeric`, each serializer carry the potential to distinguish between nulls and non-nulls,21//! and quote the latter and not the former. But in order to not have the branch when we statically know the answer,22//! we have an option to statically disable it with a const generic flag `QUOTE_NON_NULL`. Numbers (that should never23//! be quoted with `NonNumeric`) just always disable this flag.24//!25//! So we have three possibilities:26//!27//! 1. A serializer that never quotes. This is a bare serializer with `QUOTE_NON_NULL = false`.28//! 2. A serializer that always quotes. This is a serializer wrapped with `quote_serializer()`,29//! but also with `QUOTE_NON_NULL = false`.30//! 3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`.3132use std::fmt::LowerExp;33use std::io::Write;3435use arrow::array::{Array, BooleanArray, NullArray, PrimitiveArray, Utf8ViewArray};36use arrow::legacy::time_zone::Tz;37use arrow::types::NativeType;38#[cfg(feature = "timezones")]39use chrono::TimeZone;40use memchr::{memchr_iter, memchr3};41use num_traits::NumCast;42use polars_core::prelude::*;4344use crate::csv::write::{QuoteStyle, SerializeOptions};4546const TOO_MANY_MSG: &str = "too many items requested from CSV serializer";47const ARRAY_MISMATCH_MSG: &str = "wrong array type";4849#[allow(dead_code)]50struct IgnoreFmt;51impl std::fmt::Write for IgnoreFmt {52fn write_str(&mut self, _s: &str) -> std::fmt::Result {53Ok(())54}55}5657pub(super) trait Serializer<'a> {58fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions);59// Updates the array without changing the configuration.60fn update_array(&mut self, array: &'a dyn Array);61}6263fn make_serializer<'a, T, I: Iterator<Item = Option<T>>, const QUOTE_NON_NULL: bool>(64f: impl FnMut(T, &mut Vec<u8>, &SerializeOptions),65iter: I,66update_array: impl FnMut(&'a dyn Array) -> I,67) -> impl Serializer<'a> {68struct SerializerImpl<F, I, Update, const QUOTE_NON_NULL: bool> {69f: F,70iter: I,71update_array: Update,72}7374impl<'a, T, F, I, Update, const QUOTE_NON_NULL: bool> Serializer<'a>75for SerializerImpl<F, I, Update, QUOTE_NON_NULL>76where77F: FnMut(T, &mut Vec<u8>, &SerializeOptions),78I: Iterator<Item = Option<T>>,79Update: FnMut(&'a dyn Array) -> I,80{81fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {82let item = self.iter.next().expect(TOO_MANY_MSG);83match item {84Some(item) => {85if QUOTE_NON_NULL {86buf.push(options.quote_char);87}88(self.f)(item, buf, options);89if QUOTE_NON_NULL {90buf.push(options.quote_char);91}92},93None => buf.extend_from_slice(options.null.as_bytes()),94}95}9697fn update_array(&mut self, array: &'a dyn Array) {98self.iter = (self.update_array)(array);99}100}101102SerializerImpl::<_, _, _, QUOTE_NON_NULL> {103f,104iter,105update_array,106}107}108109fn integer_serializer<I: NativeType + itoa::Integer>(110array: &PrimitiveArray<I>,111) -> impl Serializer<'_> {112let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {113let mut buffer = itoa::Buffer::new();114let value = buffer.format(item);115buf.extend_from_slice(value.as_bytes());116};117118make_serializer::<_, _, false>(f, array.iter(), |array| {119array120.as_any()121.downcast_ref::<PrimitiveArray<I>>()122.expect(ARRAY_MISMATCH_MSG)123.iter()124})125}126127fn float_serializer_no_precision_autoformat<I: NativeType + ryu::Float>(128array: &PrimitiveArray<I>,129) -> impl Serializer<'_> {130let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {131let mut buffer = ryu::Buffer::new();132let value = buffer.format(item);133buf.extend_from_slice(value.as_bytes());134};135136make_serializer::<_, _, false>(f, array.iter(), |array| {137array138.as_any()139.downcast_ref::<PrimitiveArray<I>>()140.expect(ARRAY_MISMATCH_MSG)141.iter()142})143}144145fn float_serializer_no_precision_autoformat_decimal_comma<I: NativeType + ryu::Float>(146array: &PrimitiveArray<I>,147) -> impl Serializer<'_> {148let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {149let mut buffer = ryu::Buffer::new();150let value = buffer.format(item).as_bytes();151152for ch in value {153buf.push(if *ch == b'.' { b',' } else { *ch });154}155};156157make_serializer::<_, _, false>(f, array.iter(), |array| {158array159.as_any()160.downcast_ref::<PrimitiveArray<I>>()161.expect(ARRAY_MISMATCH_MSG)162.iter()163})164}165166fn float_serializer_no_precision_scientific<I: NativeType + LowerExp>(167array: &PrimitiveArray<I>,168) -> impl Serializer<'_> {169let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {170// Float writing into a buffer of `Vec<u8>` cannot fail.171let _ = write!(buf, "{item:.e}");172};173174make_serializer::<_, _, false>(f, array.iter(), |array| {175array176.as_any()177.downcast_ref::<PrimitiveArray<I>>()178.expect(ARRAY_MISMATCH_MSG)179.iter()180})181}182183fn float_serializer_no_precision_scientific_decimal_comma<I: NativeType + LowerExp>(184array: &PrimitiveArray<I>,185) -> impl Serializer<'_> {186let mut scratch = Vec::new();187188let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {189// Float writing into a buffer of `Vec<u8>` cannot fail.190let _ = write!(&mut scratch, "{item:.e}");191for c in &mut scratch {192if *c == b'.' {193*c = b',';194break;195}196}197buf.extend_from_slice(&scratch);198};199200make_serializer::<_, _, false>(f, array.iter(), |array| {201array202.as_any()203.downcast_ref::<PrimitiveArray<I>>()204.expect(ARRAY_MISMATCH_MSG)205.iter()206})207}208209fn float_serializer_no_precision_positional<I: NativeType + NumCast>(210array: &PrimitiveArray<I>,211) -> impl Serializer<'_> {212let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {213let v: f64 = NumCast::from(item).unwrap();214let _ = write!(buf, "{v}");215};216217make_serializer::<_, _, false>(f, array.iter(), |array| {218array219.as_any()220.downcast_ref::<PrimitiveArray<I>>()221.expect(ARRAY_MISMATCH_MSG)222.iter()223})224}225226fn float_serializer_no_precision_positional_decimal_comma<I: NativeType + NumCast>(227array: &PrimitiveArray<I>,228) -> impl Serializer<'_> {229let mut scratch = Vec::new();230231let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {232scratch.clear();233let v: f64 = NumCast::from(item).unwrap();234let _ = write!(&mut scratch, "{v}");235for c in &mut scratch {236if *c == b'.' {237*c = b',';238break;239}240}241buf.extend_from_slice(&scratch);242};243244make_serializer::<_, _, false>(f, array.iter(), |array| {245array246.as_any()247.downcast_ref::<PrimitiveArray<I>>()248.expect(ARRAY_MISMATCH_MSG)249.iter()250})251}252253fn float_serializer_with_precision_scientific<I: NativeType + LowerExp>(254array: &PrimitiveArray<I>,255precision: usize,256) -> impl Serializer<'_> {257let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {258// Float writing into a buffer of `Vec<u8>` cannot fail.259let _ = write!(buf, "{item:.precision$e}");260};261262make_serializer::<_, _, false>(f, array.iter(), |array| {263array264.as_any()265.downcast_ref::<PrimitiveArray<I>>()266.expect(ARRAY_MISMATCH_MSG)267.iter()268})269}270271fn float_serializer_with_precision_scientific_decimal_comma<I: NativeType + LowerExp>(272array: &PrimitiveArray<I>,273precision: usize,274) -> impl Serializer<'_> {275let mut scratch = Vec::new();276277let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {278scratch.clear();279// Float writing into a buffer of `Vec<u8>` cannot fail.280let _ = write!(&mut scratch, "{item:.precision$e}");281for c in &mut scratch {282if *c == b'.' {283*c = b',';284break;285}286}287buf.extend_from_slice(&scratch);288};289290make_serializer::<_, _, false>(f, array.iter(), |array| {291array292.as_any()293.downcast_ref::<PrimitiveArray<I>>()294.expect(ARRAY_MISMATCH_MSG)295.iter()296})297}298299fn float_serializer_with_precision_positional<I: NativeType>(300array: &PrimitiveArray<I>,301precision: usize,302) -> impl Serializer<'_> {303let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {304// Float writing into a buffer of `Vec<u8>` cannot fail.305let _ = write!(buf, "{item:.precision$}");306};307308make_serializer::<_, _, false>(f, array.iter(), |array| {309array310.as_any()311.downcast_ref::<PrimitiveArray<I>>()312.expect(ARRAY_MISMATCH_MSG)313.iter()314})315}316317fn float_serializer_with_precision_positional_decimal_comma<I: NativeType>(318array: &PrimitiveArray<I>,319precision: usize,320) -> impl Serializer<'_> {321let mut scratch = Vec::new();322323let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {324scratch.clear();325let _ = write!(&mut scratch, "{item:.precision$}");326for c in &mut scratch {327if *c == b'.' {328*c = b',';329break;330}331}332buf.extend_from_slice(&scratch);333};334335make_serializer::<_, _, false>(f, array.iter(), |array| {336array337.as_any()338.downcast_ref::<PrimitiveArray<I>>()339.expect(ARRAY_MISMATCH_MSG)340.iter()341})342}343344fn null_serializer(_array: &NullArray) -> impl Serializer<'_> {345struct NullSerializer;346impl<'a> Serializer<'a> for NullSerializer {347fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {348buf.extend_from_slice(options.null.as_bytes());349}350fn update_array(&mut self, _array: &'a dyn Array) {}351}352NullSerializer353}354355fn bool_serializer<const QUOTE_NON_NULL: bool>(array: &BooleanArray) -> impl Serializer<'_> {356let f = move |item, buf: &mut Vec<u8>, _options: &SerializeOptions| {357let s = if item { "true" } else { "false" };358buf.extend_from_slice(s.as_bytes());359};360361make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter(), |array| {362array363.as_any()364.downcast_ref::<BooleanArray>()365.expect(ARRAY_MISMATCH_MSG)366.iter()367})368}369370#[cfg(feature = "dtype-decimal")]371fn decimal_serializer(array: &PrimitiveArray<i128>, scale: usize) -> impl Serializer<'_> {372let trim_zeros = arrow::compute::decimal::get_trim_decimal_zeros();373374let mut fmt_buf = arrow::compute::decimal::DecimalFmtBuffer::new();375let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {376buf.extend_from_slice(fmt_buf.format(item, scale, trim_zeros).as_bytes());377};378379make_serializer::<_, _, false>(f, array.iter(), |array| {380array381.as_any()382.downcast_ref::<PrimitiveArray<i128>>()383.expect(ARRAY_MISMATCH_MSG)384.iter()385})386}387388#[cfg(any(389feature = "dtype-date",390feature = "dtype-time",391feature = "dtype-datetime"392))]393fn callback_serializer<'a, T: NativeType, const QUOTE_NON_NULL: bool>(394array: &'a PrimitiveArray<T>,395mut callback: impl FnMut(T, &mut Vec<u8>) + 'a,396) -> impl Serializer<'a> {397let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {398callback(item, buf);399};400401make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter(), |array| {402array403.as_any()404.downcast_ref::<PrimitiveArray<T>>()405.expect(ARRAY_MISMATCH_MSG)406.iter()407})408}409410#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]411type ChronoFormatIter<'a, 'b> = std::slice::Iter<'a, chrono::format::Item<'b>>;412413#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]414fn date_and_time_serializer<'a, Underlying: NativeType, T: std::fmt::Display>(415format_str: &'a Option<String>,416description: &str,417array: &'a dyn Array,418sample_value: T,419mut convert: impl FnMut(Underlying) -> T + Send + 'a,420mut format_fn: impl for<'b> FnMut(421&T,422ChronoFormatIter<'b, 'a>,423) -> chrono::format::DelayedFormat<ChronoFormatIter<'b, 'a>>424+ Send425+ 'a,426options: &SerializeOptions,427) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {428let array = array.as_any().downcast_ref().unwrap();429let serializer = match format_str {430Some(format_str) => {431let format = chrono::format::StrftimeItems::new(format_str).parse().map_err(432|_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),433)?;434use std::fmt::Write;435// Fail fast for invalid format. This return error faster to the user, and allows us to not return436// `Result` from `serialize()`.437write!(IgnoreFmt, "{}", format_fn(&sample_value, format.iter())).map_err(438|_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),439)?;440let callback = move |item, buf: &mut Vec<u8>| {441let item = convert(item);442// We checked the format is valid above.443let _ = write!(buf, "{}", format_fn(&item, format.iter()));444};445date_and_time_final_serializer(array, callback, options)446},447None => {448let callback = move |item, buf: &mut Vec<u8>| {449let item = convert(item);450// Formatting dates into `Vec<u8>` cannot fail.451let _ = write!(buf, "{item}");452};453date_and_time_final_serializer(array, callback, options)454},455};456Ok(serializer)457}458459#[cfg(any(460feature = "dtype-date",461feature = "dtype-time",462feature = "dtype-datetime"463))]464fn date_and_time_final_serializer<'a, T: NativeType>(465array: &'a PrimitiveArray<T>,466callback: impl FnMut(T, &mut Vec<u8>) + Send + 'a,467options: &SerializeOptions,468) -> Box<dyn Serializer<'a> + Send + 'a> {469match options.quote_style {470QuoteStyle::Always => Box::new(quote_serializer(callback_serializer::<T, false>(471array, callback,472))) as Box<dyn Serializer + Send>,473QuoteStyle::NonNumeric => Box::new(callback_serializer::<T, true>(array, callback)),474_ => Box::new(callback_serializer::<T, false>(array, callback)),475}476}477478pub(super) fn string_serializer<'a, Iter: Send + 'a>(479mut f: impl FnMut(&mut Iter) -> Option<&str> + Send + 'a,480options: &SerializeOptions,481mut update: impl FnMut(&'a dyn Array) -> Iter + Send + 'a,482array: &'a dyn Array,483) -> Box<dyn Serializer<'a> + 'a + Send> {484const LF: u8 = b'\n';485const CR: u8 = b'\r';486487struct StringSerializer<F, Iter, Update> {488serialize: F,489update: Update,490iter: Iter,491}492493impl<'a, F, Iter, Update> Serializer<'a> for StringSerializer<F, Iter, Update>494where495F: FnMut(&mut Iter, &mut Vec<u8>, &SerializeOptions),496Update: FnMut(&'a dyn Array) -> Iter,497{498fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {499(self.serialize)(&mut self.iter, buf, options);500}501502fn update_array(&mut self, array: &'a dyn Array) {503self.iter = (self.update)(array);504}505}506507fn serialize_str_escaped(buf: &mut Vec<u8>, s: &[u8], quote_char: u8, quoted: bool) {508let mut iter = memchr_iter(quote_char, s);509let first_quote = iter.next();510match first_quote {511None => buf.extend_from_slice(s),512Some(mut quote_pos) => {513if !quoted {514buf.push(quote_char);515}516let mut start_pos = 0;517loop {518buf.extend_from_slice(&s[start_pos..quote_pos]);519buf.extend_from_slice(&[quote_char, quote_char]);520match iter.next() {521Some(quote) => {522start_pos = quote_pos + 1;523quote_pos = quote;524},525None => {526buf.extend_from_slice(&s[quote_pos + 1..]);527break;528},529}530}531if !quoted {532buf.push(quote_char);533}534},535}536}537538let iter = update(array);539match options.quote_style {540QuoteStyle::Always => {541let serialize =542move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {543let quote_char = options.quote_char;544buf.push(quote_char);545let Some(s) = f(iter) else {546buf.extend_from_slice(options.null.as_bytes());547buf.push(quote_char);548return;549};550serialize_str_escaped(buf, s.as_bytes(), quote_char, true);551buf.push(quote_char);552};553Box::new(StringSerializer {554serialize,555update,556iter,557})558},559QuoteStyle::NonNumeric => {560let serialize =561move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {562let Some(s) = f(iter) else {563buf.extend_from_slice(options.null.as_bytes());564return;565};566let quote_char = options.quote_char;567buf.push(quote_char);568serialize_str_escaped(buf, s.as_bytes(), quote_char, true);569buf.push(quote_char);570};571Box::new(StringSerializer {572serialize,573update,574iter,575})576},577QuoteStyle::Necessary => {578let serialize =579move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {580let Some(s) = f(iter) else {581buf.extend_from_slice(options.null.as_bytes());582return;583};584let quote_char = options.quote_char;585// An empty string conflicts with null, so it is necessary to quote.586if s.is_empty() {587buf.extend_from_slice(&[quote_char, quote_char]);588return;589}590let needs_quote = memchr3(options.separator, LF, CR, s.as_bytes()).is_some();591if needs_quote {592buf.push(quote_char);593}594serialize_str_escaped(buf, s.as_bytes(), quote_char, needs_quote);595if needs_quote {596buf.push(quote_char);597}598};599Box::new(StringSerializer {600serialize,601update,602iter,603})604},605QuoteStyle::Never => {606let serialize =607move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {608let Some(s) = f(iter) else {609buf.extend_from_slice(options.null.as_bytes());610return;611};612buf.extend_from_slice(s.as_bytes());613};614Box::new(StringSerializer {615serialize,616update,617iter,618})619},620}621}622623fn quote_serializer<'a>(serializer: impl Serializer<'a>) -> impl Serializer<'a> {624struct QuoteSerializer<S>(S);625impl<'a, S: Serializer<'a>> Serializer<'a> for QuoteSerializer<S> {626fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {627buf.push(options.quote_char);628self.0.serialize(buf, options);629buf.push(options.quote_char);630}631632fn update_array(&mut self, array: &'a dyn Array) {633self.0.update_array(array);634}635}636QuoteSerializer(serializer)637}638639pub(super) fn serializer_for<'a>(640array: &'a dyn Array,641options: &'a SerializeOptions,642dtype: &'a DataType,643_datetime_format: &'a str,644_time_zone: Option<Tz>,645) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {646// The needs_quotes flag captures the quote logic for the quote_wrapper! macro647// It is targeted at numerical types primarily; other types may required additional logic648let needs_quotes = match dtype {649DataType::Float32 | DataType::Float64 => {650// When comma is used as both the field separator and decimal separator, quoting651// may be required. Specifically, when:652// - quote_style is Always, or653// - quote_style is Necessary or Non-Numeric, the field separator is also a comma,654// and the float string field contains a comma character (no precision or precision > 0)655//656// In some rare cases, a field may get quoted when it is not strictly necessary657// (e.g., in scientific notation when only the first digit is non-zero such as '1e12',658// or null values in 'non_numeric' quote_style).659660let mut should_quote = options.decimal_comma && options.separator == b',';661if let Some(precision) = options.float_precision {662should_quote &= precision > 0;663}664665match options.quote_style {666QuoteStyle::Always => true,667QuoteStyle::Necessary | QuoteStyle::NonNumeric => should_quote,668QuoteStyle::Never => false,669}670},671_ => options.quote_style == QuoteStyle::Always,672};673674macro_rules! quote_wrapper {675($make_serializer:path, $($arg:tt)*) => {{676let serializer = $make_serializer(array.as_any().downcast_ref().unwrap(), $($arg)*);677if needs_quotes {678Box::new(quote_serializer(serializer)) as Box<dyn Serializer + Send>679} else {680Box::new(serializer)681}682}};683($make_serializer:path) => { quote_wrapper!($make_serializer,) };684}685686let serializer = match dtype {687DataType::Int8 => quote_wrapper!(integer_serializer::<i8>),688DataType::UInt8 => quote_wrapper!(integer_serializer::<u8>),689DataType::Int16 => quote_wrapper!(integer_serializer::<i16>),690DataType::UInt16 => quote_wrapper!(integer_serializer::<u16>),691DataType::Int32 => quote_wrapper!(integer_serializer::<i32>),692DataType::UInt32 => quote_wrapper!(integer_serializer::<u32>),693DataType::Int64 => quote_wrapper!(integer_serializer::<i64>),694DataType::UInt64 => quote_wrapper!(integer_serializer::<u64>),695DataType::Int128 => quote_wrapper!(integer_serializer::<i128>),696DataType::Float32 => {697match (698options.decimal_comma,699options.float_precision,700options.float_scientific,701) {702// standard decimal separator (period)703(false, Some(precision), Some(true)) => {704quote_wrapper!(float_serializer_with_precision_scientific::<f32>, precision)705},706(false, Some(precision), _) => {707quote_wrapper!(float_serializer_with_precision_positional::<f32>, precision)708},709(false, None, Some(true)) => {710quote_wrapper!(float_serializer_no_precision_scientific::<f32>)711},712(false, None, Some(false)) => {713quote_wrapper!(float_serializer_no_precision_positional::<f32>)714},715(false, None, None) => {716quote_wrapper!(float_serializer_no_precision_autoformat::<f32>)717},718719// comma as the decimal separator720(true, Some(precision), Some(true)) => quote_wrapper!(721float_serializer_with_precision_scientific_decimal_comma::<f32>,722precision723),724(true, Some(precision), _) => quote_wrapper!(725float_serializer_with_precision_positional_decimal_comma::<f32>,726precision727),728(true, None, Some(true)) => {729quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f32>)730},731(true, None, Some(false)) => {732quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f32>)733},734(true, None, None) => {735quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f32>)736},737}738},739DataType::Float64 => {740match (741options.decimal_comma,742options.float_precision,743options.float_scientific,744) {745// standard decimal separator (period)746(false, Some(precision), Some(true)) => {747quote_wrapper!(float_serializer_with_precision_scientific::<f64>, precision)748},749(false, Some(precision), _) => {750quote_wrapper!(float_serializer_with_precision_positional::<f64>, precision)751},752(false, None, Some(true)) => {753quote_wrapper!(float_serializer_no_precision_scientific::<f64>)754},755(false, None, Some(false)) => {756quote_wrapper!(float_serializer_no_precision_positional::<f64>)757},758(false, None, None) => {759quote_wrapper!(float_serializer_no_precision_autoformat::<f64>)760},761762// comma as the decimal separator763(true, Some(precision), Some(true)) => quote_wrapper!(764float_serializer_with_precision_scientific_decimal_comma::<f64>,765precision766),767(true, Some(precision), _) => quote_wrapper!(768float_serializer_with_precision_positional_decimal_comma::<f64>,769precision770),771(true, None, Some(true)) => {772quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f64>)773},774(true, None, Some(false)) => {775quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f64>)776},777(true, None, None) => {778quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f64>)779},780}781},782DataType::Null => quote_wrapper!(null_serializer),783DataType::Boolean => {784let array = array.as_any().downcast_ref().unwrap();785match options.quote_style {786QuoteStyle::Always => Box::new(quote_serializer(bool_serializer::<false>(array)))787as Box<dyn Serializer + Send>,788QuoteStyle::NonNumeric => Box::new(bool_serializer::<true>(array)),789_ => Box::new(bool_serializer::<false>(array)),790}791},792#[cfg(feature = "dtype-date")]793DataType::Date => date_and_time_serializer(794&options.date_format,795"NaiveDate",796array,797chrono::NaiveDate::MAX,798arrow::temporal_conversions::date32_to_date,799|date, items| date.format_with_items(items),800options,801)?,802#[cfg(feature = "dtype-time")]803DataType::Time => date_and_time_serializer(804&options.time_format,805"NaiveTime",806array,807chrono::NaiveTime::MIN,808arrow::temporal_conversions::time64ns_to_time,809|time, items| time.format_with_items(items),810options,811)?,812#[cfg(feature = "dtype-datetime")]813DataType::Datetime(time_unit, _) => {814let format = chrono::format::StrftimeItems::new(_datetime_format)815.parse()816.map_err(|_| {817polars_err!(818ComputeError: "cannot format {} with format '{_datetime_format}'",819if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },820)821})?;822use std::fmt::Write;823let sample_datetime = match _time_zone {824#[cfg(feature = "timezones")]825Some(time_zone) => time_zone826.from_utc_datetime(&chrono::NaiveDateTime::MAX)827.format_with_items(format.iter()),828#[cfg(not(feature = "timezones"))]829Some(_) => panic!("activate 'timezones' feature"),830None => chrono::NaiveDateTime::MAX.format_with_items(format.iter()),831};832// Fail fast for invalid format. This return error faster to the user, and allows us to not return833// `Result` from `serialize()`.834write!(IgnoreFmt, "{sample_datetime}").map_err(|_| {835polars_err!(836ComputeError: "cannot format {} with format '{_datetime_format}'",837if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },838)839})?;840841let array = array.as_any().downcast_ref().unwrap();842843macro_rules! time_unit_serializer {844($convert:ident) => {845match _time_zone {846#[cfg(feature = "timezones")]847Some(time_zone) => {848let callback = move |item, buf: &mut Vec<u8>| {849let item = arrow::temporal_conversions::$convert(item);850let item = time_zone.from_utc_datetime(&item);851// We checked the format is valid above.852let _ = write!(buf, "{}", item.format_with_items(format.iter()));853};854date_and_time_final_serializer(array, callback, options)855},856#[cfg(not(feature = "timezones"))]857Some(_) => panic!("activate 'timezones' feature"),858None => {859let callback = move |item, buf: &mut Vec<u8>| {860let item = arrow::temporal_conversions::$convert(item);861// We checked the format is valid above.862let _ = write!(buf, "{}", item.format_with_items(format.iter()));863};864date_and_time_final_serializer(array, callback, options)865},866}867};868}869870match time_unit {871TimeUnit::Nanoseconds => time_unit_serializer!(timestamp_ns_to_datetime),872TimeUnit::Microseconds => time_unit_serializer!(timestamp_us_to_datetime),873TimeUnit::Milliseconds => time_unit_serializer!(timestamp_ms_to_datetime),874}875},876DataType::String => string_serializer(877|iter| Iterator::next(iter).expect(TOO_MANY_MSG),878options,879|arr| {880arr.as_any()881.downcast_ref::<Utf8ViewArray>()882.expect(ARRAY_MISMATCH_MSG)883.iter()884},885array,886),887#[cfg(feature = "dtype-categorical")]888DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => {889polars_core::with_match_categorical_physical_type!(dtype.cat_physical().unwrap(), |$C| {890string_serializer(891|iter| {892let &idx: &<$C as PolarsCategoricalType>::Native = Iterator::next(iter).expect(TOO_MANY_MSG)?;893Some(unsafe { mapping.cat_to_str_unchecked(idx.as_cat()) })894},895options,896|arr| {897arr.as_any()898.downcast_ref::<PrimitiveArray<<$C as PolarsCategoricalType>::Native>>()899.expect(ARRAY_MISMATCH_MSG)900.iter()901},902array,903)904})905},906#[cfg(feature = "dtype-decimal")]907DataType::Decimal(_, scale) => {908quote_wrapper!(decimal_serializer, scale.unwrap_or(0))909},910_ => {911polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or a binary format.")912},913};914Ok(serializer)915}916917#[cfg(test)]918mod test {919use arrow::array::NullArray;920use polars_core::prelude::ArrowDataType;921922use super::string_serializer;923use crate::csv::write::options::{QuoteStyle, SerializeOptions};924925// It is the most complex serializer with most edge cases, it definitely needs a comprehensive test.926#[test]927fn test_string_serializer() {928#[track_caller]929fn check_string_serialization(options: &SerializeOptions, s: Option<&str>, expected: &str) {930let fake_array = NullArray::new(ArrowDataType::Null, 0);931let mut serializer = string_serializer(|s| *s, options, |_| s, &fake_array);932let mut buf = Vec::new();933serializer.serialize(&mut buf, options);934let serialized = std::str::from_utf8(&buf).unwrap();935// Don't use `assert_eq!()` because it prints debug format and it's hard to read with all the escapes.936if serialized != expected {937panic!(938"CSV string {s:?} wasn't serialized correctly: expected: `{expected}`, got: `{serialized}`"939);940}941}942943let always_quote = SerializeOptions {944quote_style: QuoteStyle::Always,945..SerializeOptions::default()946};947check_string_serialization(&always_quote, None, r#""""#);948check_string_serialization(&always_quote, Some(""), r#""""#);949check_string_serialization(&always_quote, Some("a"), r#""a""#);950check_string_serialization(&always_quote, Some("\""), r#""""""#);951check_string_serialization(&always_quote, Some("a\"\"b"), r#""a""""b""#);952953let necessary_quote = SerializeOptions {954quote_style: QuoteStyle::Necessary,955..SerializeOptions::default()956};957check_string_serialization(&necessary_quote, None, r#""#);958check_string_serialization(&necessary_quote, Some(""), r#""""#);959check_string_serialization(&necessary_quote, Some("a"), r#"a"#);960check_string_serialization(&necessary_quote, Some("\""), r#""""""#);961check_string_serialization(&necessary_quote, Some("a\"\"b"), r#""a""""b""#);962check_string_serialization(&necessary_quote, Some("a b"), r#"a b"#);963check_string_serialization(&necessary_quote, Some("a,b"), r#""a,b""#);964check_string_serialization(&necessary_quote, Some("a\nb"), "\"a\nb\"");965check_string_serialization(&necessary_quote, Some("a\rb"), "\"a\rb\"");966967let never_quote = SerializeOptions {968quote_style: QuoteStyle::Never,969..SerializeOptions::default()970};971check_string_serialization(&never_quote, None, "");972check_string_serialization(&never_quote, Some(""), "");973check_string_serialization(&never_quote, Some("a"), "a");974check_string_serialization(&never_quote, Some("\""), "\"");975check_string_serialization(&never_quote, Some("a\"\"b"), "a\"\"b");976check_string_serialization(&never_quote, Some("a b"), "a b");977check_string_serialization(&never_quote, Some("a,b"), "a,b");978check_string_serialization(&never_quote, Some("a\nb"), "a\nb");979check_string_serialization(&never_quote, Some("a\rb"), "a\rb");980981let non_numeric_quote = SerializeOptions {982quote_style: QuoteStyle::NonNumeric,983..SerializeOptions::default()984};985check_string_serialization(&non_numeric_quote, None, "");986check_string_serialization(&non_numeric_quote, Some(""), r#""""#);987check_string_serialization(&non_numeric_quote, Some("a"), r#""a""#);988check_string_serialization(&non_numeric_quote, Some("\""), r#""""""#);989check_string_serialization(&non_numeric_quote, Some("a\"\"b"), r#""a""""b""#);990check_string_serialization(&non_numeric_quote, Some("a b"), r#""a b""#);991check_string_serialization(&non_numeric_quote, Some("a,b"), r#""a,b""#);992check_string_serialization(&non_numeric_quote, Some("a\nb"), "\"a\nb\"");993check_string_serialization(&non_numeric_quote, Some("a\rb"), "\"a\rb\"");994}995}996997998