Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-io/src/csv/read/options.rs
6939 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
use std::path::PathBuf;
3
use std::sync::Arc;
4
5
use polars_core::datatypes::{DataType, Field};
6
use polars_core::schema::{Schema, SchemaRef};
7
use polars_error::PolarsResult;
8
use polars_utils::pl_str::PlSmallStr;
9
#[cfg(feature = "serde")]
10
use serde::{Deserialize, Serialize};
11
12
use crate::RowIndex;
13
14
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
15
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
16
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
17
pub struct CsvReadOptions {
18
pub path: Option<PathBuf>,
19
// Performance related options
20
pub rechunk: bool,
21
pub n_threads: Option<usize>,
22
pub low_memory: bool,
23
// Row-wise options
24
pub n_rows: Option<usize>,
25
pub row_index: Option<RowIndex>,
26
// Column-wise options
27
pub columns: Option<Arc<[PlSmallStr]>>,
28
pub projection: Option<Arc<Vec<usize>>>,
29
pub schema: Option<SchemaRef>,
30
pub schema_overwrite: Option<SchemaRef>,
31
pub dtype_overwrite: Option<Arc<Vec<DataType>>>,
32
// CSV-specific options
33
pub parse_options: Arc<CsvParseOptions>,
34
pub has_header: bool,
35
pub chunk_size: usize,
36
/// Skip rows according to the CSV spec.
37
pub skip_rows: usize,
38
/// Skip lines according to newline char (e.g. escaping will be ignored)
39
pub skip_lines: usize,
40
pub skip_rows_after_header: usize,
41
pub infer_schema_length: Option<usize>,
42
pub raise_if_empty: bool,
43
pub ignore_errors: bool,
44
pub fields_to_cast: Vec<Field>,
45
}
46
47
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
48
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
49
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
50
pub struct CsvParseOptions {
51
pub separator: u8,
52
pub quote_char: Option<u8>,
53
pub eol_char: u8,
54
pub encoding: CsvEncoding,
55
pub null_values: Option<NullValues>,
56
pub missing_is_null: bool,
57
pub truncate_ragged_lines: bool,
58
pub comment_prefix: Option<CommentPrefix>,
59
pub try_parse_dates: bool,
60
pub decimal_comma: bool,
61
}
62
63
impl Default for CsvReadOptions {
64
fn default() -> Self {
65
Self {
66
path: None,
67
68
rechunk: false,
69
n_threads: None,
70
low_memory: false,
71
72
n_rows: None,
73
row_index: None,
74
75
columns: None,
76
projection: None,
77
schema: None,
78
schema_overwrite: None,
79
dtype_overwrite: None,
80
81
parse_options: Default::default(),
82
has_header: true,
83
chunk_size: 1 << 18,
84
skip_rows: 0,
85
skip_lines: 0,
86
skip_rows_after_header: 0,
87
infer_schema_length: Some(100),
88
raise_if_empty: true,
89
ignore_errors: false,
90
fields_to_cast: vec![],
91
}
92
}
93
}
94
95
/// Options related to parsing the CSV format.
96
impl Default for CsvParseOptions {
97
fn default() -> Self {
98
Self {
99
separator: b',',
100
quote_char: Some(b'"'),
101
eol_char: b'\n',
102
encoding: Default::default(),
103
null_values: None,
104
missing_is_null: true,
105
truncate_ragged_lines: false,
106
comment_prefix: None,
107
try_parse_dates: false,
108
decimal_comma: false,
109
}
110
}
111
}
112
113
impl CsvReadOptions {
114
pub fn get_parse_options(&self) -> Arc<CsvParseOptions> {
115
self.parse_options.clone()
116
}
117
118
pub fn with_path<P: Into<PathBuf>>(mut self, path: Option<P>) -> Self {
119
self.path = path.map(|p| p.into());
120
self
121
}
122
123
/// Whether to makes the columns contiguous in memory.
124
pub fn with_rechunk(mut self, rechunk: bool) -> Self {
125
self.rechunk = rechunk;
126
self
127
}
128
129
/// Number of threads to use for reading. Defaults to the size of the polars
130
/// thread pool.
131
pub fn with_n_threads(mut self, n_threads: Option<usize>) -> Self {
132
self.n_threads = n_threads;
133
self
134
}
135
136
/// Reduce memory consumption at the expense of performance
137
pub fn with_low_memory(mut self, low_memory: bool) -> Self {
138
self.low_memory = low_memory;
139
self
140
}
141
142
/// Limits the number of rows to read.
143
pub fn with_n_rows(mut self, n_rows: Option<usize>) -> Self {
144
self.n_rows = n_rows;
145
self
146
}
147
148
/// Adds a row index column.
149
pub fn with_row_index(mut self, row_index: Option<RowIndex>) -> Self {
150
self.row_index = row_index;
151
self
152
}
153
154
/// Which columns to select.
155
pub fn with_columns(mut self, columns: Option<Arc<[PlSmallStr]>>) -> Self {
156
self.columns = columns;
157
self
158
}
159
160
/// Which columns to select denoted by their index. The index starts from 0
161
/// (i.e. [0, 4] would select the 1st and 5th column).
162
pub fn with_projection(mut self, projection: Option<Arc<Vec<usize>>>) -> Self {
163
self.projection = projection;
164
self
165
}
166
167
/// Set the schema to use for CSV file. The length of the schema must match
168
/// the number of columns in the file. If this is [None], the schema is
169
/// inferred from the file.
170
pub fn with_schema(mut self, schema: Option<SchemaRef>) -> Self {
171
self.schema = schema;
172
self
173
}
174
175
/// Overwrites the data types in the schema by column name.
176
pub fn with_schema_overwrite(mut self, schema_overwrite: Option<SchemaRef>) -> Self {
177
self.schema_overwrite = schema_overwrite;
178
self
179
}
180
181
/// Overwrite the dtypes in the schema in the order of the slice that's given.
182
/// This is useful if you don't know the column names beforehand
183
pub fn with_dtype_overwrite(mut self, dtype_overwrite: Option<Arc<Vec<DataType>>>) -> Self {
184
self.dtype_overwrite = dtype_overwrite;
185
self
186
}
187
188
/// Sets the CSV parsing options. See [map_parse_options][Self::map_parse_options]
189
/// for an easier way to mutate them in-place.
190
pub fn with_parse_options(mut self, parse_options: CsvParseOptions) -> Self {
191
self.parse_options = Arc::new(parse_options);
192
self
193
}
194
195
/// Sets whether the CSV file has a header row.
196
pub fn with_has_header(mut self, has_header: bool) -> Self {
197
self.has_header = has_header;
198
self
199
}
200
201
/// Sets the chunk size used by the parser. This influences performance.
202
pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
203
self.chunk_size = chunk_size;
204
self
205
}
206
207
/// Start reading after ``skip_rows`` rows. The header will be parsed at this
208
/// offset. Note that we respect CSV escaping/comments when skipping rows.
209
/// If you want to skip by newline char only, use `skip_lines`.
210
pub fn with_skip_rows(mut self, skip_rows: usize) -> Self {
211
self.skip_rows = skip_rows;
212
self
213
}
214
215
/// Start reading after `skip_lines` lines. The header will be parsed at this
216
/// offset. Note that CSV escaping will not be respected when skipping lines.
217
/// If you want to skip valid CSV rows, use ``skip_rows``.
218
pub fn with_skip_lines(mut self, skip_lines: usize) -> Self {
219
self.skip_lines = skip_lines;
220
self
221
}
222
223
/// Number of rows to skip after the header row.
224
pub fn with_skip_rows_after_header(mut self, skip_rows_after_header: usize) -> Self {
225
self.skip_rows_after_header = skip_rows_after_header;
226
self
227
}
228
229
/// Set the number of rows to use when inferring the csv schema.
230
/// The default is 100 rows.
231
/// Setting to [None] will do a full table scan, which is very slow.
232
pub fn with_infer_schema_length(mut self, infer_schema_length: Option<usize>) -> Self {
233
self.infer_schema_length = infer_schema_length;
234
self
235
}
236
237
/// Whether to raise an error if the frame is empty. By default an empty
238
/// DataFrame is returned.
239
pub fn with_raise_if_empty(mut self, raise_if_empty: bool) -> Self {
240
self.raise_if_empty = raise_if_empty;
241
self
242
}
243
244
/// Continue with next batch when a ParserError is encountered.
245
pub fn with_ignore_errors(mut self, ignore_errors: bool) -> Self {
246
self.ignore_errors = ignore_errors;
247
self
248
}
249
250
/// Apply a function to the parse options.
251
pub fn map_parse_options<F: Fn(CsvParseOptions) -> CsvParseOptions>(
252
mut self,
253
map_func: F,
254
) -> Self {
255
let parse_options = Arc::unwrap_or_clone(self.parse_options);
256
self.parse_options = Arc::new(map_func(parse_options));
257
self
258
}
259
}
260
261
impl CsvParseOptions {
262
/// The character used to separate fields in the CSV file. This
263
/// is most often a comma ','.
264
pub fn with_separator(mut self, separator: u8) -> Self {
265
self.separator = separator;
266
self
267
}
268
269
/// Set the character used for field quoting. This is most often double
270
/// quotes '"'. Set this to [None] to disable quote parsing.
271
pub fn with_quote_char(mut self, quote_char: Option<u8>) -> Self {
272
self.quote_char = quote_char;
273
self
274
}
275
276
/// Set the character used to indicate an end-of-line (eol).
277
pub fn with_eol_char(mut self, eol_char: u8) -> Self {
278
self.eol_char = eol_char;
279
self
280
}
281
282
/// Set the encoding used by the file.
283
pub fn with_encoding(mut self, encoding: CsvEncoding) -> Self {
284
self.encoding = encoding;
285
self
286
}
287
288
/// Set values that will be interpreted as missing/null.
289
///
290
/// Note: These values are matched before quote-parsing, so if the null values
291
/// are quoted then those quotes also need to be included here.
292
pub fn with_null_values(mut self, null_values: Option<NullValues>) -> Self {
293
self.null_values = null_values;
294
self
295
}
296
297
/// Treat missing fields as null.
298
pub fn with_missing_is_null(mut self, missing_is_null: bool) -> Self {
299
self.missing_is_null = missing_is_null;
300
self
301
}
302
303
/// Truncate lines that are longer than the schema.
304
pub fn with_truncate_ragged_lines(mut self, truncate_ragged_lines: bool) -> Self {
305
self.truncate_ragged_lines = truncate_ragged_lines;
306
self
307
}
308
309
/// Sets the comment prefix for this instance. Lines starting with this
310
/// prefix will be ignored.
311
pub fn with_comment_prefix<T: Into<CommentPrefix>>(
312
mut self,
313
comment_prefix: Option<T>,
314
) -> Self {
315
self.comment_prefix = comment_prefix.map(Into::into);
316
self
317
}
318
319
/// Automatically try to parse dates/datetimes and time. If parsing fails,
320
/// columns remain of dtype [`DataType::String`].
321
pub fn with_try_parse_dates(mut self, try_parse_dates: bool) -> Self {
322
self.try_parse_dates = try_parse_dates;
323
self
324
}
325
326
/// Parse floats with a comma as decimal separator.
327
pub fn with_decimal_comma(mut self, decimal_comma: bool) -> Self {
328
self.decimal_comma = decimal_comma;
329
self
330
}
331
}
332
333
#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Hash)]
334
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
335
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
336
pub enum CsvEncoding {
337
/// Utf8 encoding.
338
#[default]
339
Utf8,
340
/// Utf8 encoding and unknown bytes are replaced with �.
341
LossyUtf8,
342
}
343
344
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
345
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
346
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
347
pub enum CommentPrefix {
348
/// A single byte character that indicates the start of a comment line.
349
Single(u8),
350
/// A string that indicates the start of a comment line.
351
/// This allows for multiple characters to be used as a comment identifier.
352
Multi(PlSmallStr),
353
}
354
355
impl CommentPrefix {
356
/// Creates a new `CommentPrefix` for the `Single` variant.
357
pub fn new_single(prefix: u8) -> Self {
358
CommentPrefix::Single(prefix)
359
}
360
361
/// Creates a new `CommentPrefix` for the `Multi` variant.
362
pub fn new_multi(prefix: PlSmallStr) -> Self {
363
CommentPrefix::Multi(prefix)
364
}
365
366
/// Creates a new `CommentPrefix` from a `&str`.
367
pub fn new_from_str(prefix: &str) -> Self {
368
if prefix.len() == 1 && prefix.chars().next().unwrap().is_ascii() {
369
let c = prefix.as_bytes()[0];
370
CommentPrefix::Single(c)
371
} else {
372
CommentPrefix::Multi(PlSmallStr::from_str(prefix))
373
}
374
}
375
}
376
377
impl From<&str> for CommentPrefix {
378
fn from(value: &str) -> Self {
379
Self::new_from_str(value)
380
}
381
}
382
383
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
384
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
385
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
386
pub enum NullValues {
387
/// A single value that's used for all columns
388
AllColumnsSingle(PlSmallStr),
389
/// Multiple values that are used for all columns
390
AllColumns(Vec<PlSmallStr>),
391
/// Tuples that map column names to null value of that column
392
Named(Vec<(PlSmallStr, PlSmallStr)>),
393
}
394
395
impl NullValues {
396
pub fn compile(self, schema: &Schema) -> PolarsResult<NullValuesCompiled> {
397
Ok(match self {
398
NullValues::AllColumnsSingle(v) => NullValuesCompiled::AllColumnsSingle(v),
399
NullValues::AllColumns(v) => NullValuesCompiled::AllColumns(v),
400
NullValues::Named(v) => {
401
let mut null_values = vec![PlSmallStr::from_static(""); schema.len()];
402
for (name, null_value) in v {
403
let i = schema.try_index_of(&name)?;
404
null_values[i] = null_value;
405
}
406
NullValuesCompiled::Columns(null_values)
407
},
408
})
409
}
410
}
411
412
#[derive(Debug, Clone)]
413
pub enum NullValuesCompiled {
414
/// A single value that's used for all columns
415
AllColumnsSingle(PlSmallStr),
416
// Multiple null values that are null for all columns
417
AllColumns(Vec<PlSmallStr>),
418
/// A different null value per column, computed from `NullValues::Named`
419
Columns(Vec<PlSmallStr>),
420
}
421
422
impl NullValuesCompiled {
423
/// # Safety
424
///
425
/// The caller must ensure that `index` is in bounds
426
pub(super) unsafe fn is_null(&self, field: &[u8], index: usize) -> bool {
427
use NullValuesCompiled::*;
428
match self {
429
AllColumnsSingle(v) => v.as_bytes() == field,
430
AllColumns(v) => v.iter().any(|v| v.as_bytes() == field),
431
Columns(v) => {
432
debug_assert!(index < v.len());
433
v.get_unchecked(index).as_bytes() == field
434
},
435
}
436
}
437
}
438
439