Path: blob/main/crates/polars-lazy/src/scan/file_list_reader.rs
6939 views
use std::sync::Arc;12use polars_core::prelude::*;3use polars_io::RowIndex;4use polars_io::cloud::CloudOptions;5use polars_plan::prelude::UnionArgs;6use polars_utils::plpath::PlPath;78use crate::prelude::*;910/// Reads [LazyFrame] from a filesystem or a cloud storage.11/// Supports glob patterns.12///13/// Use [LazyFileListReader::finish] to get the final [LazyFrame].14pub trait LazyFileListReader: Clone {15/// Get the final [LazyFrame].16fn finish(self) -> PolarsResult<LazyFrame> {17if !self.glob() {18return self.finish_no_glob();19}2021let ScanSources::Paths(paths) = self.sources() else {22unreachable!("opened-files or in-memory buffers should never be globbed");23};2425let lfs = paths26.iter()27.map(|path| {28self.clone()29// Each individual reader should not apply a row limit.30.with_n_rows(None)31// Each individual reader should not apply a row index.32.with_row_index(None)33.with_paths([path.clone()].into())34.with_rechunk(false)35.finish_no_glob()36.map_err(|e| {37polars_err!(38ComputeError: "error while reading {}: {}", path.display(), e39)40})41})42.collect::<PolarsResult<Vec<_>>>()?;4344polars_ensure!(45!lfs.is_empty(),46ComputeError: "no matching files found in {:?}", paths.iter().map(|x| x.to_str()).collect::<Vec<_>>()47);4849let mut lf = self.concat_impl(lfs)?;50if let Some(n_rows) = self.n_rows() {51lf = lf.slice(0, n_rows as IdxSize)52};53if let Some(rc) = self.row_index() {54lf = lf.with_row_index(rc.name.clone(), Some(rc.offset))55};5657Ok(lf)58}5960/// Recommended concatenation of [LazyFrame]s from many input files.61///62/// This method should not take into consideration [LazyFileListReader::n_rows]63/// nor [LazyFileListReader::row_index].64fn concat_impl(&self, lfs: Vec<LazyFrame>) -> PolarsResult<LazyFrame> {65let args = UnionArgs {66rechunk: self.rechunk(),67parallel: true,68to_supertypes: false,69from_partitioned_ds: true,70..Default::default()71};72concat_impl(&lfs, args)73}7475/// Get the final [LazyFrame].76/// This method assumes, that path is *not* a glob.77///78/// It is recommended to always use [LazyFileListReader::finish] method.79fn finish_no_glob(self) -> PolarsResult<LazyFrame>;8081fn glob(&self) -> bool {82true83}8485/// Get the sources for this reader.86fn sources(&self) -> &ScanSources;8788/// Set sources of the scanned files.89#[must_use]90fn with_sources(self, source: ScanSources) -> Self;9192/// Set paths of the scanned files.93#[must_use]94fn with_paths(self, paths: Arc<[PlPath]>) -> Self {95self.with_sources(ScanSources::Paths(paths))96}9798/// Configure the row limit.99fn with_n_rows(self, n_rows: impl Into<Option<usize>>) -> Self;100101/// Configure the row index.102fn with_row_index(self, row_index: impl Into<Option<RowIndex>>) -> Self;103104/// Rechunk the memory to contiguous chunks when parsing is done.105fn rechunk(&self) -> bool;106107/// Rechunk the memory to contiguous chunks when parsing is done.108#[must_use]109fn with_rechunk(self, toggle: bool) -> Self;110111/// Try to stop parsing when `n` rows are parsed. During multithreaded parsing the upper bound `n` cannot112/// be guaranteed.113fn n_rows(&self) -> Option<usize>;114115/// Add a row index column.116fn row_index(&self) -> Option<&RowIndex>;117118/// [CloudOptions] used to list files.119fn cloud_options(&self) -> Option<&CloudOptions> {120None121}122}123124125