Path: blob/main/docs/source/src/python/user-guide/io/hive.py
7890 views
# --8<-- [start:init_paths]1import polars as pl2from pathlib import Path34dfs = [5pl.DataFrame({"x": [1, 2]}),6pl.DataFrame({"x": [3, 4, 5]}),7pl.DataFrame({"x": [6, 7]}),8pl.DataFrame({"x": [8, 9, 10, 11]}),9]1011parts = [12"year=2023/month=11",13"year=2023/month=12",14"year=2024/month=01",15"year=2024/month=02",16]1718for df, part in zip(dfs, parts):19path = Path("docs/assets/data/hive/") / part / "data.parquet"20Path(path).parent.mkdir(exist_ok=True, parents=True)21df.write_parquet(path)2223path = Path("docs/assets/data/hive_mixed/") / part / "data.parquet"24Path(path).parent.mkdir(exist_ok=True, parents=True)25df.write_parquet(path)2627# Make sure the file is not empty because path expansion ignores empty files.28Path("docs/assets/data/hive_mixed/description.txt").write_text("A")293031def print_paths(path: str) -> None:32def dir_recurse(path: Path):33if path.is_dir():34for p in path.iterdir():35yield from dir_recurse(p)36else:37yield path3839df = (40pl.Series(41"File path",42(str(x) for x in dir_recurse(Path(path))),43dtype=pl.String,44)45.sort()46.to_frame()47)4849with pl.Config(50tbl_hide_column_data_types=True,51tbl_hide_dataframe_shape=True,52fmt_str_lengths=999,53):54print(df)555657print_paths("docs/assets/data/hive/")58# --8<-- [end:init_paths]5960# --8<-- [start:show_mixed_paths]61print_paths("docs/assets/data/hive_mixed/")62# --8<-- [end:show_mixed_paths]6364# --8<-- [start:scan_dir]65import polars as pl6667df = pl.scan_parquet("docs/assets/data/hive/").collect()6869with pl.Config(tbl_rows=99):70print(df)71# --8<-- [end:scan_dir]7273# --8<-- [start:scan_dir_err]74from pathlib import Path7576try:77pl.scan_parquet("docs/assets/data/hive_mixed/").collect()78except Exception as e:79print(e)8081# --8<-- [end:scan_dir_err]8283# --8<-- [start:scan_glob]84df = pl.scan_parquet(85# Glob to match all files ending in `.parquet`86"docs/assets/data/hive_mixed/**/*.parquet",87hive_partitioning=True,88).collect()8990with pl.Config(tbl_rows=99):91print(df)9293# --8<-- [end:scan_glob]9495# --8<-- [start:scan_file_no_hive]96df = pl.scan_parquet(97[98"docs/assets/data/hive/year=2024/month=01/data.parquet",99"docs/assets/data/hive/year=2024/month=02/data.parquet",100],101).collect()102103print(df)104105# --8<-- [end:scan_file_no_hive]106107# --8<-- [start:scan_file_hive]108df = pl.scan_parquet(109[110"docs/assets/data/hive/year=2024/month=01/data.parquet",111"docs/assets/data/hive/year=2024/month=02/data.parquet",112],113hive_partitioning=True,114).collect()115116print(df)117118# --8<-- [end:scan_file_hive]119120# --8<-- [start:write_parquet_partitioned_show_data]121df = pl.DataFrame({"a": [1, 1, 2, 2, 3], "b": [1, 1, 1, 2, 2], "c": 1})122print(df)123# --8<-- [end:write_parquet_partitioned_show_data]124125# --8<-- [start:write_parquet_partitioned]126df.write_parquet("docs/assets/data/hive_write/", partition_by=["a", "b"])127# --8<-- [end:write_parquet_partitioned]128129# --8<-- [start:write_parquet_partitioned_show_paths]130print_paths("docs/assets/data/hive_write/")131# --8<-- [end:write_parquet_partitioned_show_paths]132133134