Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/io/test_other.py
6939 views
1
from __future__ import annotations
2
3
import copy
4
import sys
5
from pathlib import Path
6
from typing import Any, Callable, cast
7
8
import pytest
9
10
import polars as pl
11
from polars.testing import assert_frame_equal, assert_series_equal
12
13
14
@pytest.mark.parametrize(
15
"read_function",
16
[
17
pl.read_csv,
18
pl.read_ipc,
19
pl.read_json,
20
pl.read_parquet,
21
pl.read_avro,
22
pl.scan_csv,
23
pl.scan_ipc,
24
pl.scan_parquet,
25
],
26
)
27
def test_read_missing_file(read_function: Callable[[Any], pl.DataFrame]) -> None:
28
match = "\\(os error 2\\): fake_file_path"
29
# The message associated with OS error 2 may differ per platform
30
if sys.platform == "linux":
31
match = "No such file or directory " + match
32
33
if "scan" in read_function.__name__:
34
with pytest.raises(FileNotFoundError, match=match):
35
read_function("fake_file_path").collect() # type: ignore[attr-defined]
36
else:
37
with pytest.raises(FileNotFoundError, match=match):
38
read_function("fake_file_path")
39
40
41
@pytest.mark.parametrize(
42
"write_method_name",
43
[
44
# "write_excel" not included
45
# because it already raises a FileCreateError
46
# from the underlying library dependency
47
"write_csv",
48
"write_ipc",
49
"write_ipc_stream",
50
"write_json",
51
"write_ndjson",
52
"write_parquet",
53
"write_avro",
54
],
55
)
56
def test_write_missing_directory(write_method_name: str) -> None:
57
df = pl.DataFrame({"a": [1]})
58
non_existing_path = Path("non", "existing", "path")
59
if non_existing_path.exists():
60
pytest.fail(
61
"Testing on a non existing path failed because the path does exist."
62
)
63
write_method = getattr(df, write_method_name)
64
with pytest.raises(FileNotFoundError):
65
write_method(non_existing_path)
66
67
68
def test_read_missing_file_path_truncated() -> None:
69
content = "lskdfj".join(str(i) for i in range(25))
70
71
with pytest.raises(
72
FileNotFoundError,
73
match="\\.\\.\\.lskdfj14lskdfj15lskdfj16lskdfj17lskdfj18lskdfj19lskdfj20lskdfj21lskdfj22lskdfj23lskdfj24 \\(set POLARS_VERBOSE=1 to see full path\\)",
74
):
75
pl.read_csv(content)
76
77
78
def test_read_missing_file_path_expanded_when_polars_verbose_enabled(
79
monkeypatch: pytest.MonkeyPatch,
80
) -> None:
81
content = "lskdfj".join(str(i) for i in range(25))
82
83
monkeypatch.setenv("POLARS_VERBOSE", "1")
84
85
with pytest.raises(
86
FileNotFoundError,
87
match=content,
88
):
89
pl.read_csv(content)
90
91
92
def test_copy() -> None:
93
df = pl.DataFrame({"a": [1, 2], "b": ["a", None], "c": [True, False]})
94
assert_frame_equal(copy.copy(df), df)
95
assert_frame_equal(copy.deepcopy(df), df)
96
97
a = pl.Series("a", [1, 2])
98
assert_series_equal(copy.copy(a), a)
99
assert_series_equal(copy.deepcopy(a), a)
100
101
102
def test_categorical_round_trip() -> None:
103
df = pl.DataFrame({"ints": [1, 2, 3], "cat": ["a", "b", "c"]})
104
df = df.with_columns(pl.col("cat").cast(pl.Categorical))
105
106
tbl = df.to_arrow()
107
assert "dictionary" in str(tbl["cat"].type)
108
109
df2 = cast(pl.DataFrame, pl.from_arrow(tbl))
110
assert df2.dtypes == [pl.Int64, pl.Categorical]
111
112
113
def test_from_different_chunks() -> None:
114
s0 = pl.Series("a", [1, 2, 3, 4, None])
115
s1 = pl.Series("b", [1, 2])
116
s11 = pl.Series("b", [1, 2, 3])
117
s1.append(s11)
118
119
# check we don't panic
120
df = pl.DataFrame([s0, s1])
121
df.to_arrow()
122
df = pl.DataFrame([s0, s1])
123
out = df.to_pandas()
124
assert list(out.columns) == ["a", "b"]
125
assert out.shape == (5, 2)
126
127
128
def test_unit_io_subdir_has_no_init() -> None:
129
# --------------------------------------------------------------------------------
130
# If this test fails it means an '__init__.py' was added to 'tests/unit/io'.
131
# See https://github.com/pola-rs/polars/pull/6889 for why this can cause issues.
132
# --------------------------------------------------------------------------------
133
# TLDR: it can mask the builtin 'io' module, causing a fatal python error.
134
# --------------------------------------------------------------------------------
135
io_dir = Path(__file__).parent
136
assert io_dir.parts[-2:] == ("unit", "io")
137
assert not (io_dir / "__init__.py").exists(), (
138
"Found undesirable '__init__.py' in the 'unit.io' tests subdirectory"
139
)
140
141
142
@pytest.mark.write_disk
143
@pytest.mark.parametrize(
144
("scan_funcs", "write_func"),
145
[
146
([pl.scan_parquet, pl.read_parquet], pl.DataFrame.write_parquet),
147
([pl.scan_csv, pl.read_csv], pl.DataFrame.write_csv),
148
],
149
)
150
@pytest.mark.parametrize("char", ["[", "*"])
151
def test_no_glob(
152
scan_funcs: list[Callable[[Any], pl.LazyFrame | pl.DataFrame]],
153
write_func: Callable[[pl.DataFrame, Path], None],
154
char: str,
155
tmp_path: Path,
156
) -> None:
157
if sys.platform == "win32" and char == "*":
158
pytest.skip("unsupported glob char for windows")
159
160
tmp_path.mkdir(exist_ok=True)
161
162
df = pl.DataFrame({"x": 1})
163
164
paths = [tmp_path / f"{char}", tmp_path / f"{char}1"]
165
166
write_func(df, paths[0])
167
write_func(df, paths[1])
168
169
for func in scan_funcs:
170
assert_frame_equal(func(paths[0], glob=False).lazy().collect(), df) # type: ignore[call-arg]
171
172