Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/test_polars_import.py
6939 views
1
from __future__ import annotations
2
3
import compileall
4
import subprocess
5
import sys
6
from pathlib import Path
7
8
import pytest
9
10
import polars as pl
11
from polars import selectors as cs
12
13
# set a maximum cutoff at 0.5 secs; note that we are typically much faster
14
# than this (more like ~0.07 secs, depending on hardware), but we allow a
15
# margin of error to account for frequent noise from slow/contended CI.
16
MAX_ALLOWED_IMPORT_TIME = 500_000 # << microseconds
17
18
19
def _import_time_from_frame(tm: pl.DataFrame) -> int:
20
return int(
21
tm.filter(pl.col("import").str.strip_chars() == "polars")
22
.select("cumulative_time")
23
.item()
24
)
25
26
27
def _import_timings() -> bytes:
28
# assemble suitable command to get polars module import timing;
29
# run in a separate process to ensure clean timing results.
30
cmd = f'{sys.executable} -S -X importtime -c "import polars"'
31
output = subprocess.run(cmd, shell=True, capture_output=True).stderr
32
if b"Traceback" in output:
33
msg = f"measuring import timings failed\n\nCommand output:\n{output.decode()}"
34
raise RuntimeError(msg)
35
return output.replace(b"import time:", b"").strip()
36
37
38
def _import_timings_as_frame(n_tries: int) -> tuple[pl.DataFrame, int]:
39
import_timings = []
40
for _ in range(n_tries):
41
df_import = (
42
pl.read_csv(
43
source=_import_timings(),
44
separator="|",
45
has_header=True,
46
new_columns=["own_time", "cumulative_time", "import"],
47
)
48
.with_columns(cs.ends_with("_time").str.strip_chars().cast(pl.UInt32))
49
.select("import", "own_time", "cumulative_time")
50
.reverse()
51
)
52
polars_import_time = _import_time_from_frame(df_import)
53
if polars_import_time < MAX_ALLOWED_IMPORT_TIME:
54
return df_import, polars_import_time
55
56
import_timings.append(df_import)
57
58
# note: if a qualifying import time was already achieved, we won't get here.
59
# if we do, let's see all the failed timings to help see what's going on:
60
import_times = [_import_time_from_frame(df) for df in import_timings]
61
msg = "\n".join(f"({idx}) {tm:,}μs" for idx, tm in enumerate(import_times))
62
min_max = f"Min => {min(import_times):,}μs, Max => {max(import_times):,}μs)"
63
print(f"\nImport times achieved over {n_tries} tries:\n{min_max}\n\n{msg}")
64
65
sorted_timing_frames = sorted(import_timings, key=_import_time_from_frame)
66
return sorted_timing_frames[0], min(import_times)
67
68
69
@pytest.mark.skipif(sys.platform == "win32", reason="Unreliable on Windows")
70
@pytest.mark.debug
71
@pytest.mark.slow
72
def test_polars_import() -> None:
73
# up-front compile '.py' -> '.pyc' before timing
74
polars_path = Path(pl.__file__).parent
75
compileall.compile_dir(polars_path, quiet=1)
76
77
# note: reduce noise by allowing up to 'n' tries (but return immediately if/when
78
# a qualifying time is achieved, so we don't waste time running unnecessary tests)
79
df_import, polars_import_time = _import_timings_as_frame(n_tries=10)
80
81
with pl.Config(
82
# get a complete view of what's going on in case of failure
83
tbl_rows=250,
84
fmt_str_lengths=100,
85
tbl_hide_dataframe_shape=True,
86
):
87
# ensure that we have not broken lazy-loading (numpy, pandas, pyarrow, etc).
88
lazy_modules = [
89
dep for dep in pl.dependencies.__all__ if not dep.startswith("_")
90
]
91
for mod in lazy_modules:
92
not_imported = not df_import["import"].str.starts_with(mod).any()
93
if_err = f"lazy-loading regression: found {mod!r} at import time"
94
assert not_imported, f"{if_err}\n{df_import}"
95
96
# ensure that we do not have an import speed regression.
97
if polars_import_time > MAX_ALLOWED_IMPORT_TIME:
98
import_time_ms = polars_import_time // 1_000
99
msg = f"Possible import speed regression; took {import_time_ms}ms\n{df_import}"
100
raise AssertionError(msg)
101
102