Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/meta/test_polars_import.py
8430 views
1
from __future__ import annotations
2
3
import compileall
4
import subprocess
5
import sys
6
from pathlib import Path
7
8
import pytest
9
10
import polars as pl
11
from polars import selectors as cs
12
13
# set a maximum cutoff at 0.5 secs; note that we are typically much faster
14
# than this (more like ~0.07 secs, depending on hardware), but we allow a
15
# margin of error to account for frequent noise from slow/contended CI.
16
MAX_ALLOWED_IMPORT_TIME = 500_000 # << microseconds
17
18
19
def _import_time_from_frame(tm: pl.DataFrame) -> int:
20
return int(
21
tm.filter(pl.col("import").str.strip_chars() == "polars")
22
.select("cumulative_time")
23
.item()
24
)
25
26
27
def _import_timings() -> bytes:
28
# assemble suitable command to get polars module import timing;
29
# run in a separate process to ensure clean timing results.
30
cmd = f'{sys.executable} -S -X importtime -c "import polars"'
31
python_path = (
32
f"{Path(pl.__file__).parent.parent}:{Path(pl._plr.__file__).parent.parent}"
33
)
34
output = subprocess.run(
35
cmd,
36
shell=True,
37
capture_output=True,
38
env={"PYTHONPATH": python_path},
39
).stderr
40
if b"Traceback" in output:
41
msg = f"measuring import timings failed\n\nCommand output:\n{output.decode()}"
42
raise RuntimeError(msg)
43
return output.replace(b"import time:", b"").strip()
44
45
46
def _import_timings_as_frame(n_tries: int) -> tuple[pl.DataFrame, int]:
47
import_timings = []
48
for _ in range(n_tries):
49
df_import = (
50
pl.read_csv(
51
source=_import_timings(),
52
separator="|",
53
has_header=True,
54
new_columns=["own_time", "cumulative_time", "import"],
55
)
56
.with_columns(cs.ends_with("_time").str.strip_chars().cast(pl.UInt32))
57
.select("import", "own_time", "cumulative_time")
58
.reverse()
59
)
60
polars_import_time = _import_time_from_frame(df_import)
61
if polars_import_time < MAX_ALLOWED_IMPORT_TIME:
62
return df_import, polars_import_time
63
64
import_timings.append(df_import)
65
66
# note: if a qualifying import time was already achieved, we won't get here.
67
# if we do, let's see all the failed timings to help see what's going on:
68
import_times = [_import_time_from_frame(df) for df in import_timings]
69
msg = "\n".join(f"({idx}) {tm:,}μs" for idx, tm in enumerate(import_times))
70
min_max = f"Min => {min(import_times):,}μs, Max => {max(import_times):,}μs)"
71
print(f"\nImport times achieved over {n_tries} tries:\n{min_max}\n\n{msg}")
72
73
sorted_timing_frames = sorted(import_timings, key=_import_time_from_frame)
74
return sorted_timing_frames[0], min(import_times)
75
76
77
@pytest.mark.skipif(sys.platform == "win32", reason="Unreliable on Windows")
78
@pytest.mark.debug
79
@pytest.mark.slow
80
def test_polars_import() -> None:
81
# up-front compile '.py' -> '.pyc' before timing
82
polars_path = Path(pl.__file__).parent
83
compileall.compile_dir(polars_path, quiet=1)
84
85
# note: reduce noise by allowing up to 'n' tries (but return immediately if/when
86
# a qualifying time is achieved, so we don't waste time running unnecessary tests)
87
df_import, polars_import_time = _import_timings_as_frame(n_tries=10)
88
89
with pl.Config(
90
# get a complete view of what's going on in case of failure
91
tbl_rows=250,
92
fmt_str_lengths=100,
93
tbl_hide_dataframe_shape=True,
94
):
95
# ensure that we have not broken lazy-loading (numpy, pandas, pyarrow, etc).
96
lazy_modules = [
97
dep for dep in pl._dependencies.__all__ if not dep.startswith("_")
98
]
99
for mod in lazy_modules:
100
not_imported = not df_import["import"].str.starts_with(mod).any()
101
if_err = f"lazy-loading regression: found {mod!r} at import time"
102
assert not_imported, f"{if_err}\n{df_import}"
103
104
# ensure that we do not have an import speed regression.
105
if polars_import_time > MAX_ALLOWED_IMPORT_TIME:
106
import_time_ms = polars_import_time // 1_000
107
msg = f"Possible import speed regression; took {import_time_ms}ms\n{df_import}"
108
raise AssertionError(msg)
109
110