Path: blob/main/py-polars/tests/unit/meta/test_polars_import.py
8430 views
from __future__ import annotations12import compileall3import subprocess4import sys5from pathlib import Path67import pytest89import polars as pl10from polars import selectors as cs1112# set a maximum cutoff at 0.5 secs; note that we are typically much faster13# than this (more like ~0.07 secs, depending on hardware), but we allow a14# margin of error to account for frequent noise from slow/contended CI.15MAX_ALLOWED_IMPORT_TIME = 500_000 # << microseconds161718def _import_time_from_frame(tm: pl.DataFrame) -> int:19return int(20tm.filter(pl.col("import").str.strip_chars() == "polars")21.select("cumulative_time")22.item()23)242526def _import_timings() -> bytes:27# assemble suitable command to get polars module import timing;28# run in a separate process to ensure clean timing results.29cmd = f'{sys.executable} -S -X importtime -c "import polars"'30python_path = (31f"{Path(pl.__file__).parent.parent}:{Path(pl._plr.__file__).parent.parent}"32)33output = subprocess.run(34cmd,35shell=True,36capture_output=True,37env={"PYTHONPATH": python_path},38).stderr39if b"Traceback" in output:40msg = f"measuring import timings failed\n\nCommand output:\n{output.decode()}"41raise RuntimeError(msg)42return output.replace(b"import time:", b"").strip()434445def _import_timings_as_frame(n_tries: int) -> tuple[pl.DataFrame, int]:46import_timings = []47for _ in range(n_tries):48df_import = (49pl.read_csv(50source=_import_timings(),51separator="|",52has_header=True,53new_columns=["own_time", "cumulative_time", "import"],54)55.with_columns(cs.ends_with("_time").str.strip_chars().cast(pl.UInt32))56.select("import", "own_time", "cumulative_time")57.reverse()58)59polars_import_time = _import_time_from_frame(df_import)60if polars_import_time < MAX_ALLOWED_IMPORT_TIME:61return df_import, polars_import_time6263import_timings.append(df_import)6465# note: if a qualifying import time was already achieved, we won't get here.66# if we do, let's see all the failed timings to help see what's going on:67import_times = [_import_time_from_frame(df) for df in import_timings]68msg = "\n".join(f"({idx}) {tm:,}μs" for idx, tm in enumerate(import_times))69min_max = f"Min => {min(import_times):,}μs, Max => {max(import_times):,}μs)"70print(f"\nImport times achieved over {n_tries} tries:\n{min_max}\n\n{msg}")7172sorted_timing_frames = sorted(import_timings, key=_import_time_from_frame)73return sorted_timing_frames[0], min(import_times)747576@pytest.mark.skipif(sys.platform == "win32", reason="Unreliable on Windows")77@pytest.mark.debug78@pytest.mark.slow79def test_polars_import() -> None:80# up-front compile '.py' -> '.pyc' before timing81polars_path = Path(pl.__file__).parent82compileall.compile_dir(polars_path, quiet=1)8384# note: reduce noise by allowing up to 'n' tries (but return immediately if/when85# a qualifying time is achieved, so we don't waste time running unnecessary tests)86df_import, polars_import_time = _import_timings_as_frame(n_tries=10)8788with pl.Config(89# get a complete view of what's going on in case of failure90tbl_rows=250,91fmt_str_lengths=100,92tbl_hide_dataframe_shape=True,93):94# ensure that we have not broken lazy-loading (numpy, pandas, pyarrow, etc).95lazy_modules = [96dep for dep in pl._dependencies.__all__ if not dep.startswith("_")97]98for mod in lazy_modules:99not_imported = not df_import["import"].str.starts_with(mod).any()100if_err = f"lazy-loading regression: found {mod!r} at import time"101assert not_imported, f"{if_err}\n{df_import}"102103# ensure that we do not have an import speed regression.104if polars_import_time > MAX_ALLOWED_IMPORT_TIME:105import_time_ms = polars_import_time // 1_000106msg = f"Possible import speed regression; took {import_time_ms}ms\n{df_import}"107raise AssertionError(msg)108109110