From 65bfa488bc8d5abcbe2fe8335ca3753c9abc69ed Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 15 Nov 2023 19:03:25 +0400 Subject: [PATCH] test(python): Improved `import polars` timing test; now much more consistent/reliable (#12478) --- py-polars/tests/unit/test_polars_import.py | 93 ++++++++++++++-------- 1 file changed, 59 insertions(+), 34 deletions(-) diff --git a/py-polars/tests/unit/test_polars_import.py b/py-polars/tests/unit/test_polars_import.py index 1012cb7ff236..7093f69d59e1 100644 --- a/py-polars/tests/unit/test_polars_import.py +++ b/py-polars/tests/unit/test_polars_import.py @@ -1,57 +1,83 @@ +from __future__ import annotations + +import compileall import subprocess import sys +from pathlib import Path import pytest import polars as pl +from polars import selectors as cs + +# set a maximum cutoff at 0.2 secs; note that we are typically much faster +# than this (more like ~0.07 secs, depending on hardware), but we allow a +# margin of error to account for frequent noise from slow/contended CI. +MAX_ALLOWED_IMPORT_TIME = 200_000 # << microseconds + + +def _import_time_from_frame(tm: pl.DataFrame) -> int: + return int( + tm.filter(pl.col("import").str.strip_chars() == "polars") + .select("cumulative_time") + .item() + ) def _import_timings() -> bytes: # assemble suitable command to get polars module import timing; # run in a separate process to ensure clean timing results. cmd = f'{sys.executable} -X importtime -c "import polars"' - return ( + output = ( subprocess.run(cmd, shell=True, capture_output=True) .stderr.replace(b"import time:", b"") .strip() ) + return output -def _import_timings_as_frame(best_of: int) -> pl.DataFrame: - # create master frame as minimum of 'best_of' timings. - import_timings = [ - pl.read_csv( - source=_import_timings(), - separator="|", - has_header=True, - new_columns=["own_time", "cumulative_time", "import"], - ).with_columns( - pl.col(["own_time", "cumulative_time"]).str.strip_chars().cast(pl.Int32) +def _import_timings_as_frame(n_tries: int) -> tuple[pl.DataFrame, int]: + import_timings = [] + for _ in range(n_tries): + df_import = ( + pl.read_csv( + source=_import_timings(), + separator="|", + has_header=True, + new_columns=["own_time", "cumulative_time", "import"], + ) + .with_columns(cs.ends_with("_time").str.strip_chars().cast(pl.UInt32)) + .select("import", "own_time", "cumulative_time") + .reverse() ) - for _ in range(best_of) - ] - return ( - sorted( - import_timings, - key=lambda tm: int(tm["cumulative_time"].max()), # type: ignore[arg-type] - )[0] - .reverse() - .select("import", "own_time", "cumulative_time") - ) + polars_import_time = _import_time_from_frame(df_import) + if polars_import_time < MAX_ALLOWED_IMPORT_TIME: + return df_import, polars_import_time + + import_timings.append(df_import) + + # note: if a qualifying import time was already achieved, we won't get here + df_fastest_import = sorted(import_timings, key=_import_time_from_frame)[0] + return df_fastest_import, _import_time_from_frame(df_fastest_import) @pytest.mark.skipif(sys.platform == "win32", reason="Unreliable on Windows") @pytest.mark.slow() def test_polars_import() -> None: - # note: take the fastest of several runs to reduce noise. - df_import = _import_timings_as_frame(best_of=3) + # up-front compile '.py' -> '.pyc' before timing + polars_path = Path(pl.__file__).parent + compileall.compile_dir(polars_path, quiet=1) - with pl.Config() as cfg: - # get a complete view of what's going on in case of failure - cfg.set_tbl_rows(250) - cfg.set_fmt_str_lengths(100) - cfg.set_tbl_hide_dataframe_shape() + # note: reduce noise by allowing up to 'n' tries (but return immediately if/when + # a qualifying time is achieved, so we don't waste time running unnecessary tests) + df_import, polars_import_time = _import_timings_as_frame(n_tries=5) + with pl.Config( + # get a complete view of what's going on in case of failure + tbl_rows=250, + fmt_str_lengths=100, + tbl_hide_dataframe_shape=True, + ): # ensure that we have not broken lazy-loading (numpy, pandas, pyarrow, etc). lazy_modules = [ dep for dep in pl.dependencies.__all__ if not dep.startswith("_") @@ -62,9 +88,8 @@ def test_polars_import() -> None: assert not_imported, f"{if_err}\n{df_import}" # ensure that we do not have an import speed regression. - polars_import = df_import.filter(pl.col("import").str.strip_chars() == "polars") - polars_import_time = polars_import["cumulative_time"].item() - assert isinstance(polars_import_time, int) - - if_err = f"Possible import speed regression; took {polars_import_time//1_000}ms" - assert polars_import_time < 200_000, f"{if_err}\n{df_import}" + if polars_import_time > MAX_ALLOWED_IMPORT_TIME: + import_time_ms = polars_import_time // 1_000 + raise AssertionError( + f"Possible import speed regression; took {import_time_ms}ms\n{df_import}" + )