Skip to content

Commit

Permalink
speeding up check_windspeed_drift (test mean 980us -> 482us) (#43)
Browse files Browse the repository at this point in the history
* speeding up check_windspeed_drift (test mean 980us -> 482us)

* remove benchmarking call and run linting

* fix untested func

* speeding up pre_post_pp_analysis_with_reversal from 143ms to ~100ms by improving `pp_raw_df`
  • Loading branch information
gabrielecalvo authored Jan 15, 2025
1 parent f5d0a55 commit 7982f9b
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 58 deletions.
26 changes: 25 additions & 1 deletion tests/test_windspeed_drift.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from pathlib import Path

import numpy as np
import pandas as pd
import pytest

from wind_up.constants import REANALYSIS_WS_COL
from wind_up.models import WindUpConfig
from wind_up.windspeed_drift import check_windspeed_drift
from wind_up.windspeed_drift import _calculate_rolling_windspeed_diff, check_windspeed_drift


def test_check_windspeed_drift(test_lsa_t13_config: WindUpConfig) -> None:
Expand All @@ -23,3 +24,26 @@ def test_check_windspeed_drift(test_lsa_t13_config: WindUpConfig) -> None:
)
assert test_max_ws_drift == pytest.approx(0.45289044075068974)
assert test_max_ws_drift_pp_period == pytest.approx(0.42913942378401204)


def test_calc_rolling_windspeed_diff() -> None:
n_values = 50
timestep = pd.Timedelta("6h")
ts_index = pd.date_range("2020-01-01", periods=n_values, freq=timestep)
ws_col_vals = np.linspace(5, 15, n_values)
test_df = pd.DataFrame({"ws_col": ws_col_vals, "reanalysis_ws_col": ws_col_vals[::-1]}, index=ts_index)

original = test_df.copy()
actual = _calculate_rolling_windspeed_diff(
wtg_df=test_df,
ws_col="ws_col",
reanalysis_ws_col="reanalysis_ws_col",
timebase_s=int(timestep / pd.Timedelta("1s")),
)

expected = pd.Series(np.nan, index=ts_index)
expected[-17:] = np.linspace(-2.2448979591836746, 1.0204081632653068, 17)
pd.testing.assert_series_equal(actual, expected)

# checking original dataframe is not modified
pd.testing.assert_frame_equal(test_df, original)
4 changes: 2 additions & 2 deletions wind_up/plots/windspeed_drift_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@

def plot_rolling_windspeed_diff_one_wtg(
*,
wtg_df: pd.DataFrame,
ser: pd.Series,
wtg_name: str,
ws_col: str,
plot_cfg: PlotConfig,
sub_dir: str | None,
) -> None:
plt.figure()
plt.plot(wtg_df["rolling_windspeed_diff"])
plt.plot(ser)
plot_title = f"{wtg_name} rolling {ws_col} diff to reanalysis"
plt.title(plot_title)
plt.xlabel("datetime")
Expand Down
45 changes: 24 additions & 21 deletions wind_up/pp_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,34 +27,37 @@ def pp_raw_df(
pw_col: str,
timebase_s: int,
) -> pd.DataFrame:
pp_df = (
pre_or_post_df.dropna(subset=[pw_col, ws_col])
return (
pre_or_post_df.loc[:, [pw_col, ws_col]]
.dropna()
.groupby(
by=pd.cut(pre_or_post_df[ws_col], bins=ws_bin_edges, retbins=False),
observed=False,
)
.agg(
count=pd.NamedAgg(column=pw_col, aggfunc=lambda x: len(x)),
ws_mean=pd.NamedAgg(column=ws_col, aggfunc=lambda x: x.mean()),
ws_std=pd.NamedAgg(column=ws_col, aggfunc=lambda x: x.std()),
pw_mean=pd.NamedAgg(column=pw_col, aggfunc=lambda x: x.mean()),
pw_std=pd.NamedAgg(column=pw_col, aggfunc=lambda x: x.std()),
count=pd.NamedAgg(column=pw_col, aggfunc=len),
ws_mean=pd.NamedAgg(column=ws_col, aggfunc="mean"),
ws_std=pd.NamedAgg(column=ws_col, aggfunc="std"),
pw_mean=pd.NamedAgg(column=pw_col, aggfunc="mean"),
pw_std=pd.NamedAgg(column=pw_col, aggfunc="std"),
)
.assign(
ws_std=lambda x: x["ws_std"].fillna(0),
pw_std=lambda x: x["pw_std"].fillna(0),
hours=lambda x: x["count"] * timebase_s / 3600,
ws_sem=lambda x: x["ws_std"] / np.sqrt(x["count"].clip(lower=1)),
pw_sem=lambda x: x["pw_std"] / np.sqrt(x["count"].clip(lower=1)),
)
.pipe(lambda d: d.set_axis(d.columns.map(lambda x: f"{x}_{pre_or_post}"), axis="columns"))
.assign(
bin_left=lambda x: [i.left for i in x.index],
bin_mid=lambda x: [i.mid for i in x.index],
bin_right=lambda x: [i.right for i in x.index],
bin_closed_right=lambda x: [i.closed_right for i in x.index],
)
.set_index("bin_mid", drop=False, verify_integrity=True)
.rename_axis(f"{ws_col}_bin_mid", axis=0)
)
pp_df["ws_std"] = pp_df["ws_std"].fillna(0)
pp_df["pw_std"] = pp_df["pw_std"].fillna(0)
rows_per_hour = 3600 / timebase_s
pp_df["hours"] = pp_df["count"] / rows_per_hour
pp_df["ws_sem"] = pp_df["ws_std"] / np.sqrt(pp_df["count"].clip(lower=1))
pp_df["pw_sem"] = pp_df["pw_std"] / np.sqrt(pp_df["count"].clip(lower=1))
pp_df.columns = [x + f"_{pre_or_post}" for x in pp_df.columns]
pp_df["bin_left"] = [x.left for x in pp_df.index]
pp_df["bin_mid"] = [x.mid for x in pp_df.index]
pp_df["bin_right"] = [x.right for x in pp_df.index]
pp_df["bin_closed_right"] = [x.closed_right for x in pp_df.index]
pp_df = pp_df.set_index("bin_mid", drop=False, verify_integrity=True)
pp_df.index.name = f"{ws_col}_bin_mid"
return pp_df


def _calc_rated_ws(*, pp_df: pd.DataFrame, pw_col: str, rated_power: float) -> float:
Expand Down
72 changes: 38 additions & 34 deletions wind_up/windspeed_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,43 @@
from wind_up.models import PlotConfig, WindUpConfig


def add_rolling_windspeed_diff(
wtg_df: pd.DataFrame, *, ws_col: str, reanalysis_ws_col: str, timebase_s: int
) -> pd.DataFrame:
wtg_df = wtg_df.copy()

# check for ws drift issue
wtg_df["ws_diff_to_renalysis"] = wtg_df[ws_col] - wtg_df[reanalysis_ws_col]
ws_ll = 6
ws_ul = 15
wtg_df.loc[wtg_df[ws_col] < ws_ll, "ws_diff_to_renalysis"] = np.nan
wtg_df.loc[wtg_df[ws_col] > ws_ul, "ws_diff_to_renalysis"] = np.nan
rolling_days = 90
def _calculate_rolling_windspeed_diff(
wtg_df: pd.DataFrame,
*,
ws_col: str,
reanalysis_ws_col: str,
timebase_s: int,
ws_ll: float = 6,
ws_ul: float = 15,
rolling_period: float = 90,
min_roll_days: float = 14,
min_rolling_coverage: float = 1 / 3,
) -> pd.Series:
ws_diff_to_renalysis = wtg_df[ws_col] - wtg_df[reanalysis_ws_col]
ws_diff_to_renalysis.loc[(wtg_df[ws_col] < ws_ll) | (wtg_df[ws_col] > ws_ul)] = np.nan

rows_per_day = 24 * 3600 / timebase_s
wtg_df["rolling_windspeed_diff"] = (
wtg_df["ws_diff_to_renalysis"]
.rolling(window=round(rolling_days * rows_per_day), min_periods=round(rolling_days * rows_per_day // 3))
.median()
)
min_roll_days = 14
while rolling_days >= (min_roll_days * 2) and len(wtg_df["rolling_windspeed_diff"].dropna()) == 0:
rolling_days = rolling_days // 2
wtg_df["rolling_windspeed_diff"] = (
wtg_df["ws_diff_to_renalysis"]
.rolling(window=round(rolling_days * rows_per_day), min_periods=round(rolling_days * rows_per_day // 3))
.median()
)
if len(wtg_df["rolling_windspeed_diff"].dropna()) == 0:

def _rolling_specs(rolling_period: float) -> dict[str, int]:
return {
"window": round(rolling_period * rows_per_day),
"min_periods": round(rolling_period * rows_per_day * min_rolling_coverage),
}

rolling_windspeed_diff = ws_diff_to_renalysis.rolling(**_rolling_specs(rolling_period)).median()

while rolling_period >= (min_roll_days * 2) and len(rolling_windspeed_diff.dropna()) == 0:
rolling_period = rolling_period // 2
rolling_windspeed_diff = ws_diff_to_renalysis.rolling(**_rolling_specs(rolling_period)).median()

if len(rolling_windspeed_diff.dropna()) == 0:
result_manager.warning("could not calculate rolling windspeed diff")
return wtg_df

return rolling_windspeed_diff


def calc_max_abs_relative_rolling_windspeed_diff(wtg_df: pd.DataFrame) -> float:
return (wtg_df["rolling_windspeed_diff"] - wtg_df["rolling_windspeed_diff"].median()).abs().max()
def calc_max_abs_relative_rolling_windspeed_diff(ser: pd.Series) -> float:
return (ser - ser.median()).abs().max()


def check_windspeed_drift(
Expand All @@ -58,18 +62,18 @@ def check_windspeed_drift(
plot_cfg: PlotConfig | None,
sub_dir: str | None = None,
) -> tuple[float, float]:
wtg_df = wtg_df.copy()
wtg_df = add_rolling_windspeed_diff(
rolling_windspeed_diff = _calculate_rolling_windspeed_diff(
wtg_df, ws_col=ws_col, reanalysis_ws_col=reanalysis_ws_col, timebase_s=cfg.timebase_s
)

if plot_cfg is not None:
plot_rolling_windspeed_diff_one_wtg(
wtg_df=wtg_df, wtg_name=wtg_name, ws_col=ws_col, plot_cfg=plot_cfg, sub_dir=sub_dir
ser=rolling_windspeed_diff, wtg_name=wtg_name, ws_col=ws_col, plot_cfg=plot_cfg, sub_dir=sub_dir
)

max_abs_rel_diff = calc_max_abs_relative_rolling_windspeed_diff(wtg_df)
max_abs_rel_diff = calc_max_abs_relative_rolling_windspeed_diff(rolling_windspeed_diff)
max_abs_rel_diff_pp_period = calc_max_abs_relative_rolling_windspeed_diff(
wtg_df.loc[cfg.analysis_first_dt_utc_start : cfg.analysis_last_dt_utc_start], # type: ignore[misc]
rolling_windspeed_diff.loc[cfg.analysis_first_dt_utc_start : cfg.analysis_last_dt_utc_start], # type: ignore[misc]
)

ws_diff_ul = 1
Expand Down

0 comments on commit 7982f9b

Please sign in to comment.