diff --git a/src/pipeline.py b/src/pipeline.py index 6aa80dd..f279e98 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -55,20 +55,30 @@ def run_exp( rt_df: DataFrame = BUS_654_FEATURES_ADDED_RUNNING_TIMES.dataframe dt_df: DataFrame = BUS_654_FEATURES_ADDED_DWELL_TIMES.dataframe + rt_df["start_datetime"] = to_datetime(rt_df["date"] + " " + rt_df["start_time"]) dt_df["arrival_datetime"] = to_datetime(dt_df["date"] + " " + dt_df["arrival_time"]) base_model: Optional[MME4BAT] = None model: Optional[MME4BAT] = None + result_rt_df = DataFrame( + columns=rt_df.columns.tolist() + + ["true_prediction", "base_model_prediction", "model_prediction"] + ) + result_dt_df = DataFrame( columns=dt_df.columns.tolist() - + ["true_prediction", "base_model_prediction", "model_prediction"] + + ["true_prediction", "base_model_prediction", "model_prediction"] ) - - true_predictions = [] - base_model_predictions = [] - model_predictions = [] - + rt_true_predictions = [] + rt_base_model_predictions = [] + rt_model_predictions = [] + dt_true_predictions = [] + dt_base_model_predictions = [] + dt_model_predictions = [] + + rt_x_buffer: Optional[DataFrame] = None + rt_y_buffer: Optional[DataFrame] = None dt_x_buffer: Optional[DataFrame] = None dt_y_buffer: Optional[DataFrame] = None @@ -82,21 +92,34 @@ def run_exp( flush=True, ) + rt_chunk: DataFrame = rt_df.loc[ + (from_date_time <= rt_df["start_datetime"]) + & (rt_df["start_datetime"] < to_date_time), + :, + ].reset_index(drop=True) + dt_chunk: DataFrame = dt_df.loc[ - (from_date_time <= dt_df["arrival_datetime"]) - & (dt_df["arrival_datetime"] < to_date_time), - :, - ].reset_index(drop=True) + (from_date_time <= dt_df["arrival_datetime"]) + & (dt_df["arrival_datetime"] < to_date_time), + :, + ].reset_index(drop=True) from_date_time = stream_start if from_date_time == hist_start else to_date_time to_date_time = from_date_time + timedelta(minutes=interval_min) + if len(rt_chunk) == 0: + print(" | NO INSTANCES FOR RUNNING TIME") + continue + if len(dt_chunk) == 0: - print(" | NO INSTANCES") + print(" | NO INSTANCES FOR DWELL TIME") continue + numeric_rt_chunk = rt_chunk.select_dtypes(include="number") numeric_dt_chunk = dt_chunk.select_dtypes(include="number") + rt_x: DataFrame = numeric_rt_chunk.drop(columns=["run_time_in_seconds"]) + rt_y: DataFrame = numeric_rt_chunk[["run_time_in_seconds"]] dt_x: DataFrame = numeric_dt_chunk.drop(columns=["dwell_time_in_seconds"]) dt_y: DataFrame = numeric_dt_chunk[["dwell_time_in_seconds"]] @@ -104,30 +127,59 @@ def run_exp( base_model = MME4BAT() model = MME4BAT(cdd_strategy=cdd_strategy) - base_model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y) - model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y) + base_model.fit(rt_x=rt_x, rt_y=rt_y, dt_x=dt_x, dt_y=dt_y) + model.fit(rt_x=rt_x, rt_y=rt_y, dt_x=dt_x, dt_y=dt_y) print(" | MODEL INITIATED") else: - true_prediction = dt_y["dwell_time_in_seconds"].tolist() - base_model_prediction = base_model.predict(rt_x=None, dt_x=dt_x)[ + true_prediction_rt = rt_y["run_time_in_seconds"].tolist() + true_prediction_dt = dt_y["dwell_time_in_seconds"].tolist() + + # getting base model prediction + base_model_prediction = base_model.predict(rt_x=rt_x, dt_x=dt_x) + base_model_prediction_rt = base_model_prediction[0][ + "prediction" + ].tolist() + base_model_prediction_dt = base_model_prediction[1][ + "prediction" + ].tolist() + + # getting current model prediction + model_prediction = model.predict(rt_x=rt_x, dt_x=dt_x) + model_prediction_rt = model_prediction[0][ "prediction" ].tolist() - model_prediction = model.predict(rt_x=None, dt_x=dt_x)[ + model_prediction_dt = model_prediction[1][ "prediction" ].tolist() - true_predictions.extend(true_prediction) - base_model_predictions.extend(base_model_prediction) - model_predictions.extend(model_prediction) + rt_true_predictions.extend(true_prediction_rt) + rt_base_model_predictions.extend(base_model_prediction_rt) + rt_model_predictions.extend(model_prediction_rt) - dt_chunk["true_prediction"] = true_prediction - dt_chunk["base_model_prediction"] = base_model_prediction - dt_chunk["model_prediction"] = model_prediction + dt_true_predictions.extend(true_prediction_dt) + dt_base_model_predictions.extend(base_model_prediction_dt) + dt_model_predictions.extend(model_prediction_dt) + rt_chunk["true_prediction"] = true_prediction_rt + rt_chunk["base_model_prediction"] = base_model_prediction_rt + rt_chunk["model_prediction"] = model_prediction_rt + + dt_chunk["true_prediction"] = true_prediction_dt + dt_chunk["base_model_prediction"] = base_model_prediction_dt + dt_chunk["model_prediction"] = model_prediction_dt + + result_rt_df = concat([result_rt_df, rt_chunk], ignore_index=True) result_dt_df = concat([result_dt_df, dt_chunk], ignore_index=True) if active_strategy: + if (rt_x_buffer is None) or (rt_y_buffer is None): + rt_x_buffer = rt_x + rt_y_buffer = rt_y + else: + rt_x_buffer = concat([rt_x_buffer, rt_x], ignore_index=True) + rt_y_buffer = concat([rt_y_buffer, rt_y], ignore_index=True) + if (dt_x_buffer is None) or (dt_y_buffer is None): dt_x_buffer = dt_x dt_y_buffer = dt_y @@ -136,12 +188,12 @@ def run_exp( dt_y_buffer = concat([dt_y_buffer, dt_y], ignore_index=True) is_detected = model.is_concept_drift_detected( - ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y + ni_rt_x=rt_x, ni_rt_y=rt_y, ni_dt_x=dt_x, ni_dt_y=dt_y ) if is_detected: model.incremental_fit( - ni_rt_x=None, - ni_rt_y=None, + ni_rt_x=rt_x_buffer, + ni_rt_y=rt_y_buffer, ni_dt_x=dt_x_buffer, ni_dt_y=dt_y_buffer, ) @@ -149,7 +201,7 @@ def run_exp( dt_y_buffer = None else: model.incremental_fit( - ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y + ni_rt_x=rt_x, ni_rt_y=rt_y, ni_dt_x=dt_x, ni_dt_y=dt_y ) print("\rDATA STREAMING ENDED.", flush=True) @@ -180,12 +232,19 @@ def run_exp( - Concept drift detection algorithm: {cdd_strategy.__class__.__name__ if active_strategy else None} {cdd_strategy_content} -## Results +## Results for running time prediction ### Model Performance Metrics | Model | MAE (s) | MAPE (%) | RMSE (s) | |-------------------------------------|-----------|------------|------------| -| Base Model (XGBoost) | {mean_absolute_error(true_predictions, base_model_predictions)} | {mean_absolute_percentage_error(true_predictions, base_model_predictions) * 100} | {mean_squared_error(true_predictions, base_model_predictions, squared=False)} | -| Base Model with Incremental Learning | {mean_absolute_error(true_predictions, model_predictions)} | {mean_absolute_percentage_error(true_predictions, model_predictions) * 100} | {mean_squared_error(true_predictions, model_predictions, squared=False)} | +| Base Model (XGBoost) | {mean_absolute_error(rt_true_predictions, rt_base_model_predictions)} | {mean_absolute_percentage_error(rt_true_predictions, rt_base_model_predictions) * 100} | {mean_squared_error(rt_true_predictions, rt_base_model_predictions, squared=False)} | +| Base Model with Incremental Learning | {mean_absolute_error(rt_true_predictions, rt_model_predictions)} | {mean_absolute_percentage_error(rt_true_predictions, rt_model_predictions) * 100} | {mean_squared_error(rt_true_predictions, rt_model_predictions, squared=False)} | + +## Results for dwell time prediction +### Model Performance Metrics +| Model | MAE (s) | MAPE (%) | RMSE (s) | +|-------------------------------------|-----------|------------|------------| +| Base Model (XGBoost) | {mean_absolute_error(dt_true_predictions, dt_base_model_predictions)} | {mean_absolute_percentage_error(dt_true_predictions, dt_base_model_predictions) * 100} | {mean_squared_error(dt_true_predictions, dt_base_model_predictions, squared=False)} | +| Base Model with Incremental Learning | {mean_absolute_error(dt_true_predictions, dt_model_predictions)} | {mean_absolute_percentage_error(dt_true_predictions, dt_model_predictions) * 100} | {mean_squared_error(dt_true_predictions, dt_model_predictions, squared=False)} | """ @@ -258,18 +317,62 @@ def run_exp( # plt.legend() # plt.savefig("dt.png", dpi=150) - df = result_dt_df - df["date"] = to_datetime(df["date"]) - df["arrival_time"] = to_datetime(df["arrival_time"], format="%H:%M:%S") + df_rt = result_rt_df + df_rt["date"] = to_datetime(df_rt["date"]) + df_rt["start_time"] = to_datetime(df_rt["start_time"], format="%H:%M:%S") + df_dt = result_dt_df + df_dt["date"] = to_datetime(df_dt["date"]) + df_dt["arrival_time"] = to_datetime(df_dt["arrival_time"], format="%H:%M:%S") - directions = df["direction"].unique() - bus_stops = df["bus_stop"].unique() + directions_rt = df_rt["direction"].unique() + segments = df_rt["segment"].unique() + directions_dt = df_dt["direction"].unique() + bus_stops = df_dt["bus_stop"].unique() starting_time = datetime.strptime("06:00:00", "%H:%M:%S") ending_time = datetime.strptime("18:00:00", "%H:%M:%S") from_time = starting_time + while from_time < ending_time: + to_time = from_time + timedelta(minutes=interval_min) + print( + f"\rGENERATING & EXPORTING PLOTS FOR EACH SEGMENTS: [{from_time.strftime('%H:%M:%S')} - {to_time.strftime('%H:%M:%S')})", + end="", + flush=True, + ) + + for direction in directions_rt: + + for segment in segments: + filtered_df = df_rt[ + (df_rt["direction"] == direction) + & (df_rt["segment"] == segment) + & (df_rt["start_time"].dt.time >= from_time.time()) + & (df_rt["start_time"].dt.time < to_time.time()) + ] + + export_at = os.path.join( + output_dir, + f"rt-d-{direction}-ti-{from_time.strftime('%H-%M-%S')}_{to_time.strftime('%H-%M-%S')}", + ) + os.makedirs(export_at, exist_ok=True) + + if len(filtered_df) > 0: + export_mean_rt_plot_as_image( + df=filtered_df, + segment=segment, + starting_time=from_time, + ending_time=to_time, + export_at=export_at, + ) + + from_time = to_time + + print("\rGENERATING & EXPORTING PLOTS FOR EACH SEGMENTS ENDED.", flush=True) + + from_time = starting_time + while from_time < ending_time: to_time = from_time + timedelta(minutes=interval_min) print( @@ -278,14 +381,15 @@ def run_exp( flush=True, ) - for direction in directions: + for direction in directions_dt: + for bus_stop in bus_stops: - filtered_df = df[ - (df["direction"] == direction) - & (df["bus_stop"] == bus_stop) - & (df["arrival_time"].dt.time >= from_time.time()) - & (df["arrival_time"].dt.time < to_time.time()) - ] + filtered_df = df_dt[ + (df_dt["direction"] == direction) + & (df_dt["bus_stop"] == bus_stop) + & (df_dt["arrival_time"].dt.time >= from_time.time()) + & (df_dt["arrival_time"].dt.time < to_time.time()) + ] export_at = os.path.join( output_dir, @@ -308,11 +412,11 @@ def run_exp( def export_mean_dt_plot_as_image( - df: DataFrame, - bus_stop: int, - starting_time, - ending_time, - export_at: str, + df: DataFrame, + bus_stop: int, + starting_time, + ending_time, + export_at: str, ) -> None: dt_in_seconds_df = ( df.groupby("date")["dwell_time_in_seconds"] @@ -373,3 +477,71 @@ def export_mean_dt_plot_as_image( plt.tight_layout() plt.savefig(f"{export_at}/{bus_stop}.png", dpi=200) plt.close() + + +def export_mean_rt_plot_as_image( + df: DataFrame, + segment: int, + starting_time, + ending_time, + export_at: str, +) -> None: + rt_in_seconds_df = ( + df.groupby("date")["run_time_in_seconds"] + .mean() + .reset_index() + .sort_values(by="date") + ) + + base_model_prediction_df = ( + df.groupby("date")["base_model_prediction"] + .mean() + .reset_index() + .sort_values(by="date") + ) + + model_prediction_df = ( + df.groupby("date")["model_prediction"] + .mean() + .reset_index() + .sort_values(by="date") + ) + + x = model_prediction_df["date"] + + plt.figure(figsize=(13, 5)) + plt.plot( + x, + rt_in_seconds_df["run_time_in_seconds"], + label="True running time", + marker="o", + linestyle="-", + color="blue", + ) + plt.plot( + x, + base_model_prediction_df["base_model_prediction"], + label="Base model prediction", + marker="o", + linestyle="--", + color="green", + ) + plt.plot( + x, + model_prediction_df["model_prediction"], + label="Prediction with incremental online learning", + marker="o", + linestyle="--", + color="orange", + ) + plt.xlabel("Date") + plt.ylabel("Mean running time (s)") + plt.title( + f"The mean running time at segment {segment} for each day between {starting_time.time()} and {ending_time.time()}." + ) + plt.xticks(rotation=45) + plt.grid(True) + plt.legend() + plt.tight_layout() + plt.savefig(f"{export_at}/{segment}.png", dpi=200) + plt.close()