Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Adding running time pipeline implementation #25

Open
wants to merge 1 commit into
base: running_time
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
264 changes: 218 additions & 46 deletions src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,30 @@ def run_exp(
rt_df: DataFrame = BUS_654_FEATURES_ADDED_RUNNING_TIMES.dataframe
dt_df: DataFrame = BUS_654_FEATURES_ADDED_DWELL_TIMES.dataframe

rt_df["start_datetime"] = to_datetime(rt_df["date"] + " " + rt_df["start_time"])
dt_df["arrival_datetime"] = to_datetime(dt_df["date"] + " " + dt_df["arrival_time"])

base_model: Optional[MME4BAT] = None
model: Optional[MME4BAT] = None

result_rt_df = DataFrame(
columns=rt_df.columns.tolist()
+ ["true_prediction", "base_model_prediction", "model_prediction"]
)

result_dt_df = DataFrame(
columns=dt_df.columns.tolist()
+ ["true_prediction", "base_model_prediction", "model_prediction"]
+ ["true_prediction", "base_model_prediction", "model_prediction"]
)

true_predictions = []
base_model_predictions = []
model_predictions = []

rt_true_predictions = []
rt_base_model_predictions = []
rt_model_predictions = []
dt_true_predictions = []
dt_base_model_predictions = []
dt_model_predictions = []

rt_x_buffer: Optional[DataFrame] = None
rt_y_buffer: Optional[DataFrame] = None
dt_x_buffer: Optional[DataFrame] = None
dt_y_buffer: Optional[DataFrame] = None

Expand All @@ -82,52 +92,94 @@ def run_exp(
flush=True,
)

rt_chunk: DataFrame = rt_df.loc[
(from_date_time <= rt_df["start_datetime"])
& (rt_df["start_datetime"] < to_date_time),
:,
].reset_index(drop=True)

dt_chunk: DataFrame = dt_df.loc[
(from_date_time <= dt_df["arrival_datetime"])
& (dt_df["arrival_datetime"] < to_date_time),
:,
].reset_index(drop=True)
(from_date_time <= dt_df["arrival_datetime"])
& (dt_df["arrival_datetime"] < to_date_time),
:,
].reset_index(drop=True)

from_date_time = stream_start if from_date_time == hist_start else to_date_time
to_date_time = from_date_time + timedelta(minutes=interval_min)

if len(rt_chunk) == 0:
print(" | NO INSTANCES FOR RUNNING TIME")
continue

if len(dt_chunk) == 0:
print(" | NO INSTANCES")
print(" | NO INSTANCES FOR DWELL TIME")
continue

numeric_rt_chunk = rt_chunk.select_dtypes(include="number")
numeric_dt_chunk = dt_chunk.select_dtypes(include="number")

rt_x: DataFrame = numeric_rt_chunk.drop(columns=["run_time_in_seconds"])
rt_y: DataFrame = numeric_rt_chunk[["run_time_in_seconds"]]
dt_x: DataFrame = numeric_dt_chunk.drop(columns=["dwell_time_in_seconds"])
dt_y: DataFrame = numeric_dt_chunk[["dwell_time_in_seconds"]]

if not model:
base_model = MME4BAT()
model = MME4BAT(cdd_strategy=cdd_strategy)

base_model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y)
model.fit(rt_x=None, rt_y=None, dt_x=dt_x, dt_y=dt_y)
base_model.fit(rt_x=rt_x, rt_y=rt_y, dt_x=dt_x, dt_y=dt_y)
model.fit(rt_x=rt_x, rt_y=rt_y, dt_x=dt_x, dt_y=dt_y)

print(" | MODEL INITIATED")
else:
true_prediction = dt_y["dwell_time_in_seconds"].tolist()
base_model_prediction = base_model.predict(rt_x=None, dt_x=dt_x)[
true_prediction_rt = rt_y["run_time_in_seconds"].tolist()
true_prediction_dt = dt_y["dwell_time_in_seconds"].tolist()

# getting base model prediction
base_model_prediction = base_model.predict(rt_x=rt_x, dt_x=dt_x)
base_model_prediction_rt = base_model_prediction[0][
"prediction"
].tolist()
base_model_prediction_dt = base_model_prediction[1][
"prediction"
].tolist()

# getting current model prediction
model_prediction = model.predict(rt_x=rt_x, dt_x=dt_x)
model_prediction_rt = model_prediction[0][
"prediction"
].tolist()
model_prediction = model.predict(rt_x=None, dt_x=dt_x)[
model_prediction_dt = model_prediction[1][
"prediction"
].tolist()

true_predictions.extend(true_prediction)
base_model_predictions.extend(base_model_prediction)
model_predictions.extend(model_prediction)
rt_true_predictions.extend(true_prediction_rt)
rt_base_model_predictions.extend(base_model_prediction_rt)
rt_model_predictions.extend(model_prediction_rt)

dt_chunk["true_prediction"] = true_prediction
dt_chunk["base_model_prediction"] = base_model_prediction
dt_chunk["model_prediction"] = model_prediction
dt_true_predictions.extend(true_prediction_dt)
dt_base_model_predictions.extend(base_model_prediction_dt)
dt_model_predictions.extend(model_prediction_dt)

rt_chunk["true_prediction"] = true_prediction_rt
rt_chunk["base_model_prediction"] = base_model_prediction_rt
rt_chunk["model_prediction"] = model_prediction_rt

dt_chunk["true_prediction"] = true_prediction_dt
dt_chunk["base_model_prediction"] = base_model_prediction_dt
dt_chunk["model_prediction"] = model_prediction_dt

result_rt_df = concat([result_rt_df, rt_chunk], ignore_index=True)
result_dt_df = concat([result_dt_df, dt_chunk], ignore_index=True)

if active_strategy:
if (rt_x_buffer is None) or (rt_y_buffer is None):
rt_x_buffer = rt_x
rt_y_buffer = rt_y
else:
rt_x_buffer = concat([rt_x_buffer, rt_x], ignore_index=True)
rt_y_buffer = concat([rt_y_buffer, rt_y], ignore_index=True)

if (dt_x_buffer is None) or (dt_y_buffer is None):
dt_x_buffer = dt_x
dt_y_buffer = dt_y
Expand All @@ -136,20 +188,20 @@ def run_exp(
dt_y_buffer = concat([dt_y_buffer, dt_y], ignore_index=True)

is_detected = model.is_concept_drift_detected(
ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y
ni_rt_x=rt_x, ni_rt_y=rt_y, ni_dt_x=dt_x, ni_dt_y=dt_y
)
if is_detected:
model.incremental_fit(
ni_rt_x=None,
ni_rt_y=None,
ni_rt_x=rt_x_buffer,
ni_rt_y=rt_y_buffer,
ni_dt_x=dt_x_buffer,
ni_dt_y=dt_y_buffer,
)
dt_x_buffer = None
dt_y_buffer = None
else:
model.incremental_fit(
ni_rt_x=None, ni_rt_y=None, ni_dt_x=dt_x, ni_dt_y=dt_y
ni_rt_x=rt_x, ni_rt_y=rt_y, ni_dt_x=dt_x, ni_dt_y=dt_y
)

print("\rDATA STREAMING ENDED.", flush=True)
Expand Down Expand Up @@ -180,12 +232,19 @@ def run_exp(
- Concept drift detection algorithm: {cdd_strategy.__class__.__name__ if active_strategy else None}
{cdd_strategy_content}

## Results
## Results for running time prediction
### Model Performance Metrics
| Model | MAE (s) | MAPE (%) | RMSE (s) |
|-------------------------------------|-----------|------------|------------|
| Base Model (XGBoost) | {mean_absolute_error(true_predictions, base_model_predictions)} | {mean_absolute_percentage_error(true_predictions, base_model_predictions) * 100} | {mean_squared_error(true_predictions, base_model_predictions, squared=False)} |
| Base Model with Incremental Learning | {mean_absolute_error(true_predictions, model_predictions)} | {mean_absolute_percentage_error(true_predictions, model_predictions) * 100} | {mean_squared_error(true_predictions, model_predictions, squared=False)} |
| Base Model (XGBoost) | {mean_absolute_error(rt_true_predictions, rt_base_model_predictions)} | {mean_absolute_percentage_error(rt_true_predictions, rt_base_model_predictions) * 100} | {mean_squared_error(rt_true_predictions, rt_base_model_predictions, squared=False)} |
| Base Model with Incremental Learning | {mean_absolute_error(rt_true_predictions, rt_model_predictions)} | {mean_absolute_percentage_error(rt_true_predictions, rt_model_predictions) * 100} | {mean_squared_error(rt_true_predictions, rt_model_predictions, squared=False)} |

## Results for dwell time prediction
### Model Performance Metrics
| Model | MAE (s) | MAPE (%) | RMSE (s) |
|-------------------------------------|-----------|------------|------------|
| Base Model (XGBoost) | {mean_absolute_error(dt_true_predictions, dt_base_model_predictions)} | {mean_absolute_percentage_error(dt_true_predictions, dt_base_model_predictions) * 100} | {mean_squared_error(dt_true_predictions, dt_base_model_predictions, squared=False)} |
| Base Model with Incremental Learning | {mean_absolute_error(dt_true_predictions, dt_model_predictions)} | {mean_absolute_percentage_error(dt_true_predictions, dt_model_predictions) * 100} | {mean_squared_error(dt_true_predictions, dt_model_predictions, squared=False)} |

"""

Expand Down Expand Up @@ -258,18 +317,62 @@ def run_exp(
# plt.legend()
# plt.savefig("dt.png", dpi=150)

df = result_dt_df
df["date"] = to_datetime(df["date"])
df["arrival_time"] = to_datetime(df["arrival_time"], format="%H:%M:%S")
df_rt = result_rt_df
df_rt["date"] = to_datetime(df_rt["date"])
df_rt["start_time"] = to_datetime(df_rt["start_time"], format="%H:%M:%S")
df_dt = result_dt_df
df_dt["date"] = to_datetime(df_dt["date"])
df_dt["arrival_time"] = to_datetime(df_dt["arrival_time"], format="%H:%M:%S")

directions = df["direction"].unique()
bus_stops = df["bus_stop"].unique()
directions_rt = df_rt["direction"].unique()
segments = df_rt["segment"].unique()
directions_dt = df_dt["direction"].unique()
bus_stops = df_dt["bus_stop"].unique()

starting_time = datetime.strptime("06:00:00", "%H:%M:%S")
ending_time = datetime.strptime("18:00:00", "%H:%M:%S")

from_time = starting_time

while from_time < ending_time:
to_time = from_time + timedelta(minutes=interval_min)
print(
f"\rGENERATING & EXPORTING PLOTS FOR EACH SEGMENTS: [{from_time.strftime('%H:%M:%S')} - {to_time.strftime('%H:%M:%S')})",
end="",
flush=True,
)

for direction in directions_rt:

for segment in segments:
filtered_df = df_rt[
(df_rt["direction"] == direction)
& (df_rt["segment"] == segment)
& (df_rt["start_time"].dt.time >= from_time.time())
& (df_rt["start_time"].dt.time < to_time.time())
]

export_at = os.path.join(
output_dir,
f"rt-d-{direction}-ti-{from_time.strftime('%H-%M-%S')}_{to_time.strftime('%H-%M-%S')}",
)
os.makedirs(export_at, exist_ok=True)

if len(filtered_df) > 0:
export_mean_rt_plot_as_image(
df=filtered_df,
segment=segment,
starting_time=from_time,
ending_time=to_time,
export_at=export_at,
)

from_time = to_time

print("\rGENERATING & EXPORTING PLOTS FOR EACH SEGMENTS ENDED.", flush=True)

from_time = starting_time

while from_time < ending_time:
to_time = from_time + timedelta(minutes=interval_min)
print(
Expand All @@ -278,14 +381,15 @@ def run_exp(
flush=True,
)

for direction in directions:
for direction in directions_dt:

for bus_stop in bus_stops:
filtered_df = df[
(df["direction"] == direction)
& (df["bus_stop"] == bus_stop)
& (df["arrival_time"].dt.time >= from_time.time())
& (df["arrival_time"].dt.time < to_time.time())
]
filtered_df = df_dt[
(df_dt["direction"] == direction)
& (df_dt["bus_stop"] == bus_stop)
& (df_dt["arrival_time"].dt.time >= from_time.time())
& (df_dt["arrival_time"].dt.time < to_time.time())
]

export_at = os.path.join(
output_dir,
Expand All @@ -308,11 +412,11 @@ def run_exp(


def export_mean_dt_plot_as_image(
df: DataFrame,
bus_stop: int,
starting_time,
ending_time,
export_at: str,
df: DataFrame,
bus_stop: int,
starting_time,
ending_time,
export_at: str,
) -> None:
dt_in_seconds_df = (
df.groupby("date")["dwell_time_in_seconds"]
Expand Down Expand Up @@ -373,3 +477,71 @@ def export_mean_dt_plot_as_image(
plt.tight_layout()
plt.savefig(f"{export_at}/{bus_stop}.png", dpi=200)
plt.close()


def export_mean_rt_plot_as_image(
df: DataFrame,
segment: int,
starting_time,
ending_time,
export_at: str,
) -> None:
rt_in_seconds_df = (
df.groupby("date")["run_time_in_seconds"]
.mean()
.reset_index()
.sort_values(by="date")
)

base_model_prediction_df = (
df.groupby("date")["base_model_prediction"]
.mean()
.reset_index()
.sort_values(by="date")
)

model_prediction_df = (
df.groupby("date")["model_prediction"]
.mean()
.reset_index()
.sort_values(by="date")
)

x = model_prediction_df["date"]

plt.figure(figsize=(13, 5))
plt.plot(
x,
rt_in_seconds_df["run_time_in_seconds"],
label="True running time",
marker="o",
linestyle="-",
color="blue",
)
plt.plot(
x,
base_model_prediction_df["base_model_prediction"],
label="Base model prediction",
marker="o",
linestyle="--",
color="green",
)
plt.plot(
x,
model_prediction_df["model_prediction"],
label="Prediction with incremental online learning",
marker="o",
linestyle="--",
color="orange",
)
plt.xlabel("Date")
plt.ylabel("Mean running time (s)")
plt.title(
f"The mean running time at segment {segment} for each day between {starting_time.time()} and {ending_time.time()}."
)
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(f"{export_at}/{segment}.png", dpi=200)
plt.close()