Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

220 df index sort mgtfs #249

Merged
merged 7 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions src/transport_performance/gtfs/multi_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,12 +379,21 @@ def filter_to_bbox(
self._raise_empty_feed_error(bbox)
return None

def _summary_col_sorter(self, df: pd.DataFrame) -> pd.DataFrame:
SergioRec marked this conversation as resolved.
Show resolved Hide resolved
"""Sort columns for summary dfs."""
presets = ["day", "route_type"]
for col in sorted(df.columns.values):
if col not in presets:
presets.append(col)
return df.reindex(presets, axis=1).copy()

def _summarise_core(
self,
which: str = "trips",
summ_ops: list = [np.min, np.max, np.mean, np.median],
return_summary: bool = True,
to_days: bool = False,
sort_by_route_type: bool = False,
) -> pd.DataFrame:
"""Summarise the MultiGtfsInstance by either trip_id or route_id.

Expand All @@ -399,12 +408,16 @@ def _summarise_core(
by default [np.min, np.max, np.mean, np.median]
return_summary : bool, optional
When set to False, full data for each trip on each date will be
returned. Defaults to True.
returned, by default True.
to_days : bool, optional
Whether or not to aggregate to days, or to just return counts for
trips/routes for each date. When False, summ_ops becomes useless,
and should therefore nothing should be passed when calling this
function (so it remains as the default). Defaults to False.
function (so it remains as the default), by default False.
sort_by_route_type : bool, optional
Whether or not to sort the resulting dataframe by route_type.
This only impacts the resulting df when to_days=True,
by default False.

Returns
-------
Expand All @@ -422,12 +435,12 @@ def _summarise_core(
_type_defence(which, "which", str)
_type_defence(return_summary, "return_summary", bool)
_type_defence(to_days, "to_days", bool)
_type_defence(sort_by_route_type, "sort_by_route_type", bool)
which = which.lower().strip()
if which not in ["trips", "routes"]:
raise ValueError(
f"'which' must be one of ['trips', 'routes']. Got {which}"
)

# choose summary
if which == "trips":
group_col = "trip_id"
Expand Down Expand Up @@ -476,14 +489,20 @@ def _summarise_core(
column.replace("amin", "min").replace("amax", "max")
for column in trip_counts.columns.values
]
# Always sort by day so that route_type sorts are more organised
trip_counts = self.instances[0]._order_dataframe_by_day(trip_counts)
return trip_counts
if sort_by_route_type:
trip_counts = trip_counts.sort_values("route_type")
if to_days:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that if to_days is set to True, the index after sorting will be messed up. This is nitpicking, but perhaps we could add a reset_index somewhere.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good shout. Added this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed the reset_index to the return as this wasn't fixing the issue when to_days was True. Let me know if you're not happy with this!

trip_counts = self._summary_col_sorter(trip_counts)
return trip_counts.reset_index(drop=True)

def summarise_trips(
self,
summ_ops: list = [np.min, np.max, np.mean, np.median],
return_summary: bool = True,
to_days: bool = False,
sort_by_route_type: bool = False,
) -> pd.DataFrame:
"""Summarise the combined GTFS data by trip_id.

Expand All @@ -496,12 +515,16 @@ def summarise_trips(
,by default [np.min, np.max, np.mean, np.median]
return_summary: bool, optional
When set to False, full data for each trip on each date will be
returned. Defaults to True.
returned, by default True.
to_days : bool, optional
Whether or not to aggregate to days, or to just return counts for
trips/routes for each date. When False, summ_ops becomes useless,
and should therefore nothing should be passed when calling this
function (so it remains as the default). Defaults to False.
function (so it remains as the default), by default False.
sort_by_route_type : bool, optional
Whether or not to sort the resulting dataframe by route_type.
This only impacts the resulting df when to_days=True,
by default False.

Returns
-------
Expand All @@ -514,6 +537,7 @@ def summarise_trips(
summ_ops=summ_ops,
return_summary=return_summary,
to_days=to_days,
sort_by_route_type=sort_by_route_type,
)
return self.daily_trip_summary.copy()

Expand All @@ -522,6 +546,7 @@ def summarise_routes(
summ_ops: list = [np.min, np.max, np.mean, np.median],
return_summary: bool = True,
to_days: bool = False,
sort_by_route_type: bool = False,
) -> pd.DataFrame:
"""Summarise the combined GTFS data by route_id.

Expand All @@ -534,12 +559,16 @@ def summarise_routes(
by default [np.min, np.max, np.mean, np.median].
return_summary: bool, optional
When set to False, full data for each trip on each date will be
returned. Defaults to True.
returned, by default True.
to_days : bool, optional
Whether or not to aggregate to days, or to just return counts for
trips/routes for each date. When False, summ_ops becomes useless,
and should therefore nothing should be passed when calling this
function (so it remains as the default). Defaults to False.
function (so it remains as the default), by default False.
sort_by_route_type : bool, optional
Whether or not to sort the resulting dataframe by route_type.
This only impacts the resulting df when to_days=True,
by default False.

Returns
-------
Expand All @@ -552,6 +581,7 @@ def summarise_routes(
summ_ops=summ_ops,
return_summary=return_summary,
to_days=to_days,
sort_by_route_type=sort_by_route_type,
)
return self.daily_route_summary.copy()

Expand All @@ -571,7 +601,7 @@ def viz_stops(
Whether or not to return the folium map object, by default True.
filtered_only : bool, optional
Whether to filter the stops that are plotted to only stop_id's that
are present in the stop_times table.
are present in the stop_times table, by default True.

Returns
-------
Expand Down
36 changes: 31 additions & 5 deletions tests/gtfs/test_multi_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,24 +453,42 @@ def test_filter_to_bbox(self, multi_gtfs_fixture):
), "Gtfs inst[1] not as expected after filter"

@pytest.mark.parametrize(
"which, summ_ops, raises, match",
"which, summ_ops, sort_by_route_type, raises, match",
(
["route", True, TypeError, ".*summ_ops.*list.*bool"],
[True, [np.max], TypeError, ".*which.*str.*bool"],
["route", True, True, TypeError, ".*summ_ops.*list.*bool"],
[True, [np.max], True, TypeError, ".*which.*str.*bool"],
[
"not_which",
[np.max],
True,
ValueError,
".*which.*trips.*routes.*not_which.*",
],
[
"trips",
[np.max],
"not_sort",
TypeError,
".*sort_by_route_type.*bool.*str",
],
),
)
def test__summarise_core_defence(
self, multi_gtfs_fixture, which, summ_ops, raises, match
self,
multi_gtfs_fixture,
which,
summ_ops,
sort_by_route_type,
raises,
match,
):
"""Defensive tests for _summarise_core()."""
with pytest.raises(raises, match=match):
multi_gtfs_fixture._summarise_core(which=which, summ_ops=summ_ops)
multi_gtfs_fixture._summarise_core(
which=which,
summ_ops=summ_ops,
sort_by_route_type=sort_by_route_type,
)

def test__summarise_core(self, multi_gtfs_fixture):
"""General tests for _summarise_core()."""
Expand Down Expand Up @@ -532,6 +550,14 @@ def test__summarise_core(self, multi_gtfs_fixture):
assert (
dated_sum[dated_sum.date == "2024-04-06"].route_count.iloc[0] == 9
), "Unexpecteed number of routes on 2024-04-06"
# test sorting to route_type
route_sort = multi_gtfs_fixture._summarise_core(
which="routes", to_days=True, sort_by_route_type=True
)
first_three_types = route_sort.route_type[:3]
assert np.array_equal(
first_three_types, [3, 3, 3]
), "Summary not sorted by route_type"

def test_summarise_trips(self, multi_gtfs_fixture):
"""General tests for summarise_trips()."""
Expand Down
Loading