From 7cb934dac57371ad2c6d1e718c9c9d875a133f1c Mon Sep 17 00:00:00 2001 From: Browning Date: Fri, 9 Feb 2024 09:30:28 +0000 Subject: [PATCH 1/6] feat: sorting params for mgtfs summaries --- .../gtfs/multi_validation.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/transport_performance/gtfs/multi_validation.py b/src/transport_performance/gtfs/multi_validation.py index e48582fa..7b471e67 100644 --- a/src/transport_performance/gtfs/multi_validation.py +++ b/src/transport_performance/gtfs/multi_validation.py @@ -379,12 +379,21 @@ def filter_to_bbox( self._raise_empty_feed_error(bbox) return None + def _summary_col_sorter(self, df: pd.DataFrame) -> pd.DataFrame: + """Sort columns for summary dfs.""" + presets = ["day", "route_type"] + for col in sorted(df.columns.values): + if col not in presets: + presets.append(col) + return df.reindex(presets, axis=1).copy() + def _summarise_core( self, which: str = "trips", summ_ops: list = [np.min, np.max, np.mean, np.median], return_summary: bool = True, to_days: bool = False, + sort_by: str = "days", ) -> pd.DataFrame: """Summarise the MultiGtfsInstance by either trip_id or route_id. @@ -405,6 +414,12 @@ def _summarise_core( trips/routes for each date. When False, summ_ops becomes useless, and should therefore nothing should be passed when calling this function (so it remains as the default). Defaults to False. + sort_by : str, optional + What to sort the dataframe by. Options include: ["days", + "route_type"]. + -days will sort the dataframe by day. E.g., Monday, Tuesday, Wed... + -route_type will sort the dataframe by route_type. E.g, 3, 200..., + by default "days" Returns ------- @@ -422,11 +437,18 @@ def _summarise_core( _type_defence(which, "which", str) _type_defence(return_summary, "return_summary", bool) _type_defence(to_days, "to_days", bool) + _type_defence(sort_by, "sort_by", str) which = which.lower().strip() if which not in ["trips", "routes"]: raise ValueError( f"'which' must be one of ['trips', 'routes']. Got {which}" ) + sort_by = sort_by.lower().strip() + if sort_by not in ["days", "route_type"]: + raise ValueError( + "'sort_by' must be on of ['days', 'route_type']. " + f"Got {sort_by}" + ) # choose summary if which == "trips": @@ -476,7 +498,15 @@ def _summarise_core( column.replace("amin", "min").replace("amax", "max") for column in trip_counts.columns.values ] + # sorting + # NOTE: More convoluted sorts don't work well with the custom + # _order_dataframe_by_day func. + # Always sort by day so that route_type sorts are more organised trip_counts = self.instances[0]._order_dataframe_by_day(trip_counts) + if sort_by == "route_type": + trip_counts = trip_counts.sort_values("route_type") + if to_days: + trip_counts = self._summary_col_sorter(trip_counts) return trip_counts def summarise_trips( @@ -484,6 +514,7 @@ def summarise_trips( summ_ops: list = [np.min, np.max, np.mean, np.median], return_summary: bool = True, to_days: bool = False, + sort_by: Union[str, list] = "days", ) -> pd.DataFrame: """Summarise the combined GTFS data by trip_id. @@ -502,6 +533,12 @@ def summarise_trips( trips/routes for each date. When False, summ_ops becomes useless, and should therefore nothing should be passed when calling this function (so it remains as the default). Defaults to False. + sort_by : Union[str, list], optional + What to sort the dataframe by. Options include: ["days", + "route_type"]. + -days will sort the dataframe by day. E.g., Monday, Tuesday, Wed... + -route_type will sort the dataframe by route_type. E.g, 3, 200..., + by default "days" Returns ------- @@ -514,6 +551,7 @@ def summarise_trips( summ_ops=summ_ops, return_summary=return_summary, to_days=to_days, + sort_by=sort_by, ) return self.daily_trip_summary.copy() @@ -522,6 +560,7 @@ def summarise_routes( summ_ops: list = [np.min, np.max, np.mean, np.median], return_summary: bool = True, to_days: bool = False, + sort_by: str = "days", ) -> pd.DataFrame: """Summarise the combined GTFS data by route_id. @@ -540,6 +579,12 @@ def summarise_routes( trips/routes for each date. When False, summ_ops becomes useless, and should therefore nothing should be passed when calling this function (so it remains as the default). Defaults to False. + sort_by : str, optional + What to sort the dataframe by. Options include: ["days", + "route_type"]. + -days will sort the dataframe by day. E.g., Monday, Tuesday, Wed... + -route_type will sort the dataframe by route_type. E.g, 3, 200..., + by default "days" Returns ------- @@ -552,6 +597,7 @@ def summarise_routes( summ_ops=summ_ops, return_summary=return_summary, to_days=to_days, + sort_by=sort_by, ) return self.daily_route_summary.copy() From e53e09c65a48142b2e1cc69191f8c378c8597787 Mon Sep 17 00:00:00 2001 From: Browning Date: Fri, 9 Feb 2024 09:49:17 +0000 Subject: [PATCH 2/6] feat: additional tests for new sorting features --- .../gtfs/multi_validation.py | 2 +- tests/gtfs/test_multi_validation.py | 28 +++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/transport_performance/gtfs/multi_validation.py b/src/transport_performance/gtfs/multi_validation.py index 7b471e67..5d07790e 100644 --- a/src/transport_performance/gtfs/multi_validation.py +++ b/src/transport_performance/gtfs/multi_validation.py @@ -446,7 +446,7 @@ def _summarise_core( sort_by = sort_by.lower().strip() if sort_by not in ["days", "route_type"]: raise ValueError( - "'sort_by' must be on of ['days', 'route_type']. " + "'sort_by' must be one of ['days', 'route_type']. " f"Got {sort_by}" ) diff --git a/tests/gtfs/test_multi_validation.py b/tests/gtfs/test_multi_validation.py index ba3b8939..95dcd032 100644 --- a/tests/gtfs/test_multi_validation.py +++ b/tests/gtfs/test_multi_validation.py @@ -453,24 +453,34 @@ def test_filter_to_bbox(self, multi_gtfs_fixture): ), "Gtfs inst[1] not as expected after filter" @pytest.mark.parametrize( - "which, summ_ops, raises, match", + "which, summ_ops, sort_by, raises, match", ( - ["route", True, TypeError, ".*summ_ops.*list.*bool"], - [True, [np.max], TypeError, ".*which.*str.*bool"], + ["route", True, "days", TypeError, ".*summ_ops.*list.*bool"], + [True, [np.max], "days", TypeError, ".*which.*str.*bool"], [ "not_which", [np.max], + "days", ValueError, ".*which.*trips.*routes.*not_which.*", ], + [ + "trips", + [np.max], + "not_sort", + ValueError, + ".*sort_by.*days.*route_type.*not_sort.*", + ], ), ) def test__summarise_core_defence( - self, multi_gtfs_fixture, which, summ_ops, raises, match + self, multi_gtfs_fixture, which, summ_ops, sort_by, raises, match ): """Defensive tests for _summarise_core().""" with pytest.raises(raises, match=match): - multi_gtfs_fixture._summarise_core(which=which, summ_ops=summ_ops) + multi_gtfs_fixture._summarise_core( + which=which, summ_ops=summ_ops, sort_by=sort_by + ) def test__summarise_core(self, multi_gtfs_fixture): """General tests for _summarise_core().""" @@ -532,6 +542,14 @@ def test__summarise_core(self, multi_gtfs_fixture): assert ( dated_sum[dated_sum.date == "2024-04-06"].route_count.iloc[0] == 9 ), "Unexpecteed number of routes on 2024-04-06" + # test sorting to route_type + route_sort = multi_gtfs_fixture._summarise_core( + which="routes", to_days=True, sort_by="route_type" + ) + first_three_types = route_sort.route_type[:3] + assert np.array_equal( + first_three_types, [3, 3, 3] + ), "Summary not sorted by route_type" def test_summarise_trips(self, multi_gtfs_fixture): """General tests for summarise_trips().""" From 45af51bd4082802ae37133dd429069727c4b77f4 Mon Sep 17 00:00:00 2001 From: Browning Date: Mon, 19 Feb 2024 05:43:19 +0000 Subject: [PATCH 3/6] fix: restructured param to remove unnecessary sort options --- .../gtfs/multi_validation.py | 65 +++++++------------ tests/gtfs/test_multi_validation.py | 26 +++++--- 2 files changed, 40 insertions(+), 51 deletions(-) diff --git a/src/transport_performance/gtfs/multi_validation.py b/src/transport_performance/gtfs/multi_validation.py index 5d07790e..0a4e6a9a 100644 --- a/src/transport_performance/gtfs/multi_validation.py +++ b/src/transport_performance/gtfs/multi_validation.py @@ -393,7 +393,7 @@ def _summarise_core( summ_ops: list = [np.min, np.max, np.mean, np.median], return_summary: bool = True, to_days: bool = False, - sort_by: str = "days", + sort_by_route_type: bool = False, ) -> pd.DataFrame: """Summarise the MultiGtfsInstance by either trip_id or route_id. @@ -408,18 +408,15 @@ def _summarise_core( by default [np.min, np.max, np.mean, np.median] return_summary : bool, optional When set to False, full data for each trip on each date will be - returned. Defaults to True. + returned, by default True. to_days : bool, optional Whether or not to aggregate to days, or to just return counts for trips/routes for each date. When False, summ_ops becomes useless, and should therefore nothing should be passed when calling this - function (so it remains as the default). Defaults to False. - sort_by : str, optional - What to sort the dataframe by. Options include: ["days", - "route_type"]. - -days will sort the dataframe by day. E.g., Monday, Tuesday, Wed... - -route_type will sort the dataframe by route_type. E.g, 3, 200..., - by default "days" + function (so it remains as the default), by default False. + sort_by_route_type : bool, optional + Whether or not to sort the resulting dataframe by route_type, + by default False. Returns ------- @@ -437,19 +434,12 @@ def _summarise_core( _type_defence(which, "which", str) _type_defence(return_summary, "return_summary", bool) _type_defence(to_days, "to_days", bool) - _type_defence(sort_by, "sort_by", str) + _type_defence(sort_by_route_type, "sort_by_route_type", bool) which = which.lower().strip() if which not in ["trips", "routes"]: raise ValueError( f"'which' must be one of ['trips', 'routes']. Got {which}" ) - sort_by = sort_by.lower().strip() - if sort_by not in ["days", "route_type"]: - raise ValueError( - "'sort_by' must be one of ['days', 'route_type']. " - f"Got {sort_by}" - ) - # choose summary if which == "trips": group_col = "trip_id" @@ -498,12 +488,9 @@ def _summarise_core( column.replace("amin", "min").replace("amax", "max") for column in trip_counts.columns.values ] - # sorting - # NOTE: More convoluted sorts don't work well with the custom - # _order_dataframe_by_day func. # Always sort by day so that route_type sorts are more organised trip_counts = self.instances[0]._order_dataframe_by_day(trip_counts) - if sort_by == "route_type": + if sort_by_route_type: trip_counts = trip_counts.sort_values("route_type") if to_days: trip_counts = self._summary_col_sorter(trip_counts) @@ -514,7 +501,7 @@ def summarise_trips( summ_ops: list = [np.min, np.max, np.mean, np.median], return_summary: bool = True, to_days: bool = False, - sort_by: Union[str, list] = "days", + sort_by_route_type: bool = False, ) -> pd.DataFrame: """Summarise the combined GTFS data by trip_id. @@ -527,18 +514,15 @@ def summarise_trips( ,by default [np.min, np.max, np.mean, np.median] return_summary: bool, optional When set to False, full data for each trip on each date will be - returned. Defaults to True. + returned, by default True. to_days : bool, optional Whether or not to aggregate to days, or to just return counts for trips/routes for each date. When False, summ_ops becomes useless, and should therefore nothing should be passed when calling this - function (so it remains as the default). Defaults to False. - sort_by : Union[str, list], optional - What to sort the dataframe by. Options include: ["days", - "route_type"]. - -days will sort the dataframe by day. E.g., Monday, Tuesday, Wed... - -route_type will sort the dataframe by route_type. E.g, 3, 200..., - by default "days" + function (so it remains as the default), by default False. + sort_by_route_type : bool, optional + Whether or not to sort the resulting dataframe by route_type, + by default False. Returns ------- @@ -551,7 +535,7 @@ def summarise_trips( summ_ops=summ_ops, return_summary=return_summary, to_days=to_days, - sort_by=sort_by, + sort_by_route_type=sort_by_route_type, ) return self.daily_trip_summary.copy() @@ -560,7 +544,7 @@ def summarise_routes( summ_ops: list = [np.min, np.max, np.mean, np.median], return_summary: bool = True, to_days: bool = False, - sort_by: str = "days", + sort_by_route_type: bool = False, ) -> pd.DataFrame: """Summarise the combined GTFS data by route_id. @@ -573,18 +557,15 @@ def summarise_routes( by default [np.min, np.max, np.mean, np.median]. return_summary: bool, optional When set to False, full data for each trip on each date will be - returned. Defaults to True. + returned, by default True. to_days : bool, optional Whether or not to aggregate to days, or to just return counts for trips/routes for each date. When False, summ_ops becomes useless, and should therefore nothing should be passed when calling this - function (so it remains as the default). Defaults to False. - sort_by : str, optional - What to sort the dataframe by. Options include: ["days", - "route_type"]. - -days will sort the dataframe by day. E.g., Monday, Tuesday, Wed... - -route_type will sort the dataframe by route_type. E.g, 3, 200..., - by default "days" + function (so it remains as the default), by default False. + sort_by_route_type : bool, optional + Whether or not to sort the resulting dataframe by route_type, + by default False. Returns ------- @@ -597,7 +578,7 @@ def summarise_routes( summ_ops=summ_ops, return_summary=return_summary, to_days=to_days, - sort_by=sort_by, + sort_by_route_type=sort_by_route_type, ) return self.daily_route_summary.copy() @@ -617,7 +598,7 @@ def viz_stops( Whether or not to return the folium map object, by default True. filtered_only : bool, optional Whether to filter the stops that are plotted to only stop_id's that - are present in the stop_times table. + are present in the stop_times table, by default True. Returns ------- diff --git a/tests/gtfs/test_multi_validation.py b/tests/gtfs/test_multi_validation.py index 95dcd032..35ed7334 100644 --- a/tests/gtfs/test_multi_validation.py +++ b/tests/gtfs/test_multi_validation.py @@ -453,14 +453,14 @@ def test_filter_to_bbox(self, multi_gtfs_fixture): ), "Gtfs inst[1] not as expected after filter" @pytest.mark.parametrize( - "which, summ_ops, sort_by, raises, match", + "which, summ_ops, sort_by_route_type, raises, match", ( - ["route", True, "days", TypeError, ".*summ_ops.*list.*bool"], - [True, [np.max], "days", TypeError, ".*which.*str.*bool"], + ["route", True, True, TypeError, ".*summ_ops.*list.*bool"], + [True, [np.max], True, TypeError, ".*which.*str.*bool"], [ "not_which", [np.max], - "days", + True, ValueError, ".*which.*trips.*routes.*not_which.*", ], @@ -468,18 +468,26 @@ def test_filter_to_bbox(self, multi_gtfs_fixture): "trips", [np.max], "not_sort", - ValueError, - ".*sort_by.*days.*route_type.*not_sort.*", + TypeError, + ".*sort_by_route_type.*bool.*str", ], ), ) def test__summarise_core_defence( - self, multi_gtfs_fixture, which, summ_ops, sort_by, raises, match + self, + multi_gtfs_fixture, + which, + summ_ops, + sort_by_route_type, + raises, + match, ): """Defensive tests for _summarise_core().""" with pytest.raises(raises, match=match): multi_gtfs_fixture._summarise_core( - which=which, summ_ops=summ_ops, sort_by=sort_by + which=which, + summ_ops=summ_ops, + sort_by_route_type=sort_by_route_type, ) def test__summarise_core(self, multi_gtfs_fixture): @@ -544,7 +552,7 @@ def test__summarise_core(self, multi_gtfs_fixture): ), "Unexpecteed number of routes on 2024-04-06" # test sorting to route_type route_sort = multi_gtfs_fixture._summarise_core( - which="routes", to_days=True, sort_by="route_type" + which="routes", to_days=True, sort_by_route_type=True ) first_three_types = route_sort.route_type[:3] assert np.array_equal( From e3c7c1fff1195290edd0d7cf178090a0ae69a8ed Mon Sep 17 00:00:00 2001 From: Browning Date: Mon, 19 Feb 2024 05:51:37 +0000 Subject: [PATCH 4/6] fix: add reset index after sorting by route type --- src/transport_performance/gtfs/multi_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport_performance/gtfs/multi_validation.py b/src/transport_performance/gtfs/multi_validation.py index 0a4e6a9a..cf533431 100644 --- a/src/transport_performance/gtfs/multi_validation.py +++ b/src/transport_performance/gtfs/multi_validation.py @@ -491,7 +491,7 @@ def _summarise_core( # Always sort by day so that route_type sorts are more organised trip_counts = self.instances[0]._order_dataframe_by_day(trip_counts) if sort_by_route_type: - trip_counts = trip_counts.sort_values("route_type") + trip_counts = trip_counts.sort_values("route_type").reset_index() if to_days: trip_counts = self._summary_col_sorter(trip_counts) return trip_counts From 146fff85ee8c1aecaf549c1121b041c5ba4d6eda Mon Sep 17 00:00:00 2001 From: Browning Date: Mon, 19 Feb 2024 05:52:38 +0000 Subject: [PATCH 5/6] fix: update docstring to address sorting conditions --- src/transport_performance/gtfs/multi_validation.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transport_performance/gtfs/multi_validation.py b/src/transport_performance/gtfs/multi_validation.py index cf533431..3759c4c4 100644 --- a/src/transport_performance/gtfs/multi_validation.py +++ b/src/transport_performance/gtfs/multi_validation.py @@ -415,7 +415,8 @@ def _summarise_core( and should therefore nothing should be passed when calling this function (so it remains as the default), by default False. sort_by_route_type : bool, optional - Whether or not to sort the resulting dataframe by route_type, + Whether or not to sort the resulting dataframe by route_type. + This only impacts the resulting df when to_days=True, by default False. Returns @@ -521,7 +522,8 @@ def summarise_trips( and should therefore nothing should be passed when calling this function (so it remains as the default), by default False. sort_by_route_type : bool, optional - Whether or not to sort the resulting dataframe by route_type, + Whether or not to sort the resulting dataframe by route_type. + This only impacts the resulting df when to_days=True, by default False. Returns @@ -564,7 +566,8 @@ def summarise_routes( and should therefore nothing should be passed when calling this function (so it remains as the default), by default False. sort_by_route_type : bool, optional - Whether or not to sort the resulting dataframe by route_type, + Whether or not to sort the resulting dataframe by route_type. + This only impacts the resulting df when to_days=True, by default False. Returns From 6c9b359d5b8a2b2d71a849b5168e22b3e2173e64 Mon Sep 17 00:00:00 2001 From: Sergio Recio Date: Mon, 19 Feb 2024 09:22:33 +0000 Subject: [PATCH 6/6] fix: moved reset_index to return statement --- src/transport_performance/gtfs/multi_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transport_performance/gtfs/multi_validation.py b/src/transport_performance/gtfs/multi_validation.py index 3759c4c4..84319b2b 100644 --- a/src/transport_performance/gtfs/multi_validation.py +++ b/src/transport_performance/gtfs/multi_validation.py @@ -492,10 +492,10 @@ def _summarise_core( # Always sort by day so that route_type sorts are more organised trip_counts = self.instances[0]._order_dataframe_by_day(trip_counts) if sort_by_route_type: - trip_counts = trip_counts.sort_values("route_type").reset_index() + trip_counts = trip_counts.sort_values("route_type") if to_days: trip_counts = self._summary_col_sorter(trip_counts) - return trip_counts + return trip_counts.reset_index(drop=True) def summarise_trips( self,