From d76e23cc0a5cb64160e866c8e3f73dc1c05ff062 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Sat, 4 Feb 2023 18:25:21 +0100 Subject: [PATCH 1/8] =?UTF-8?q?add=20--stats-by-route-date=20option=20?= =?UTF-8?q?=F0=9F=93=9D=E2=9C=85=E2=9A=A1=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supports analyzing a feed by route and/or date and/or day of the week. part of #33 --- benchmark/index.sql | 1 + benchmark/stats_by_route_id_date.sql | 8 +++ cli.js | 10 ++++ docs/analysis/feed-by-route-date.md | 50 +++++++++++++++++++ index.js | 8 +++ lib/index.js | 1 + lib/stats_by_route_date.js | 74 ++++++++++++++++++++++++++++ package.json | 1 + readme.md | 12 +++++ test/amtrak-gtfs-2021-10-06.sh | 14 ++++++ 10 files changed, 179 insertions(+) create mode 100644 benchmark/stats_by_route_id_date.sql create mode 100644 docs/analysis/feed-by-route-date.md create mode 100644 lib/stats_by_route_date.js diff --git a/benchmark/index.sql b/benchmark/index.sql index 589011c..08869d0 100644 --- a/benchmark/index.sql +++ b/benchmark/index.sql @@ -75,6 +75,7 @@ LANGUAGE plpgsql; \i connections_by_stop.sql \i connections_by_non_existent_stop.sql \i connections_by_time.sql +\i stats_by_route_id_and_date.sql SELECT * FROM _benchmark; diff --git a/benchmark/stats_by_route_id_date.sql b/benchmark/stats_by_route_id_date.sql new file mode 100644 index 0000000..6f191d8 --- /dev/null +++ b/benchmark/stats_by_route_id_date.sql @@ -0,0 +1,8 @@ +SELECT * from bench( +'SELECT * +FROM stats_by_route_date +WHERE route_id = ''17452_900'' -- M4 +AND date >= ''2022-08-08'' AND date <= ''2022-08-14'' +AND is_effective = true', +10 +); diff --git a/cli.js b/cli.js index ccd3967..b6a772e 100755 --- a/cli.js +++ b/cli.js @@ -44,6 +44,9 @@ const { 'stops-location-index': { type: 'boolean', }, + 'stats-by-route-date': { + type: 'string', + }, 'schema': { type: 'string', }, @@ -78,6 +81,12 @@ Options: Default if levels.txt has not been provided. --stops-location-index Create a spatial index on stops.stop_loc for efficient queries by geolocation. + --stats-by-route-date Wether to generate a stats_by_route_date view + letting you analyze all data per routes and/or date: + - none: Don't generate a view. + - view: Fast generation, slow access. + - materialized-view: Slow generation, fast access. + Default: none --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ @@ -117,6 +126,7 @@ const opt = { tripsWithoutShapeId: !!flags['trips-without-shape-id'], routesWithoutAgencyId: !!flags['routes-without-agency-id'], stopsLocationIndex: !!flags['stops-location-index'], + statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', schema: flags['schema'] || 'public', postgraphile: !!flags.postgraphile, importMetadata: !!flags['import-metadata'], diff --git a/docs/analysis/feed-by-route-date.md b/docs/analysis/feed-by-route-date.md new file mode 100644 index 0000000..afb08c7 --- /dev/null +++ b/docs/analysis/feed-by-route-date.md @@ -0,0 +1,50 @@ +# analysing a GTFS dataset by route ID and/or date + +Are you trying to answer a question like those below? + +- Are there certain dates or days of the week that have sigificantly less arrivals/departures (hereinafter "stop time events")? – This *may* indicate errors in the data, e.g. a faulty `calendar.csv` or `calendar_dates.csv` file. +- Has the number of stop time events decreased, compared to the last dataset version? +- Do specific routes stop running during certain time periods? + +`gtfs-via-postgres` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL: + +- If you run `gtfs-to-sql` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios. +- If you pass `--stats-by-route-date=materialized-view`, the `stats_by_route_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed). + +`stats_by_route_date` has the following columns: + +- `route_id` +- `date` +- `dow` – day of the week, following the [PostgreSQL notation `0` (Sunday) to `6` (Saturday)](https://www.postgresql.org/docs/14/functions-datetime.html#FUNCTIONS-DATETIME-EXTRACT) +- `nr_of_trips` – nr of trips starting on that date +- `nr_of_arrs_deps` – nr of trips taking place on that date +- `is_effective` – wether `nr_of_trips` & `nr_of_arrs_deps` are calculated based on the *effective* date (i.e. the date that the stop time event actually happens on) or *schedule* date (i.e. the date which their `stop_time` rows refer to) + +So + +- if you want to take a customer-facing perspective on the data (as in "I don't care which trips are scheduled before midnight, I want to know if they run today"), filter for `is_effective = True` rows; +- If you're interested in the operational/planning perspective (e.g. if you're looking for data errors), filter for `is_effective = False` rows. + +## example: nr of effective stop time events of a single route over a week + +```sql +-- using VBB's 2023-02-27 GTFS data +SELECT * +FROM stats_by_route_date stats +WHERE is_effective = True +AND route_id = '17438_900', -- M1 tram line +AND "date" >= '2023-03-19' -- Sunday, dow = 0 +AND "date" <= '2023-03-25' -- Saturday, dow = 6 +ORDER BY route_id, "date", is_effective DESC +``` + +```csv +route_id,date,dow,nr_of_trips,nr_of_arrs_deps,is_effective,dow +17438_900,2023-03-19,0,258,5870,t,0 +17438_900,2023-03-20,1,345,7831,t,1 +17438_900,2023-03-21,2,345,7831,t,2 +17438_900,2023-03-22,3,345,7831,t,3 +17438_900,2023-03-23,4,345,7831,t,4 +17438_900,2023-03-24,5,345,7831,t,5 +17438_900,2023-03-25,6,326,9001,t,6 +``` diff --git a/index.js b/index.js index 2cbba77..d3b8c16 100644 --- a/index.js +++ b/index.js @@ -19,6 +19,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { routesWithoutAgencyId: false, stopsWithoutLevelId: !files.some(f => f.name === 'levels'), stopsLocationIndex: false, + statsByRouteIdAndDate: 'none', schema: 'public', postgraphile: false, importMetadata: false, @@ -31,6 +32,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { requireDependencies, ignoreUnsupportedFiles, importMetadata, + statsByRouteIdAndDate, } = opt if (ignoreUnsupportedFiles) { @@ -81,6 +83,12 @@ const convertGtfsToSql = async function* (files, opt = {}) { dep: [], }, } : {}), + + ...(statsByRouteIdAndDate !== 'none' ? { + 'stats_by_route_date': { + dep: ['stop_times'], + }, + } : {}), } for (const file of files) { diff --git a/lib/index.js b/lib/index.js index 524ea70..cd49742 100644 --- a/lib/index.js +++ b/lib/index.js @@ -20,4 +20,5 @@ module.exports = { levels: require('./levels'), translations: require('./translations'), import_metadata: require('./import_metadata'), + stats_by_route_date: require('./stats_by_route_date'), } diff --git a/lib/stats_by_route_date.js b/lib/stats_by_route_date.js new file mode 100644 index 0000000..ff816f9 --- /dev/null +++ b/lib/stats_by_route_date.js @@ -0,0 +1,74 @@ +'use strict' + +const afterAll = (opt) => { + let materialized = false + if (opt.statsByRouteIdAndDate === 'materialized-view') { + materialized = true + } else if (opt.statsByRouteIdAndDate !== 'view') { + throw new Error('invalid opt.statsByRouteIdAndDate, must be one of these: none, view, materialized-view.') + } + const createViewCmd = materialized + ? `CREATE MATERIALIZED VIEW` + : `CREATE OR REPLACE VIEW` + + return `\ +${createViewCmd} "${opt.schema}".stats_by_route_date AS +WITH + arrs_deps_with_svc_date AS NOT MATERIALIZED ( + SELECT + route_id, stop_sequence_consec, + "date"::date AS svc_date, + EXTRACT(DOW FROM "date") AS svc_dow + FROM "${opt.schema}".arrivals_departures + ), + by_svc_date AS NOT MATERIALIZED ( + SELECT DISTINCT ON (route_id, svc_date) + route_id, + svc_date AS "date", + svc_dow AS dow, + count(*) FILTER (WHERE stop_sequence_consec = 0) OVER (PARTITION BY route_id, svc_date) AS nr_of_trips, + count(*) OVER (PARTITION BY route_id, svc_date) AS nr_of_arrs_deps + FROM arrs_deps_with_svc_date + ), + arrs_deps_with_effective_date AS NOT MATERIALIZED ( + SELECT + route_id, stop_sequence_consec, + coalesce(t_departure, t_arrival)::date AS effective_date, + EXTRACT(DOW FROM coalesce(t_departure, t_arrival)) AS effective_dow + FROM "${opt.schema}".arrivals_departures + ), + by_effective_date AS NOT MATERIALIZED ( + SELECT DISTINCT ON (route_id, effective_date) + route_id, + effective_date AS "date", + effective_dow AS dow, + count(*) FILTER (WHERE stop_sequence_consec = 0) OVER (PARTITION BY route_id, effective_date) AS nr_of_trips, + count(*) OVER (PARTITION BY route_id, effective_date) AS nr_of_arrs_deps + FROM arrs_deps_with_effective_date + ) +SELECT + *, + True AS is_effective +FROM by_effective_date +UNION +SELECT + *, + False AS is_effective +FROM by_svc_date; + +${materialized ? `\ +CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id); +CREATE INDEX ON "${opt.schema}".stats_by_route_date ("date"); +CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, "date", is_effective); +CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, dow, is_effective); +` : ''} + +${opt.postgraphile ? `\ +COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_route_date IS E'@name routeStats\\n@primaryKey route_id,date,is_effective\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByDate'; +` : ''} +` +} + +module.exports = { + afterAll, +} diff --git a/package.json b/package.json index 41fe8f5..5fc2495 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "index.js", "scripts", "lib", + "docs", "example.sh", "LICENSE", "LICENSE-PROSPERITY.md", diff --git a/readme.md b/readme.md index 5515f80..1887d61 100644 --- a/readme.md +++ b/readme.md @@ -93,6 +93,7 @@ In addition to a table for each GTFS file, `gtfs-via-postgres` adds these views - `arrivals_departures` "applies" [`stop_times`](https://gtfs.org/reference/static/#stop_timestxt)/[`frequencies`](https://gtfs.org/reference/static/#frequenciestxt) to [`trips`](https://gtfs.org/reference/static/#tripstxt) and `service_days` to give you all arrivals/departures at each stop with their *absolute* dates & times. It also resolves each stop's parent station ID & name. - `connections` "applies" [`stop_times`](https://gtfs.org/reference/static/#stop_timestxt)/[`frequencies`](https://gtfs.org/reference/static/#frequenciestxt) to [`trips`](https://gtfs.org/reference/static/#tripstxt) and `service_days`, just like `arrivals_departures`, but gives you departure (at stop A) & arrival (at stop B) *pairs*. - `shapes_aggregates` aggregates individual shape points in [`shapes`](https://gtfs.org/reference/static/#shapestxt) into a [PostGIS `LineString`](http://postgis.net/workshops/postgis-intro/geometries.html#linestrings). +- `stats_by_route_date` provides the number of arrivals/departures by route ID and date. – [read more](docs/analysis/feed-by-route-date.md) As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01`: @@ -155,6 +156,12 @@ Options: Default if levels.txt has not been provided. --stops-location-index Create a spatial index on stops.stop_loc for efficient queries by geolocation. + --stats-by-route-date Wether to generate a stats_by_route_date view + letting you analyze all data per routes and/or date: + - none: Don't generate a view. + - view: Fast generation, slow access. + - materialized-view: Slow generation, fast access. + Default: none --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ @@ -261,6 +268,10 @@ env NODE_ENV=development npm exec -- serve-gtfs-via-graphql **As an example for the GraphQL API, check out the [test query](test/sample-gtfs-feed-postgraphile-test.graphql)** or open the [GraphiQL UI](https://github.com/graphql/graphiql) served at [`localhost:3000/graphiql`](http://localhost:3000/graphiql). +### more guides + +The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-postgres`. + ## Correctness vs. Speed regarding GTFS Time Values @@ -327,6 +338,7 @@ The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https:/ |
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 84.17 | 83.71 | 83.98 | 84.05 | 84.18 | 84.82 | 85.44 | 88.441 | 100 | |
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 15.53 | 15.404 | 15.5 | 15.52 | 15.54 | 15.6 | 15.9 | 15.915 | 100 | |
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
ORDER BY t_departure
LIMIT 100
| 8414.27 | 7885.369 | 7994.99 | 8364.33 | 8735.64 | 9147.52 | 9180.64 | 9188.92 | 7 | +|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2022-08-08' AND date <= '2022-08-14'
AND is_effective = true
| 2900.64 | 2888.196 | 2891.65 | 2893.07 | 2905.69 | 2927.44 | 2936.58 | 2938.86 | 10 | ## Related Projects diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index a41897f..f30cd44 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -15,6 +15,7 @@ export PGDATABASE='amtrak_2021_10_06' ../cli.js -d --trips-without-shape-id --schema amtrak \ --import-metadata \ + --stats-by-route-date=view \ -- amtrak-gtfs-2021-10-06/*.txt | psql -b query=$(cat << EOF @@ -50,3 +51,16 @@ if [[ "$fMin" != "2021-11-24" ]]; then echo "invalid dates_filter_min(…): $fMin" 1>&2 exit 1 fi + +acelaStatQuery=$(cat << EOF +SELECT nr_of_trips, nr_of_arrs_deps +FROM amtrak.stats_by_route_date +WHERE route_id = '40751' -- Acela +AND date = '2021-11-26' +AND is_effective = True +EOF) +acelaStat=$(psql --csv -t -c "$acelaStatQuery" | tail -n 1) +if [[ "$acelaStat" != "16,190" ]]; then + echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2 + exit 1 +fi From 7a063ad41e96d0897dadb68c0005d4b2788b3194 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 23 Mar 2023 12:56:21 +0100 Subject: [PATCH 2/8] =?UTF-8?q?benchmarks:=20use=20date=5Ffilter=5F{min,ma?= =?UTF-8?q?x}()=20=E2=9A=A1=EF=B8=8F=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../arrs_deps_by_route_name_and_time.sql | 3 +- benchmark/arrs_deps_by_station_and_time.sql | 3 +- .../arrs_deps_by_station_and_time_seq_0.sql | 3 +- benchmark/arrs_deps_by_stop_and_time.sql | 3 +- benchmark/arrs_deps_by_time.sql | 3 +- benchmark/arrs_deps_by_time_manual.sql | 8 ++++ .../connections_by_route_name_and_time.sql | 3 +- benchmark/connections_by_station_and_time.sql | 3 +- .../connections_by_station_and_time_seq_0.sql | 3 +- benchmark/connections_by_stop_and_time.sql | 3 +- benchmark/connections_by_time.sql | 3 +- benchmark/connections_by_time_manual.sql | 10 +++++ benchmark/index.sql | 2 + readme.md | 38 ++++++++++--------- 14 files changed, 60 insertions(+), 28 deletions(-) create mode 100644 benchmark/arrs_deps_by_time_manual.sql create mode 100644 benchmark/connections_by_time_manual.sql diff --git a/benchmark/arrs_deps_by_route_name_and_time.sql b/benchmark/arrs_deps_by_route_name_and_time.sql index 625a3ff..5a6bace 100644 --- a/benchmark/arrs_deps_by_route_name_and_time.sql +++ b/benchmark/arrs_deps_by_route_name_and_time.sql @@ -3,5 +3,6 @@ SELECT * from bench( FROM arrivals_departures WHERE route_short_name = ''S1'' AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''' +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'')' ); diff --git a/benchmark/arrs_deps_by_station_and_time.sql b/benchmark/arrs_deps_by_station_and_time.sql index bba6def..0a96cd4 100644 --- a/benchmark/arrs_deps_by_station_and_time.sql +++ b/benchmark/arrs_deps_by_station_and_time.sql @@ -3,6 +3,7 @@ SELECT * from bench( FROM arrivals_departures WHERE station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin) AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''', +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'')', 40 ); diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0.sql b/benchmark/arrs_deps_by_station_and_time_seq_0.sql index 5d2a391..c171d81 100644 --- a/benchmark/arrs_deps_by_station_and_time_seq_0.sql +++ b/benchmark/arrs_deps_by_station_and_time_seq_0.sql @@ -3,7 +3,8 @@ SELECT * from bench( FROM arrivals_departures WHERE station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin) AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09'' +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'') AND stop_sequence = 0', 50 ); diff --git a/benchmark/arrs_deps_by_stop_and_time.sql b/benchmark/arrs_deps_by_stop_and_time.sql index 0b8a5f1..a2c147e 100644 --- a/benchmark/arrs_deps_by_stop_and_time.sql +++ b/benchmark/arrs_deps_by_stop_and_time.sql @@ -3,5 +3,6 @@ SELECT * from bench( FROM arrivals_departures WHERE stop_id = ''de:11000:900100001::4'' -- S+U Friedrichstr. (Berlin) AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''' +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'')' ); diff --git a/benchmark/arrs_deps_by_time.sql b/benchmark/arrs_deps_by_time.sql index 48623a4..ad82cc3 100644 --- a/benchmark/arrs_deps_by_time.sql +++ b/benchmark/arrs_deps_by_time.sql @@ -2,6 +2,7 @@ SELECT * from bench( 'SELECT * FROM arrivals_departures WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''', +AND date >= dates_filter_min(''2022-08-09T07:10+02''::timestamp with time zone) +AND date <= dates_filter_max(''2022-08-09T07:30+02''::timestamp with time zone)', 10 ); diff --git a/benchmark/arrs_deps_by_time_manual.sql b/benchmark/arrs_deps_by_time_manual.sql new file mode 100644 index 0000000..9707b28 --- /dev/null +++ b/benchmark/arrs_deps_by_time_manual.sql @@ -0,0 +1,8 @@ +SELECT * from bench( +'SELECT * +FROM arrivals_departures +WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' +AND date >= ''2022-08-08'' +AND date <= ''2022-08-09''', +10 +); diff --git a/benchmark/connections_by_route_name_and_time.sql b/benchmark/connections_by_route_name_and_time.sql index a5728ed..e754e96 100644 --- a/benchmark/connections_by_route_name_and_time.sql +++ b/benchmark/connections_by_route_name_and_time.sql @@ -3,5 +3,6 @@ SELECT * from bench( FROM connections WHERE route_short_name = ''S1'' AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''' +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'')' ); diff --git a/benchmark/connections_by_station_and_time.sql b/benchmark/connections_by_station_and_time.sql index c5fa3c9..d468673 100644 --- a/benchmark/connections_by_station_and_time.sql +++ b/benchmark/connections_by_station_and_time.sql @@ -3,6 +3,7 @@ SELECT * from bench( FROM connections WHERE from_station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin) AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''', +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'')', 40 ); diff --git a/benchmark/connections_by_station_and_time_seq_0.sql b/benchmark/connections_by_station_and_time_seq_0.sql index b6749d9..5606e50 100644 --- a/benchmark/connections_by_station_and_time_seq_0.sql +++ b/benchmark/connections_by_station_and_time_seq_0.sql @@ -3,7 +3,8 @@ SELECT * from bench( FROM connections WHERE from_station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin) AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09'' +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'') AND from_stop_sequence = 0', 50 ); diff --git a/benchmark/connections_by_stop_and_time.sql b/benchmark/connections_by_stop_and_time.sql index ad4aa26..41eb953 100644 --- a/benchmark/connections_by_stop_and_time.sql +++ b/benchmark/connections_by_stop_and_time.sql @@ -3,5 +3,6 @@ SELECT * from bench( FROM connections WHERE from_stop_id = ''de:11000:900100001::4'' -- S+U Friedrichstr. (Berlin) AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09''' +AND date >= dates_filter_min(''2022-08-09T07:10+02'') +AND date <= dates_filter_max(''2022-08-09T07:30+02'')' ); diff --git a/benchmark/connections_by_time.sql b/benchmark/connections_by_time.sql index e48e42e..c40aab9 100644 --- a/benchmark/connections_by_time.sql +++ b/benchmark/connections_by_time.sql @@ -2,7 +2,8 @@ SELECT * from bench( 'SELECT * FROM connections WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' -AND date > ''2022-08-08'' AND date <= ''2022-08-09'' +AND date >= dates_filter_min(''2022-08-09T07:10+02''::timestamp with time zone) +AND date <= dates_filter_max(''2022-08-09T07:30+02''::timestamp with time zone) ORDER BY t_departure LIMIT 100', 7 diff --git a/benchmark/connections_by_time_manual.sql b/benchmark/connections_by_time_manual.sql new file mode 100644 index 0000000..4a0bfc0 --- /dev/null +++ b/benchmark/connections_by_time_manual.sql @@ -0,0 +1,10 @@ +SELECT * from bench( +'SELECT * +FROM connections +WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02'' +AND date >= ''2022-08-08'' +AND date <= ''2022-08-09'' +ORDER BY t_departure +LIMIT 100', +7 +); diff --git a/benchmark/index.sql b/benchmark/index.sql index 08869d0..eca2b02 100644 --- a/benchmark/index.sql +++ b/benchmark/index.sql @@ -67,6 +67,7 @@ LANGUAGE plpgsql; \i arrs_deps_by_stop.sql \i arrs_deps_by_non_existent_stop.sql \i arrs_deps_by_time.sql +\i arrs_deps_by_time_manual.sql \i connections_by_route_name_and_time.sql \i connections_by_station_and_time.sql \i connections_by_station_and_time_seq_0.sql @@ -75,6 +76,7 @@ LANGUAGE plpgsql; \i connections_by_stop.sql \i connections_by_non_existent_stop.sql \i connections_by_time.sql +\i connections_by_time_manual.sql \i stats_by_route_id_and_date.sql SELECT * FROM _benchmark; diff --git a/readme.md b/readme.md index 1887d61..b24e776 100644 --- a/readme.md +++ b/readme.md @@ -321,24 +321,26 @@ The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https:/ | query | avg | min | p25 | p50 | p75 | p95 | p99 | max | iterations | | - | - | - | - | - | - | - | - | - | - | -|
SELECT *
FROM stops
ORDER BY ST_Distance(stop_loc::geometry, ST_SetSRID(ST_MakePoint(9.7, 50.547), 4326)) ASC
LIMIT 100
| 15.51 | 15.154 | 15.52 | 15.54 | 15.57 | 15.6 | 15.62 | 15.639 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 25.61 | 25.108 | 25.43 | 25.56 | 25.83 | 26.05 | 26.2 | 26.286 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 226.79 | 225.335 | 226.16 | 226.45 | 226.74 | 230.48 | 231.47 | 231.561 | 40 | -|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
AND stop_sequence = 0
| 184.96 | 182.642 | 183.52 | 183.85 | 184.11 | 189.78 | 203.34 | 208.909 | 50 | -|
SELECT *
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 26.7 | 26.398 | 26.66 | 26.7 | 26.76 | 26.82 | 26.9 | 26.966 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 1.77 | 1.753 | 1.77 | 1.77 | 1.78 | 1.81 | 1.81 | 1.811 | 100 | -|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 85.37 | 57.008 | 85.34 | 86.37 | 87.49 | 90.02 | 92.91 | 95.309 | 100 | -|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'definitely-non-existent'
| 1.82 | 1.804 | 1.81 | 1.82 | 1.82 | 1.83 | 1.84 | 1.856 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 1140.03 | 1138.579 | 1139.2 | 1140.01 | 1140.34 | 1141.85 | 1142.64 | 1142.831 | 10 | -|
SELECT *
FROM connections
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 75.46 | 69.94 | 72.02 | 78.32 | 78.5 | 78.66 | 78.82 | 78.874 | 100 | -|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 242 | 234.683 | 235.54 | 244.37 | 244.67 | 245.02 | 247.83 | 249.496 | 40 | -|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
AND from_stop_sequence = 0
| 184.49 | 183.285 | 183.85 | 183.99 | 184.25 | 188.98 | 190.42 | 191.213 | 50 | -|
SELECT *
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 10.66 | 10.57 | 10.62 | 10.65 | 10.68 | 10.76 | 10.96 | 10.976 | 100 | -|
SELECT *
FROM connections
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 2.78 | 2.758 | 2.77 | 2.78 | 2.79 | 2.81 | 2.83 | 2.829 | 100 | -|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 84.17 | 83.71 | 83.98 | 84.05 | 84.18 | 84.82 | 85.44 | 88.441 | 100 | -|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 15.53 | 15.404 | 15.5 | 15.52 | 15.54 | 15.6 | 15.9 | 15.915 | 100 | -|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
ORDER BY t_departure
LIMIT 100
| 8414.27 | 7885.369 | 7994.99 | 8364.33 | 8735.64 | 9147.52 | 9180.64 | 9188.92 | 7 | -|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2022-08-08' AND date <= '2022-08-14'
AND is_effective = true
| 2900.64 | 2888.196 | 2891.65 | 2893.07 | 2905.69 | 2927.44 | 2936.58 | 2938.86 | 10 | +|
SELECT *
FROM stops
ORDER BY ST_Distance(stop_loc::geometry, ST_SetSRID(ST_MakePoint(9.7, 50.547), 4326)) ASC
LIMIT 100
| 15.03 | 14.972 | 15.01 | 15.03 | 15.05 | 15.09 | 15.12 | 15.162 | 100 | +|
SELECT *
FROM arrivals_departures
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 73.32 | 72.183 | 72.7 | 73.04 | 74.01 | 74.41 | 75.21 | 76.991 | 100 | +|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 25.44 | 25.326 | 25.38 | 25.44 | 25.5 | 25.58 | 25.64 | 25.669 | 40 | +|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
AND stop_sequence = 0
| 4.81 | 4.779 | 4.79 | 4.8 | 4.82 | 4.85 | 4.88 | 4.881 | 50 | +|
SELECT *
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 6.9 | 6.844 | 6.87 | 6.89 | 6.91 | 6.95 | 7.06 | 7.38 | 100 | +|
SELECT *
FROM arrivals_departures
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 1.95 | 1.93 | 1.94 | 1.95 | 1.96 | 1.98 | 2.01 | 2.017 | 100 | +|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 57.12 | 56.909 | 57.04 | 57.07 | 57.15 | 57.52 | 57.63 | 58.027 | 100 | +|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'definitely-non-existent'
| 1.88 | 1.857 | 1.87 | 1.87 | 1.88 | 1.91 | 1.94 | 1.957 | 100 | +|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone)
| 9029.56 | 8989.411 | 9008.11 | 9023.53 | 9045.53 | 9073.78 | 9081.32 | 9083.21 | 10 | +|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= '2022-08-08'
AND date <= '2022-08-09'
| 3228.52 | 3166.56 | 3178.56 | 3207.68 | 3255.47 | 3341.53 | 3360.18 | 3364.837 | 10 | +|
SELECT *
FROM connections
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 175.24 | 166.412 | 175.91 | 176.73 | 177.19 | 177.94 | 178.24 | 180.551 | 100 | +|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 48.97 | 48.668 | 48.87 | 48.99 | 49.08 | 49.19 | 49.25 | 49.27 | 40 | +|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
AND from_stop_sequence = 0
| 10.2 | 10.129 | 10.18 | 10.2 | 10.22 | 10.24 | 10.26 | 10.26 | 50 | +|
SELECT *
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 13.06 | 12.983 | 13.04 | 13.06 | 13.08 | 13.11 | 13.14 | 13.162 | 100 | +|
SELECT *
FROM connections
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 2.78 | 2.758 | 2.77 | 2.78 | 2.79 | 2.8 | 2.82 | 2.831 | 100 | +|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 84.33 | 83.86 | 84.15 | 84.3 | 84.45 | 84.74 | 84.95 | 85.255 | 100 | +|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 15.56 | 15.482 | 15.52 | 15.54 | 15.57 | 15.63 | 15.94 | 16.033 | 100 | +|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone)
ORDER BY t_departure
LIMIT 100
| 21717.58 | 21221.577 | 21550.43 | 21695.35 | 21903.85 | 22121.88 | 22182.42 | 22197.556 | 7 | +|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= '2022-08-08'
AND date <= '2022-08-09'
ORDER BY t_departure
LIMIT 100
| 9566.47 | 9276.008 | 9372.12 | 9513.29 | 9790.32 | 9850.55 | 9850.99 | 9851.094 | 7 | +|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2022-08-08' AND date <= '2022-08-14'
AND is_effective = true
| 2896.36 | 2891.601 | 2893.68 | 2895.13 | 2897.72 | 2904.03 | 2904.62 | 2904.77 | 10 | ## Related Projects From d747d9a7cad3fb897972a1cfe9f2136ba4d86f1a Mon Sep 17 00:00:00 2001 From: Jannis R Date: Mon, 27 Mar 2023 00:22:32 +0200 Subject: [PATCH 3/8] =?UTF-8?q?add=20--stats-by-agency-route-stop-hour=20o?= =?UTF-8?q?ption=20=F0=9F=93=9D=E2=9C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cli.js | 8 ++++ .../feed-by-agency-route-stop-and-hour.md | 5 +++ index.js | 7 ++++ lib/index.js | 1 + lib/stats_by_agency_route_stop_hour.js | 38 +++++++++++++++++++ lib/stop_times.js | 2 + readme.md | 5 +++ test/amtrak-gtfs-2021-10-06.sh | 14 +++++++ 8 files changed, 80 insertions(+) create mode 100644 docs/analysis/feed-by-agency-route-stop-and-hour.md create mode 100644 lib/stats_by_agency_route_stop_hour.js diff --git a/cli.js b/cli.js index b6a772e..dded271 100755 --- a/cli.js +++ b/cli.js @@ -47,6 +47,9 @@ const { 'stats-by-route-date': { type: 'string', }, + 'stats-by-agency-route-stop-hour': { + type: 'string', + }, 'schema': { type: 'string', }, @@ -87,6 +90,10 @@ Options: - view: Fast generation, slow access. - materialized-view: Slow generation, fast access. Default: none + --stats-by-agency-route-stop-hour + Generate a view letting you analyze arrivals/ + departures per route, stop and hour. + The flag works like --stats-by-route-date. --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ @@ -127,6 +134,7 @@ const opt = { routesWithoutAgencyId: !!flags['routes-without-agency-id'], stopsLocationIndex: !!flags['stops-location-index'], statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', + statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none', schema: flags['schema'] || 'public', postgraphile: !!flags.postgraphile, importMetadata: !!flags['import-metadata'], diff --git a/docs/analysis/feed-by-agency-route-stop-and-hour.md b/docs/analysis/feed-by-agency-route-stop-and-hour.md new file mode 100644 index 0000000..521482d --- /dev/null +++ b/docs/analysis/feed-by-agency-route-stop-and-hour.md @@ -0,0 +1,5 @@ +# analysing a GTFS dataset by route ID, stop ID and/or hour + +With the `--stats-by-route-and-stop-and-hour` option, `gtfs-via-postgres` provides a view `stats_by_agency_route_stop_hour`. Just like [`stats_by_route_id_and_date`](feed-by-route-and-date.md), it aggregates all arrivals by `agency_id`, `route_id`, `stop_id` and `effective_hour`. + +Note: As a materialized view, `stats_by_agency_route_stop_hour` takes up a significant amount of space, e.g. 13GB with the 2023-05-02 VBB GTFS feed. diff --git a/index.js b/index.js index d3b8c16..438895f 100644 --- a/index.js +++ b/index.js @@ -20,6 +20,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { stopsWithoutLevelId: !files.some(f => f.name === 'levels'), stopsLocationIndex: false, statsByRouteIdAndDate: 'none', + statsByAgencyIdAndRouteIdAndStopAndHour: 'none', schema: 'public', postgraphile: false, importMetadata: false, @@ -33,6 +34,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { ignoreUnsupportedFiles, importMetadata, statsByRouteIdAndDate, + statsByAgencyIdAndRouteIdAndStopAndHour, } = opt if (ignoreUnsupportedFiles) { @@ -89,6 +91,11 @@ const convertGtfsToSql = async function* (files, opt = {}) { dep: ['stop_times'], }, } : {}), + ...(statsByAgencyIdAndRouteIdAndStopAndHour !== 'none' ? { + 'stats_by_agency_route_stop_hour': { + dep: ['stop_times'], + }, + } : {}), } for (const file of files) { diff --git a/lib/index.js b/lib/index.js index cd49742..79c43e9 100644 --- a/lib/index.js +++ b/lib/index.js @@ -21,4 +21,5 @@ module.exports = { translations: require('./translations'), import_metadata: require('./import_metadata'), stats_by_route_date: require('./stats_by_route_date'), + stats_by_agency_route_stop_hour: require('./stats_by_agency_route_stop_hour'), } diff --git a/lib/stats_by_agency_route_stop_hour.js b/lib/stats_by_agency_route_stop_hour.js new file mode 100644 index 0000000..c753d31 --- /dev/null +++ b/lib/stats_by_agency_route_stop_hour.js @@ -0,0 +1,38 @@ +'use strict' + +const afterAll = (opt) => { + let materialized = false + if (opt.statsByAgencyIdAndRouteIdAndStopAndHour === 'materialized-view') { + materialized = true + } else if (opt.statsByAgencyIdAndRouteIdAndStopAndHour !== 'view') { + throw new Error('invalid opt.statsByAgencyIdAndRouteIdAndStopAndHour, must be one of these: none, view, materialized-view.') + } + const createViewCmd = materialized + ? `CREATE MATERIALIZED VIEW` + : `CREATE OR REPLACE VIEW` + + return `\ +${createViewCmd} "${opt.schema}".stats_by_agency_route_stop_hour AS +SELECT DISTINCT ON (agency_id, route_id, stop_id, effective_hour) + agency_id, route_id, stop_id, station_id, + "date" as service_date, + date_trunc('hour', t_arrival) AS effective_hour, + count(*) OVER (PARTITION BY route_id, stop_id, date_trunc('hour', t_arrival)) AS nr_of_arrs +FROM "${opt.schema}".arrivals_departures; + +${materialized ? `\ +CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (route_id); +CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (stop_id); +CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (station_id); +CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (effective_hour); +` : ''} + +${opt.postgraphile ? `\ +COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_agency_route_stop_hour IS E'@name hourlyStats\\n@primaryKey route_id,stop_id,effective_hour\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByStopIdAndHour\\n@foreignKey (stop_id) references stops|@fieldName stop|@foreignFieldName statsByRouteIdAndHour'; +` : ''} +` +} + +module.exports = { + afterAll, +} diff --git a/lib/stop_times.js b/lib/stop_times.js index d70faa6..7379fe2 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -161,6 +161,7 @@ $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT + routes.agency_id, trips.route_id, route_short_name, route_long_name, @@ -256,6 +257,7 @@ SELECT -- stop_times_based.* except t_arrival & t_departure, duh -- todo: find a way to use all columns without explicitly enumerating them here + agency_id, route_id, route_short_name, route_long_name, route_type, trip_id, direction_id, trip_headsign, service_id, diff --git a/readme.md b/readme.md index b24e776..a36cbad 100644 --- a/readme.md +++ b/readme.md @@ -94,6 +94,7 @@ In addition to a table for each GTFS file, `gtfs-via-postgres` adds these views - `connections` "applies" [`stop_times`](https://gtfs.org/reference/static/#stop_timestxt)/[`frequencies`](https://gtfs.org/reference/static/#frequenciestxt) to [`trips`](https://gtfs.org/reference/static/#tripstxt) and `service_days`, just like `arrivals_departures`, but gives you departure (at stop A) & arrival (at stop B) *pairs*. - `shapes_aggregates` aggregates individual shape points in [`shapes`](https://gtfs.org/reference/static/#shapestxt) into a [PostGIS `LineString`](http://postgis.net/workshops/postgis-intro/geometries.html#linestrings). - `stats_by_route_date` provides the number of arrivals/departures by route ID and date. – [read more](docs/analysis/feed-by-route-date.md) +- `stats_by_agency_route_stop_hour` provides the number of arrivals/departures by agency ID, route ID, stop ID & hour. – [read more](docs/analysis/feed-by-agency-route-stop-and-hour.md) As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01`: @@ -162,6 +163,10 @@ Options: - view: Fast generation, slow access. - materialized-view: Slow generation, fast access. Default: none + --stats-by-agency-route-stop-hour + Generate a view letting you analyze arrivals/ + departures per route, stop and hour. + The flag works like --stats-by-route-date. --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index f30cd44..7232280 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -16,6 +16,7 @@ export PGDATABASE='amtrak_2021_10_06' ../cli.js -d --trips-without-shape-id --schema amtrak \ --import-metadata \ --stats-by-route-date=view \ + --stats-by-agency-route-stop-hour=view \ -- amtrak-gtfs-2021-10-06/*.txt | psql -b query=$(cat << EOF @@ -64,3 +65,16 @@ if [[ "$acelaStat" != "16,190" ]]; then echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2 exit 1 fi + +acelaPhillyStatQuery=$(cat << EOF +SELECT nr_of_arrs +FROM amtrak.stats_by_agency_route_stop_hour +WHERE route_id = '40751' -- Acela +AND stop_id = 'PHL' -- Philadelphia +AND effective_hour = '2022-07-24T09:00-05' +EOF) +acelaPhillyStat=$(psql --csv -t -c "$acelaPhillyStatQuery" | tail -n 1) +if [[ "$acelaPhillyStat" != "2" ]]; then + echo "invalid stats for route 40751 (Acela) at PHL (Philadelphia) on 2021-11-26: $acelaPhillyStat" 1>&2 + exit 1 +fi From a48ecabcdffa811cf2a5804b390ae389618a849a Mon Sep 17 00:00:00 2001 From: Jannis R Date: Wed, 29 Mar 2023 19:47:33 +0200 Subject: [PATCH 4/8] benchmark script: report rounded milliseconds --- benchmark/index.sql | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmark/index.sql b/benchmark/index.sql index eca2b02..01f292c 100644 --- a/benchmark/index.sql +++ b/benchmark/index.sql @@ -42,13 +42,13 @@ BEGIN INSERT INTO _benchmark SELECT _query, - round(avg(elapsed)::numeric, 2), + round(avg(elapsed)::numeric, 0), min(elapsed), - round((percentile_cont(0.25) WITHIN GROUP (ORDER BY elapsed))::numeric, 2), - round((percentile_cont(0.50) WITHIN GROUP (ORDER BY elapsed))::numeric, 2), - round((percentile_cont(0.75) WITHIN GROUP (ORDER BY elapsed))::numeric, 2), - round((percentile_cont(0.95) WITHIN GROUP (ORDER BY elapsed))::numeric, 2), - round((percentile_cont(0.99) WITHIN GROUP (ORDER BY elapsed))::numeric, 2), + round((percentile_cont(0.25) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), + round((percentile_cont(0.50) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), + round((percentile_cont(0.75) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), + round((percentile_cont(0.95) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), + round((percentile_cont(0.99) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), max(elapsed), _iterations FROM _bench_results; From 8edb2b7c630dc00a1fc524271e0765d7d9f56e7c Mon Sep 17 00:00:00 2001 From: Jannis R Date: Mon, 10 Apr 2023 16:24:24 +0200 Subject: [PATCH 5/8] =?UTF-8?q?tests:=20buffer=20generated=20SQL=20using?= =?UTF-8?q?=20sponge=20=E2=9C=85=F0=9F=92=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/publish.yml | 2 ++ .github/workflows/smoke-test.yml | 2 ++ .github/workflows/test.yml | 2 ++ test/amtrak-gtfs-2021-10-06.sh | 3 ++- test/calendar-dates-only.sh | 4 +++- test/postgraphile.sh | 2 +- test/sample-gtfs-feed.sh | 2 +- 7 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f5bb0d3..bddeb77 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -22,6 +22,8 @@ jobs: uses: actions/setup-node@v1 with: node-version: ${{ matrix.node-version }} + - name: install sponge (moreutils) + run: sudo apt install -y moreutils - name: install & start PostgreSQL with PostGIS # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14 diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index a34cc71..dee9d49 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -31,6 +31,8 @@ jobs: uses: actions/setup-node@v1 with: node-version: '16.x' + - name: install sponge (moreutils) + run: sudo apt install -y moreutils - name: install & start PostgreSQL with PostGIS # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2b0f965..225aa19 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,6 +27,8 @@ jobs: uses: actions/setup-node@v1 with: node-version: ${{ matrix.node-version }} + - name: install sponge (moreutils) + run: sudo apt install -y moreutils - name: install & start PostgreSQL with PostGIS # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14 diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 7232280..c1b5151 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -17,7 +17,8 @@ export PGDATABASE='amtrak_2021_10_06' --import-metadata \ --stats-by-route-date=view \ --stats-by-agency-route-stop-hour=view \ - -- amtrak-gtfs-2021-10-06/*.txt | psql -b + -- amtrak-gtfs-2021-10-06/*.txt \ + | sponge | psql -b query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival diff --git a/test/calendar-dates-only.sh b/test/calendar-dates-only.sh index d431c43..edca85d 100755 --- a/test/calendar-dates-only.sh +++ b/test/calendar-dates-only.sh @@ -10,7 +10,9 @@ env | grep '^PG' || true psql -c 'create database calendar_dates_only' export PGDATABASE='calendar_dates_only' -../cli.js -d --trips-without-shape-id -- calendar-dates-only/*.txt | psql -b +../cli.js -d --trips-without-shape-id -- \ + calendar-dates-only/*.txt \ + | sponge | psql -b query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival diff --git a/test/postgraphile.sh b/test/postgraphile.sh index 6bb6426..3c8e423 100755 --- a/test/postgraphile.sh +++ b/test/postgraphile.sh @@ -12,7 +12,7 @@ export PGDATABASE='postgraphile' ../cli.js -d --trips-without-shape-id --postgraphile -- \ ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - | psql -b + | sponge | psql -b # kill child processes on exit # https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 diff --git a/test/sample-gtfs-feed.sh b/test/sample-gtfs-feed.sh index b5fc835..2b2b412 100755 --- a/test/sample-gtfs-feed.sh +++ b/test/sample-gtfs-feed.sh @@ -22,7 +22,7 @@ export PGDATABASE='sample_gtfs_feed' ../node_modules/sample-gtfs-feed/gtfs/levels.txt \ ../node_modules/sample-gtfs-feed/gtfs/pathways.txt \ ../node_modules/sample-gtfs-feed/gtfs/translations.txt \ - | psql -b + | sponge | psql -b query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival From 36f848aabc7b6811de775acaf57c3b3ca733271c Mon Sep 17 00:00:00 2001 From: Jannis R Date: Mon, 10 Apr 2023 16:25:35 +0200 Subject: [PATCH 6/8] =?UTF-8?q?Dockerfile,=20docs:=20use=20sponge=20too=20?= =?UTF-8?q?=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 6 ++++-- cli.js | 4 ++-- example.sh | 3 ++- readme.md | 12 ++++++------ 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index c098e7f..aa53467 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,8 +9,10 @@ LABEL org.opencontainers.image.licenses="(Apache-2.0 AND Prosperity-3.0.0)" WORKDIR /app -# add psql CLI tool -RUN apk add --no-cache postgresql-client +# Both moreutils (providing sponge) and postgresql-client (providing psql) are not required but come in handy for users. +RUN apk add --no-cache \ + postgresql-client \ + moreutils ADD package.json /app RUN npm install --production && npm cache clean --force diff --git a/cli.js b/cli.js index dded271..2fe9e21 100755 --- a/cli.js +++ b/cli.js @@ -102,8 +102,8 @@ Options: - gtfs_via_postgres_version (text) - gtfs_via_postgres_options (jsonb) Examples: - gtfs-to-sql some-gtfs/*.txt | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql # generate a gzipped SQL dump + gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL + gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J diff --git a/example.sh b/example.sh index 3448d8a..dabc52d 100755 --- a/example.sh +++ b/example.sh @@ -5,7 +5,8 @@ set -o pipefail 2>&1 echo "importing into PostgreSQL:" ./cli.js --ignore-unsupported --require-dependencies --trips-without-shape-id --silent \ - node_modules/sample-gtfs-feed/gtfs/*.txt | psql -b + node_modules/sample-gtfs-feed/gtfs/*.txt \ + | sponge | psql -b 2>&1 echo "\nfetching a connection during DST switch:" psql -c "$(cat <<- EOM diff --git a/readme.md b/readme.md index a36cbad..7cc23f0 100644 --- a/readme.md +++ b/readme.md @@ -72,7 +72,7 @@ Install `gtfs-via-postgres` and use it to import the GTFS data: ```sh npm install -D gtfs-via-postgres -npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | psql -b +npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b # agency # calendar # CREATE EXTENSION @@ -175,8 +175,8 @@ Options: - gtfs_via_postgres_version (text) - gtfs_via_postgres_options (jsonb) Examples: - gtfs-to-sql some-gtfs/*.txt | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql # generate a gzipped SQL dump + gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL + gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J @@ -200,7 +200,7 @@ Instead of installing via `npm`, you can use [the `ghcr.io/public-transport/gtfs # variant A: use Docker image just to convert GTFS to SQL docker run --rm --volume /path/to/gtfs:/gtfs \ ghcr.io/public-transport/gtfs-via-postgres --require-dependencies -- '/gtfs/*.csv' \ - | psql -b + | sponge | psql -b ``` *Note:* Remember to pass the `/gtfs/*.csv` glob as a string (with `'`), so that it gets evaluated *inside* the Docker container. @@ -214,7 +214,7 @@ FROM ghcr.io/public-transport/gtfs-via-postgres ENV PGPORT=5432 PGUSER=postgres WORKDIR /gtfs # pass all arguments into gtfs-via-postgres, pipe output into psql: -ENTRYPOINT ["/bin/sh", "-c", "gtfs-via-postgres $0 $@ | psql -b"] +ENTRYPOINT ["/bin/sh", "-c", "gtfs-via-postgres $0 $@ | sponge | psql -b"] ``` ```shell @@ -248,7 +248,7 @@ The `--postgraphile` flag changes the SQL generated by `gtfs-via-postgres` sligh ```shell # import data into PostgreSQL with PostGraphile tweaks -npm exec -- gtfs-to-sql -d --postgraphile -- gtfs/*.csv | psql -b +npm exec -- gtfs-to-sql -d --postgraphile -- gtfs/*.csv | sponge | psql -b ``` `gtfs-via-postgres` *doesn't* specify PostGraphile as a regular dependency, but as `peerDependencies`, in order to stay lightweight for users who don't need the GraphQL interface. Some versions of some package managers install unmet peer dependencies, some don't. Let's make sure that PostGraphile (and its plugins) are installed: From 79b6983acf42d0152151076386fc7076f8bb1d76 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Tue, 2 May 2023 23:08:54 +0200 Subject: [PATCH 7/8] =?UTF-8?q?add=20--stats-active-trips-by-hour=20option?= =?UTF-8?q?=20=E2=9C=85=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cli.js | 8 ++ docs/analysis/active-trips-by-hour.md | 32 +++++++ index.js | 7 ++ lib/index.js | 1 + lib/stats_active_trips_by_hour.js | 116 ++++++++++++++++++++++++++ readme.md | 5 ++ test/amtrak-gtfs-2021-10-06.sh | 18 ++++ 7 files changed, 187 insertions(+) create mode 100644 docs/analysis/active-trips-by-hour.md create mode 100644 lib/stats_active_trips_by_hour.js diff --git a/cli.js b/cli.js index 2fe9e21..7dd2d53 100755 --- a/cli.js +++ b/cli.js @@ -50,6 +50,9 @@ const { 'stats-by-agency-route-stop-hour': { type: 'string', }, + 'stats-active-trips-by-hour': { + type: 'string', + }, 'schema': { type: 'string', }, @@ -94,6 +97,10 @@ Options: Generate a view letting you analyze arrivals/ departures per route, stop and hour. The flag works like --stats-by-route-date. + --stats-active-trips-by-hour Generate a view letting you analyze the number of + currently running trips over time, by hour. + Like --stats-by-route-date, this flag accepts + none, view & materialized-view. --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ @@ -135,6 +142,7 @@ const opt = { stopsLocationIndex: !!flags['stops-location-index'], statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none', + statsActiveTripsByHour: flags['stats-active-trips-by-hour'] || 'none', schema: flags['schema'] || 'public', postgraphile: !!flags.postgraphile, importMetadata: !!flags['import-metadata'], diff --git a/docs/analysis/active-trips-by-hour.md b/docs/analysis/active-trips-by-hour.md new file mode 100644 index 0000000..70ac6a2 --- /dev/null +++ b/docs/analysis/active-trips-by-hour.md @@ -0,0 +1,32 @@ +# analysing the no. of active trips + +Do you want to know how many trips are running at a specific point in time? + +`gtfs-via-postgres` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**: + +- If you run `gtfs-to-sql` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios. +- If you pass `--stats-active-trips-by-hour=materialized-view`, the `stats_active_trips_by_hour` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (a minute for small feeds, many hours for large feeds). + +## example: number of active trips over the course of a day + +```sql +-- using VBB's 2023-05-02 GTFS data +SELECT * +FROM stats_active_trips_by_hour stats +WHERE "hour" >= '2023-05-20T22:00+02:00' +AND "hour" <= '2023-05-21T08:00+02:00' +``` + +`hour` | `nr_of_active_trips` +-|- +`2023-05-20T22:00+02:00` | `2715` +`2023-05-20T23:00+02:00` | `2401` +`2023-05-21T00:00+02:00` | `1827` +`2023-05-21T01:00+02:00` | `974` +`2023-05-21T02:00+02:00` | `813` +`2023-05-21T03:00+02:00` | `818` +`2023-05-21T04:00+02:00` | `887` +`2023-05-21T05:00+02:00` | `1118` +`2023-05-21T06:00+02:00` | `1598` +`2023-05-21T07:00+02:00` | `2318` +`2023-05-21T08:00+02:00` | `2615` diff --git a/index.js b/index.js index 438895f..f0993be 100644 --- a/index.js +++ b/index.js @@ -21,6 +21,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { stopsLocationIndex: false, statsByRouteIdAndDate: 'none', statsByAgencyIdAndRouteIdAndStopAndHour: 'none', + statsActiveTripsByHour: 'none', schema: 'public', postgraphile: false, importMetadata: false, @@ -35,6 +36,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { importMetadata, statsByRouteIdAndDate, statsByAgencyIdAndRouteIdAndStopAndHour, + statsActiveTripsByHour, } = opt if (ignoreUnsupportedFiles) { @@ -96,6 +98,11 @@ const convertGtfsToSql = async function* (files, opt = {}) { dep: ['stop_times'], }, } : {}), + ...(statsActiveTripsByHour !== 'none' ? { + 'stats_active_trips_by_hour': { + dep: ['stop_times'], + }, + } : {}), } for (const file of files) { diff --git a/lib/index.js b/lib/index.js index 79c43e9..35b1158 100644 --- a/lib/index.js +++ b/lib/index.js @@ -22,4 +22,5 @@ module.exports = { import_metadata: require('./import_metadata'), stats_by_route_date: require('./stats_by_route_date'), stats_by_agency_route_stop_hour: require('./stats_by_agency_route_stop_hour'), + stats_active_trips_by_hour: require('./stats_active_trips_by_hour'), } diff --git a/lib/stats_active_trips_by_hour.js b/lib/stats_active_trips_by_hour.js new file mode 100644 index 0000000..c8fdd3f --- /dev/null +++ b/lib/stats_active_trips_by_hour.js @@ -0,0 +1,116 @@ +'use strict' + +const afterAll = (opt) => { + let materialized = false + if (opt.statsActiveTripsByHour === 'materialized-view') { + materialized = true + } else if (opt.statsActiveTripsByHour !== 'view') { + throw new Error('invalid opt.statsActiveTripsByHour, must be one of these: none, view, materialized-view.') + } + const createViewCmd = materialized + ? `CREATE MATERIALIZED VIEW` + : `CREATE OR REPLACE VIEW` + + return `\ +CREATE MATERIALIZED VIEW "${opt.schema}".feed_time_frame AS +WITH + dates AS ( + SELECT + min("date") AS min, + max("date") AS max + FROM "${opt.schema}".service_days + ), + date_offset AS ( + SELECT greatest( + "${opt.schema}".largest_arrival_time(), + "${opt.schema}".largest_departure_time() + ) AS o + ), + date_min_max AS ( + SELECT + date_trunc('day', min + o) AS min, + date_trunc('day', max - o) AS max + FROM dates, date_offset + ), + min_dep AS ( + SELECT t_departure AS t + FROM "${opt.schema}".arrivals_departures, date_min_max + WHERE date <= date_min_max.min + ORDER BY t_departure ASC + LIMIT 1 + ), + min_arr AS ( + SELECT t_arrival AS t + FROM "${opt.schema}".arrivals_departures, date_min_max + WHERE date <= date_min_max.min + ORDER BY t_arrival ASC + LIMIT 1 + ), + max_dep AS ( + SELECT t_departure AS t + FROM "${opt.schema}".arrivals_departures, date_min_max + WHERE date >= date_min_max.max + ORDER BY t_departure DESC + LIMIT 1 + ), + max_arr AS ( + SELECT t_arrival AS t + FROM "${opt.schema}".arrivals_departures, date_min_max + WHERE date >= date_min_max.max + ORDER BY t_arrival DESC + LIMIT 1 + ) +SELECT + least(min_dep.t, min_arr.t) as min, + least(max_dep.t, max_arr.t) as max +FROM min_dep, min_arr, max_dep, max_arr; + +CREATE OR REPLACE FUNCTION "${opt.schema}".feed_time_series( + time_unit TEXT +) +RETURNS SETOF timestamptz +AS $$ + SELECT + generate_series( + date_trunc(time_unit, min), + date_trunc(time_unit, max), + ('1 ' || time_unit)::interval + ) as t + FROM "${opt.schema}".feed_time_frame +$$ LANGUAGE sql STABLE; + +${createViewCmd} "${opt.schema}".stats_active_trips_by_hour AS +WITH + all_hours AS NOT MATERIALIZED ( + SELECT "${opt.schema}".feed_time_series('hour') AS "hour" + ) +SELECT DISTINCT ON ("hour") + "hour", + count(*) OVER (PARTITION BY "hour") as nr_of_active_trips +FROM ( + -- only keep one arrival/departure per trip + SELECT DISTINCT ON ("hour", route_id, trip_id) + * + FROM ( + SELECT * + FROM all_hours + LEFT JOIN "${opt.schema}".connections ON ( + date_trunc('hour', t_departure) <= "hour" + AND date_trunc('hour', t_arrival) >= "hour" + ) + ) t +) cons; + +${materialized ? `\ +CREATE INDEX ON "${opt.schema}".stats_active_trips_by_hour ("hour"); +` : ''} + +${opt.postgraphile ? `\ +COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_active_trips_by_hour IS E'@name hourlyActiveTripsStats\\n@primaryKey hour'; +` : ''} +` +} + +module.exports = { + afterAll, +} diff --git a/readme.md b/readme.md index 7cc23f0..7ca3022 100644 --- a/readme.md +++ b/readme.md @@ -95,6 +95,7 @@ In addition to a table for each GTFS file, `gtfs-via-postgres` adds these views - `shapes_aggregates` aggregates individual shape points in [`shapes`](https://gtfs.org/reference/static/#shapestxt) into a [PostGIS `LineString`](http://postgis.net/workshops/postgis-intro/geometries.html#linestrings). - `stats_by_route_date` provides the number of arrivals/departures by route ID and date. – [read more](docs/analysis/feed-by-route-date.md) - `stats_by_agency_route_stop_hour` provides the number of arrivals/departures by agency ID, route ID, stop ID & hour. – [read more](docs/analysis/feed-by-agency-route-stop-and-hour.md) +- In contrast to `stats_by_route_date` & `stats_by_agency_route_stop_hour`, `stats_active_trips_by_hour` provides the number of *currently running* trips for each hour in the feeds period of time. As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01`: @@ -167,6 +168,10 @@ Options: Generate a view letting you analyze arrivals/ departures per route, stop and hour. The flag works like --stats-by-route-date. + --stats-active-trips-by-hour Generate a view letting you analyze the number of + currently running trips over time, by hour. + Like --stats-by-route-date, this flag accepts + none, view & materialized-view. --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index c1b5151..8580432 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -17,6 +17,7 @@ export PGDATABASE='amtrak_2021_10_06' --import-metadata \ --stats-by-route-date=view \ --stats-by-agency-route-stop-hour=view \ + --stats-active-trips-by-hour=view \ -- amtrak-gtfs-2021-10-06/*.txt \ | sponge | psql -b @@ -79,3 +80,20 @@ if [[ "$acelaPhillyStat" != "2" ]]; then echo "invalid stats for route 40751 (Acela) at PHL (Philadelphia) on 2021-11-26: $acelaPhillyStat" 1>&2 exit 1 fi + +nrOfActiveTripsQuery=$(cat << EOF +SELECT nr_of_active_trips +FROM amtrak.stats_active_trips_by_hour +WHERE "hour" = '2021-11-26T04:00-05' +EOF) +# Note: I'm not sure if 146 is correct, but it is in the right ballpark. 🙈 +# The following query yields 175 connections, and it doesn't contain those who depart earlier and arrive later. +# SELECT DISTINCT ON (trip_id) * +# FROM amtrak.connections +# WHERE t_departure >= '2021-11-26T02:00-05' +# AND t_arrival <= '2021-11-26T06:00-05' +nrOfActiveTrips=$(psql --csv -t -c "$nrOfActiveTripsQuery" | tail -n 1) +if [[ "$nrOfActiveTrips" != "146" ]]; then + echo "unexpected no. of active trips at 2021-11-26T04:00-05: $nrOfActiveTrips" 1>&2 + exit 1 +fi From 514d53229f320f0ca72a907602e67c598a7e8af6 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 4 May 2023 00:46:14 +0200 Subject: [PATCH 8/8] =?UTF-8?q?docs:=20no.=20of=20active=20trips=20with=20?= =?UTF-8?q?a=20custom=20temporal=20resolution=20=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/analysis/active-trips-by-hour.md | 46 +++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/docs/analysis/active-trips-by-hour.md b/docs/analysis/active-trips-by-hour.md index 70ac6a2..9983331 100644 --- a/docs/analysis/active-trips-by-hour.md +++ b/docs/analysis/active-trips-by-hour.md @@ -30,3 +30,49 @@ AND "hour" <= '2023-05-21T08:00+02:00' `2023-05-21T06:00+02:00` | `1598` `2023-05-21T07:00+02:00` | `2318` `2023-05-21T08:00+02:00` | `2615` + +## example: custom temporal resolution + +As an example, let's query active trips *per minute* by just adapting `stats_active_trips_by_hour`'s underlying query: + +```sql +WITH all_minutes AS NOT MATERIALIZED ( + SELECT feed_time_series('minute') AS "minute" +) +SELECT DISTINCT ON ("minute") + "minute", + count(*) OVER (PARTITION BY "minute") as nr_of_active_trips +FROM ( + -- only keep one arrival/departure per trip + SELECT DISTINCT ON ("minute", route_id, trip_id) + * + FROM ( + SELECT * + FROM all_minutes + LEFT JOIN connections ON ( + date_trunc('minute', t_departure) <= "minute" + AND date_trunc('minute', t_arrival) >= "minute" + ) + ) t +) cons +WHERE "minute" >= '2023-05-20T22:00+02:00' +AND "minute" < '2023-05-20T22:15+02:00' +``` + +`minute` | `nr_of_active_trips` +-|- +`2023-05-20T22:00+02:00` | `959` +`2023-05-20T22:01+02:00` | `960` +`2023-05-20T22:02+02:00` | `966` +`2023-05-20T22:03+02:00` | `978` +`2023-05-20T22:04+02:00` | `976` +`2023-05-20T22:05+02:00` | `982` +`2023-05-20T22:06+02:00` | `991` +`2023-05-20T22:07+02:00` | `980` +`2023-05-20T22:08+02:00` | `975` +`2023-05-20T22:09+02:00` | `967` +`2023-05-20T22:10+02:00` | `983` +`2023-05-20T22:11+02:00` | `976` +`2023-05-20T22:12+02:00` | `982` +`2023-05-20T22:13+02:00` | `970` +`2023-05-20T22:14+02:00` | `958`