public-transport · derhuerst · May 18, 2023 · Feb 4, 2023 · Mar 23, 2023 · Mar 26, 2023
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -22,6 +22,8 @@ jobs:
       uses: actions/setup-node@v1
       with:
         node-version: ${{ matrix.node-version }}
+    - name: install sponge (moreutils)
+      run: sudo apt install -y moreutils
 
     - name: install & start PostgreSQL with PostGIS
       # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
@@ -31,6 +31,8 @@ jobs:
       uses: actions/setup-node@v1
       with:
         node-version: '16.x'
+    - name: install sponge (moreutils)
+      run: sudo apt install -y moreutils
 
     - name: install & start PostgreSQL with PostGIS
       # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,6 +27,8 @@ jobs:
       uses: actions/setup-node@v1
       with:
         node-version: ${{ matrix.node-version }}
+    - name: install sponge (moreutils)
+      run: sudo apt install -y moreutils
 
     - name: install & start PostgreSQL with PostGIS
       # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14

diff --git a/Dockerfile b/Dockerfile
@@ -9,8 +9,10 @@ LABEL org.opencontainers.image.licenses="(Apache-2.0 AND Prosperity-3.0.0)"
 
 WORKDIR /app
 
-# add psql CLI tool
-RUN apk add --no-cache postgresql-client
+# Both moreutils (providing sponge) and postgresql-client (providing psql) are not required but come in handy for users.
+RUN apk add --no-cache \
+	postgresql-client \
+	moreutils
 
 ADD package.json /app
 RUN npm install --production && npm cache clean --force

diff --git a/benchmark/arrs_deps_by_route_name_and_time.sql b/benchmark/arrs_deps_by_route_name_and_time.sql
@@ -3,5 +3,6 @@ SELECT * from bench(
 FROM arrivals_departures
 WHERE route_short_name = ''S1''
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09'''
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')'
 );
diff --git a/benchmark/arrs_deps_by_station_and_time.sql b/benchmark/arrs_deps_by_station_and_time.sql
@@ -3,6 +3,7 @@ SELECT * from bench(
 FROM arrivals_departures
 WHERE station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin)
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09''',
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')',
 40
 );
diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0.sql b/benchmark/arrs_deps_by_station_and_time_seq_0.sql
@@ -3,7 +3,8 @@ SELECT * from bench(
 FROM arrivals_departures
 WHERE station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin)
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09''
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')
 AND stop_sequence = 0',
 50
 );
diff --git a/benchmark/arrs_deps_by_stop_and_time.sql b/benchmark/arrs_deps_by_stop_and_time.sql
@@ -3,5 +3,6 @@ SELECT * from bench(
 FROM arrivals_departures
 WHERE stop_id = ''de:11000:900100001::4'' -- S+U Friedrichstr. (Berlin)
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09'''
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')'
 );
diff --git a/benchmark/arrs_deps_by_time.sql b/benchmark/arrs_deps_by_time.sql
@@ -2,6 +2,7 @@ SELECT * from bench(
 'SELECT *
 FROM arrivals_departures
 WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09''',
+AND date >= dates_filter_min(''2022-08-09T07:10+02''::timestamp with time zone)
+AND date <= dates_filter_max(''2022-08-09T07:30+02''::timestamp with time zone)',
 10
 );
diff --git a/benchmark/arrs_deps_by_time_manual.sql b/benchmark/arrs_deps_by_time_manual.sql
@@ -0,0 +1,8 @@
+SELECT * from bench(
+'SELECT *
+FROM arrivals_departures
+WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
+AND date >= ''2022-08-08''
+AND date <= ''2022-08-09''',
+10
+);
diff --git a/benchmark/connections_by_route_name_and_time.sql b/benchmark/connections_by_route_name_and_time.sql
@@ -3,5 +3,6 @@ SELECT * from bench(
 FROM connections
 WHERE route_short_name = ''S1''
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09'''
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')'
 );
diff --git a/benchmark/connections_by_station_and_time.sql b/benchmark/connections_by_station_and_time.sql
@@ -3,6 +3,7 @@ SELECT * from bench(
 FROM connections
 WHERE from_station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin)
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09''',
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')',
 40
 );
diff --git a/benchmark/connections_by_station_and_time_seq_0.sql b/benchmark/connections_by_station_and_time_seq_0.sql
@@ -3,7 +3,8 @@ SELECT * from bench(
 FROM connections
 WHERE from_station_id = ''de:11000:900100001'' -- S+U Friedrichstr. (Berlin)
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09''
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')
 AND from_stop_sequence = 0',
 50
 );
diff --git a/benchmark/connections_by_stop_and_time.sql b/benchmark/connections_by_stop_and_time.sql
@@ -3,5 +3,6 @@ SELECT * from bench(
 FROM connections
 WHERE from_stop_id = ''de:11000:900100001::4'' -- S+U Friedrichstr. (Berlin)
 AND t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09'''
+AND date >= dates_filter_min(''2022-08-09T07:10+02'')
+AND date <= dates_filter_max(''2022-08-09T07:30+02'')'
 );
diff --git a/benchmark/connections_by_time.sql b/benchmark/connections_by_time.sql
@@ -2,7 +2,8 @@ SELECT * from bench(
 'SELECT *
 FROM connections
 WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
-AND date > ''2022-08-08'' AND date <= ''2022-08-09''
+AND date >= dates_filter_min(''2022-08-09T07:10+02''::timestamp with time zone)
+AND date <= dates_filter_max(''2022-08-09T07:30+02''::timestamp with time zone)
 ORDER BY t_departure
 LIMIT 100',
 7

diff --git a/benchmark/connections_by_time_manual.sql b/benchmark/connections_by_time_manual.sql
@@ -0,0 +1,10 @@
+SELECT * from bench(
+'SELECT *
+FROM connections
+WHERE t_departure >= ''2022-08-09T07:10+02'' AND t_departure <= ''2022-08-09T07:30+02''
+AND date >= ''2022-08-08''
+AND date <= ''2022-08-09''
+ORDER BY t_departure
+LIMIT 100',
+7
+);
diff --git a/benchmark/index.sql b/benchmark/index.sql
@@ -42,13 +42,13 @@ BEGIN
 	INSERT INTO _benchmark
 	SELECT
 		_query,
-		round(avg(elapsed)::numeric, 2),
+		round(avg(elapsed)::numeric, 0),
 		min(elapsed),
-		round((percentile_cont(0.25) WITHIN GROUP (ORDER BY elapsed))::numeric, 2),
-		round((percentile_cont(0.50) WITHIN GROUP (ORDER BY elapsed))::numeric, 2),
-		round((percentile_cont(0.75) WITHIN GROUP (ORDER BY elapsed))::numeric, 2),
-		round((percentile_cont(0.95) WITHIN GROUP (ORDER BY elapsed))::numeric, 2),
-		round((percentile_cont(0.99) WITHIN GROUP (ORDER BY elapsed))::numeric, 2),
+		round((percentile_cont(0.25) WITHIN GROUP (ORDER BY elapsed))::numeric, 0),
+		round((percentile_cont(0.50) WITHIN GROUP (ORDER BY elapsed))::numeric, 0),
+		round((percentile_cont(0.75) WITHIN GROUP (ORDER BY elapsed))::numeric, 0),
+		round((percentile_cont(0.95) WITHIN GROUP (ORDER BY elapsed))::numeric, 0),
+		round((percentile_cont(0.99) WITHIN GROUP (ORDER BY elapsed))::numeric, 0),
 		max(elapsed),
 		_iterations
 	FROM _bench_results;
@@ -67,6 +67,7 @@ LANGUAGE plpgsql;
 \i arrs_deps_by_stop.sql
 \i arrs_deps_by_non_existent_stop.sql
 \i arrs_deps_by_time.sql
+\i arrs_deps_by_time_manual.sql
 \i connections_by_route_name_and_time.sql
 \i connections_by_station_and_time.sql
 \i connections_by_station_and_time_seq_0.sql
@@ -75,6 +76,8 @@ LANGUAGE plpgsql;
 \i connections_by_stop.sql
 \i connections_by_non_existent_stop.sql
 \i connections_by_time.sql
+\i connections_by_time_manual.sql
+\i stats_by_route_id_and_date.sql
 
 SELECT * FROM _benchmark;
 

diff --git a/benchmark/stats_by_route_id_date.sql b/benchmark/stats_by_route_id_date.sql
@@ -0,0 +1,8 @@
+SELECT * from bench(
+'SELECT *
+FROM stats_by_route_date
+WHERE route_id = ''17452_900'' -- M4
+AND date >= ''2022-08-08'' AND date <= ''2022-08-14''
+AND is_effective = true',
+10
+);
diff --git a/cli.js b/cli.js
@@ -44,6 +44,15 @@ const {
 		'stops-location-index': {
 			type: 'boolean',
 		},
+		'stats-by-route-date': {
+			type: 'string',
+		},
+		'stats-by-agency-route-stop-hour': {
+			type: 'string',
+		},
+		'stats-active-trips-by-hour': {
+			type: 'string',
+		},
 		'schema': {
 			type: 'string',
 		},
@@ -78,6 +87,20 @@ Options:
                                   Default if levels.txt has not been provided.
     --stops-location-index        Create a spatial index on stops.stop_loc for efficient
                                     queries by geolocation.
+    --stats-by-route-date         Wether to generate a stats_by_route_date view
+                                    letting you analyze all data per routes and/or date:
+                                    - none: Don't generate a view.
+                                    - view: Fast generation, slow access.
+                                    - materialized-view: Slow generation, fast access.
+                                    Default: none
+    --stats-by-agency-route-stop-hour
+                                  Generate a view letting you analyze arrivals/
+                                    departures per route, stop and hour.
+                                    The flag works like --stats-by-route-date.
+    --stats-active-trips-by-hour  Generate a view letting you analyze the number of
+                                    currently running trips over time, by hour.
+                                    Like --stats-by-route-date, this flag accepts
+                                    none, view & materialized-view.
     --schema                      The schema to use for the database. Default: public
     --postgraphile                Tweak generated SQL for PostGraphile usage.
                                     https://www.graphile.org/postgraphile/
@@ -86,8 +109,8 @@ Options:
                                     - gtfs_via_postgres_version (text)
                                     - gtfs_via_postgres_options (jsonb)
 Examples:
-    gtfs-to-sql some-gtfs/*.txt | psql -b # import into PostgreSQL
-    gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql # generate a gzipped SQL dump
+    gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL
+    gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump
 
 [1] https://developers.google.com/transit/gtfs/reference/extended-route-types
 [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J
@@ -117,6 +140,9 @@ const opt = {
 	tripsWithoutShapeId: !!flags['trips-without-shape-id'],
 	routesWithoutAgencyId: !!flags['routes-without-agency-id'],
 	stopsLocationIndex: !!flags['stops-location-index'],
+	statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none',
+	statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none',
+	statsActiveTripsByHour: flags['stats-active-trips-by-hour'] || 'none',
 	schema: flags['schema'] || 'public',
 	postgraphile: !!flags.postgraphile,
 	importMetadata: !!flags['import-metadata'],

diff --git a/docs/analysis/active-trips-by-hour.md b/docs/analysis/active-trips-by-hour.md
@@ -0,0 +1,78 @@
+# analysing the no. of active trips
+
+Do you want to know how many trips are running at a specific point in time?
+
+`gtfs-via-postgres` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**:
+
+- If you run `gtfs-to-sql` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios.
+- If you pass `--stats-active-trips-by-hour=materialized-view`, the `stats_active_trips_by_hour` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (a minute for small feeds, many hours for large feeds).
+
+## example: number of active trips over the course of a day
+
+```sql
+-- using VBB's 2023-05-02 GTFS data
+SELECT *
+FROM stats_active_trips_by_hour stats
+WHERE "hour" >= '2023-05-20T22:00+02:00'
+AND "hour" <= '2023-05-21T08:00+02:00'
+```
+
+`hour` | `nr_of_active_trips`
+-|-
+`2023-05-20T22:00+02:00` | `2715`
+`2023-05-20T23:00+02:00` | `2401`
+`2023-05-21T00:00+02:00` | `1827`
+`2023-05-21T01:00+02:00` | `974`
+`2023-05-21T02:00+02:00` | `813`
+`2023-05-21T03:00+02:00` | `818`
+`2023-05-21T04:00+02:00` | `887`
+`2023-05-21T05:00+02:00` | `1118`
+`2023-05-21T06:00+02:00` | `1598`
+`2023-05-21T07:00+02:00` | `2318`
+`2023-05-21T08:00+02:00` | `2615`
+
+## example: custom temporal resolution
+
+As an example, let's query active trips *per minute* by just adapting `stats_active_trips_by_hour`'s underlying query:
+
+```sql
+WITH all_minutes AS NOT MATERIALIZED (
+	SELECT feed_time_series('minute') AS "minute"
+)
+SELECT DISTINCT ON ("minute")
+	"minute",
+	count(*) OVER (PARTITION BY "minute") as nr_of_active_trips
+FROM (
+	-- only keep one arrival/departure per trip
+	SELECT DISTINCT ON ("minute", route_id, trip_id)
+		*
+	FROM (
+		SELECT *
+		FROM all_minutes
+		LEFT JOIN connections ON (
+			date_trunc('minute', t_departure) <= "minute"
+			AND date_trunc('minute', t_arrival) >= "minute"	
+		)
+	) t
+) cons
+WHERE "minute" >= '2023-05-20T22:00+02:00'
+AND "minute" < '2023-05-20T22:15+02:00'
+```
+
+`minute` | `nr_of_active_trips`
+-|-
+`2023-05-20T22:00+02:00` | `959`
+`2023-05-20T22:01+02:00` | `960`
+`2023-05-20T22:02+02:00` | `966`
+`2023-05-20T22:03+02:00` | `978`
+`2023-05-20T22:04+02:00` | `976`
+`2023-05-20T22:05+02:00` | `982`
+`2023-05-20T22:06+02:00` | `991`
+`2023-05-20T22:07+02:00` | `980`
+`2023-05-20T22:08+02:00` | `975`
+`2023-05-20T22:09+02:00` | `967`
+`2023-05-20T22:10+02:00` | `983`
+`2023-05-20T22:11+02:00` | `976`
+`2023-05-20T22:12+02:00` | `982`
+`2023-05-20T22:13+02:00` | `970`
+`2023-05-20T22:14+02:00` | `958`
diff --git a/docs/analysis/feed-by-agency-route-stop-and-hour.md b/docs/analysis/feed-by-agency-route-stop-and-hour.md
@@ -0,0 +1,5 @@
+# analysing a GTFS dataset by route ID, stop ID and/or hour
+
+With the `--stats-by-route-and-stop-and-hour` option, `gtfs-via-postgres` provides a view `stats_by_agency_route_stop_hour`. Just like [`stats_by_route_id_and_date`](feed-by-route-and-date.md), it aggregates all arrivals by `agency_id`, `route_id`, `stop_id` and `effective_hour`.
+
+Note: As a materialized view, `stats_by_agency_route_stop_hour` takes up a significant amount of space, e.g. 13GB with the 2023-05-02 VBB GTFS feed.
diff --git a/docs/analysis/feed-by-route-date.md b/docs/analysis/feed-by-route-date.md
@@ -0,0 +1,50 @@
+# analysing a GTFS dataset by route ID and/or date
+
+Are you trying to answer a question like those below?
+
+- Are there certain dates or days of the week that have sigificantly less arrivals/departures (hereinafter "stop time events")? – This *may* indicate errors in the data, e.g. a faulty `calendar.csv` or `calendar_dates.csv` file.
+- Has the number of stop time events decreased, compared to the last dataset version?
+- Do specific routes stop running during certain time periods?
+
+`gtfs-via-postgres` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL:
+
+- If you run `gtfs-to-sql` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios.
+- If you pass `--stats-by-route-date=materialized-view`, the `stats_by_route_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed).
+
+`stats_by_route_date` has the following columns:
+
+- `route_id`
+- `date`
+- `dow` – day of the week, following the [PostgreSQL notation `0` (Sunday) to `6` (Saturday)](https://www.postgresql.org/docs/14/functions-datetime.html#FUNCTIONS-DATETIME-EXTRACT)
+- `nr_of_trips` – nr of trips starting on that date
+- `nr_of_arrs_deps` – nr of trips taking place on that date
+- `is_effective` – wether `nr_of_trips` & `nr_of_arrs_deps` are calculated based on the *effective* date (i.e. the date that the stop time event actually happens on) or *schedule* date (i.e. the date which their `stop_time` rows refer to)
+
+So
+
+- if you want to take a customer-facing perspective on the data (as in "I don't care which trips are scheduled before midnight, I want to know if they run today"), filter for `is_effective = True` rows;
+- If you're interested in the operational/planning perspective (e.g. if you're looking for data errors), filter for `is_effective = False` rows.
+
+## example: nr of effective stop time events of a single route over a week
+
+```sql
+-- using VBB's 2023-02-27 GTFS data
+SELECT *
+FROM stats_by_route_date stats
+WHERE is_effective = True
+AND route_id = '17438_900', -- M1 tram line
+AND "date" >= '2023-03-19' -- Sunday, dow = 0
+AND "date" <= '2023-03-25' -- Saturday, dow = 6
+ORDER BY route_id, "date", is_effective DESC
+```
+
+```csv
+route_id,date,dow,nr_of_trips,nr_of_arrs_deps,is_effective,dow
+17438_900,2023-03-19,0,258,5870,t,0
+17438_900,2023-03-20,1,345,7831,t,1
+17438_900,2023-03-21,2,345,7831,t,2
+17438_900,2023-03-22,3,345,7831,t,3
+17438_900,2023-03-23,4,345,7831,t,4
+17438_900,2023-03-24,5,345,7831,t,5
+17438_900,2023-03-25,6,326,9001,t,6
+```
diff --git a/example.sh b/example.sh
@@ -5,7 +5,8 @@ set -o pipefail
 
 2>&1 echo "importing into PostgreSQL:"
 ./cli.js --ignore-unsupported --require-dependencies --trips-without-shape-id --silent \
-	node_modules/sample-gtfs-feed/gtfs/*.txt | psql -b
+	node_modules/sample-gtfs-feed/gtfs/*.txt \
+	| sponge | psql -b
 
 2>&1 echo "\nfetching a connection during DST switch:"
 psql -c "$(cat <<- EOM