Skip to content

Commit

Permalink
Merge pull request #209 from HTTPArchive/upgrade-to-new-datamodel
Browse files Browse the repository at this point in the history
Upgrade to new datamodel
  • Loading branch information
tunetheweb authored Jan 16, 2025
2 parents 30201f7 + 477d54f commit 5230702
Show file tree
Hide file tree
Showing 129 changed files with 1,133 additions and 1,429 deletions.
4 changes: 3 additions & 1 deletion sql/.sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,18 @@ templater = jinja
## Comma separated list of rules to check, or None for all
rules = None
## Comma separated list of rules to exclude, or None
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
# AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same.
# AL07 - Avoid aliases in from and join - why?
# AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all.
# AM05 - INNER JOIN must be fully qualified. Probably should use this but not our style.
# CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case
# CP03 - Function names will be mixed case so don't enforce case
# CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer.
# CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558
# LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future?
# LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future?
# LT14 - Keywords on newline. We have some simple, single line joins
# RF01 - BigQuery uses STRUCTS which can look like incorrect table references
# RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain)
# RF03 - Insists on references in column names even if not ambiguous. Bit OTT.
Expand Down
3 changes: 3 additions & 0 deletions sql/.sqlfluffignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
/lens/*/crux_histograms.sql
/lens/*/crux_timeseries.sql
/lens/*/histograms.sql
/lens/*/timeseries.sql
193 changes: 95 additions & 98 deletions sql/generate_reports.sh

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions sql/histograms/bootupJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), JSON_EXTRACT(report, '$.audits.bootup-time.rawValue')) AS FLOAT64) / 100) / 10 AS bin
FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin
FROM
`httparchive.lighthouse.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesCss.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesCSS / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesFont.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesFont / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesHtml.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesHtml / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesImg.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesImg / 102400) * 100 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesJS / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesOther.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesOther / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesTotal.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesTotal / 102400) * 100 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesTotal) / 102400) * 100 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesVideo.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesVideo / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/compileJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(JSON_EXTRACT(payload, "$['_cpu.v8.compile']") AS INT64) AS bin
INT64(payload['_cpu.v8.compile']) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
49 changes: 0 additions & 49 deletions sql/histograms/cruxFid.sql

This file was deleted.

22 changes: 15 additions & 7 deletions sql/histograms/cruxShopifyThemes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,27 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F
good + needs_improvement + poor > 0
);

-- Test CrUX data exists
WITH crux_test AS ( -- noqa: ST03
SELECT
1
FROM
`chrome-ux-report.all.${YYYYMM}`
),

-- All Shopify shops in HTTPArchive
WITH archive_pages AS (
archive_pages AS (
SELECT
client,
page AS url,
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') AS theme_name,
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.theme_store_id') AS theme_store_id
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name,
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id
FROM
`httparchive.all.pages`
`httparchive.crawl.pages`
WHERE
date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND
date = '${YYYY-MM-DD}' AND
is_root_page AND
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') IS NOT NULL --first grab all shops for market share
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share
)

SELECT
Expand Down Expand Up @@ -176,7 +184,7 @@ JOIN (
-- Include null theme store ids so that we can get full market share within CrUX
ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A')
WHERE
date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND
date = '${YYYY-MM-DD}' AND
theme_names.rank = 1
GROUP BY
client,
Expand Down
10 changes: 6 additions & 4 deletions sql/histograms/dcl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
FLOOR(onContentLoaded / 1000) AS bin
FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
onContentLoaded > 0
date = '${YYYY-MM-DD}' AND
is_root_page AND
FLOAT64(summary.onContentLoaded) > 0
GROUP BY
bin,
client
Expand Down
12 changes: 9 additions & 3 deletions sql/histograms/evalJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(CAST(JSON_EXTRACT(payload, "$['_cpu.EvaluateScript']") AS FLOAT64) / 20 AS INT64) * 20 AS bin
CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin
FROM
`httparchive.requests.${YYYY_MM_DD}_*`
`httparchive.crawl.requests` r
INNER JOIN
`httparchive.crawl.pages`
USING (date, client, is_root_page, rank, page)
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/fcp.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64) / 1000) AS INT64) AS bin
CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/gzipSavings.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64) / (1024 * 2)) * 2 AS INT64) AS bin
CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
Loading

0 comments on commit 5230702

Please sign in to comment.