Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tech Report: Technologies - major.minor versions granularity #48

Merged
merged 34 commits into from
Feb 14, 2025
Merged
Changes from 15 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cf33544
versions
max-ostapenko Jan 12, 2025
547f63e
tech filter
max-ostapenko Jan 12, 2025
e1f0e60
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 14, 2025
3ec247c
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 21, 2025
65acf68
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 26, 2025
e8580d0
new table with versions
max-ostapenko Jan 26, 2025
bfac4f6
typo
max-ostapenko Jan 26, 2025
ac2f597
versions table
max-ostapenko Jan 26, 2025
bdd46b8
fix
max-ostapenko Jan 26, 2025
23864b9
no retries
max-ostapenko Jan 26, 2025
4d42453
tech_report_* tables
max-ostapenko Jan 26, 2025
4dd3f9b
clusters renamed
max-ostapenko Jan 26, 2025
8032aab
lint
max-ostapenko Jan 26, 2025
a41ed32
adjust export config
max-ostapenko Jan 26, 2025
2aec142
fix clustering
max-ostapenko Jan 26, 2025
396d664
origin renamed
max-ostapenko Jan 27, 2025
e9b666e
deduplicated good_cwv
max-ostapenko Jan 27, 2025
ff2f5a4
Merge branch 'main' into central-flyingfish
max-ostapenko Jan 27, 2025
58eea31
include minor
max-ostapenko Jan 30, 2025
747a18f
Merge branch 'main' into main
max-ostapenko Jan 30, 2025
8c0455c
Merge branch 'central-flyingfish' into central-flyingfish
max-ostapenko Jan 30, 2025
c88ef18
fix
max-ostapenko Jan 30, 2025
3268e28
Merge branch 'central-flyingfish' into central-flyingfish
max-ostapenko Jan 30, 2025
bd07f78
cleanup
max-ostapenko Jan 30, 2025
5967524
pattern fix
max-ostapenko Jan 30, 2025
146978d
Merge branch 'central-flyingfish' into central-flyingfish
max-ostapenko Jan 30, 2025
7ff9151
tech detections only
max-ostapenko Jan 31, 2025
718e3c4
fix
max-ostapenko Jan 31, 2025
34a4bb7
relaxed pattern
max-ostapenko Jan 31, 2025
15e5555
Merge branch 'main' into central-flyingfish
max-ostapenko Feb 1, 2025
8bde2fb
remove similar_technologies
max-ostapenko Feb 1, 2025
faa2339
cleanup
max-ostapenko Feb 2, 2025
a7afc25
Merge branch 'main' into central-flyingfish
max-ostapenko Feb 5, 2025
f23cffc
Merge branch 'main' into central-flyingfish
max-ostapenko Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_adoption.js
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ publish('cwv_tech_adoption', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_categories.js
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth)
publish('cwv_tech_categories', {
schema: 'reports',
type: 'table',
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
WITH pages AS (
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_core_web_vitals.js
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ publish('cwv_tech_core_web_vitals', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['tech_report']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_VITALS(
records ARRAY<STRUCT<
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_lighthouse.js
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ publish('cwv_tech_lighthouse', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE(
records ARRAY<STRUCT<
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_page_weight.js
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ publish('cwv_tech_page_weight', {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT(
records ARRAY<STRUCT<
2 changes: 1 addition & 1 deletion definitions/output/reports/cwv_tech_technologies.js
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ const pastMonth = constants.fnPastMonth(constants.currentMonth)
publish('cwv_tech_technologies', {
schema: 'reports',
type: 'table',
tags: ['crux_ready', 'tech_report']
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */
WITH pages AS (
297 changes: 297 additions & 0 deletions definitions/output/reports/tech_crux.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_crux', {
schema: 'reports',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
clusterBy: ['geo', 'client', 'rank', 'technology'],
requirePartitionFilter: true
},
tags: ['tech_report'],
dependOnDependencyAssertions: true
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
CREATE TEMP FUNCTION IS_GOOD(
good FLOAT64,
needs_improvement FLOAT64,
poor FLOAT64
) RETURNS BOOL AS (
SAFE_DIVIDE(good, good + needs_improvement + poor) >= 0.75
);
CREATE TEMP FUNCTION IS_NON_ZERO(
good FLOAT64,
needs_improvement FLOAT64,
poor FLOAT64
) RETURNS BOOL AS (
good + needs_improvement + poor > 0
);
`).query(ctx => `
WITH pages AS (
SELECT
client,
page,
root_page AS origin,
technologies,
summary,
lighthouse
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = '${pastMonth}'
${constants.devRankFilter}
), geo_summary AS (
SELECT
\`chrome-ux-report\`.experimental.GET_COUNTRY(country_code) AS geo,
rank,
device,
origin,
avg_fcp,
avg_fid,
avg_inp,
avg_lcp,
avg_ttfb,
fast_fcp,
fast_fid,
fast_inp,
fast_lcp,
fast_ttfb,
slow_fcp,
slow_fid,
slow_inp,
slow_lcp,
slow_ttfb,
small_cls,
medium_cls,
large_cls
FROM ${ctx.ref('chrome-ux-report', 'materialized', 'country_summary')}
WHERE
yyyymm = CAST(FORMAT_DATE('%Y%m', '${pastMonth}') AS INT64) AND
device IN ('desktop', 'phone')
UNION ALL
SELECT
'ALL' AS geo,
rank,
device,
origin,
avg_fcp,
avg_fid,
avg_inp,
avg_lcp,
avg_ttfb,
fast_fcp,
fast_fid,
fast_inp,
fast_lcp,
fast_ttfb,
slow_fcp,
slow_fid,
slow_inp,
slow_lcp,
slow_ttfb,
small_cls,
medium_cls,
large_cls
FROM ${ctx.ref('chrome-ux-report', 'materialized', 'device_summary')}
WHERE
date = '${pastMonth}' AND
device IN ('desktop', 'phone')
),
crux AS (
SELECT
geo,
CASE _rank
WHEN 100000000 THEN 'ALL'
WHEN 10000000 THEN 'Top 10M'
WHEN 1000000 THEN 'Top 1M'
WHEN 100000 THEN 'Top 100k'
WHEN 10000 THEN 'Top 10k'
WHEN 1000 THEN 'Top 1k'
END AS rank,
CONCAT(origin, '/') AS origin,
IF(device = 'desktop', 'desktop', 'mobile') AS client,
# CWV
IS_NON_ZERO(fast_fid, avg_fid, slow_fid) AS any_fid,
IS_GOOD(fast_fid, avg_fid, slow_fid) AS good_fid,
IS_NON_ZERO(small_cls, medium_cls, large_cls) AS any_cls,
IS_GOOD(small_cls, medium_cls, large_cls) AS good_cls,
IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AS any_lcp,
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_lcp,
(IS_GOOD(fast_inp, avg_inp, slow_inp) OR fast_inp IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2024,
(IS_GOOD(fast_fid, avg_fid, slow_fid) OR fast_fid IS NULL) AND
IS_GOOD(small_cls, medium_cls, large_cls) AND
IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AS good_cwv_2023,
# WV
IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp) AS any_fcp,
IS_GOOD(fast_fcp, avg_fcp, slow_fcp) AS good_fcp,
IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb) AS any_ttfb,
IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb) AS good_ttfb,
IS_NON_ZERO(fast_inp, avg_inp, slow_inp) AS any_inp,
IS_GOOD(fast_inp, avg_inp, slow_inp) AS good_inp
FROM geo_summary,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank
WHERE rank <= _rank
),
technologies AS (
SELECT
tech.technology,
REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version,
client,
page
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.info) AS version
WHERE
tech.technology IS NOT NULL AND
tech.technology != '' AND
REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] IS NOT NULL
UNION ALL
SELECT
tech.technology,
'ALL' AS version,
client,
page
FROM pages,
UNNEST(technologies) AS tech
UNION ALL
SELECT
'ALL' AS technology,
'ALL' AS version,
client,
page
FROM pages
),
categories AS (
SELECT
tech.technology,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.categories) AS category
GROUP BY technology
UNION ALL
SELECT
'ALL' AS technology,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.categories) AS category
WHERE
client = 'mobile'
),
lab_metrics AS (
SELECT
client,
page,
origin,
SAFE.INT64(summary.bytesTotal) AS bytesTotal,
SAFE.INT64(summary.bytesJS) AS bytesJS,
SAFE.INT64(summary.bytesImg) AS bytesImg,
SAFE.FLOAT64(lighthouse.categories.accessibility.score) AS accessibility,
SAFE.FLOAT64(lighthouse.categories['best-practices'].score) AS best_practices,
SAFE.FLOAT64(lighthouse.categories.performance.score) AS performance,
SAFE.FLOAT64(lighthouse.categories.pwa.score) AS pwa,
SAFE.FLOAT64(lighthouse.categories.seo.score) AS seo
FROM pages
),
lab_data AS (
SELECT
client,
origin,
technology,
version,
ANY_VALUE(category) AS category,
AVG(bytesTotal) AS bytesTotal,
AVG(bytesJS) AS bytesJS,
AVG(bytesImg) AS bytesImg,
AVG(accessibility) AS accessibility,
AVG(best_practices) AS best_practices,
AVG(performance) AS performance,
AVG(pwa) AS pwa,
AVG(seo) AS seo
FROM lab_metrics
INNER JOIN technologies
USING (client, page)
INNER JOIN categories
USING (technology)
GROUP BY
client,
origin,
technology,
version
)
SELECT
DATE('${pastMonth}') AS date,
geo,
client,
rank,
technology,
version,
COUNT(DISTINCT origin) AS origins,
# CrUX data
COUNTIF(good_fid) AS origins_with_good_fid,
COUNTIF(good_cls) AS origins_with_good_cls,
COUNTIF(good_lcp) AS origins_with_good_lcp,
COUNTIF(good_fcp) AS origins_with_good_fcp,
COUNTIF(good_ttfb) AS origins_with_good_ttfb,
COUNTIF(good_inp) AS origins_with_good_inp,
COUNTIF(any_fid) AS origins_with_any_fid,
COUNTIF(any_cls) AS origins_with_any_cls,
COUNTIF(any_lcp) AS origins_with_any_lcp,
COUNTIF(any_fcp) AS origins_with_any_fcp,
COUNTIF(any_ttfb) AS origins_with_any_ttfb,
COUNTIF(any_inp) AS origins_with_any_inp,
COUNTIF(good_cwv_2024) AS origins_with_good_cwv,
COUNTIF(good_cwv_2024) AS origins_with_good_cwv_2024,
COUNTIF(good_cwv_2023) AS origins_with_good_cwv_2023,
COUNTIF(any_lcp AND any_cls) AS origins_eligible_for_cwv,
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv,
SAFE_DIVIDE(COUNTIF(good_cwv_2024), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2024,
SAFE_DIVIDE(COUNTIF(good_cwv_2023), COUNTIF(any_lcp AND any_cls)) AS pct_eligible_origins_with_good_cwv_2023,
# Lighthouse data
SAFE_CAST(APPROX_QUANTILES(accessibility, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_accessibility,
SAFE_CAST(APPROX_QUANTILES(best_practices, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_best_practices,
SAFE_CAST(APPROX_QUANTILES(performance, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_performance,
SAFE_CAST(APPROX_QUANTILES(pwa, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_pwa,
SAFE_CAST(APPROX_QUANTILES(seo, 1000)[OFFSET(500)] AS NUMERIC) AS median_lighthouse_score_seo,
# Page weight stats
SAFE_CAST(APPROX_QUANTILES(bytesTotal, 1000)[OFFSET(500)] AS INT64) AS median_bytes_total,
SAFE_CAST(APPROX_QUANTILES(bytesJS, 1000)[OFFSET(500)] AS INT64) AS median_bytes_js,
SAFE_CAST(APPROX_QUANTILES(bytesImg, 1000)[OFFSET(500)] AS INT64) AS median_bytes_image
FROM lab_data
INNER JOIN crux
USING (client, origin)
GROUP BY
geo,
client,
rank,
technology,
version
`)
35 changes: 35 additions & 0 deletions definitions/output/reports/tech_report_adoption.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_adoption', {
schema: 'reports',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['tech_report']
}).preOps(ctx => `
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "adoption", "type": "report"} */
SELECT
date,
geo,
rank,
technology,
version,
STRUCT(
COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop,
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
) AS adoption
FROM ${ctx.ref('reports', 'tech_crux')}
WHERE date = '${pastMonth}'
GROUP BY
date,
geo,
rank,
technology,
version
`)
94 changes: 94 additions & 0 deletions definitions/output/reports/tech_report_categories.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_categories', {
schema: 'reports',
type: 'table',
tags: ['tech_report']
}).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "name": "categories", "type": "dict"} */
WITH pages AS (
SELECT DISTINCT
client,
root_page,
technologies
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = '${pastMonth}'
${constants.devRankFilter}
),
category_descriptions AS (
SELECT
name AS category,
description
FROM ${ctx.ref('wappalyzer', 'categories')}
),
category_stats AS (
SELECT
category,
STRUCT(
COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop,
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
) AS origins
FROM (
SELECT
client,
category,
COUNT(DISTINCT root_page) AS origins
FROM pages
INNER JOIN pages.technologies AS tech
INNER JOIN tech.categories AS category
WHERE
category IS NOT NULL
GROUP BY
client,
category
)
GROUP BY category
),
technology_stats AS (
SELECT
technology,
category_obj AS categories,
SUM(origins) AS total_origins
FROM ${ctx.ref('reports', 'tech_report_technologies')}
GROUP BY
technology,
categories
)
SELECT
category,
description,
origins,
ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies
FROM category_stats
INNER JOIN technology_stats
ON category_stats.category IN UNNEST(technology_stats.categories)
INNER JOIN category_descriptions
USING (category)
GROUP BY
category,
description,
origins
UNION ALL
SELECT
'ALL' AS category,
NULL AS description,
STRUCT(
COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop,
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
) AS origins,
NULL AS technologies
FROM (
SELECT
client,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY client
)
`)
103 changes: 103 additions & 0 deletions definitions/output/reports/tech_report_core_web_vitals.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_core_web_vitals', {
schema: 'reports',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['tech_report']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_VITALS(
records ARRAY<STRUCT<
client STRING,
origins_with_good_fid INT64,
origins_with_good_cls INT64,
origins_with_good_lcp INT64,
origins_with_good_fcp INT64,
origins_with_good_ttfb INT64,
origins_with_good_inp INT64,
origins_with_any_fid INT64,
origins_with_any_cls INT64,
origins_with_any_lcp INT64,
origins_with_any_fcp INT64,
origins_with_any_ttfb INT64,
origins_with_any_inp INT64,
origins_with_good_cwv INT64,
origins_eligible_for_cwv INT64
>>)
RETURNS ARRAY<STRUCT<
name STRING,
desktop STRUCT<
good_number INT64,
tested INT64
>,
mobile STRUCT<
good_number INT64,
tested INT64
>>>
LANGUAGE js AS '''
const METRIC_MAP = {
overall: ['origins_with_good_cwv', 'origins_eligible_for_cwv'],
LCP: ['origins_with_good_lcp', 'origins_with_any_lcp'],
CLS: ['origins_with_good_cls', 'origins_with_any_cls'],
FID: ['origins_with_good_fid', 'origins_with_any_fid'],
FCP: ['origins_with_good_fcp', 'origins_with_any_fcp'],
TTFB: ['origins_with_good_ttfb', 'origins_with_any_ttfb'],
INP: ['origins_with_good_inp', 'origins_with_any_inp']
};
// Initialize the vitals map.
const vitals = Object.fromEntries(
Object.keys(METRIC_MAP).map(metricName => {
return [metricName, {name: metricName}]
}));
// Populate each client record.
records.forEach(record => {
Object.entries(METRIC_MAP).forEach(
([metricName, [good_number, tested]]) => {
vitals[metricName][record.client] = {good_number: record[good_number], tested: record[tested]}
})})
return Object.values(vitals)
''';
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "core_web_vitals", "type": "report"} */
SELECT
date,
geo,
rank,
technology,
version,
GET_VITALS(ARRAY_AGG(STRUCT(
client,
origins_with_good_fid,
origins_with_good_cls,
origins_with_good_lcp,
origins_with_good_fcp,
origins_with_good_ttfb,
origins_with_good_inp,
origins_with_any_fid,
origins_with_any_cls,
origins_with_any_lcp,
origins_with_any_fcp,
origins_with_any_ttfb,
origins_with_any_inp,
origins_with_good_cwv,
origins_eligible_for_cwv
))) AS vitals
FROM ${ctx.ref('reports', 'tech_crux')}
WHERE date = '${pastMonth}'
GROUP BY
date,
geo,
rank,
technology,
version
`)
80 changes: 80 additions & 0 deletions definitions/output/reports/tech_report_lighthouse.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_lighthouse', {
schema: 'reports',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['tech_report']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_LIGHTHOUSE(
records ARRAY<STRUCT<
client STRING,
median_lighthouse_score_accessibility NUMERIC,
median_lighthouse_score_best_practices NUMERIC,
median_lighthouse_score_performance NUMERIC,
median_lighthouse_score_pwa NUMERIC,
median_lighthouse_score_seo NUMERIC
>>)
RETURNS ARRAY<STRUCT<
name STRING,
desktop STRUCT<
median_score FLOAT64
>,
mobile STRUCT<
median_score FLOAT64
>>>
LANGUAGE js AS '''
const METRIC_MAP = {
accessibility: 'median_lighthouse_score_accessibility',
best_practices: 'median_lighthouse_score_best_practices',
performance: 'median_lighthouse_score_performance',
pwa: 'median_lighthouse_score_pwa',
seo: 'median_lighthouse_score_seo',
}
// Initialize the Lighthouse map.
const lighthouse = Object.fromEntries(Object.keys(METRIC_MAP).map(metricName => {
return [metricName, {name: metricName}]
}));
// Populate each client record.
records.forEach(record => {
Object.entries(METRIC_MAP).forEach(([metricName, median_score]) => {
lighthouse[metricName][record.client] = {median_score: record[median_score]}
});
});
return Object.values(lighthouse)
''';
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "lighthouse", "type": "report"} */
SELECT
date,
geo,
rank,
technology,
version,
GET_LIGHTHOUSE(ARRAY_AGG(STRUCT(
client,
median_lighthouse_score_accessibility,
median_lighthouse_score_best_practices,
median_lighthouse_score_performance,
median_lighthouse_score_pwa,
median_lighthouse_score_seo
))) AS lighthouse
FROM ${ctx.ref('reports', 'tech_crux')}
WHERE date = '${pastMonth}'
GROUP BY
date,
geo,
rank,
technology,
version
`)
70 changes: 70 additions & 0 deletions definitions/output/reports/tech_report_page_weight.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_page_weight', {
schema: 'reports',
type: 'incremental',
protected: true,
bigquery: {
partitionBy: 'date',
clusterBy: ['rank', 'geo']
},
tags: ['tech_report']
}).preOps(ctx => `
CREATE TEMPORARY FUNCTION GET_PAGE_WEIGHT(
records ARRAY<STRUCT<
client STRING,
total INT64,
js INT64,
images INT64
>>)
RETURNS ARRAY<STRUCT<
name STRING,
mobile STRUCT<
median_bytes INT64
>,
desktop STRUCT<
median_bytes INT64
>>>
LANGUAGE js AS '''
const METRICS = ['total', 'js', 'images']
// Initialize the page weight map.
const pageWeight = Object.fromEntries(METRICS.map(metricName => {
return [metricName, {name: metricName}]
}))
// Populate each client record.
records.forEach(record => {
METRICS.forEach(metricName => {
pageWeight[metricName][record.client] = {median_bytes: record[metricName]}
})
})
return Object.values(pageWeight)
''';
DELETE FROM ${ctx.self()}
WHERE date = '${pastMonth}';
`).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "date": "${pastMonth}", "name": "page_weight", "type": "report"} */
SELECT
date,
geo,
rank,
technology,
version,
GET_PAGE_WEIGHT(ARRAY_AGG(STRUCT(
client,
median_bytes_total,
median_bytes_js,
median_bytes_image
))) AS pageWeight
FROM ${ctx.ref('reports', 'tech_crux')}
WHERE date = '${pastMonth}'
GROUP BY
date,
geo,
rank,
technology,
version
`)
79 changes: 79 additions & 0 deletions definitions/output/reports/tech_report_technologies.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_technologies', {
schema: 'reports',
type: 'table',
tags: ['tech_report']
}).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "name": "technologies", "type": "dict"} */
WITH pages AS (
SELECT DISTINCT
client,
root_page,
tech.technology
FROM ${ctx.ref('crawl', 'pages')} AS pages
INNER JOIN pages.technologies AS tech
WHERE
date = '${pastMonth}'
${constants.devRankFilter} AND
tech.technology IS NOT NULL
),
tech_origins AS (
SELECT
client,
technology,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY
client,
technology
),
technologies AS (
SELECT
name AS technology,
description,
STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category,
categories AS category_obj,
NULL AS similar_technologies
FROM ${ctx.ref('wappalyzer', 'technologies')} AS technologies
INNER JOIN technologies.categories AS category
GROUP BY
technology,
description,
categories
),
total_pages AS (
SELECT
client,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY client
)
SELECT
client,
technology,
description,
category,
category_obj,
similar_technologies,
origins
FROM tech_origins
INNER JOIN technologies
USING(technology)
UNION ALL
SELECT
client,
'ALL' AS technology,
NULL AS description,
NULL AS category,
NULL AS category_obj,
NULL AS similar_technologies,
origins
FROM total_pages
`)
64 changes: 64 additions & 0 deletions definitions/output/reports/tech_report_versions.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
const pastMonth = constants.fnPastMonth(constants.currentMonth)

publish('tech_report_versions', {
schema: 'reports',
type: 'table',
tags: ['tech_report']
}).query(ctx => `
/* {"dataform_trigger": "tech_report_complete", "name": "versions", "type": "dict"} */
WITH pages AS (
SELECT DISTINCT
client,
root_page,
tech.technology,
REGEXP_EXTRACT_ALL(version, r'(0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)\\.(?:0|[1-9]\\d*)')[SAFE_OFFSET(0)] AS version
FROM ${ctx.ref('crawl', 'pages')} AS pages
INNER JOIN pages.technologies AS tech
LEFT JOIN tech.info AS version
WHERE
date = '${pastMonth}'
${constants.devRankFilter} AND
tech.technology IS NOT NULL
),
version_origins AS (
SELECT
client,
technology,
version,
COUNT(DISTINCT root_page) AS origins
FROM pages
WHERE version IS NOT NULL
GROUP BY
client,
technology,
version
),
total_origins AS (
SELECT
client,
technology,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY
client,
technology
)
SELECT
client,
technology,
version,
origins
FROM version_origins
UNION ALL
SELECT
client,
technology,
'ALL' AS version,
origins
FROM total_origins
`)
2 changes: 1 addition & 1 deletion infra/bigquery-export/index.js
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@ async function main (exportConfig) {
console.log(exportConfig)
const reports = new ReportsExporter()
await reports.export(exportConfig)
} else if (eventName === 'report_cwv_tech_complete') {
} else if (eventName === 'tech_report_complete') {
console.info('Tech Report export')
console.log(exportConfig)
const techReports = new TechReportsExporter()
6 changes: 3 additions & 3 deletions infra/bigquery-export/reports.js
Original file line number Diff line number Diff line change
@@ -57,7 +57,7 @@ export class TechReportsExporter {
}

async export (exportConfig) {
if (exportConfig.dataform_trigger !== 'report_cwv_tech_complete') {
if (exportConfig.dataform_trigger !== 'tech_report_complete') {
console.error('Invalid dataform trigger')
return
}
@@ -68,13 +68,13 @@ export class TechReportsExporter {
SELECT
STRING(date) AS date,
* EXCEPT(date)
FROM httparchive.reports.cwv_tech_${exportConfig.name}
FROM httparchive.reports.tech_report_${exportConfig.name}
WHERE date = '${exportConfig.date}'
`
} else if (exportConfig.type === 'dict') {
query = `
SELECT *
FROM reports.cwv_tech_${exportConfig.name}
FROM reports.tech_report_${exportConfig.name}
`
} else {
console.error('Invalid export type')
2 changes: 1 addition & 1 deletion infra/dataform-trigger/index.js
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ FROM crux, report;
action: 'runDataformRepo',
actionArgs: {
repoName: 'crawl-data',
tags: ['crux_ready']
tags: ['tech_report']
}
},
crawl_complete: {
2 changes: 2 additions & 0 deletions infra/tf/bigquery_export/main.tf
Original file line number Diff line number Diff line change
@@ -32,6 +32,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" {
deletion_protection = false

template {
parallelism = 5
template {
containers {
image = "${var.location}.gcr.io/${var.project}/cloud-run/${var.function_name}:latest"
@@ -48,6 +49,7 @@ resource "google_cloud_run_v2_job" "bigquery_export" {
}
timeout = "3600s"
service_account = var.function_identity
max_retries = 1
}
}
}
4 changes: 2 additions & 2 deletions infra/tf/dataform_trigger/main.tf
Original file line number Diff line number Diff line change
@@ -30,11 +30,11 @@ resource "google_storage_bucket_object" "source" {
}

resource "google_cloudfunctions2_function" "dataform_trigger" {
name = "dataform-trigger"
name = var.function_name
location = var.region
build_config {
runtime = "nodejs20"
entry_point = "dataform-trigger"
entry_point = var.function_name
source {
storage_source {
bucket = google_storage_bucket_object.source.bucket