Skip to content

Commit

Permalink
Retrieve WHATWG spec titles from WHATWG database (#1666)
Browse files Browse the repository at this point in the history
Build code used to rely on Specref to get the title of WHATWG specifications.
This update makes it fetch info for WHATWG specs from the WHATWG database
directly. To save one request, the code leverages the workstreams database,
also used by fetch-groups, instead of the biblio file.

On top of adding a new `whatwg` value to the `"source"` field, this update will
also fix the titles of the WHATWG specs: they end with "Standard" in Specref
but, while that matches the `<title>` tag, the actual spec title in the `<h1>`
and the title in the WHATWG database don't end with "Standard". #docallmeDOM

The update turns the `fetchJSON` function into a utility function. This is
going to save a few requests (not that many!) that are common between
fetch-info and fetch-groups. Specific functions in fetch-info were also
adjusted not to do anything when there are no specs of interest in the list
(this speeds up tests a bit, but has no impact on a full build since, by
definition, there are specs of interest in the full list...)
  • Loading branch information
tidoust authored Jan 24, 2025
1 parent 475c375 commit 04e8c41
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 161 deletions.
2 changes: 1 addition & 1 deletion schema/definitions.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@

"source": {
"type": "string",
"enum": ["w3c", "specref", "spec", "ietf"]
"enum": ["w3c", "specref", "spec", "ietf", "whatwg"]
},

"nightly": {
Expand Down
26 changes: 7 additions & 19 deletions src/fetch-groups.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import Octokit from "./octokit.js";
import parseSpecUrl from "./parse-spec-url.js";
import fetchJSON from "./fetch-json.js";


/**
Expand Down Expand Up @@ -84,19 +85,6 @@ export default async function (specs, options) {
// same fetch request again and again
const cache = {};

// Helper function to retrieve a JSON resource or return null if resource
// cannot be retrieved
async function fetchJSON(url, options) {
const body = cache[url] ?? await fetch(url, options).then(res => {
if (res.status !== 200) {
throw new Error(`W3C API returned an error for ${url}, status code is ${res.status}`);
}
return res.json();
});
cache[url] = body;
return body;
}

for (const spec of specs) {
if (spec.__last?.standing === 'discontinued' &&
(!spec.standing || spec.standing === 'discontinued')) {
Expand All @@ -113,7 +101,7 @@ export default async function (specs, options) {
if (ietfName) {
spec.organization = spec.organization ?? "IETF";
if (spec.groups) continue;
const ietfJson = await fetchJSON(`https://datatracker.ietf.org/doc/${ietfName[1]}/doc.json`);
const ietfJson = await fetchJSON(`https://datatracker.ietf.org/doc/${ietfName[1]}/doc.json`, options);
if (ietfJson.group?.type === "WG") {
spec.groups = [{
name: `${ietfJson.group.name} Working Group`,
Expand Down Expand Up @@ -152,7 +140,7 @@ export default async function (specs, options) {
}

if (info && info.owner === "whatwg") {
const workstreams = await fetchJSON("https://raw.githubusercontent.com/whatwg/sg/main/db.json");
const workstreams = await fetchJSON("https://raw.githubusercontent.com/whatwg/sg/main/db.json", options);
const workstream = workstreams.workstreams.find(ws => ws.standards.find(s => s.href === spec.url));
if (!workstream) {
throw new Error(`No WHATWG workstream found for ${spec.url}`);
Expand Down Expand Up @@ -214,11 +202,11 @@ export default async function (specs, options) {
else if (info.type === "tr") {
// Use the W3C API to find info about /TR specs
const url = `https://api.w3.org/specifications/${info.name}/versions/latest`;
let resp = await fetchJSON(url);
let resp = await fetchJSON(url, options);
if (!resp?._links?.deliverers) {
throw new Error(`W3C API did not return deliverers for the spec`);
}
resp = await fetchJSON(resp._links.deliverers.href);
resp = await fetchJSON(resp._links.deliverers.href, options);

if (!resp?._links?.deliverers) {
throw new Error(`W3C API did not return deliverers for the spec`);
Expand Down Expand Up @@ -250,7 +238,7 @@ export default async function (specs, options) {
url = new URL(spec.url);
url.pathname = "/w3c.json";
}
const body = await fetchJSON(url.toString());
const body = await fetchJSON(url.toString(), options);

// Note the "group" property is either an ID or an array of IDs
groups = [body?.group].flat().filter(g => !!g);
Expand All @@ -261,7 +249,7 @@ export default async function (specs, options) {
spec.groups = [];
for (const id of groups) {
const url = ('' + id).startsWith("https://") ? id : `https://api.w3.org/groups/${id}`;
const info = await fetchJSON(url);
const info = await fetchJSON(url, options);
spec.groups.push({
name: info.name,
url: info._links.homepage.href
Expand Down
184 changes: 76 additions & 108 deletions src/fetch-info.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import loadSpec from "./load-spec.js";
import computeShortname from "./compute-shortname.js";
import Octokit from "./octokit.js";
import ThrottledQueue from "./throttled-queue.js";
import fetchJSON from "./fetch-json.js";

// Map spec statuses returned by Specref to those used in specs
// Note we typically won't get /TR statuses from Specref, since all /TR URLs
Expand All @@ -55,8 +56,6 @@ const specrefStatusMapping = {
"cg-draft": "Draft Community Group Report"
};

const fetchQueue = new ThrottledQueue({ maxParallel: 2 });

async function useLastInfoForDiscontinuedSpecs(specs) {
const results = {};
for (const spec of specs) {
Expand Down Expand Up @@ -95,31 +94,22 @@ async function fetchInfoFromW3CApi(specs, options) {
}

const url = `https://api.w3.org/specifications/${spec.shortname}/versions/latest`;
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status === 404) {
return;
}
if (res.status !== 200) {
throw new Error(`W3C API returned an error, status code is ${res.status}, url was ${url}`);
}

// Has the shortname changed from a W3C perspective?
if (res.redirected) {
const match = res.url.match(/\/specifications\/([^\/]+)\//);
const w3cShortname = match ? match[1] : '';
if (w3cShortname !== spec.shortname) {
throw new Error(`W3C API redirects "${spec.shortname}" to ` +
`"${w3cShortname}", update the shortname!`);
}
}

try {
const body = await res.json();
return body;
}
catch (err) {
throw new Error("W3C API returned invalid JSON");
}
const body = await fetchJSON(url, options);

// The shortname of the specification may have changed. In such cases, the
// W3C API silently redirects to the info for the new shortname, whereas we
// want to make sure we use the latest shortname in browser-specs. The
// actual shortname used by the W3C API does not appear explicitly in the
// response to a "/versions/latest" request, but it appears implicitly in
// the "_links/specification/href" URL.
const match = body._links.specification.href.match(/\/specifications\/([^\/]+)$/);
const shortname = match[1];
if (shortname !== spec.shortname) {
throw new Error(`W3C API redirects "${spec.shortname}" to ` +
`"${shortname}", update the shortname!`);
}

return body;
}));

const seriesShortnames = new Set();
Expand Down Expand Up @@ -153,28 +143,15 @@ async function fetchInfoFromW3CApi(specs, options) {
// Fetch info on the series
const seriesInfo = await Promise.all([...seriesShortnames].map(async shortname => {
const url = `https://api.w3.org/specification-series/${shortname}`;
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status === 404) {
return;
}
if (res.status !== 200) {
throw new Error(`W3C API returned an error, status code is ${res.status}`);
}
try {
const body = await res.json();

// The CSS specs and the CSS snapshots have different series shortnames for
// us ("CSS" vs. "css"), but the W3C API is case-insentive, mixes the two
// series, and claims that the series shortname is "CSS" or "css"
// depending on which spec got published last. Let's get back to the
// shortname we requested.
body.shortname = shortname;

return body;
}
catch (err) {
throw new Error("W3C API returned invalid JSON");
}
const body = await fetchJSON(url, options);

// The CSS specs and the CSS snapshots have different series shortnames for
// us ("CSS" vs. "css"), but the W3C API is case-insentive, mixes the two
// series, and claims that the series shortname is "CSS" or "css"
// depending on which spec got published last. Let's get back to the
// shortname we requested.
body.shortname = shortname;
return body;
}));

results.__series = {};
Expand Down Expand Up @@ -207,6 +184,36 @@ async function fetchInfoFromW3CApi(specs, options) {
return results;
}

async function fetchInfoFromWHATWG(specs, options) {
const whatwgRe = /\.whatwg\.org/;
if (!specs.find(spec => spec.url.match(whatwgRe))) {
return {};
}

// Note: The WHATWG biblio.json file could also be used, but we're going to
// need the workstreams database in any case in fetch-groups, so let's fetch
// the database directly (this will put it in cache for fetch-groups)
const url = 'https://raw.githubusercontent.com/whatwg/sg/main/db.json';
const db = await fetchJSON(url, options);
const standards = db.workstreams.map(ws => ws.standards).flat();

const specInfo = {};
for (const spec of specs) {
if (!spec.url.match(/\.whatwg\.org/)) {
continue;
}
const entry = standards.find(std => std.href === spec.url);
if (!entry) {
console.warn(`[warning] WHATWG spec at ${spec.url} not found in WHATWG database`);
continue;
}
specInfo[spec.shortname] = {
nightly: { url: spec.url, status: 'Living Standard' },
title: entry.name
};
}
return specInfo;
}

async function fetchInfoFromSpecref(specs, options) {
function chunkArray(arr, len) {
Expand All @@ -224,11 +231,7 @@ async function fetchInfoFromSpecref(specs, options) {
// API does not return the "source" field, so we need to retrieve the list
// ourselves from Specref's GitHub repository.
const specrefBrowserspecsUrl = "https://raw.githubusercontent.com/tobie/specref/main/refs/browser-specs.json";
const browserSpecsResponse = await fetch(specrefBrowserspecsUrl, options);
if (browserSpecsResponse.status !== 200) {
throw new Error(`Could not retrieve specs contributed by browser-specs to Speref, status code is ${browserSpecsResponse.status}`);
}
const browserSpecs = await browserSpecsResponse.json();
const browserSpecs = await fetchJSON(specrefBrowserspecsUrl, options);
specs = specs.filter(spec => !browserSpecs[spec.shortname.toUpperCase()]);

// Browser-specs now acts as source for Specref for the WICG specs and W3C
Expand All @@ -244,18 +247,7 @@ async function fetchInfoFromSpecref(specs, options) {
const chunksRes = await Promise.all(chunks.map(async chunk => {
let specrefUrl = "https://api.specref.org/bibrefs?refs=" +
chunk.map(spec => spec.shortname).join(',');

const res = await fetchQueue.runThrottled(fetch, specrefUrl, options);
if (res.status !== 200) {
throw new Error(`Could not query Specref, status code is ${res.status}`);
}
try {
const body = await res.json();
return body;
}
catch (err) {
throw new Error("Specref returned invalid JSON");
}
return fetchJSON(specrefUrl, options);
}));

const results = {};
Expand Down Expand Up @@ -315,54 +307,17 @@ async function fetchInfoFromSpecref(specs, options) {


async function fetchInfoFromIETF(specs, options) {
async function fetchJSONDoc(draftName) {
const url = `https://datatracker.ietf.org/doc/${draftName}/doc.json`;
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status !== 200) {
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
}
try {
return await res.json();
}
catch (err) {
throw new Error(`IETF datatracker returned invalid JSON for ${url}`);
}
}

async function fetchRFCName(docUrl) {
const res = await fetchQueue.runThrottled(fetch, docUrl, options);
if (res.status !== 200) {
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
}
try {
const body = await res.json();
if (!body.rfc) {
throw new Error(`Could not find an RFC name in ${docUrl}`);
}
return `rfc${body.rfc}`;
}
catch (err) {
throw new Error(`IETF datatracker returned invalid JSON for ${url}`);
}
const body = await fetchJSON(docUrl, options);
return `rfc${body.rfc}`;
}

async function fetchObsoletedBy(draftName) {
if (!draftName.startsWith('rfc')) {
return [];
}
const url = `https://datatracker.ietf.org/api/v1/doc/relateddocument/?format=json&relationship__slug__in=obs&target__name__in=${draftName}`;
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status !== 200) {
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
}
let body;
try {
body = await res.json();
}
catch (err) {
throw new Error(`IETF datatracker returned invalid JSON for ${url}`);
}

const body = await fetchJSON(url, options);
return Promise.all(body.objects
.map(obj => `https://datatracker.ietf.org${obj.source}`)
.map(fetchRFCName));
Expand All @@ -388,6 +343,15 @@ async function fetchInfoFromIETF(specs, options) {
return paths.filter(p => p.path.match(/^specs\/rfc\d+\.html$/))
.map(p => p.path.match(/(rfc\d+)\.html$/)[1]);
}

// IETF can only provide information about IETF specs, no need to fetch the
// list of RFCs of the HTTP WG if there's no IETF spec in the list.
if (!specs.find(spec =>
spec.url.match(/\.rfc-editor\.org/) ||
spec.url.match(/datatracker\.ietf\.org/))) {
return {};
}

const httpwgRFCs = await getHttpwgRFCs();

const info = await Promise.all(specs.map(async spec => {
Expand All @@ -404,7 +368,8 @@ async function fetchInfoFromIETF(specs, options) {
if (!draftName) {
throw new Error(`IETF document follows an unexpected URL pattern: ${spec.url}`);
}
const jsonDoc = await fetchJSONDoc(draftName[1]);
const draftUrl = `https://datatracker.ietf.org/doc/${draftName[1]}/doc.json`;
const jsonDoc = await fetchJSON(draftUrl, options);
const lastRevision = jsonDoc.rev_history.pop();
if (lastRevision.name !== draftName[1]) {
throw new Error(`IETF spec ${spec.url} published under a new name "${lastRevision.name}". Canonical URL must be updated accordingly.`);
Expand Down Expand Up @@ -645,13 +610,16 @@ async function fetchInfo(specs, options) {
{ name: 'discontinued', fn: useLastInfoForDiscontinuedSpecs },
{ name: 'w3c', fn: fetchInfoFromW3CApi },
{ name: 'ietf', fn: fetchInfoFromIETF },
{ name: 'whatwg', fn: fetchInfoFromWHATWG },
{ name: 'specref', fn: fetchInfoFromSpecref },
{ name: 'spec', fn: fetchInfoFromSpecs }
];
let remainingSpecs = specs;
for (let i = 0; i < steps.length ; i++) {
const step = steps[i];
info[step.name] = await step.fn(remainingSpecs, options);
info[step.name] = remainingSpecs.length > 0 ?
await step.fn(remainingSpecs, options) :
{};
remainingSpecs = remainingSpecs.filter(spec => !info[step.name][spec.shortname]);
}

Expand Down
33 changes: 33 additions & 0 deletions src/fetch-json.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import ThrottledQueue from "./throttled-queue.js";

// Make sure we remain "friendly" with servers
const fetchQueue = new ThrottledQueue({ maxParallel: 2 });

// Maintain a cache of fetched JSON resources in memory to avoid sending the
// same fetch request again and again
const cache = {};

/**
* Fetch a JSON URL
*/
export default async function (url, options) {
if (cache[url]) {
return structuredClone(cache[url]);
}
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status === 404) {
return null;
}
if (res.status !== 200) {
throw new Error(`Server returned an error for ${url}, status code is ${res.status}`);
}

try {
const body = await res.json();
cache[url] = body;
return structuredClone(body);
}
catch (err) {
throw new Error(`Server returned invalid JSON for ${url}`);
}
}
Loading

0 comments on commit 04e8c41

Please sign in to comment.