Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
rmarx committed Dec 2, 2024
2 parents a5e57fb + d477c96 commit 3dd154f
Show file tree
Hide file tree
Showing 108 changed files with 1,402 additions and 140 deletions.
30 changes: 12 additions & 18 deletions sql/2024/seo/robots-txt-size-2024.sql
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
#standardSQL
# Robots.txt size
CREATE TEMPORARY FUNCTION getRobotsSize(payload STRING)
RETURNS FLOAT64 LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var robots = JSON.parse($._robots_txt);
return robots['size']/1024;
} catch (e) {
return 0;
}
''';

SELECT
client,
COUNT(DISTINCT(site)) AS sites,
SAFE_DIVIDE(COUNTIF(robots_size = 0), COUNT(DISTINCT(site))) AS pct_0,
SAFE_DIVIDE(COUNTIF(robots_size > 0 AND robots_size <= 100), COUNT(DISTINCT(site))) AS pct_0_100,
SAFE_DIVIDE(COUNTIF(robots_size > 100 AND robots_size <= 200), COUNT(DISTINCT(site))) AS pct_100_200,
SAFE_DIVIDE(COUNTIF(robots_size > 200 AND robots_size <= 300), COUNT(DISTINCT(site))) AS pct_200_300,
SAFE_DIVIDE(COUNTIF(robots_size > 300 AND robots_size <= 400), COUNT(DISTINCT(site))) AS pct_300_400,
SAFE_DIVIDE(COUNTIF(robots_size > 400 AND robots_size <= 500), COUNT(DISTINCT(site))) AS pct_400_500,
SAFE_DIVIDE(COUNTIF(robots_size > 500), COUNT(DISTINCT(site))) AS pct_gt500,
SAFE_DIVIDE(COUNTIF(robots_size = 0), COUNT(DISTINCT(site))) AS pct_missing,
SAFE_DIVIDE(COUNTIF(robots_size IS NULL), COUNT(DISTINCT(site))) AS pct_missing,
COUNTIF(robots_size > 500) AS count_gt500,
COUNTIF(robots_size = 0) AS count_missing
COUNTIF(robots_size IS NULL) AS count_missing
FROM (
SELECT
client,
page AS site,
getRobotsSize(payload) AS robots_size
root_page AS site,
custom_metrics.robots_txt,
FLOAT64(custom_metrics.robots_txt.size_kib) AS robots_size
FROM
`httparchive.all.pages`
WHERE date = '2024-06-01'
) -- noqa: L062
`httparchive.crawl.pages`
WHERE
date = '2024-06-01' AND
is_root_page AND -- no need to crawl inner pages for this one
custom_metrics.robots_txt.status IS NOT NULL
)
GROUP BY
client
ORDER BY
Expand Down
3 changes: 1 addition & 2 deletions src/config/2024.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@
"part": "II",
"chapter_number": "9",
"title": "SEO",
"slug": "seo",
"todo": true
"slug": "seo"
},
{
"part": "II",
Expand Down
61 changes: 59 additions & 2 deletions src/config/contributors.json
Original file line number Diff line number Diff line change
Expand Up @@ -1194,7 +1194,10 @@
},
"dwsmart": {
"avatar_url": "11179452",
"bluesky": "tamethebots.com",
"github": "dwsmart",
"linkedin": "davewsmart",
"mastodon": "https://seocommunity.social/@dwsmart",
"name": "Dave Smart",
"teams": {
"2020": [
Expand All @@ -1206,9 +1209,11 @@
"2022": [
"authors",
"reviewers"
],
"2024": [
"authors"
]
},
"twitter": "davewsmart",
"website": "https://tamethebots.com"
},
"dsottimano": {
Expand Down Expand Up @@ -2047,7 +2052,9 @@
},
"fellowhuman1101": {
"avatar_url": "52051775",
"bluesky": "not-a-robot.com",
"github": "fellowhuman1101",
"linkedin": "jamie-indigo",
"name": "Jamie Indigo",
"teams": {
"2020": [
Expand All @@ -2059,6 +2066,10 @@
],
"2022": [
"authors"
],
"2024": [
"authors",
"leads"
]
},
"twitter": "Jammer_Volts",
Expand Down Expand Up @@ -3026,13 +3037,22 @@
},
"MichaelLewittes": {
"avatar_url": "96250205",
"bluesky": "michaellewittes.bsky.social",
"github": "MichaelLewittes",
"linkedin": "michael-lewittes-a22b831",
"mastodon": "https://seocommunity.social/@MichaelLewittes",
"name": "Michael Lewittes",
"teams": {
"2022": [
"editors"
],
"2024": [
"authors",
"editors"
]
}
},
"twitter": "MichaelLewittes",
"website": "https://www.ranktify.com/team"
},
"MichaelSolati": {
"avatar_url": "11811422",
Expand Down Expand Up @@ -4877,5 +4897,42 @@
"reviewers"
]
}
},
"mikaelaraujo": {
"avatar_url": "4764075",
"bluesky": "mikaelaraujo.bsky.social",
"github": "mikaelaraujo",
"linkedin": "mikael-araujo",
"name": "Mikael Araújo",
"teams": {
"2024": [
"authors"
]
},
"threads": "@mikaelaraujo",
"twitter": "miknaraujo",
"website": "https://www.mikaelaraujo.com"
},
"henryp25": {
"avatar_url": "62102954",
"github": "henryp25",
"linkedin": "henry-price-9ab362b4",
"name": "Henry Price",
"teams": {
"2024": [
"analysts"
]
}
},
"cnichols013" :{
"avatar_url": "73146375",
"github": "cnichols013",
"linkedin": "chris-nichols",
"name": "Chris Nichols",
"teams": {
"2024": [
"analysts"
]
}
}
}
16 changes: 8 additions & 8 deletions src/config/last_updated.json
Original file line number Diff line number Diff line change
Expand Up @@ -753,8 +753,8 @@
},
"en/2024/chapters/accessibility.html": {
"date_published": "2024-11-11T00:00:00.000Z",
"date_modified": "2024-11-22T00:00:00.000Z",
"hash": "db405edaff762e930358c9471d3fd358"
"date_modified": "2024-11-30T00:00:00.000Z",
"hash": "d437f8d0a37f119170da4d0eb0b1bfe9"
},
"en/2024/chapters/cdn.html": {
"date_published": "2024-11-11T00:00:00.000Z",
Expand Down Expand Up @@ -837,8 +837,8 @@
"hash": "232a286d67940eaed02f935fd9ce1db6"
},
"en/2024/chapters/seo.html": {
"date_published": "2024-11-11T00:00:00.000Z",
"date_modified": "2024-11-16T00:00:00.000Z",
"date_published": "2024-12-02T00:00:00.000Z",
"date_modified": "2024-12-02T00:00:00.000Z",
"hash": "d7bb5659e4444ac7702888c7b11880cb"
},
"en/2024/chapters/structured-data.html": {
Expand All @@ -848,13 +848,13 @@
},
"en/2024/chapters/sustainability.html": {
"date_published": "2024-11-11T00:00:00.000Z",
"date_modified": "2024-11-18T00:00:00.000Z",
"hash": "bb49d876d3e33811819746edc96ed447"
"date_modified": "2024-11-30T00:00:00.000Z",
"hash": "48be7a85f0dfefe06c709a2ff9f5449b"
},
"en/2024/chapters/third-parties.html": {
"date_published": "2024-11-21T00:00:00.000Z",
"date_modified": "2024-11-21T00:00:00.000Z",
"hash": "075bec99b73be68c6fa7b97b97808182"
"date_modified": "2024-11-30T00:00:00.000Z",
"hash": "f5e703ed3f81f6969d2000e33d06a7fd"
},
"en/2024/chapters/webassembly.html": {
"date_published": "2024-11-11T00:00:00.000Z",
Expand Down
4 changes: 2 additions & 2 deletions src/content/en/2019/markup.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Names of elements on each page were collected from the DOM itself, after the ini

Looking at a raw frequency count isn't especially helpful, even for standard elements: About 25% of all elements encountered are `<div>`. About 17% are `<a>`, about 11% are `<span>` -- and those are the only elements that account for more than 10% of occurrences. Languages are <a hreflang="en" href="https://www.youtube.com/watch?v=fCn8zs912OE">generally like this</a>; a small number of terms are astoundingly used by comparison. Further, when we start looking at non-standard elements for uptake, this would be very misleading as one site could use a certain element a thousand times and thus make it look artificially very popular.

Instead, as in Hixie's original study, what we will look at is how many sites include each element at least once in their homepage.
Instead, as in Hixie's original study, what we will look at is how many sites include each element at least once in their home page.

<p class="note">Note: This is, itself, not without some potential biases. Popular products can be used by several sites, which introduce non-standard markup, even "invisibly" to individual authors. Thus, care must be taken to acknowledge that usage doesn't necessarily imply direct author knowledge and conscious adoption as much as it does the servicing of a common need, in a common way. During our research, we found several examples of this, some we will call out.</p>

Expand Down Expand Up @@ -249,7 +249,7 @@ So, are all of those elements used by less than 1% of pages "useless"? Definite

Many elements, even the native ones, appear on fewer than 1% of pages and are still very important and successful. `<code>`, for example, is an element that I both use and encounter a lot. It's definitely useful and important, and yet it is used on only 0.57% of these pages. Part of this is skewed based on what we are measuring; home pages are generally *less likely* to include certain kinds of things (like `<code>` for example). Home pages serve a less general purpose than, for example, headings, paragraphs, links and lists. However, the data is generally useful.

We also collected information about which pages contained an author-defined (not native) `.shadowRoot`. About 0.22% of desktop pages and 0.15% of mobile pages had a shadow root. This might not sound like a lot, but it is roughly 6.5k sites in the mobile dataset and 10k sites on the desktop and is more than several HTML elements. `<summary>` for example, has about equivalent use on the desktop and it is the 146th most popular element. `<datalist>` appears on 0.04% of homepages and it's the 201st most popular element.
We also collected information about which pages contained an author-defined (not native) `.shadowRoot`. About 0.22% of desktop pages and 0.15% of mobile pages had a shadow root. This might not sound like a lot, but it is roughly 6.5k sites in the mobile dataset and 10k sites on the desktop and is more than several HTML elements. `<summary>` for example, has about equivalent use on the desktop and it is the 146th most popular element. `<datalist>` appears on 0.04% of home pages and it's the 201st most popular element.

In fact, over 15% of elements we're counting as defined by HTML are outside the top 200 in the desktop dataset . `<meter>` is the least popular "HTML5 era" element, which we can define as 2004-2011, before HTML moved to a Living Standard model. It is around the 1,000th most popular element. `<slot>`, the most recently introduced element (April 2016), is only around the 1,400th most popular element.

Expand Down
8 changes: 4 additions & 4 deletions src/content/en/2019/security.md
Original file line number Diff line number Diff line change
Expand Up @@ -669,16 +669,16 @@ NEL provides incredibly valuable information and you can read more about the typ
### Clear Site Data
With the increasing ability to store data locally on a user's device, via cookies, caches and local storage to name but a few, site operators needed a reliable way to manage this data. The Clear Site Data header provides a means to ensure that all data of a particular type is removed from the device, though it is <a hreflang="en" href="https://caniuse.com/#feat=mdn-http_headers_clear-site-data">not yet supported in all browsers</a>.

Given the nature of the header, it is unsurprising to see almost no usage reported - just 9 desktop requests and 7 mobile requests. With our data only looking at the homepage of a site, we're unlikely to see the most common use of the header which would be on a logout endpoint. Upon logging out of a site, the site operator would return the Clear Site Data header and the browser would remove all data of the indicated types. This is unlikely to take place on the homepage of a site.
Given the nature of the header, it is unsurprising to see almost no usage reported - just 9 desktop requests and 7 mobile requests. With our data only looking at the home page of a site, we're unlikely to see the most common use of the header which would be on a logout endpoint. Upon logging out of a site, the site operator would return the Clear Site Data header and the browser would remove all data of the indicated types. This is unlikely to take place on the home page of a site.

## Cookies
Cookies have many security protections available and whilst some of those are long standing, and have been available for years, some of them are really quite new have been introduced only in the last couple of years.

### `Secure`
The `Secure` flag on a cookie instructs a browser to only send the cookie over a secure (HTTPS) connection and we find only a small % of sites (4.22% on desktop and 3.68% on mobile) issuing a cookie with the Secure flag set on their homepage. This is depressing considering the relative ease with which this feature can be used. Again, the high usage of analytics and advertisement [third-party](./third-parties) requests, which wish to collect data over both HTTP and HTTPS is likely skewing these numbers and it would be interesting research to see the usage on other cookies, like authentication cookies.
The `Secure` flag on a cookie instructs a browser to only send the cookie over a secure (HTTPS) connection and we find only a small % of sites (4.22% on desktop and 3.68% on mobile) issuing a cookie with the Secure flag set on their home page. This is depressing considering the relative ease with which this feature can be used. Again, the high usage of analytics and advertisement [third-party](./third-parties) requests, which wish to collect data over both HTTP and HTTPS is likely skewing these numbers and it would be interesting research to see the usage on other cookies, like authentication cookies.

### `HttpOnly`
The `HttpOnly` flag on a cookie instructs the browser to prevent JavaScript on the page from accessing the cookie. Many cookies are only used by the server so are not needed by the JavaScript on the page, so restricting access to a cookie is a great protection against XSS attacks from stealing the cookie. We find that a much larger % of sites issuing a cookie with this flag on their homepage at 24.24% on desktop and 22.23% on mobile.
The `HttpOnly` flag on a cookie instructs the browser to prevent JavaScript on the page from accessing the cookie. Many cookies are only used by the server so are not needed by the JavaScript on the page, so restricting access to a cookie is a great protection against XSS attacks from stealing the cookie. We find that a much larger % of sites issuing a cookie with this flag on their home page at 24.24% on desktop and 22.23% on mobile.

### `SameSite`
As a much more recent addition to cookie protections, the `SameSite` flag is a powerful protection against [Cross-Site Request Forgery (CSRF)](https://en.wikipedia.org/wiki/Cross-site_request_forgery) attacks (often also known as XSRF).
Expand Down Expand Up @@ -787,7 +787,7 @@ As the web grows in capabilities and allows access to more and more sensitive da

In the recent years, the web has made the most progress on the encryption of data in transit. As described in the [TLS section](#transport-layer-security) section, thanks to a range of efforts from browser vendors, developers and Certificate Authorities such as Let's Encrypt, the fraction of the web using HTTPS has steadily grown. At the time of writing, the majority of sites are available over HTTPS, ensuring confidentiality and integrity of traffic. Importantly, over 99% of websites which enable HTTPS use newer, more secure versions of the TLS protocol (TLSv1.2 and TLSv1.3). The use of strong [cipher suites](#cipher-suites) such as AES in GCM mode is also high, accounting for over 95% of requests on all platforms.

At the same time, gaps in TLS configurations are still fairly common. Over 15% of pages suffer from [mixed content](#mixed-content) issues, resulting in browser warnings, and 4% of sites contain active mixed content, blocked by modern browsers for security reasons. Similarly, the benefits of [HTTP Strict Transport Security](#http-strict-transport-security) only extend to a small subset of major sites, and the majority of websites don't enable the most secure HSTS configurations and are not eligible for [HSTS preloading](#hsts-preloading). Despite progress in HTTPS adoption, a large number of cookies is still set without the `Secure` flag; only 4% of homepages that set cookies prevent them from being sent over unencrypted HTTP.
At the same time, gaps in TLS configurations are still fairly common. Over 15% of pages suffer from [mixed content](#mixed-content) issues, resulting in browser warnings, and 4% of sites contain active mixed content, blocked by modern browsers for security reasons. Similarly, the benefits of [HTTP Strict Transport Security](#http-strict-transport-security) only extend to a small subset of major sites, and the majority of websites don't enable the most secure HSTS configurations and are not eligible for [HSTS preloading](#hsts-preloading). Despite progress in HTTPS adoption, a large number of cookies is still set without the `Secure` flag; only 4% of home pages that set cookies prevent them from being sent over unencrypted HTTP.

### Defending against common web vulnerabilities

Expand Down
2 changes: 1 addition & 1 deletion src/content/en/2020/accessibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ HTML5 introduced many new native elements, all which have <a hreflang="en" href=

#### Just use a button!

We found that 25.20% of desktop sites and 24.50% of mobile sites had homepages with at least one element with an explicitly assigned `role="button"`. This suggests that about a quarter of websites are using the `button` role on elements in order to change their semantics, with the exception of buttons that have been explicitly assigned the button role, which is redundant but harmless.
We found that 25.20% of desktop sites and 24.50% of mobile sites had home pages with at least one element with an explicitly assigned `role="button"`. This suggests that about a quarter of websites are using the `button` role on elements in order to change their semantics, with the exception of buttons that have been explicitly assigned the button role, which is redundant but harmless.

If non-interactive elements such as `<div>`s and `<span>`s have been given this role, there is a significant chance one or more of the 5 rules of ARIA have been broken.

Expand Down
2 changes: 1 addition & 1 deletion src/content/en/2020/capabilities.md
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ Over the course of 2020, the `getInstalledRelatedApps()` API shows a steady grow

Web apps can store content offline using various ways, such as Cache Storage, or IndexedDB. However, for users it's hard to discover which content is available offline. The <a hreflang="en" href="https://web.dev/content-indexing-api/">Content Indexing API</a> (<a hreflang="en" href="https://wicg.github.io/content-index/spec/">WICG Editor's Draft</a>) allows developers to expose content more prominently. Currently, Chrome on Android is the only browser to support this API. This browser shows a list of "Articles for you" in the Downloads menu. Content indexed via the Content Indexing API will appear there.

The Content Indexing API extends the Service Worker API by providing a new `ContentIndex` interface. This interface is available via the `index` property of the Service Worker's registration. The `add()` method allows developers to add content to the index: Each piece of content must have an ID, a URL, a launch URL, title, description, and a set of icons. Optionally, the content can be grouped into different categories such as articles, homepages, or videos. The `delete()` method allows for removing content from the index again, and the `getAll()` method returns a list of all indexed entries.
The Content Indexing API extends the Service Worker API by providing a new `ContentIndex` interface. This interface is available via the `index` property of the Service Worker's registration. The `add()` method allows developers to add content to the index: Each piece of content must have an ID, a URL, a launch URL, title, description, and a set of icons. Optionally, the content can be grouped into different categories such as articles, home pages, or videos. The `delete()` method allows for removing content from the index again, and the `getAll()` method returns a list of all indexed entries.

{{ figure_markup(
image="content_indexing_api.png",
Expand Down
Loading

0 comments on commit 3dd154f

Please sign in to comment.