Introduce “UnforgivingHtml” parser.

Goals of the parser: * Tighten control over things like double-quotes & closing tags. * Improve error messaging for malformed markup. * Improve performance. Closes #239.
Netflix · Dec 22, 2024 · ae0ea29 · ae0ea29
1 parent aaefbba
commit ae0ea29
Show file tree

Hide file tree

Showing 5 changed files with 1,697 additions and 521 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- The current “forgiving” html parser is replaced for a more “unforgiving” html
+  parser in the default template engine (#239).
+
+### Deprecated
+
+- The `<svg>` element and the `svg` tagged template function are deprecated and
+  will be removed in future versions. The spec and conventions for `svg`
+  differ a lot from `html` and a faster / more-maintainable parser can be built
+  if we stop supporting this (#236).
+- The `<style>` tag is deprecated and will be removed in future versions.
+  Authors should prefer to declare a separate stylesheet in a `.css` file now
+  that “import attributes” are supported in modern browsers (#237).
+
+### Removed
+
+- Support for the `<math>` element is removed from the default template engine.
+  This worked before because `innerHTML` was being used under-the-hood. But a
+  strict allow-list is now used to accomplish parsing (#238).
+- Support for `on*` event handlers bound to either attributes or properties is
+  removed. Authors should prefer to add event listeners via “addEventListener”.
+  This was implicitly supported previously, but is now deemed invalid with the
+  move to a more “unforgiving“ parser (#240).
+- Support for CDATA sections is removed. Authors should prefer to use character
+  references (html entities) instead. Previously, this was implicitly supported
+  due to underlying usage of `innerHTML`, but is now strictly forbidden (#241).
+- Reject JS-y unicode escapes in html template strings. E.g., you cannot write
+  something like `this\u2026` — instead, you would have to write something like
+  `this&hellip;`, or `this&#x2026;`, etc. This mirrors the html spec (#242).
+- Restrict element tags to an allow-list of what we’re willing to parse in the
+  default template engine. This causes us to reject elements like `<title>`,
+  `<body>`, `<link>`, `<script>`, `<canvas>`, `<meta>`, etc. (#239).
+
 ## [1.1.2] - 2024-12-16
 
 ### Added

diff --git a/test/forgiving.js b/test/forgiving.js
@@ -0,0 +1,275 @@
+// This is just kept here as an example alternative to our more “unforgiving”
+//  parsing solution. In particular, it could be interesting to try and keep the
+//  interfaces to both “forgiving” and “unforgiving” as similar as possible to
+//  enable us to show performance-testing deltas in the future.
+/** Forgiving HTML parser which leverages innerHTML. */
+export default class Forgiving {
+  // Special markers added to markup enabling discovery post-instantiation.
+  static #NEXT_MARKER = 'forgiving-next:'; // The ":" helps for debugging.
+  static #CONTENT_MARKER = 'forgiving-content';
+
+  // Types of bindings that we can have.
+  static #ATTRIBUTE = 'attribute';
+  static #BOOLEAN = 'boolean';
+  static #DEFINED = 'defined';
+  static #PROPERTY = 'property';
+
+  // TODO: Could be more forgiving here!
+  // Patterns to find special edges in original html strings.
+  static #OPEN_REGEX = /<[a-z][a-z0-9-]*(?=\s)/g;
+  static #STEP_REGEX = /(?:\s+[a-z][a-z0-9-]*(?=[\s>])|\s+[a-z][a-zA-Z0-9-]*="[^"]*")+/y;
+  static #ATTRIBUTE_OR_PROPERTY_REGEX = /\s+(?:(?<questions>\?{0,2})?(?<attribute>([a-z][a-zA-Z0-9-]*))|\.(?<property>[a-z][a-zA-Z0-9_]*))="$/y;
+  static #CLOSE_REGEX = />/g;
+
+  // Walk through each string from our tagged template function “strings” array
+  //  in a stateful way so that we know what kind of bindings are implied at
+  //  each interpolated value.
+  static #exhaustString(string, state, context) {
+    if (!state.inside) {
+      // We're outside the opening tag.
+      Forgiving.#OPEN_REGEX.lastIndex = state.index;
+      const openMatch = Forgiving.#OPEN_REGEX.exec(string);
+      if (openMatch) {
+        state.inside = true;
+        state.index = Forgiving.#OPEN_REGEX.lastIndex;
+        state.lastOpenContext = context;
+        state.lastOpenIndex = openMatch.index;
+        Forgiving.#exhaustString(string, state, context);
+      }
+    } else {
+      // We're inside the opening tag.
+      Forgiving.#STEP_REGEX.lastIndex = state.index;
+      if (Forgiving.#STEP_REGEX.test(string)) {
+        state.index = Forgiving.#STEP_REGEX.lastIndex;
+      }
+      Forgiving.#CLOSE_REGEX.lastIndex = state.index;
+      if (Forgiving.#CLOSE_REGEX.test(string)) {
+        state.inside = false;
+        state.index = Forgiving.#CLOSE_REGEX.lastIndex;
+        Forgiving.#exhaustString(string, state, context);
+      }
+    }
+  }
+
+  // Flesh out an html string from our tagged template function “strings” array
+  //  and add special markers that we can detect later, after instantiation.
+  //
+  // E.g., the user might have passed this interpolation:
+  //
+  // <div id="foo-bar-baz" foo="${foo}" bar="${bar}" .baz="${baz}">
+  //   ${content}
+  // </div>
+  //
+  // … and we would instrument it as follows:
+  //
+  // <!--forgiving-next:attribute=foo,attribute=bar,attribute=baz--><div id="foo-bar-baz">
+  //   <!--forgiving-content-->
+  // </div>
+  //
+  static #createHtml(language, strings) {
+    const keyToKeyState = new Map();
+    const htmlStrings = [];
+    const state = { inside: false, index: 0, lastOpenContext: 0, lastOpenIndex: 0 };
+    // We don’t have to test the last string since it is already on the other
+    //  side of the last interpolation, by definition. Hence the “- 1” below.
+    //  Note that this final string is added just after the loop completes.
+    for (let iii = 0; iii < strings.length - 1; iii++) {
+      // The index may be set to “1” here, which indicates we are slicing off a
+      //  trailing quote character from a attribute-or-property match. After
+      //  slicing, we reset the index to zero so regular expressions know to
+      //  match from the start in “exhaustString”.
+      let string = strings[iii];
+      if (state.index !== 0) {
+        string = string.slice(state.index);
+        state.index = 0;
+      }
+      Forgiving.#exhaustString(string, state, iii);
+      if (state.inside) {
+        Forgiving.#ATTRIBUTE_OR_PROPERTY_REGEX.lastIndex = state.index;
+        const match = Forgiving.#ATTRIBUTE_OR_PROPERTY_REGEX.exec(string);
+        if (match) {
+          const { questions, attribute, property } = match.groups;
+          if (attribute) {
+            // We found a match like this: html`<div hidden="${value}"></div>`.
+            //                  … or this: html`<div ?hidden="${value}"></div>`.
+            //                  … or this: html`<div ??hidden="${value}"></div>`.
+            // Syntax is 3-5 characters: `${questions}${attribute}="` + `"`.
+            let syntax = 3;
+            let kind = Forgiving.#ATTRIBUTE;
+            switch (questions) {
+              case '??': kind = Forgiving.#DEFINED; syntax = 5; break;
+              case '?': kind = Forgiving.#BOOLEAN; syntax = 4; break;
+            }
+            string = string.slice(0, -syntax - attribute.length);
+            const key = state.lastOpenContext;
+            const keyState = Forgiving.#setIfMissing(keyToKeyState, key, () => ({ index: state.lastOpenIndex, items: [] }));
+            keyState.items.push(`${kind}=${attribute}`);
+          } else {
+            // We found a match like this: html`<div .title="${value}"></div>`.
+            // Syntax is 4 characters: `.${property}="` + `"`.
+            const syntax = 4;
+            const kind = Forgiving.#PROPERTY;
+            string = string.slice(0, -syntax - property.length);
+            const key = state.lastOpenContext;
+            const keyState = Forgiving.#setIfMissing(keyToKeyState, key, () => ({ index: state.lastOpenIndex, items: [] }));
+            keyState.items.push(`${kind}=${property}`);
+          }
+          state.index = 1; // Accounts for an expected quote character next.
+        } else {
+          // It’s “on or after” because interpolated JS can span multiple lines.
+          const handled = [...strings.slice(0, iii), string.slice(0, state.index)].join('');
+          const lineCount = handled.split('\n').length;
+          throw new Error(`Found invalid template on or after line ${lineCount} in substring \`${string}\`. Failed to parse \`${string.slice(state.index)}\`.`);
+        }
+      } else {
+        // Assume it’s a match like this: html`<div>${value}</div>`.
+        string += `<!--${Forgiving.#CONTENT_MARKER}-->`;
+        state.index = 0; // No characters to account for. Reset to zero.
+      }
+      htmlStrings[iii] = string;
+    }
+    // Again, there might be a quote we need to slice off here still.
+    let lastString = strings.at(-1);
+    if (state.index > 0) {
+      lastString = lastString.slice(state.index);
+    }
+    htmlStrings.push(lastString);
+    for (const [iii, { index, items }] of keyToKeyState.entries()) {
+      const comment = `<!--${Forgiving.#NEXT_MARKER}${items.join(',')}-->`;
+      const htmlString = htmlStrings[iii];
+      htmlStrings[iii] = `${htmlString.slice(0, index)}${comment}${htmlString.slice(index)}`;
+    }
+    const html = htmlStrings.join('');
+    return language === Forgiving.svg
+      ? `<svg xmlns="http://www.w3.org/2000/svg">${html}</svg>`
+      : html;
+  }
+
+  static #createFragment(language, strings) {
+    const template = document.createElement('template');
+    const html = Forgiving.#createHtml(language, strings);
+    template.innerHTML = html;
+    return template.content;
+  }
+
+  // Walk through our fragment that we added special markers to and notify
+  //  integrator when we hit target “paths”. The integrator can use this with
+  //  a subsequent clone of the fragment to establish “targets”. And, while we
+  //  walk, clean up our bespoke markers.
+  // Note that we are always walking the interpolated strings and the resulting,
+  //  instantiated DOM _in the same depth-first manner_. This means that the
+  //  ordering is fairly reliable.
+  //
+  // For example, we walk this structure:
+  //
+  // <!--forgiving-next:attribute=foo,attribute=bar,attribute=baz--><div id="foo-bar-baz">
+  //   <!--forgiving-content-->
+  // </div>
+  //
+  // And end up with this (which is ready to be injected into a container):
+  //
+  // <div id="foo-bar-baz">
+  //   <!---->
+  //   <!---->
+  // </div>
+  //
+  static #walkFragment(
+    onBoolean,
+    onDefined,
+    onAttribute,
+    onProperty,
+    onContent,
+    onText,
+    node,
+    nodeType = Node.DOCUMENT_FRAGMENT_NODE,
+    path = [],
+  ) {
+    // @ts-ignore — TypeScript doesn’t seem to understand the nodeType param.
+    if (nodeType === Node.ELEMENT_NODE) {
+      // Special case to handle elements which only allow text content (no comments).
+      const { localName } = node;
+      if (
+        (localName === 'style' || localName === 'script') &&
+        node.textContent.includes(Forgiving.#CONTENT_MARKER)
+      ) {
+        throw new Error(`Interpolation of <${localName}> tags is not allowed.`);
+      } else if (localName === 'textarea' || localName === 'title') {
+        if (node.textContent.includes(Forgiving.#CONTENT_MARKER)) {
+          if (node.textContent === `<!--${Forgiving.#CONTENT_MARKER}-->`) {
+            node.textContent = '';
+            onText(path);
+          } else {
+            throw new Error(`Only basic interpolation of <${localName}> tags is allowed.`);
+          }
+        }
+      }
+    }
+    if (nodeType === Node.DOCUMENT_FRAGMENT_NODE || nodeType === Node.ELEMENT_NODE) {
+      // It’s expensive to make a copy of “childNodes”. Instead, we carefully
+      //  manage our index as we iterate over the live collection.
+      const childNodes = node.childNodes;
+      for (let iii = 0; iii < childNodes.length; iii++) {
+        const childNode = childNodes[iii];
+        const childNodeType = childNode.nodeType;
+        if (childNodeType === Node.COMMENT_NODE) {
+          const textContent = childNode.textContent;
+          if (textContent.startsWith(Forgiving.#CONTENT_MARKER)) {
+            childNode.textContent = '';
+            const startNode = document.createComment('');
+            node.insertBefore(startNode, childNode);
+            iii++;
+            onContent([...path, iii]);
+          } else if (textContent.startsWith(Forgiving.#NEXT_MARKER)) {
+            const data = textContent.slice(Forgiving.#NEXT_MARKER.length);
+            const items = data.split(',');
+            for (const item of items) {
+              const [binding, name] = item.split('=');
+              switch (binding) {
+                case Forgiving.#ATTRIBUTE: onAttribute(name, [...path, iii]); break;
+                case Forgiving.#BOOLEAN:   onBoolean(name, [...path, iii]); break;
+                case Forgiving.#DEFINED:   onDefined(name, [...path, iii]); break;
+                case Forgiving.#PROPERTY:  onProperty(name, [...path, iii]); break;
+              }
+            }
+            iii--;
+            node.removeChild(childNode);
+          }
+        } else if (childNodeType === Node.ELEMENT_NODE) {
+          Forgiving.#walkFragment(
+            onBoolean,
+            onDefined,
+            onAttribute,
+            onProperty,
+            onContent,
+            onText,
+            childNode,
+            childNodeType,
+            [...path, iii],
+          );
+        }
+      }
+    }
+  }
+
+  // TODO: Replace with Map.prototype.getOrInsert when TC39 proposal lands.
+  //  https://github.com/tc39/proposal-upsert
+  static #setIfMissing(map, key, callback) {
+    // Values set in this file are ALL truthy, so "get" is used (versus "has").
+    let value = map.get(key);
+    if (!value) {
+      value = callback();
+      map.set(key, value);
+    }
+    return value;
+  }
+
+  // Languages.
+  static html = 'html';
+  static svg = 'svg';
+
+  static parse(strings, onBoolean, onDefined, onAttribute, onProperty, onContent, onText, language) {
+    const fragment = Forgiving.#createFragment(language, strings);
+    Forgiving.#walkFragment(onBoolean, onDefined, onAttribute, onProperty, onContent, onText, fragment);
+    return fragment;
+  }
+}