From 694a19be998e7272f652711056b355d8bf300773 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 6 Sep 2018 08:31:42 +0100 Subject: [PATCH] Parser: Propose new hand-coded parser (#8083) * Parser: Propose new hand-coded PHP parser For some time we've needed a more performant PHP parser for the first stage of parsing the `post_content` document. See #1681 (early exploration) See #8044 (parser performance issue) See #1775 (parser performance, fixed in php-pegjs) I'm proposing this implementation of the spec parser as an alternative to the auto-generated parser from the PEG definition. This is not yet ready to go but I wanted to get the code in a branch so I can iterate on it and garner early feedback. This should eventually provide a setup fixture for #6831 wherein we are testing alternate parser implementations. - designed as a basic recursive-descent - but doesn't recurse on the call-stack, recurses via trampoline - moves linearly through document in one pass - relies on RegExp for tokenization - nested blocks include the nested content in their `innerHTML` this needs to go away - create test fixutre - figure out where to save this file * Fix issue with containing the nested innerHTML * Also handle newlines as whitespace * Use classes for some static typing * add type hints * remove needless comment * space where space is due * meaningless rename * remove needless function call * harmonize with spec parser * don't forget freeform HTML before blocks * account for oddity in spec-parser * add some polish, fix a thing * comment it * add JS version too * Change `.` to `[^]` because `/s` isn't well supported in JS The `s` flag on the RegExp object informs the engine to treat a dot character as a class that includes the newline character. Without it newlines aren't considered in the dot. Since this flag is new to Javascript and not well supported in different browsers I have removed it in favor of an explicit class of characters that _does_ include the newline, namely the open exclusion of `[^]` which permits all input characters. Hat-top to @hywan for finding this. * Move code into `/packages` directory, prepare for review * take out names from RegExp pattern to not fail tests * Fix bug in parser: store HTML soup in stack frames while parsing Previously we were sending all "HTML soup" segments of HTML between blocks to the output list before any blocks were processed. We should have been tracking these segments during the parsing and only spit them out when closing a block at the top level. This change stores the index into the input document at which that soup starts if it exists and then produces the freeform block when adding a block to the output from the parse frame stack. * fix whitespace * fix oddity in spec * match styles * use class name filter on server-side parser class * fix whitespace * Document extensibility * fix typo in example code * Push failing parsing test * fix lazy/greedy bug in parser regexp * Docs: Fix typos, links, tweak style. * update from PR feedback * trim docs * Load default block parser, replacing PEG-generated one * Expand `?:` shorthand for PHP 5.2 compat * add fixtures test for default parser * spaces to tabs * could we need no assoc? * fill out return array * put that assoc back in there * isometrize * rename and add 0 * Conditionally include the parser class * Add docblocks * Standardize the package configuration --- docs/extensibility.md | 6 + docs/extensibility/parser.md | 36 ++ docs/manifest.json | 6 + lib/blocks.php | 16 +- lib/client-assets.php | 9 +- lib/load.php | 1 - package-lock.json | 7 + package.json | 1 + .../block-serialization-default-parser/.npmrc | 1 + .../CHANGELOG.md | 3 + .../README.md | 126 +++++ .../package.json | 29 ++ .../parser.php | 449 ++++++++++++++++++ .../src/index.js | 257 ++++++++++ .../test/index.js | 27 ++ packages/blocks/package.json | 1 + packages/blocks/src/api/parser.js | 4 +- phpunit/class-parsing-test.php | 30 +- .../full-content/full-content.spec.js | 2 +- webpack.config.js | 1 + 20 files changed, 1004 insertions(+), 8 deletions(-) create mode 100644 docs/extensibility/parser.md create mode 100644 packages/block-serialization-default-parser/.npmrc create mode 100644 packages/block-serialization-default-parser/CHANGELOG.md create mode 100644 packages/block-serialization-default-parser/README.md create mode 100644 packages/block-serialization-default-parser/package.json create mode 100644 packages/block-serialization-default-parser/parser.php create mode 100644 packages/block-serialization-default-parser/src/index.js create mode 100644 packages/block-serialization-default-parser/test/index.js diff --git a/docs/extensibility.md b/docs/extensibility.md index 60d9b073373305..26d69f4118a965 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -74,3 +74,9 @@ There are some advanced block features which require opt-in support in the theme ## Autocomplete Autocompleters within blocks may be extended and overridden. See [autocomplete](../docs/extensibility/autocomplete.md). + +## Block Parsing and Serialization + +Posts in the editor move through a couple of different stages between being stored in `post_content` and appearing in the editor. Since the blocks themselves are data structures that live in memory it takes a parsing and serialization step to transform out from and into the stored format in the database. + +Customizing the parser is an advanced topic that you can learn more about in the [Extending the Parser](../docs/extensibility/parser.md) section. diff --git a/docs/extensibility/parser.md b/docs/extensibility/parser.md new file mode 100644 index 00000000000000..7c1e5bc1be7c21 --- /dev/null +++ b/docs/extensibility/parser.md @@ -0,0 +1,36 @@ +# Extending the Parser + +When the editor is interacting with blocks, these are stored in memory as data structures comprising a few basic properties and attributes. Upon saving a working post we serialize these data structures into a specific HTML structure and save the resultant string into the `post_content` property of the post in the WordPress database. When we load that post back into the editor we have to make the reverse transformation to build those data structures from the serialized format in HTML. + +The process of loading the serialized HTML into the editor is performed by the _block parser_. The formal specification for this transformation is encoded in the parsing expression grammar (PEG) inside the `@wordpress/block-serialization-spec-parser` package. The editor provides a default parser implementation of this grammar but there may be various reasons for replacing that implementation with a custom implementation. We can inject our own custom parser implementation through the appropriate filter. + +## Server-side parser + +Plugins have access to the parser if they want to process posts in their structured form instead of a plain HTML-as-string representation. + +## Client-side parser + +The editor uses the client-side parser while interactively working in a post. The plain HTML-as-string representation is sent to the browser by the backend and then the editor performs the first parse to initialize itself. + +## Filters + +To replace the server-side parser, use the `block_parser_class` filter. The filter transforms the string class name of a parser class. This class is expected to expose a `parse` method. + +_Example:_ + +```php +class EmptyParser { + public function parse( $post_content ) { + // return an empty document + return array(); + } +} + +function my_plugin_select_empty_parser( $prev_parser_class ) { + return 'EmptyParser'; +} + +add_filter( 'block_parser_class', 'my_plugin_select_empty_parser', 10, 1 ); +``` + +> **Note**: At the present time it's not possible to replace the client-side parser. diff --git a/docs/manifest.json b/docs/manifest.json index fccd97003d15e5..19b6171d9ee412 100644 --- a/docs/manifest.json +++ b/docs/manifest.json @@ -287,6 +287,12 @@ "markdown_source": "https://raw.githubusercontent.com/WordPress/gutenberg/master/packages/block-library/README.md", "parent": "packages" }, + { + "title": "@wordpress/block-serialization-default-parser", + "slug": "packages-block-serialization-default-parser", + "markdown_source": "https://raw.githubusercontent.com/WordPress/gutenberg/master/packages/block-serialization-default-parser/README.md", + "parent": "packages" + }, { "title": "@wordpress/block-serialization-spec-parser", "slug": "packages-block-serialization-spec-parser", diff --git a/lib/blocks.php b/lib/blocks.php index 30c7059dc265b1..92e89abddd6462 100644 --- a/lib/blocks.php +++ b/lib/blocks.php @@ -66,8 +66,20 @@ function gutenberg_parse_blocks( $content ) { ); } - $parser = new Gutenberg_PEG_Parser; - return $parser->parse( _gutenberg_utf8_split( $content ) ); + /** + * Filter to allow plugins to replace the server-side block parser + * + * @since 3.8.0 + * + * @param string $parser_class Name of block parser class + */ + $parser_class = apply_filters( 'block_parser_class', 'WP_Block_Parser' ); + // Load default block parser for server-side parsing if the default parser class is being used. + if ( 'WP_Block_Parser' === $parser_class ) { + require_once dirname( __FILE__ ) . '/../packages/block-serialization-default-parser/parser.php'; + } + $parser = new $parser_class(); + return $parser->parse( $content ); } /** diff --git a/lib/client-assets.php b/lib/client-assets.php index 7ad3546c5fb131..e00c39586cc733 100644 --- a/lib/client-assets.php +++ b/lib/client-assets.php @@ -275,6 +275,13 @@ function gutenberg_register_scripts_and_styles() { filemtime( gutenberg_dir_path() . 'build/dom/index.js' ), true ); + wp_register_script( + 'wp-block-serialization-default-parser', + gutenberg_url( 'build/block-serialization-default-parser/index.js' ), + array(), + filemtime( gutenberg_dir_path() . 'build/block-serialization-default-parser/index.js' ), + true + ); wp_register_script( 'wp-block-serialization-spec-parser', gutenberg_url( 'build/block-serialization-spec-parser/index.js' ), @@ -386,7 +393,7 @@ function gutenberg_register_scripts_and_styles() { array( 'wp-autop', 'wp-blob', - 'wp-block-serialization-spec-parser', + 'wp-block-serialization-default-parser', 'wp-data', 'wp-deprecated', 'wp-dom', diff --git a/lib/load.php b/lib/load.php index 0f18f6e4d15f9e..d049d5bb8ad436 100644 --- a/lib/load.php +++ b/lib/load.php @@ -29,7 +29,6 @@ require dirname( __FILE__ ) . '/compat.php'; require dirname( __FILE__ ) . '/plugin-compat.php'; require dirname( __FILE__ ) . '/i18n.php'; -require dirname( __FILE__ ) . '/parser.php'; require dirname( __FILE__ ) . '/register.php'; diff --git a/package-lock.json b/package-lock.json index baff199cd7a332..ec63890b85e0f8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2080,6 +2080,12 @@ "url": "^0.11.0" } }, + "@wordpress/block-serialization-default-parser": { + "version": "file:packages/block-serialization-default-parser", + "requires": { + "@babel/runtime": "^7.0.0" + } + }, "@wordpress/block-serialization-spec-parser": { "version": "file:packages/block-serialization-spec-parser" }, @@ -2089,6 +2095,7 @@ "@babel/runtime": "^7.0.0", "@wordpress/autop": "file:packages/autop", "@wordpress/blob": "file:packages/blob", + "@wordpress/block-serialization-default-parser": "file:packages/block-serialization-default-parser", "@wordpress/block-serialization-spec-parser": "file:packages/block-serialization-spec-parser", "@wordpress/data": "file:packages/data", "@wordpress/deprecated": "file:packages/deprecated", diff --git a/package.json b/package.json index 0b78cbcabfe64e..3948ca1f760162 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@wordpress/autop": "file:packages/autop", "@wordpress/blob": "file:packages/blob", "@wordpress/block-library": "file:packages/block-library", + "@wordpress/block-serialization-default-parser": "file:packages/block-serialization-default-parser", "@wordpress/block-serialization-spec-parser": "file:packages/block-serialization-spec-parser", "@wordpress/blocks": "file:packages/blocks", "@wordpress/components": "file:packages/components", diff --git a/packages/block-serialization-default-parser/.npmrc b/packages/block-serialization-default-parser/.npmrc new file mode 100644 index 00000000000000..43c97e719a5a82 --- /dev/null +++ b/packages/block-serialization-default-parser/.npmrc @@ -0,0 +1 @@ +package-lock=false diff --git a/packages/block-serialization-default-parser/CHANGELOG.md b/packages/block-serialization-default-parser/CHANGELOG.md new file mode 100644 index 00000000000000..21e1ef9aa757ec --- /dev/null +++ b/packages/block-serialization-default-parser/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial release. diff --git a/packages/block-serialization-default-parser/README.md b/packages/block-serialization-default-parser/README.md new file mode 100644 index 00000000000000..bdf933f6f4fa9b --- /dev/null +++ b/packages/block-serialization-default-parser/README.md @@ -0,0 +1,126 @@ +# Block Serialization Default Parser + +This library contains the default block serialization parser implementations for WordPress documents. It provides native PHP and JavaScript parsers that implement the specification from `@wordpress/block-serialization-spec-parser` and which normally operates on the document stored in `post_content`. + +## Installation + +Install the module + +```bash +npm install @wordpress/block-serialization-default-parser --save +``` + +_This package assumes that your code will run in an **ES2015+** environment. If you're using an environment that has limited or no support for ES2015+ such as lower versions of IE then using [core-js](https://github.com/zloirock/core-js) or [@babel/polyfill](https://babeljs.io/docs/en/next/babel-polyfill) will add support for these methods. Learn more about it in [Babel docs](https://babeljs.io/docs/en/next/caveats)._ + +## Usage + +Input post: +```html + +
+
+

Left

+
+ + + +
+

Middle

+
+ + + +
+
+ +``` + +Parsing code: +```js +import { parse } from '@wordpress/block-serialization-default-parser'; + +parse( post ) === [ + { + blockName: "core/columns", + attrs: { + columns: 3 + }, + innerBlocks: [ + { + blockName: "core/column", + attrs: null, + innerBlocks: [ + { + blockName: "core/paragraph", + attrs: null, + innerBlocks: [], + innerHTML: "\n

Left

\n" + } + ], + innerHTML: '\n
\n' + }, + { + blockName: "core/column", + attrs: null, + innerBlocks: [ + { + blockName: "core/paragraph", + attrs: null, + innerBlocks: [], + innerHTML: "\n

Middle

\n" + } + ], + innerHTML: '\n
\n' + }, + { + blockName: "core/column", + attrs: null, + innerBlocks: [], + innerHTML: '\n
\n' + } + ], + innerHTML: '\n
\n\n\n\n
\n' + } +]; +``` + +## Theory + +### What is different about this one from the spec-parser? + +This is a recursive-descent parser that scans linearly once through the input document. Instead of directly recursing it utilizes a trampoline mechanism to prevent stack overflow. It minimizes data copying and passing through the use of globals for tracking state through the parse. Between every token (a block comment delimiter) we can instrument the parser and intervene should we want to; for example we might put a hard limit on how long we can be parsing a document or provide additional debugging diagnostics for a document. + +The spec parser is defined via a _Parsing Expression Grammar_ (PEG) which answers many questions inherently that we must answer explicitly in this parser. The goal for this implementation is to match the characteristics of the PEG so that it can be directly swapped out and so that the only changes are better runtime performance and memory usage. + +### How does it work? + +Every serialized Gutenberg document is nominally an HTML document which, in addition to normal HTML, may also contain specially designed HTML comments -- the block comment delimiters -- which separate and isolate the blocks serialized in the document. + +This parser attempts to create a state-machine around the transitions triggered from those delimiters -- the "tokens" of the grammar. Every time we find one we should only be doing either of: + + - enter a new block; + - exit out of a block. + +Those actions have different effects depending on the context; for instance, when we exit a block we either need to add it to the output block list _or_ we need to append it as the next `innerBlock` on the parent block below it in the block stack (the place where we track open blocks). The details are documented below. + +The biggest challenge in this parser is making the right accounting of indices required to construct the `innerHTML` values for each block at every level of nesting depth. We take a simple approach: + + - Start each newly opened block with an empty `innerHTML`. + - Whenever we push a first block into the `innerBlocks` list, add the content from where the content of the parent block started to where this inner block starts. + - Whenever we push another block into the `innerBlocks` list, add the content from where the previous inner block ended to where this inner block starts. + - When we close out an open block, add the content from where the last inner block ended to where the closing block delimiter starts. + - If there are no inner blocks then we take the entire content between the opening and closing block comment delimiters as the `innerHTML`. + +### I meant, how does it perform? + +This parser operates much faster than the generated parser from the specification. Because we know more about the parsing than the PEG does we can take advantage of several tricks to improve our speed and memory usage: + + - We only have one or two distinct tokens, depending on how you look at it, and they are all readily matched via a regular expression. Instead of parsing on a character-per-character basis we can allow the PCRE RegExp engine to skip over large swaths of the document for us in order to find those tokens. + - Since `preg_match()` takes an `offset` parameter we can crawl through the input without passing copies of the input text on every step. We can track our position in the string and only pass a number instead. + - Not copying all those strings means that we'll also skip many memory allocations. + +Further, tokenizing with a RegExp brings an additional advantage. The parser generated by the PEG provides predictable performance characteristics in exchange for control over tokenization rules -- it doesn't allow us to define RegExp patterns in the rules so as to guard against _e.g._ cataclysmic backtracking that would break the PEG guarantees. + +However, since our "token language" of the block comment delimiters is _regular_ and _can_ be trivially matched with RegExp patterns, we can do that here and then something magical happens: we jump out of PHP or JavaScript and into a highly-optimized RegExp engine written in C or C++ on the host system. We thereby leave the virtual machine and its overhead. + +

Code is Poetry.

diff --git a/packages/block-serialization-default-parser/package.json b/packages/block-serialization-default-parser/package.json new file mode 100644 index 00000000000000..4daf27eb065f9e --- /dev/null +++ b/packages/block-serialization-default-parser/package.json @@ -0,0 +1,29 @@ +{ + "name": "@wordpress/block-serialization-default-parser", + "version": "1.0.0-rc.0", + "description": "Block serialization specification parser for WordPress posts.", + "author": "The WordPress Contributors", + "license": "GPL-2.0-or-later", + "keywords": [ + "wordpress", + "block", + "parser" + ], + "homepage": "https://github.com/WordPress/gutenberg/tree/master/packages/block-serialization-default-parser/README.md", + "repository": { + "type": "git", + "url": "https://github.com/WordPress/gutenberg.git" + }, + "bugs": { + "url": "https://github.com/WordPress/gutenberg/issues" + }, + "main": "build/index.js", + "module": "build-module/index.js", + "react-native": "src/index", + "dependencies": { + "@babel/runtime": "^7.0.0" + }, + "publishConfig": { + "access": "public" + } +} diff --git a/packages/block-serialization-default-parser/parser.php b/packages/block-serialization-default-parser/parser.php new file mode 100644 index 00000000000000..ba124340d70643 --- /dev/null +++ b/packages/block-serialization-default-parser/parser.php @@ -0,0 +1,449 @@ + 3 ) + * + * @since 3.8.0 + * @var array|null + */ + public $attrs; + + /** + * List of inner blocks (of this same class) + * + * @since 3.8.0 + * @var WP_Block_Parser_Block[] + */ + public $innerBlocks; + + /** + * Resultant HTML from inside block comment delimieters + * after removing inner blocks + * + * @example "...Just testing..." -> "Just testing..." + * + * @since 3.8.0 + * @var string + */ + public $innerHTML; + + function __construct( $name, $attrs, $innerBlocks, $innerHTML ) { + $this->blockName = $name; + $this->attrs = $attrs; + $this->innerBlocks = $innerBlocks; + $this->innerHTML = $innerHTML; + } +} + +/** + * Class WP_Block_Parser_Frame + * + * Holds partial blocks in memory while parsing + * + * @internal + * @since 3.8.0 + */ +class WP_Block_Parser_Frame { + /** + * Full or partial block + * + * @since 3.8.0 + * @var WP_Block_Parser_Block + */ + public $block; + + /** + * Byte offset into document for start of parse token + * + * @since 3.8.0 + * @var int + */ + public $token_start; + + /** + * Byte length of entire parse token string + * + * @since 3.8.0 + * @var int + */ + public $token_length; + + /** + * Byte offset into document for after parse token ends + * (used during reconstruction of stack into parse production) + * + * @since 3.8.0 + * @var int + */ + public $prev_offset; + + /** + * Byte offset into document where leading HTML before token starts + * + * @since 3.8.0 + * @var int + */ + public $leading_html_start; + + function __construct( $block, $token_start, $token_length, $prev_offset = null, $leading_html_start = null ) { + $this->block = $block; + $this->token_start = $token_start; + $this->token_length = $token_length; + $this->prev_offset = isset($prev_offset) ? $prev_offset : $token_start + $token_length; + $this->leading_html_start = $leading_html_start; + } +} + +/** + * Class WP_Block_Parser + * + * Parses a document and constructs a list of parsed block objects + * + * @since 3.8.0 + */ +class WP_Block_Parser { + /** + * Input document being parsed + * + * @example "Pre-text\nThis is inside a block!" + * + * @since 3.8.0 + * @var string + */ + public $document; + + /** + * Tracks parsing progress through document + * + * @since 3.8.0 + * @var int + */ + public $offset; + + /** + * List of parsed blocks + * + * @since 3.8.0 + * @var WP_Block_Parser_Block[] + */ + public $output; + + /** + * Stack of partially-parsed structures in memory during parse + * + * @since 3.8.0 + * @var WP_Block_Parser_Frame[] + */ + public $stack; + + /** + * Parses a document and returns a list of block structures + * + * When encountering an invalid parse will return a best-effort + * parse. In contrast to the specification parser this does not + * return an error on invalid inputs. + * + * @since 3.8.0 + * + * @param string $document + * @return WP_Block_Parser_Block[] + */ + function parse( $document ) { + $this->document = $document; + $this->offset = 0; + $this->output = array(); + $this->stack = array(); + + do { + // twiddle our thumbs + } while ( $this->proceed() ); + + return $this->output; + } + + /** + * Processes the next token from the input document + * and returns whether to proceed eating more tokens + * + * This is the "next step" function that essentially + * takes a token as its input and decides what to do + * with that token before descending deeper into a + * nested block tree or continuing along the document + * or breaking out of a level of nesting. + * + * @internal + * @since 3.8.0 + * @return bool + */ + function proceed() { + list( $token_type, $block_name, $attrs, $start_offset, $token_length ) = $this->next_token(); + $stack_depth = count( $this->stack ); + + switch ( $token_type ) { + case 'no-more-tokens': + // if not in a block then flush output + if ( 0 === $stack_depth ) { + $this->add_freeform(); + return false; + } + + /* + * Otherwise we have a problem + * This is an error + * + * we have options + * - treat it all as freeform text + * - assume an implicit closer (easiest when not nesting) + */ + + // for the easy case we'll assume an implicit closer + if ( 1 === $stack_depth ) { + $this->add_block_from_stack(); + return false; + } + + /* + * for the nested case where it's more difficult we'll + * have to assume that multiple closers are missing + * and so we'll collapse the whole stack piecewise + */ + while ( 0 < count( $this->stack ) ) { + $this->add_block_from_stack(); + } + return false; + + case 'void-block': + /* + * easy case is if we stumbled upon a void block + * in the top-level of the document + */ + if ( 0 === $stack_depth ) { + $this->output[] = new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ); + $this->offset = $start_offset + $token_length; + return true; + } + + // otherwise we found an inner block + $this->add_inner_block( + new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ), + $start_offset, + $token_length + ); + $this->offset = $start_offset + $token_length; + return true; + + case 'block-opener': + // we may have some HTML soup before the next block + $leading_html_start = $start_offset > $this->offset ? $this->offset : null; + + // track all newly-opened blocks on the stack + array_push( $this->stack, new WP_Block_Parser_Frame( + new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ), + $start_offset, + $token_length, + $start_offset + $token_length, + $leading_html_start + ) ); + $this->offset = $start_offset + $token_length; + return true; + + case 'block-closer': + /* + * if we're missing an opener we're in trouble + * This is an error + */ + if ( 0 === $stack_depth ) { + /* + * we have options + * - assume an implicit opener + * - assume _this_ is the opener + * - give up and close out the document + */ + $this->add_freeform(); + return false; + } + + // if we're not nesting then this is easy - close the block + if ( 1 === $stack_depth ) { + $this->add_block_from_stack( $start_offset ); + $this->offset = $start_offset + $token_length; + return true; + } + + /* + * otherwise we're nested and we have to close out the current + * block and add it as a new innerBlock to the parent + */ + $stack_top = array_pop( $this->stack ); + $stack_top->block->innerHTML .= substr( $this->document, $stack_top->prev_offset, $start_offset - $stack_top->prev_offset ); + $stack_top->prev_offset = $start_offset + $token_length; + + $this->add_inner_block( + $stack_top->block, + $stack_top->token_start, + $stack_top->token_length, + $start_offset + $token_length + ); + $this->offset = $start_offset + $token_length; + return true; + + default: + // This is an error + $this->add_freeform(); + return false; + } + } + + /** + * Scans the document from where we last left off + * and finds the next valid token to parse if it exists + * + * Returns the type of the find: kind of find, block information, attributes + * + * @internal + * @since 3.8.0 + * @return array + */ + function next_token() { + $matches = null; + + /* + * aye the magic + * we're using a single RegExp to tokenize the block comment delimiters + * we're also using a trick here because the only difference between a + * block opener and a block closer is the leading `/` before `wp:` (and + * a closer has no attributes). we can trap them both and process the + * match back in PHP to see which one it was. + */ + $has_match = preg_match( + '/).)+?}\s+)?(?\/)?-->/s', + $this->document, + $matches, + PREG_OFFSET_CAPTURE, + $this->offset + ); + + // we have no more tokens + if ( 0 === $has_match ) { + return array( 'no-more-tokens', null, null, null, null ); + } + + list( $match, $started_at ) = $matches[ 0 ]; + + $length = strlen( $match ); + $is_closer = isset( $matches[ 'closer' ] ) && -1 !== $matches[ 'closer' ][ 1 ]; + $is_void = isset( $matches[ 'void' ] ) && -1 !== $matches[ 'void' ][ 1 ]; + $namespace = $matches[ 'namespace' ]; + $namespace = ( isset( $namespace ) && -1 !== $namespace[ 1 ] ) ? $namespace[ 0 ] : 'core/'; + $name = $namespace . $matches[ 'name' ][ 0 ]; + $has_attrs = isset( $matches[ 'attrs' ] ) && -1 !== $matches[ 'attrs' ][ 1 ]; + $attrs = $has_attrs ? json_decode( $matches[ 'attrs' ][ 0 ] ) : null; + + /* + * This state isn't allowed + * This is an error + */ + if ( $is_closer && ( $is_void || $has_attrs ) ) { + // we can ignore them since they don't hurt anything + } + + if ( $is_void ) { + return array( 'void-block', $name, $attrs, $started_at, $length ); + } + + if ( $is_closer ) { + return array( 'block-closer', $name, null, $started_at, $length ); + } + + return array( 'block-opener', $name, $attrs, $started_at, $length ); + } + + /** + * Pushes a length of text from the input document + * to the output list as a freeform block + * + * @internal + * @since 3.8.0 + * @param null $length how many bytes of document text to output + */ + function add_freeform( $length = null ) { + $length = $length ? $length : strlen( $this->document ) - $this->offset; + + if ( 0 === $length ) { + return; + } + + $this->output[] = array( + 'attrs' => new stdClass(), + 'innerHTML' => substr( $this->document, $this->offset, $length ), + ); + } + + /** + * Given a block structure from memory pushes + * a new block to the output list + * + * @internal + * @since 3.8.0 + * @param WP_Block_Parser_Block $block the block to add to the output + * @param int $token_start byte offset into the document where the first token for the block starts + * @param int $token_length byte length of entire block from start of opening token to end of closing token + * @param int|null $last_offset last byte offset into document if continuing form earlier output + */ + function add_inner_block(WP_Block_Parser_Block $block, $token_start, $token_length, $last_offset = null ) { + $parent = $this->stack[ count( $this->stack ) - 1 ]; + $parent->block->innerBlocks[] = $block; + $parent->block->innerHTML .= substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset ); + $parent->prev_offset = $last_offset ? $last_offset : $token_start + $token_length; + } + + /** + * Pushes the top block from the parsing stack to the output list + * + * @internal + * @since 3.8.0 + * @param int|null $end_offset byte offset into document for where we should stop sending text output as HTML + */ + function add_block_from_stack( $end_offset = null ) { + $stack_top = array_pop( $this->stack ); + $prev_offset = $stack_top->prev_offset; + + $stack_top->block->innerHTML .= isset( $end_offset ) + ? substr( $this->document, $prev_offset, $end_offset - $prev_offset ) + : substr( $this->document, $prev_offset ); + + if ( isset( $stack_top->leading_html_start ) ) { + $this->output[] = array( + 'attrs' => array(), + 'innerHTML' => substr( + $this->document, + $stack_top->leading_html_start, + $stack_top->token_start - $stack_top->leading_html_start + ), + ); + } + + $this->output[] = $stack_top->block; + } +} diff --git a/packages/block-serialization-default-parser/src/index.js b/packages/block-serialization-default-parser/src/index.js new file mode 100644 index 00000000000000..4f4fdb38cd19e1 --- /dev/null +++ b/packages/block-serialization-default-parser/src/index.js @@ -0,0 +1,257 @@ +let document; +let offset; +let output; +let stack; +const tokenizer = /)[^])+?}\s+)?(\/)?-->/g; + +function Block( blockName, attrs, innerBlocks, innerHTML ) { + return { + blockName, + attrs, + innerBlocks, + innerHTML, + }; +} + +function Frame( block, tokenStart, tokenLength, prevOffset, leadingHtmlStart ) { + return { + block, + tokenStart, + tokenLength, + prevOffset: prevOffset || tokenStart + tokenLength, + leadingHtmlStart, + }; +} + +export const parse = ( doc ) => { + document = doc; + offset = 0; + output = []; + stack = []; + tokenizer.lastIndex = 0; + + do { + // twiddle our thumbs + } while ( proceed() ); + + return output; +}; + +function proceed() { + const next = nextToken(); + const [ tokenType, blockName, attrs, startOffset, tokenLength ] = next; + const stackDepth = stack.length; + + switch ( tokenType ) { + case 'no-more-tokens': + // if not in a block then flush output + if ( 0 === stackDepth ) { + addFreeform(); + return false; + } + + // Otherwise we have a problem + // This is an error + // we have options + // - treat it all as freeform text + // - assume an implicit closer (easiest when not nesting) + + // for the easy case we'll assume an implicit closer + if ( 1 === stackDepth ) { + addBlockFromStack(); + return false; + } + + // for the nested case where it's more difficult we'll + // have to assume that multiple closers are missing + // and so we'll collapse the whole stack piecewise + while ( 0 < stack.length ) { + addBlockFromStack(); + } + return false; + + case 'void-block': + // easy case is if we stumbled upon a void block + // in the top-level of the document + if ( 0 === stackDepth ) { + output.push( Block( blockName, attrs, [], '' ) ); + offset = startOffset + tokenLength; + return true; + } + + // otherwise we found an inner block + addInnerBlock( + Block( blockName, attrs, [], '' ), + startOffset, + tokenLength, + ); + offset = startOffset + tokenLength; + return true; + + case 'block-opener': + // we may have some HTML soup before the next block + const leadingHtmlStart = ( startOffset > offset ) ? offset : null; + + // track all newly-opened blocks on the stack + stack.push( + Frame( + Block( blockName, attrs, [], '' ), + startOffset, + tokenLength, + startOffset + tokenLength, + leadingHtmlStart, + ), + ); + offset = startOffset + tokenLength; + return true; + + case 'block-closer': + // if we're missing an opener we're in trouble + // This is an error + if ( 0 === stackDepth ) { + // we have options + // - assume an implicit opener + // - assume _this_ is the opener + // - give up and close out the document + addFreeform(); + return false; + } + + // if we're not nesting then this is easy - close the block + if ( 1 === stackDepth ) { + addBlockFromStack( startOffset ); + offset = startOffset + tokenLength; + return true; + } + + // otherwise we're nested and we have to close out the current + // block and add it as a innerBlock to the parent + const stackTop = stack.pop(); + stackTop.block.innerHTML += document.substr( + stackTop.prevOffset, + startOffset - stackTop.prevOffset, + ); + stackTop.prevOffset = startOffset + tokenLength; + + addInnerBlock( + stackTop.block, + stackTop.tokenStart, + stackTop.tokenLength, + startOffset + tokenLength, + ); + offset = startOffset + tokenLength; + return true; + + default: + // This is an error + addFreeform(); + return false; + } +} + +/** + * Parse JSON if valid, otherwise return null + * + * Note that JSON coming from the block comment + * delimiters is constrained to be an object + * and cannot be things like `true` or `null` + * + * @param {string} input JSON input string to parse + * @return {Object|null} parsed JSON if valid + */ +function parseJSON( input ) { + try { + return JSON.parse( input ); + } catch ( e ) { + return null; + } +} + +function nextToken() { + // aye the magic + // we're using a single RegExp to tokenize the block comment delimiters + // we're also using a trick here because the only difference between a + // block opener and a block closer is the leading `/` before `wp:` (and + // a closer has no attributes). we can trap them both and process the + // match back in Javascript to see which one it was. + const matches = tokenizer.exec( document ); + + // we have no more tokens + if ( null === matches ) { + return [ 'no-more-tokens' ]; + } + + const startedAt = matches.index; + const [ match, closerMatch, namespaceMatch, nameMatch, attrsMatch, voidMatch ] = matches; + + const length = match.length; + const isCloser = !! closerMatch; + const isVoid = !! voidMatch; + const namespace = namespaceMatch || 'core/'; + const name = namespace + nameMatch; + const hasAttrs = !! attrsMatch; + const attrs = hasAttrs ? parseJSON( attrsMatch ) : null; + + // This state isn't allowed + // This is an error + if ( isCloser && ( isVoid || hasAttrs ) ) { + // we can ignore them since they don't hurt anything + // we may warn against this at some point or reject it + } + + if ( isVoid ) { + return [ 'void-block', name, attrs, startedAt, length ]; + } + + if ( isCloser ) { + return [ 'block-closer', name, null, startedAt, length ]; + } + + return [ 'block-opener', name, attrs, startedAt, length ]; +} + +function addFreeform( rawLength ) { + const length = rawLength ? rawLength : document.length - offset; + + if ( 0 === length ) { + return; + } + + // why is this not a Frame? it's because the current grammar + // specifies an object that's different. we can update the + // specification and change here if we want to but for now we + // want this parser to be spec-compliant + output.push( { + attrs: {}, + innerHTML: document.substr( offset, length ), + } ); +} + +function addInnerBlock( block, tokenStart, tokenLength, lastOffset ) { + const parent = stack[ stack.length - 1 ]; + parent.block.innerBlocks.push( block ); + parent.block.innerHTML += document.substr( + parent.prevOffset, + tokenStart - parent.prevOffset, + ); + parent.prevOffset = lastOffset ? lastOffset : tokenStart + tokenLength; +} + +function addBlockFromStack( endOffset ) { + const { block, leadingHtmlStart, prevOffset, tokenStart } = stack.pop(); + + if ( endOffset ) { + block.innerHTML += document.substr( prevOffset, endOffset - prevOffset ); + } else { + block.innerHTML += document.substr( prevOffset ); + } + + if ( null !== leadingHtmlStart ) { + output.push( { + attrs: {}, + innerHTML: document.substr( leadingHtmlStart, tokenStart - leadingHtmlStart ), + } ); + } + + output.push( block ); +} diff --git a/packages/block-serialization-default-parser/test/index.js b/packages/block-serialization-default-parser/test/index.js new file mode 100644 index 00000000000000..68a46b39c97c45 --- /dev/null +++ b/packages/block-serialization-default-parser/test/index.js @@ -0,0 +1,27 @@ +/** + * Internal dependencies + */ +import { parse } from '../'; + +describe( 'block-serialization-spec-parser', () => { + test( 'parse() accepts inputs with multiple Reusable blocks', () => { + const result = parse( + '' + ); + + expect( result ).toEqual( [ + { + blockName: 'core/block', + attrs: { ref: 313 }, + innerBlocks: [], + innerHTML: '', + }, + { + blockName: 'core/block', + attrs: { ref: 482 }, + innerBlocks: [], + innerHTML: '', + }, + ] ); + } ); +} ); diff --git a/packages/blocks/package.json b/packages/blocks/package.json index 049637270fbaa2..c3018a510f15bf 100644 --- a/packages/blocks/package.json +++ b/packages/blocks/package.json @@ -23,6 +23,7 @@ "@babel/runtime": "^7.0.0", "@wordpress/autop": "file:../autop", "@wordpress/blob": "file:../blob", + "@wordpress/block-serialization-default-parser": "file:../block-serialization-default-parser", "@wordpress/block-serialization-spec-parser": "file:../block-serialization-spec-parser", "@wordpress/data": "file:../data", "@wordpress/deprecated": "file:../deprecated", diff --git a/packages/blocks/src/api/parser.js b/packages/blocks/src/api/parser.js index 16353552810019..6390d3a8852a35 100644 --- a/packages/blocks/src/api/parser.js +++ b/packages/blocks/src/api/parser.js @@ -9,7 +9,7 @@ import { flow, castArray, mapValues, omit, stubFalse } from 'lodash'; */ import { autop } from '@wordpress/autop'; import { applyFilters } from '@wordpress/hooks'; -import { parse as grammarParse } from '@wordpress/block-serialization-spec-parser'; +import { parse as defaultParse } from '@wordpress/block-serialization-default-parser'; /** * Internal dependencies @@ -378,6 +378,6 @@ const createParse = ( parseImplementation ) => * * @return {Array} Block list. */ -export const parseWithGrammar = createParse( grammarParse ); +export const parseWithGrammar = createParse( defaultParse ); export default parseWithGrammar; diff --git a/phpunit/class-parsing-test.php b/phpunit/class-parsing-test.php index 9bf05fcdf34e0b..b854f8e306a2bd 100644 --- a/phpunit/class-parsing-test.php +++ b/phpunit/class-parsing-test.php @@ -52,7 +52,7 @@ function strip_r( $input ) { /** * @dataProvider parsing_test_filenames */ - function test_parser_output( $html_filename, $parsed_json_filename ) { + function test_spec_parser_output( $html_filename, $parsed_json_filename ) { $html_path = self::$fixtures_dir . '/' . $html_filename; $parsed_json_path = self::$fixtures_dir . '/' . $parsed_json_filename; @@ -74,4 +74,32 @@ function test_parser_output( $html_filename, $parsed_json_filename ) { "File '$parsed_json_filename' does not match expected value" ); } + + /** + * @dataProvider parsing_test_filenames + */ + function test_default_parser_output( $html_filename, $parsed_json_filename ) { + // include the parser if it was not yet loaded. + require_once dirname( __FILE__ ) . '/../packages/block-serialization-default-parser/parser.php'; + $html_path = self::$fixtures_dir . '/' . $html_filename; + $parsed_json_path = self::$fixtures_dir . '/' . $parsed_json_filename; + + foreach ( array( $html_path, $parsed_json_path ) as $filename ) { + if ( ! file_exists( $filename ) ) { + throw new Exception( "Missing fixture file: '$filename'" ); + } + } + + $html = self::strip_r( file_get_contents( $html_path ) ); + $expected_parsed = json_decode( self::strip_r( file_get_contents( $parsed_json_path ) ), true ); + + $parser = new WP_Block_Parser(); + $result = json_decode( json_encode( $parser->parse( $html ) ), true ); + + $this->assertEquals( + $expected_parsed, + $result, + "File '$parsed_json_filename' does not match expected value" + ); + } } diff --git a/test/integration/full-content/full-content.spec.js b/test/integration/full-content/full-content.spec.js index 0aa2740831a9df..5c169185accaec 100644 --- a/test/integration/full-content/full-content.spec.js +++ b/test/integration/full-content/full-content.spec.js @@ -15,7 +15,7 @@ import { serialize, unstable__bootstrapServerSideBlockDefinitions, // eslint-disable-line camelcase } from '@wordpress/blocks'; -import { parse as grammarParse } from '@wordpress/block-serialization-spec-parser'; +import { parse as grammarParse } from '@wordpress/block-serialization-default-parser'; import { registerCoreBlocks } from '@wordpress/block-library'; const fixturesDir = path.join( __dirname, 'fixtures' ); diff --git a/webpack.config.js b/webpack.config.js index 748a721735c1e3..2a926a952255aa 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -87,6 +87,7 @@ const gutenbergPackages = [ 'autop', 'blob', 'blocks', + 'block-serialization-default-parser', 'block-serialization-spec-parser', 'compose', 'core-data',