From 70691acadc611f1dc2acf972a6d8792614a6c2f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Fri, 10 Jan 2025 15:52:02 +0100 Subject: [PATCH] Merge /WP_Filesystem_To_Post_Tree and /WP_Filesystem_Entity_Reader --- .../playground/data-liberation/bootstrap.php | 3 +- .../playground/data-liberation/phpunit.xml | 2 +- .../src/WP_Data_Liberation_HTML_Processor.php | 9 + .../WP_Filesystem_Entity_Reader.php | 555 +++++++++++++++--- .../WP_Filesystem_To_Post_Tree.php | 413 ------------- .../src/import/WP_Import_Utils.php | 2 +- .../tests/WPFilesystemEntityReaderTests.php | 89 +++ .../tests/WPFilesystemToPostTreeTests.php | 40 -- 8 files changed, 573 insertions(+), 540 deletions(-) delete mode 100644 packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php create mode 100644 packages/playground/data-liberation/tests/WPFilesystemEntityReaderTests.php delete mode 100644 packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 2e17a5f6c9..0db5d76324 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -87,9 +87,10 @@ require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php'; require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; -require_once __DIR__ . '/src/entity-readers/WP_Filesystem_To_Post_Tree.php'; require_once __DIR__ . '/src/entity-readers/WP_Filesystem_Entity_Reader.php'; +require_once __DIR__ . '/src/WP_Data_Liberation_HTML_Processor.php'; + require_once __DIR__ . '/src/utf8_decoder.php'; // When running in Playground, the composer autoloader script sees CLI SAPI and diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index 9c3e0dc036..8ce44f18e7 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -8,7 +8,7 @@ tests/WPMarkupProcessorConsumerTests.php tests/WPHTMLEntityReaderTests.php tests/WPURLInTextProcessorTests.php - tests/WPFilesystemToPostTreeTests.php + tests/WPFilesystemEntityReaderTests.php tests/WPBlockMarkupProcessorTests.php tests/WPBlockMarkupUrlProcessorTests.php tests/URLParserWHATWGComplianceTests.php diff --git a/packages/playground/data-liberation/src/WP_Data_Liberation_HTML_Processor.php b/packages/playground/data-liberation/src/WP_Data_Liberation_HTML_Processor.php index 5ad85d643a..4cbfc96071 100644 --- a/packages/playground/data-liberation/src/WP_Data_Liberation_HTML_Processor.php +++ b/packages/playground/data-liberation/src/WP_Data_Liberation_HTML_Processor.php @@ -50,4 +50,13 @@ public function skip_to_closer() { return false; } + + public function get_string_index_after_current_token() { + $name = 'current_token'; + $this->set_bookmark( $name ); + $bookmark = $this->bookmarks[ '_' . $name ]; + $this->release_bookmark( $name ); + return $bookmark->start + $bookmark->length; + } + } diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php index 2ac427e726..57583f0865 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php @@ -1,77 +1,271 @@ 100, + * 'filter_pattern' => '/\.md$/', + * 'index_file_pattern' => 'index\.md', + * ]; + * $reader = WP_Filesystem_To_Post_Tree::create($filesystem, $options); + * while ($reader->next_filesystem_node()) { + * $current_filesystem_node = $reader->get_current_filesystem_node(); + * // Process the node, e.g., create a post from it. + * } + * ``` + */ +class WP_Filesystem_Entity_Reader { + /** + * The filesystem instance to read from. + * + * @var WP_Abstract_Filesystem + */ + private $fs; + + /** + * Visitor for traversing the filesystem. + * + * @var WP_Filesystem_Visitor + */ + private $file_visitor; + + /** + * The current filesystem node being processed. + * + * @var array|null + */ + private $current_filesystem_node; + + /** + * The enqueued entities to emit. + * + * @var array + */ private $entities = array(); + + /** + * The current WordPress entity. + * + * @var WP_Imported_Entity|null + */ private $current_entity; + + /** + * Files pending processing. + * + * @var array + */ + private $pending_files = array(); + + /** + * A filename to emit as the next directory index. If null, there's no matching + * directory index file and a placeholder file will be created. If false, + * we're not emitting directory indexes at all. + * + * @var string|false|null + */ + private $pending_directory_index; + + /** + * A stack of post IDs emitted at each directory depth up to the currently processed + * directory. + * + * @var array + */ + private $parent_ids = array(); + + /** + * The next post ID to assign. + * + * @var int + */ + private $next_post_id; + + /** + * Flag to determine if an index page should be created when no index file is found + * in a directory. + * + * @var bool + */ + private $create_index_pages; + + /** + * Counter for entities read so far. + * + * @var int + */ + private $fs_nodes_emited_so_far = 0; + + /** + * Pattern to filter files. + * + * @var string + */ + private $filter_pattern = '##'; + + /** + * Pattern to identify index files. + * + * @var string + */ + private $index_file_pattern = '##'; + + /** + * Flag to indicate if processing is finished. + * + * @var bool + */ + private $is_finished = false; + + /** + * The post type to emit. + */ private $post_type; + + /** + * Flag to indicate if processing is finished. + * + * @var bool + */ private $finished = false; - public function __construct( $filesystem, $options = array() ) { - $this->filesystem = $filesystem; - $this->post_type = $options['post_type'] ?? 'page'; - $this->post_tree = WP_Filesystem_To_Post_Tree::create( - $this->filesystem, - array_merge( - array( - 'root_parent_id' => null, - 'filter_pattern' => '#\.(?:md|html|xhtml|png|jpg|jpeg|gif|svg|webp|mp4)$#', - 'index_file_pattern' => '#^index\.[a-z]+$#', - ), - $options['post_tree_options'] ?? array() - ) - ); - if ( false === $this->post_tree ) { + /** + * Creates a new instance of WP_Filesystem_To_Post_Tree. + * + * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse. + * @param array $options Configuration options. { + * $first_post_id => int The ID of the first post to emit. + * $filter_pattern => string A pattern to filter files by. + * $index_file_pattern => string A pattern to identify index files. + * $root_parent_id => int|null The ID of the root parent post. + * $create_index_pages => bool Whether to create index pages when no index file is found. + * } + * @return WP_Filesystem_To_Post_Tree|false The created instance or false on failure. + */ + public static function create( + \WordPress\Filesystem\WP_Abstract_Filesystem $filesystem, + $options + ) { + if ( ! isset( $options['first_post_id'] ) ) { + $options['first_post_id'] = 2; + if ( function_exists( 'get_posts' ) ) { + $max_id = get_posts( + array( + 'post_type' => 'any', + 'posts_per_page' => 1, + 'fields' => 'ids', + 'orderby' => 'ID', + 'order' => 'DESC', + ) + ); + if ( ! empty( $max_id ) ) { + $options['first_post_id'] = $max_id[0] + 1; + } + } + } + if ( 1 === $options['first_post_id'] ) { + _doing_it_wrong( __FUNCTION__, 'First node ID must be greater than 1', '1.0.0' ); return false; } + return new self( $filesystem, $options ); } - public function get_last_error(): ?string { - // @TODO: Implement this. - return null; + /** + * Initializes the reader with filesystem and options. + * + * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse. + * @param array $options Configuration options. + */ + private function __construct( + WP_Abstract_Filesystem $filesystem, + $options + ) { + $this->fs = $filesystem; + $this->file_visitor = new WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem ); + $this->post_type = $options['post_type'] ?? 'page'; + $this->create_index_pages = $options['create_index_pages'] ?? true; + $this->next_post_id = $options['first_post_id']; + $this->filter_pattern = $options['filter_pattern'] ?? '#\.(?:md|html|xhtml|png|jpg|jpeg|gif|svg|webp|mp4)$#'; + $this->index_file_pattern = $options['index_file_pattern'] ?? '#^index\.[a-z]+$#'; + if ( isset( $options['root_parent_id'] ) ) { + $this->parent_ids[-1] = $options['root_parent_id']; + } } + /** + * Get the current entity. + * + * @return WP_Imported_Entity|null The current entity or null if none. + */ public function get_entity() { return $this->current_entity; } + /** + * Check if the reader has finished reading the filesystem. + * + * @return bool Whether the reader has finished. + */ public function is_finished(): bool { return $this->finished; } + /** + * Read the next WordPress post or metadata entity from the filesystem. + * + * @return bool Whether an entity was read. + */ public function next_entity(): bool { + if ( $this->is_finished ) { + return false; + } + while ( true ) { while ( count( $this->entities ) > 0 ) { $this->current_entity = array_shift( $this->entities ); return true; } - if ( ! $this->post_tree->next_node() ) { + if ( ! $this->next_filesystem_node() ) { $this->finished = true; return false; } - $source_content_converter = null; - $post_tree_node = $this->post_tree->get_current_node(); + $post_tree_node = $this->get_current_filesystem_node(); + $metadata = [ + 'post_id' => $post_tree_node['post_id'], + 'post_parent' => $post_tree_node['parent_id'], + 'post_title' => $post_tree_node['post_title'] ?? null, + 'post_status' => 'publish', + 'post_type' => $this->post_type, + 'guid' => $post_tree_node['local_file_path'], + 'local_file_path' => $post_tree_node['local_file_path'], + ]; if ( $post_tree_node['type'] === 'file' ) { $extension = pathinfo( $post_tree_node['local_file_path'], PATHINFO_EXTENSION ); switch ( $extension ) { case 'md': - $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] ); - $converter = new WP_Markdown_To_Blocks( $content ); - $source_content_converter = 'md'; + $content = $this->fs->get_contents( $post_tree_node['local_file_path'] ); + $converter = new WP_Markdown_Consumer( $content ); break; case 'xhtml': - $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] ); - $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $content ) ); - $source_content_converter = 'xhtml'; + $content = $this->fs->get_contents( $post_tree_node['local_file_path'] ); + $converter = new WP_Markup_Processor_Consumer( WP_XML_Processor::create_from_string( $content ) ); break; case 'html': - $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] ); - $converter = new WP_HTML_To_Blocks( WP_HTML_Processor::create_fragment( $content ) ); - $source_content_converter = 'html'; + $content = $this->fs->get_contents( $post_tree_node['local_file_path'] ); + $converter = new WP_Markup_Processor_Consumer( WP_HTML_Processor::create_fragment( $content ) ); break; default: $filetype = 'application/octet-stream'; @@ -81,65 +275,35 @@ public function next_entity(): bool { $filetype = $filetype['type']; } } - $this->entities[] = new WP_Imported_Entity( - 'post', - array( - 'post_id' => $post_tree_node['post_id'], - 'post_title' => sanitize_file_name( basename( $post_tree_node['local_file_path'] ) ), - 'post_status' => 'inherit', - 'post_content' => '', - 'post_mime_type' => $filetype, - 'post_type' => 'attachment', - 'post_parent' => $post_tree_node['parent_id'], - 'guid' => $post_tree_node['local_file_path'], - // The importer will use the same Filesystem instance to - // source the attachment. - 'attachment_url' => 'file://' . $post_tree_node['local_file_path'], - ) - ); - $this->entities[] = new WP_Imported_Entity( - 'post_meta', - array( - 'post_id' => $post_tree_node['post_id'], - 'key' => 'local_file_path', - 'value' => $post_tree_node['local_file_path'], - ) - ); - // We're done emiting the entity. - // wp_generate_attachment_metadata() et al. will be called by the - // importer at the database insertion step. - continue 2; + $metadata['post_mime_type'] = $filetype; + $metadata['post_status'] = 'inherit'; + $metadata['post_title'] = WP_Import_Utils::slug_to_title( basename( $post_tree_node['local_file_path'] ) ); + // The importer will use the same Filesystem instance to + // source the attachment. + $metadata['attachment_url'] = 'file://' . $post_tree_node['local_file_path']; + break; } - if ( false === $converter->convert() ) { - throw new Exception( 'Failed to convert Markdown to blocks' ); - } - $markup = $converter->get_block_markup(); - $metadata = $converter->get_all_metadata(); - } else { - $markup = ''; - $metadata = array(); - // @TODO: Accept an option to set what should we default to. - $source_content_converter = 'html'; + $result = $converter->consume(); + } else if ( $post_tree_node['type'] === 'file_placeholder' ) { + $result = new WP_Blocks_With_Metadata( + '', + array() + ); + $metadata['post_title'] = WP_Import_Utils::slug_to_title( basename( $post_tree_node['local_file_path'] ) ); } - $reader = new WP_Block_Markup_Entity_Reader( - $markup, - $metadata, + $reader = new WP_Blocks_With_Metadata_Entity_Reader( + $result, $post_tree_node['post_id'] ); while ( $reader->next_entity() ) { $entity = $reader->get_entity(); $data = $entity->get_data(); if ( $entity->get_type() === 'post' ) { - $data['id'] = $post_tree_node['post_id']; - $data['guid'] = $post_tree_node['local_file_path']; - $data['post_parent'] = $post_tree_node['parent_id']; - $data['post_title'] = $data['post_title'] ?? null; - $data['post_status'] = 'publish'; - $data['post_type'] = $this->post_type; + $data = array_merge( $metadata, $data ); if ( ! $data['post_title'] ) { - $data['post_title'] = WP_Import_Utils::slug_to_title( basename( $post_tree_node['local_file_path'] ) ); + $data['post_title'] = WP_Import_Utils::slug_to_title( basename( $metadata['local_file_path'] ) ); } $entity = new WP_Imported_Entity( $entity->get_type(), $data ); } @@ -148,9 +312,7 @@ public function next_entity(): bool { // Also emit: $additional_meta = array( - 'local_file_path' => $post_tree_node['local_file_path'], - 'source_type' => $post_tree_node['type'], - 'source_content_converter' => $source_content_converter, + 'local_file_path' => $metadata['local_file_path'], ); foreach ( $additional_meta as $key => $value ) { $this->entities[] = new WP_Imported_Entity( @@ -164,4 +326,229 @@ public function next_entity(): bool { } } } + + /** + * Retrieves the current filesystem node being processed. + * + * @return array|null The current node or null if none. + */ + private function get_current_filesystem_node() { + return $this->current_filesystem_node; + } + + /** + * Advances to the next filesystem node. + * + * @return bool True if a node is found, false if processing is complete. + */ + private function next_filesystem_node() { + $this->current_filesystem_node = null; + while ( true ) { + if ( null !== $this->pending_directory_index ) { + $dir = $this->file_visitor->get_event()->dir; + $depth = $this->file_visitor->get_current_depth(); + $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; + if ( null === $parent_id && $depth > 1 ) { + // There's no parent ID even though we're a few levels deep. + // This is a scenario where `next_file()` skipped a few levels + // of directories with no relevant content in them: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, we need to backtrack and create the missing + // parent pages for /bar/ and /foo/. + + // Find the topmost missing parent ID + $missing_parent_id_depth = 1; + while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { + ++$missing_parent_id_depth; + } + + // Move up to the corresponding directory + $missing_parent_path = $dir; + for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { + $missing_parent_path = dirname( $missing_parent_path ); + } + + $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_filesystem_node( + array( + 'type' => 'directory', + 'local_file_path' => $missing_parent_path, + 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ] ?? null, + ) + ); + } elseif ( false === $this->pending_directory_index ) { + // No directory index candidate – let's create a fake page + // just to have something in the page tree. + $this->parent_ids[ $depth ] = $this->emit_filesystem_node( + array( + 'type' => 'file_placeholder', + 'local_file_path' => $dir, + 'parent_id' => $parent_id, + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } else { + $file_path = $this->pending_directory_index; + $this->parent_ids[ $depth ] = $this->emit_filesystem_node( + array( + 'type' => 'file', + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } + return true; + } + while ( count( $this->pending_files ) ) { + $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; + $file_path = array_shift( $this->pending_files ); + $this->emit_filesystem_node( + array( + 'type' => 'file', + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + ) + ); + return true; + } + + if ( false === $this->next_file() ) { + break; + } + } + $this->is_finished = true; + return false; + } + + /** + * Processes the next file in the traversal. + * + * @return bool True if a file is processed, false otherwise. + */ + private function next_file() { + $this->pending_files = array(); + while ( $this->file_visitor->next() ) { + $event = $this->file_visitor->get_event(); + + if ( $event->is_exiting() ) { + // Clean up stale IDs to save some memory when processing + // large directory trees. + unset( $this->parent_ids[ $event->dir ] ); + continue; + } + + if ( $event->is_entering() ) { + $abs_paths = array(); + foreach ( $event->files as $filename ) { + $abs_paths[] = wp_join_paths( $event->dir, $filename ); + } + $this->pending_files = array(); + foreach ( $abs_paths as $path ) { + // Add all the subdirectory into the pending files list – there's + // a chance the directory wouldn't match the filter pattern, but + // a descendant file might. + if ( $this->fs->is_dir( $path ) ) { + $this->pending_files[] = $path; + } + + // Only add the files that match the filter pattern. + if ( $this->fs->is_file( $path ) && preg_match( $this->filter_pattern, $path ) ) { + $this->pending_files[] = $path; + } + } + if ( ! count( $this->pending_files ) ) { + // Only consider directories with relevant files in them. + // Otherwise we'll create fake pages for media directories + // and other directories that don't contain any content. + // + // One corner case is when there's a few levels of directories + // with a single relevant file at the bottom: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, `next_entity()` will backtrack at baz.md and + // create the missing parent pages. + continue; + } + $directory_index_idx = $this->choose_directory_index( $this->pending_files ); + if ( -1 === $directory_index_idx ) { + $this->pending_directory_index = false; + } else { + $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; + unset( $this->pending_files[ $directory_index_idx ] ); + } + return true; + } + + return false; + } + return false; + } + + /** + * Emits a WordPress post entity based on the provided options. + * + * @param array $options Configuration for the post entity. + * @return int The ID of the created post. + */ + protected function emit_filesystem_node( $options ) { + $post_id = $this->next_post_id; + ++$this->next_post_id; + $this->current_filesystem_node = array_merge( + $options, + array( + 'post_id' => $post_id, + ) + ); + ++$this->fs_nodes_emited_so_far; + return $post_id; + } + + /** + * Chooses an index file from the list of pending files. + * + * @param array $files List of files to choose from. + * @return int The index of the chosen file or -1 if none. + */ + protected function choose_directory_index( $files ) { + foreach ( $files as $idx => $file ) { + if ( $this->looks_like_directory_index( $file ) ) { + return $idx; + } + } + if ( ! $this->create_index_pages && count( $files ) > 0 ) { + return 0; + } + return -1; + } + + /** + * Determines if a file path matches the index file pattern. + * + * @param string $path The file path to check. + * @return bool True if it matches, false otherwise. + */ + protected function looks_like_directory_index( $path ) { + return preg_match( $this->index_file_pattern, basename( $path ) ); + } + + /** + * Finds a node in the filesystem tree by its path. + * + * @param string $path The path to search for. + * @return array|null The found node or null if not found. + */ + private function find_node($path) { + // existing code... + } } diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php deleted file mode 100644 index 0980d228c1..0000000000 --- a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php +++ /dev/null @@ -1,413 +0,0 @@ - 100, - * 'filter_pattern' => '/\.md$/', - * 'index_file_pattern' => 'index\.md', - * ]; - * $reader = WP_Filesystem_To_Post_Tree::create($filesystem, $options); - * while ($reader->next_node()) { - * $current_node = $reader->get_current_node(); - * // Process the node, e.g., create a post from it. - * } - * ``` - */ -class WP_Filesystem_To_Post_Tree { - /** - * The filesystem instance to read from. - * - * @var WP_Abstract_Filesystem - */ - private $fs; - - /** - * Visitor for traversing the filesystem. - * - * @var WP_Filesystem_Visitor - */ - private $file_visitor; - - /** - * The current node being processed. - * - * @var array|null - */ - private $current_node; - - /** - * Files pending processing. - * - * @var array - */ - private $pending_files = array(); - - /** - * A filename to emit as the next directory index. If null, there's no matching - * directory index file and a placeholder file will be created. If false, - * we're not emitting directory indexes at all. - * - * @var string|false|null - */ - private $pending_directory_index; - - /** - * A stack of post IDs emitted at each directory depth up to the currently processed - * directory. - * - * @var array - */ - private $parent_ids = array(); - - /** - * The next post ID to assign. - * - * @var int - */ - private $next_post_id; - - /** - * Flag to determine if an index page should be created when no index file is found - * in a directory. - * - * @var bool - */ - private $create_index_pages; - - /** - * Counter for entities read so far. - * - * @var int - */ - private $entities_read_so_far = 0; - - /** - * Pattern to filter files. - * - * @var string - */ - private $filter_pattern = '##'; - - /** - * Pattern to identify index files. - * - * @var string - */ - private $index_file_pattern = '##'; - - /** - * Flag to indicate if processing is finished. - * - * @var bool - */ - private $is_finished = false; - - /** - * Creates a new instance of WP_Filesystem_To_Post_Tree. - * - * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse. - * @param array $options Configuration options. { - * $first_post_id => int The ID of the first post to emit. - * $filter_pattern => string A pattern to filter files by. - * $index_file_pattern => string A pattern to identify index files. - * $root_parent_id => int|null The ID of the root parent post. - * $create_index_pages => bool Whether to create index pages when no index file is found. - * } - * @return WP_Filesystem_To_Post_Tree|false The created instance or false on failure. - */ - public static function create( - \WordPress\Filesystem\WP_Abstract_Filesystem $filesystem, - $options - ) { - if ( ! isset( $options['first_post_id'] ) ) { - $options['first_post_id'] = 2; - if ( function_exists( 'get_posts' ) ) { - $max_id = get_posts( - array( - 'post_type' => 'any', - 'posts_per_page' => 1, - 'fields' => 'ids', - 'orderby' => 'ID', - 'order' => 'DESC', - ) - ); - if ( ! empty( $max_id ) ) { - $options['first_post_id'] = $max_id[0] + 1; - } - } - } - if ( 1 === $options['first_post_id'] ) { - _doing_it_wrong( __FUNCTION__, 'First node ID must be greater than 1', '1.0.0' ); - return false; - } - if ( ! isset( $options['filter_pattern'] ) ) { - _doing_it_wrong( __FUNCTION__, 'Missing required options: filter_pattern', '1.0.0' ); - return false; - } - if ( ! isset( $options['index_file_pattern'] ) ) { - _doing_it_wrong( __FUNCTION__, 'Missing required options: index_file_pattern', '1.0.0' ); - return false; - } - return new self( $filesystem, $options ); - } - - /** - * Initializes the reader with filesystem and options. - * - * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse. - * @param array $options Configuration options. - */ - private function __construct( - WP_Abstract_Filesystem $filesystem, - $options - ) { - $this->fs = $filesystem; - $this->file_visitor = new WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem ); - $this->create_index_pages = $options['create_index_pages'] ?? true; - $this->next_post_id = $options['first_post_id']; - $this->filter_pattern = $options['filter_pattern']; - $this->index_file_pattern = $options['index_file_pattern']; - if ( isset( $options['root_parent_id'] ) ) { - $this->parent_ids[-1] = $options['root_parent_id']; - } - } - - /** - * Retrieves the current node being processed. - * - * @return array|null The current node or null if none. - */ - public function get_current_node() { - return $this->current_node; - } - - /** - * Advances to the next node in the filesystem. - * - * @return bool True if a node is found, false if processing is complete. - */ - public function next_node() { - $this->current_node = null; - if ( $this->is_finished ) { - return false; - } - while ( true ) { - if ( null !== $this->pending_directory_index ) { - $dir = $this->file_visitor->get_event()->dir; - $depth = $this->file_visitor->get_current_depth(); - $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; - if ( null === $parent_id && $depth > 1 ) { - // There's no parent ID even though we're a few levels deep. - // This is a scenario where `next_file()` skipped a few levels - // of directories with no relevant content in them: - // - // - /docs/ - // - /foo/ - // - /bar/ - // - /baz.md - // - // In this case, we need to backtrack and create the missing - // parent pages for /bar/ and /foo/. - - // Find the topmost missing parent ID - $missing_parent_id_depth = 1; - while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { - ++$missing_parent_id_depth; - } - - // Move up to the corresponding directory - $missing_parent_path = $dir; - for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { - $missing_parent_path = dirname( $missing_parent_path ); - } - - $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_object( - array( - 'type' => 'directory', - 'local_file_path' => $missing_parent_path, - 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ] ?? null, - ) - ); - } elseif ( false === $this->pending_directory_index ) { - // No directory index candidate – let's create a fake page - // just to have something in the page tree. - $this->parent_ids[ $depth ] = $this->emit_object( - array( - 'type' => 'file_placeholder', - 'local_file_path' => $dir, - 'parent_id' => $parent_id, - ) - ); - // We're no longer looking for a directory index. - $this->pending_directory_index = null; - } else { - $file_path = $this->pending_directory_index; - $this->parent_ids[ $depth ] = $this->emit_object( - array( - 'type' => 'file', - 'local_file_path' => $file_path, - 'parent_id' => $parent_id, - ) - ); - // We're no longer looking for a directory index. - $this->pending_directory_index = null; - } - return true; - } - while ( count( $this->pending_files ) ) { - $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; - $file_path = array_shift( $this->pending_files ); - $this->emit_object( - array( - 'type' => 'file', - 'local_file_path' => $file_path, - 'parent_id' => $parent_id, - ) - ); - return true; - } - - if ( false === $this->next_file() ) { - break; - } - } - $this->is_finished = true; - return false; - } - - /** - * Emits a WordPress post entity based on the provided options. - * - * @param array $options Configuration for the post entity. - * @return int The ID of the created post. - */ - protected function emit_object( $options ) { - $post_id = $this->next_post_id; - ++$this->next_post_id; - $this->current_node = array_merge( - $options, - array( - 'post_id' => $post_id, - ) - ); - ++$this->entities_read_so_far; - return $post_id; - } - - /** - * Processes the next file in the traversal. - * - * @return bool True if a file is processed, false otherwise. - */ - private function next_file() { - $this->pending_files = array(); - while ( $this->file_visitor->next() ) { - $event = $this->file_visitor->get_event(); - - if ( $event->is_exiting() ) { - // Clean up stale IDs to save some memory when processing - // large directory trees. - unset( $this->parent_ids[ $event->dir ] ); - continue; - } - - if ( $event->is_entering() ) { - $abs_paths = array(); - foreach ( $event->files as $filename ) { - $abs_paths[] = wp_join_paths( $event->dir, $filename ); - } - $this->pending_files = array(); - foreach ( $abs_paths as $path ) { - // Add all the subdirectory into the pending files list – there's - // a chance the directory wouldn't match the filter pattern, but - // a descendant file might. - if ( $this->fs->is_dir( $path ) ) { - $this->pending_files[] = $path; - } - - // Only add the files that match the filter pattern. - if ( $this->fs->is_file( $path ) && preg_match( $this->filter_pattern, $path ) ) { - $this->pending_files[] = $path; - } - } - if ( ! count( $this->pending_files ) ) { - // Only consider directories with relevant files in them. - // Otherwise we'll create fake pages for media directories - // and other directories that don't contain any content. - // - // One corner case is when there's a few levels of directories - // with a single relevant file at the bottom: - // - // - /docs/ - // - /foo/ - // - /bar/ - // - /baz.md - // - // In this case, `next_entity()` will backtrack at baz.md and - // create the missing parent pages. - continue; - } - $directory_index_idx = $this->choose_directory_index( $this->pending_files ); - if ( -1 === $directory_index_idx ) { - $this->pending_directory_index = false; - } else { - $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; - unset( $this->pending_files[ $directory_index_idx ] ); - } - return true; - } - - return false; - } - return false; - } - - /** - * Chooses an index file from the list of pending files. - * - * @param array $files List of files to choose from. - * @return int The index of the chosen file or -1 if none. - */ - protected function choose_directory_index( $files ) { - foreach ( $files as $idx => $file ) { - if ( $this->looks_like_directory_index( $file ) ) { - return $idx; - } - } - if ( ! $this->create_index_pages && count( $files ) > 0 ) { - return 0; - } - return -1; - } - - /** - * Determines if a file path matches the index file pattern. - * - * @param string $path The file path to check. - * @return bool True if it matches, false otherwise. - */ - protected function looks_like_directory_index( $path ) { - return preg_match( $this->index_file_pattern, basename( $path ) ); - } - - /** - * Finds a node in the filesystem tree by its path. - * - * @param string $path The path to search for. - * @return array|null The found node or null if not found. - */ - private function find_node($path) { - // existing code... - } -} diff --git a/packages/playground/data-liberation/src/import/WP_Import_Utils.php b/packages/playground/data-liberation/src/import/WP_Import_Utils.php index 868490bcaa..951f8b6c76 100644 --- a/packages/playground/data-liberation/src/import/WP_Import_Utils.php +++ b/packages/playground/data-liberation/src/import/WP_Import_Utils.php @@ -71,7 +71,7 @@ public static function slug_to_title( $filename ) { } public static function remove_first_h1_block_from_block_markup( $html ) { - $p = WP_Import_HTML_Processor::create_fragment( $html ); + $p = WP_Data_Liberation_HTML_Processor::create_fragment( $html ); if ( false === $p->next_tag() ) { return false; } diff --git a/packages/playground/data-liberation/tests/WPFilesystemEntityReaderTests.php b/packages/playground/data-liberation/tests/WPFilesystemEntityReaderTests.php new file mode 100644 index 0000000000..aeb981d893 --- /dev/null +++ b/packages/playground/data-liberation/tests/WPFilesystemEntityReaderTests.php @@ -0,0 +1,89 @@ + 2, + 'create_index_pages' => true, + 'filter_pattern' => '#\.html$#', + 'index_file_pattern' => '#root.html#', + ] + ); + $entities = []; + while ( $reader->next_entity() ) { + $entities[] = $reader->get_entity(); + } + $this->assertCount(6, $entities); + + // The root index page + // Root index page + $entity_data = $entities[0]->get_data(); + $this->assertEquals(2, $entity_data['post_id']); + $this->assertNull($entity_data['post_parent']); + $this->assertEquals('Root', $entity_data['post_title']); + $this->assertEquals('publish', $entity_data['post_status']); + $this->assertEquals('page', $entity_data['post_type']); + $this->assertEquals('/root.html', $entity_data['guid']); + $this->assertMarkupMatches( + $entity_data['post_content'], + '

This is the root page.

' + ); + + $entity_data = $entities[1]->get_data(); + $this->assertEquals(2, $entity_data['post_id']); + $this->assertEquals('local_file_path', $entity_data['key']); + $this->assertEquals('/root.html', $entity_data['value']); + + $entity_data = $entities[2]->get_data(); + $this->assertEquals(3, $entity_data['post_id']); + $this->assertEquals(2, $entity_data['post_parent']); + $this->assertEquals('Nested', $entity_data['post_title']); + $this->assertEquals('publish', $entity_data['post_status']); + $this->assertEquals('page', $entity_data['post_type']); + $this->assertEquals('/nested', $entity_data['guid']); + $this->assertMarkupMatches( + $entity_data['post_content'], + '' + ); + + $entity_data = $entities[3]->get_data(); + $this->assertEquals(3, $entity_data['post_id']); + $this->assertEquals('local_file_path', $entity_data['key']); + $this->assertEquals('/nested', $entity_data['value']); + + $entity_data = $entities[4]->get_data(); + $this->assertEquals(4, $entity_data['post_id']); + $this->assertEquals('Page 1', $entity_data['post_title']); + $this->assertEquals('publish', $entity_data['post_status']); + $this->assertEquals('page', $entity_data['post_type']); + $this->assertEquals('/nested/page1.html', $entity_data['guid']); + $this->assertMarkupMatches( + $entity_data['post_content'], + '

This is page 1.

' + ); + + $entity_data = $entities[5]->get_data(); + $this->assertEquals(4, $entity_data['post_id']); + $this->assertEquals('local_file_path', $entity_data['key']); + $this->assertEquals('/nested/page1.html', $entity_data['value']); + } + + private function assertMarkupMatches( $markup, $expected ) { + $this->assertEquals( + $this->normalize_markup( $expected ), + $this->normalize_markup( $markup ), + ); + } + + private function normalize_markup( $markup ) { + return WP_HTML_Processor::create_fragment( + preg_replace( "/\s+/", ' ', trim($markup) ) + )->serialize(); + } + +} diff --git a/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php b/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php deleted file mode 100644 index 6201cb70cb..0000000000 --- a/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php +++ /dev/null @@ -1,40 +0,0 @@ - 2, - 'create_index_pages' => true, - 'filter_pattern' => '#\.html$#', - 'index_file_pattern' => '#root.html#', - ] - ); - $posts = []; - while ( $reader->next_node() ) { - $posts[] = $reader->get_current_node(); - } - $this->assertCount(3, $posts); - - // The root index page - // Root index page - $this->assertEquals(2, $posts[0]['post_id']); - $this->assertNull($posts[0]['parent_id']); - $this->assertEquals('file', $posts[0]['type']); - - // Nested directory page - $this->assertEquals(3, $posts[1]['post_id']); - $this->assertEquals(2, $posts[1]['parent_id']); - $this->assertEquals('file_placeholder', $posts[1]['type']); - - // Leaf page - $this->assertEquals(4, $posts[2]['post_id']); - $this->assertEquals(3, $posts[2]['parent_id']); - $this->assertEquals('file', $posts[2]['type']); - } - -}