From d09e7660c95d72849b982f3a62fdc41132e16a39 Mon Sep 17 00:00:00 2001 From: Vishnu Gopal <533+vishnugopal@users.noreply.github.com> Date: Wed, 30 Nov 2022 10:10:12 +0530 Subject: [PATCH 1/3] Add wxr-parser-large-file. This adds a more memory-efficient WXR file parser to the WordPress importer. The existing parser loads the entire WXR file into memory whereas this one uses XMLReader to parse the file in chunks. This parser implementation has been extensively tested at WordPress.com, and reuses the existing SimpleXML parser for maximum compatibility. --- .wp-env.json | 7 + src/helpers/class-xml-character-filter.php | 47 +++ src/parsers/class-wxr-parser-large-file.php | 437 ++++++++++++++++++++ src/parsers/class-wxr-parser-simplexml.php | 10 +- src/parsers/class-wxr-parser.php | 4 + src/wordpress-importer.php | 6 + 6 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 .wp-env.json create mode 100644 src/helpers/class-xml-character-filter.php create mode 100644 src/parsers/class-wxr-parser-large-file.php diff --git a/.wp-env.json b/.wp-env.json new file mode 100644 index 0000000..0d73d51 --- /dev/null +++ b/.wp-env.json @@ -0,0 +1,7 @@ +{ + "plugins": ["."], + "config": { + "WP_UPLOAD_MAX_FILESIZE": "128M", + "WP_MEMORY_LIMIT": "256M" + } +} diff --git a/src/helpers/class-xml-character-filter.php b/src/helpers/class-xml-character-filter.php new file mode 100644 index 0000000..2370cc4 --- /dev/null +++ b/src/helpers/class-xml-character-filter.php @@ -0,0 +1,47 @@ +datalen; + $bucket->data = $this->replace_chars( $bucket->data ); + stream_bucket_append( $out, $bucket ); + } + + return PSFS_PASS_ON; + } + + private function replace_chars( $string ) { + return str_replace( $this->chars, ' ', $string ); + } + + public function onCreate() { + for ( $ascii_num = 0; $ascii_num < 32; $ascii_num ++ ) { + if ( $ascii_num !== 9 && $ascii_num !== 10 && $ascii_num !== 13 ) { + $this->chars[] = chr( $ascii_num ); + } + } + $this->chars[] = chr( 127 ); + + return true; + } +} + +stream_filter_register( 'xml_character_filter', 'XML_Character_Filter' ); diff --git a/src/parsers/class-wxr-parser-large-file.php b/src/parsers/class-wxr-parser-large-file.php new file mode 100644 index 0000000..f707f97 --- /dev/null +++ b/src/parsers/class-wxr-parser-large-file.php @@ -0,0 +1,437 @@ +\n"; + var $raw_footer = "\n"; + + var $tiny_header = "\n"; + var $tiny_header_size = 0; + + var $current_post = 0; + var $posts_metadata = null; + var $posts_found = 0; + var $post_fp = 0; + + var $tmp = null; + var $tmp_bytes = 0; + + var $mini_parsed_wxr = null; + + var $invalid_xml = false; + + var $do_compress = false; + var $large_file_size = 524288000; // 500 MB + + public $wxr_parser_class = 'WXR_Parser'; + + /** + * Clean up resources so that they are not orphaned when the object is + * no longer in scope. Specifically open file handles. + */ + function close() { + fclose( $this->tmp ); + } + + function __construct( $file, $override_wxr_parser = '' ) { + /** + * Detect compressed import files. + */ + $is_forced_compressed_file = self::is_file_compressed( $file ); + + $source_uri = sprintf("file://%s", realpath( $file ) ); + + if ( + ( preg_match( '/\.gz$/i', $file ) || $is_forced_compressed_file ) && + in_array( 'compress.zlib', stream_get_wrappers() ) + ) { + // 100 MB of compressed data is quite a lot + $this->large_file_size = 104857600; + $source_uri = sprintf( "compress.zlib://%s", $file ); + } + + // Prepend stream filter to strip out control characters XMLReader doesn't like + $source_uri = XML_CHARACTER_FILTER_PREFIX . $source_uri; + + if ( filesize( $file ) > $this->large_file_size && function_exists( 'gzencode' ) ) { + $this->do_compress = true; + } + + /** + * Check if the WXR_Parser class needs to be overriden for another class. + * + * This is used to plug in the Site_Importer_WXR_Parser to be used with the Large File parser. + */ + if ( $override_wxr_parser && class_exists( $override_wxr_parser ) ) { + $this->wxr_parser_class = $override_wxr_parser; + } + + // Create a file pointer with no filesystem references to act as our simple + // database container. We want no filesystem references, so that when + // the process dies the file is orphaned and space reclaimed by the OS + $tmp = tempnam( sys_get_temp_dir(), "import-" ); + $this->tmp = fopen( $tmp, 'w+' ); + unlink( $tmp ); + + // Create a similar orphaned file descriptor to house the index fata for seeking + // to posts in our data file (above). Exactly 12 bytes per item entry will be used: + // an unsigned 64 bit unsinged int for the offset and a 32 bit unsigned int for + // the length of the data. Therefore seeking to $id * 12 and unpacking the next 12 + // bytes gives us everything we need to pull data from $this->tmp + $tmp = tempnam( sys_get_temp_dir(), "import-" ); + $this->posts_metadata = fopen( $tmp, 'w+' ); + unlink( $tmp ); + + // XMLReader is a stream parser. It does not need to read the entire file, + // and is therefore very memory efficient. It uses the same parsing engine + // that simplexml does (I believe) and so should work precisely the same + $reader = new XMLReader(); + + libxml_use_internal_errors(true); + + $libxml_options = LIBXML_NOBLANKS; + + if ( defined( 'LIBXML_COMPACT' ) ) { + $libxml_options = $libxml_options | LIBXML_COMPACT; + } + + if( defined( 'LIBXML_PARSEHUGE' ) ) { + $libxml_options = $libxml_options | LIBXML_PARSEHUGE; + } + + // Using `false` here is bad practice, but we're limiting it to the open + // step, in which XMLReader (unlike other XML tools) does not attempt to + // load or parse external entities. We go back to best practice during the + // read steps. + $old_disable_entity_loader_value = libxml_disable_entity_loader( false ); + $opened = $reader->open( $source_uri, null, $libxml_options ); + libxml_disable_entity_loader( true ); + + if ( ! $opened ) { + libxml_disable_entity_loader( $old_disable_entity_loader_value ); + return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it\'s valid XML.', 'wordpress-importer') ); + } + + // Be explicit about this default behavior for the read steps + $reader->setParserProperty( XMLReader::SUBST_ENTITIES, false ); + + $writing_to = 0; + $found_channel = false; + $reader->read(); + while( true ) { + switch( $reader->name ) { + case 'channel': + $found_channel = true; + case 'rss': + // rss and rss/channel are both parts of the header or footer + // depending on whether we have an opening tag or not. + $node_name = $reader->name; + switch ( $reader->nodeType ) { + case XMLReader::ELEMENT: + // Trap any attributes on these kinds of elements so that + // we can include them in the header as well + $attrs = array(); + if ( $reader->moveToFirstAttribute() ) { + $attrs[] = sprintf( + '%s="%s"', + preg_replace( '/[^0-9a-z:]/i', '', $reader->name ), + str_replace( array( '"', '\\' ), "", $reader->value ) + ); + while ( $reader->moveToNextAttribute() ) { + $attrs[] = sprintf( + '%s="%s"', + preg_replace( '/[^0-9a-z:]/i', '', $reader->name ), + str_replace( array( '"', '\\' ), "", $reader->value ) + ); + } + } + if ( !empty( $attrs ) ) { + $this->raw_header .= "<$node_name " . implode( " ", $attrs ) . ">\n"; + $this->tiny_header .= "<$node_name " . implode( " ", $attrs ) . ">\n"; + } else { + $this->raw_header .= "<$node_name>\n"; + $this->tiny_header .= "<$node_name>\n"; + } + $attrs = array(); + break; + case XMLReader::END_ELEMENT: + $writing_to = 1; + $this->raw_footer .= "name>\n"; + break; + } + if ( !$reader->read() ) { + break 2; + } + break; + case "item": + // Write rss/channel/item elements into ur pseudo database file pointer + // and update the in-memory index about where the data starts and how + // many bytes long it is so that we know how to read it all back later. + $inner_xml = $reader->readInnerXML(); + if ( $this->do_compress ) { + $bytes = fwrite( $this->tmp, gzencode( "\n" . $inner_xml . "\n", 1 ) ); + } else { + $bytes = fwrite( $this->tmp, "\n" . $inner_xml . "\n" ); + } + $this->posts_found++; + // I do this because it's memory efficient. I can index about 44.5 million posts + // in ram this way using only 512MB + fwrite( $this->posts_metadata, pack( 'QL', $this->tmp_bytes, $bytes ) ); + $this->tmp_bytes += $bytes; + if ( !$reader->next() ) { + break 2; + } + break; + default: + if ( !$found_channel ) { + $this->raw_header .= $reader->readOuterXML() . "\n"; + } else { + if ( $reader->nodeType === XMLReader::ELEMENT ) { + if ( $writing_to === 0 ) { + switch( $reader->name ) { + case 'wp:tag': + case 'wp:author': + case 'wp:wp_author': + case 'wp:term': + case 'wp:category': + $this->raw_header .= $reader->readOuterXML() . "\n"; + break; + default: + $xml = $reader->readOuterXML(); + $this->raw_header .= $xml . "\n"; + $this->tiny_header .= $xml . "\n"; + break; + } + } else { + $this->raw_footer .= $reader->readOuterXML() . "\n"; + } + } + } + if ( !$reader->next() ) { + break 2; + } + break; + } + } + fflush( $this->tmp ); + $reader->close(); + libxml_disable_entity_loader( $old_disable_entity_loader_value ); + + // XMLReader may have come across errors caused by bad characters which don't show up on $reader->open(). + // We follow the example above and die here because the error handling in the WP_Import plugin which + // calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin. + // Dying also ensures the shutdown process deletes the temp file. + $libxml_errors = libxml_get_errors(); + libxml_clear_errors(); + if ( ! empty ( $libxml_errors ) ) { + return new WP_Error( 'xml_parse_error', __( 'We had trouble reading the import file. Please make sure it\'s valid XML.', 'wordpress-importer' ) ); + } + + $this->init_mini(); + + // If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error + if ( is_wp_error( $this->mini_parsed_wxr ) ) { + return; + } + + $tmp = tempnam( sys_get_temp_dir(), "import-" ); + $this->post_fp = fopen( $tmp, 'w+' ); + unlink( $tmp ); + fwrite( $this->post_fp, $this->tiny_header ); + $this->tiny_header_size = strlen( $this->tiny_header ); + } + + function init_mini() { + // initialize our persistent mini WXR data structure. This is where + // imports will read authors, tags, cats, etc from. + $tmp = tempnam( sys_get_temp_dir(), 'import-mini-' ); + $fp = fopen( $tmp, 'w+' ); + unlink( $tmp ); + fwrite( $fp, $this->raw_header ); + $this->raw_header = ''; + fwrite( $fp, $this->raw_footer ); + fflush( $fp ); + fseek( $fp, 0, SEEK_SET ); + $parser = $this->get_wxr_parser_instance(); + $this->mini_parsed_wxr = $parser->parse( $fp ); + fclose( $fp ); + } + + public function get_wxr_parser_instance() { + $parser_class = $this->wxr_parser_class; + if ( class_exists( $parser_class ) ) { + return new $parser_class(); + } + else { + // This is a precaution and fallback to the default parser if the override class doesn't exist + return new WXR_Parser(); + } + } + + /** + * Check if file is compressed with zlib/gzip. + * + * Inspired by https://stackoverflow.com/a/29268776/153310 + * + * @param string $file_path The file to check for compression + * + * @return bool + */ + public static function is_file_compressed( $file_path ) { + if ( ! is_file( $file_path ) ) { + return false; + } + + $handle = fopen( $file_path, 'r' ); + $header = fread( $handle, 8 ); + $is_compressed = 0 === strpos( $header, "\x1f" . "\x8b" . "\x08" ); + fclose( $handle ); + + return $is_compressed; + } + + // For ArrayAccess + function offsetSet( $offset, $val ) { + $this->mini_parsed_wxr[$offset] = $val; + } + + function offsetExists( $offset ) { + return isset( $this->mini_parsed_wxr[$offset] ); + } + + function offsetUnset( $offset ) { + unset( $this->mini_parsed_wxr[$offset] ); + } + + function offsetGet( $offset ) { + return $this->$offset; + } + + // For Iterator + + /** + * Provide the $post to the foreach( $posts as $post ) loop + * + * It should be noted that there is a real possibility that, if the process + * dies or is killed between making the temp file and unlinking it that we + * will leave orphaned bits of WXR on the filesystem. It's very difficult to + * make PHP clean up after itself when it's been the victim of kill -9 + */ + function current() { + // Create a real filesystem file to write a single rss/channel/item WXR into + + $index_offset = 12 * $this->current_post; + fseek( $this->posts_metadata, $index_offset, SEEK_SET ); + $index = unpack( 'Qo/Ll', fread( $this->posts_metadata, 12 ) ); + + // Hop to the appropriate starting byte in our database file. + fseek( $this->tmp, $index['o'], SEEK_SET ); + ftruncate( $this->post_fp, $this->tiny_header_size ); + fseek( $this->post_fp, $this->tiny_header_size, SEEK_SET ); + + // Compose the WXR from the header, bytes from the database for the item and footer + if ( $this->do_compress ) { + fwrite( $this->post_fp, gzdecode( fread( $this->tmp, $index['l'] ) ) ); + } else { + fwrite( $this->post_fp, fread( $this->tmp, $index['l'] ) ); + } + fwrite( $this->post_fp, $this->raw_footer ); + fflush( $this->post_fp ); + fseek( $this->post_fp, 0, SEEK_SET ); + + // Create a normal WXR_Parser data structure from the file + $parser = $this->get_wxr_parser_instance(); + $parsed = $parser->parse( $this->post_fp ); + + // Clean up + if ( is_wp_error( $parsed ) ) { + return $parsed; + } + + // There is exactly one post in this WXR so we can just return that. + // It's all we really wanted anyway. + return $parsed["posts"][0]; + } + + function key() { + return $this->current_post; + } + + function next() { + $this->current_post++; + } + + function rewind() { + $this->current_post = 0; + } + + function valid() { + $post_number = $this->current_post + 1; + return ( $post_number > 0 && $post_number <= $this->posts_found ); + } + + // Magic Methods + function __get( $key ) { + switch ( $key ) { + case 'posts': + // $this['posts'] returns $this. + // For use in the foreach( $data['posts'] as $post ) loop + return $this; + default: + // Anything else we're passing through to our embedded empty WXR + // that we keep around for just these purposes... eg: $data['authors'] + if ( isset( $this->mini_parsed_wxr[$key] ) ) { + return $this->mini_parsed_wxr[$key]; + } + return null; + } + } + + // For Countable + function count() { + return $this->posts_found; + } +} diff --git a/src/parsers/class-wxr-parser-simplexml.php b/src/parsers/class-wxr-parser-simplexml.php index 00dd110..022a59c 100644 --- a/src/parsers/class-wxr-parser-simplexml.php +++ b/src/parsers/class-wxr-parser-simplexml.php @@ -24,7 +24,15 @@ function parse( $file ) { if ( function_exists( 'libxml_disable_entity_loader' ) && PHP_VERSION_ID < 80000 ) { $old_value = libxml_disable_entity_loader( true ); } - $success = $dom->loadXML( file_get_contents( $file ) ); + + $success = false; + if ( is_resource( $file ) ) { + fseek( $file, 0, SEEK_SET ); + $success = $dom->loadXML( stream_get_contents( $file ) ); + } else { + $success = $dom->loadXML( file_get_contents( $file ) ); + } + if ( ! is_null( $old_value ) ) { libxml_disable_entity_loader( $old_value ); } diff --git a/src/parsers/class-wxr-parser.php b/src/parsers/class-wxr-parser.php index 057bf9e..148b586 100644 --- a/src/parsers/class-wxr-parser.php +++ b/src/parsers/class-wxr-parser.php @@ -11,6 +11,10 @@ */ class WXR_Parser { function parse( $file ) { + // just for testing now! + $parser = new WXR_Parser_Large_File( $file, 'WXR_Parser_SimpleXML' ); + return $parser; + // Attempt to use proper XML parsers first if ( extension_loaded( 'simplexml' ) ) { $parser = new WXR_Parser_SimpleXML; diff --git a/src/wordpress-importer.php b/src/wordpress-importer.php index 5d8824e..c0eeadd 100644 --- a/src/wordpress-importer.php +++ b/src/wordpress-importer.php @@ -36,6 +36,9 @@ /** Functions missing in older WordPress versions. */ require_once dirname( __FILE__ ) . '/compat.php'; +/** XML Character Stream Filter to sanitize XML input. */ +require_once dirname( __FILE__ ) . '/helpers/class-xml-character-filter.php'; + /** WXR_Parser class */ require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser.php'; @@ -48,6 +51,9 @@ /** WXR_Parser_Regex class */ require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser-regex.php'; +/** WXR_Large_File_Parser class */ +require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser-large-file.php'; + /** WP_Import class */ require_once dirname( __FILE__ ) . '/class-wp-import.php'; From 93c7cb9dde432a299ee1af68b5123e873eede580 Mon Sep 17 00:00:00 2001 From: Konstantin Obenland Date: Thu, 1 Dec 2022 20:46:34 -0800 Subject: [PATCH 2/3] Polish up and add some docs --- src/helpers/class-xml-character-filter.php | 61 ++- src/parsers/class-wxr-parser-large-file.php | 507 +++++++++++++------- 2 files changed, 375 insertions(+), 193 deletions(-) diff --git a/src/helpers/class-xml-character-filter.php b/src/helpers/class-xml-character-filter.php index 2370cc4..961ca9f 100644 --- a/src/helpers/class-xml-character-filter.php +++ b/src/helpers/class-xml-character-filter.php @@ -1,26 +1,41 @@ datalen; + while ( $bucket = stream_bucket_make_writeable( $in ) ) { //phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition + $consumed += $bucket->datalen; $bucket->data = $this->replace_chars( $bucket->data ); stream_bucket_append( $out, $bucket ); } @@ -28,13 +43,14 @@ public function filter( $in, $out, &$consumed, $closing ) { return PSFS_PASS_ON; } - private function replace_chars( $string ) { - return str_replace( $this->chars, ' ', $string ); - } - + /** + * This method is called during instantiation of the filter class object. + * + * @return bool + */ public function onCreate() { - for ( $ascii_num = 0; $ascii_num < 32; $ascii_num ++ ) { - if ( $ascii_num !== 9 && $ascii_num !== 10 && $ascii_num !== 13 ) { + for ( $ascii_num = 0; $ascii_num < 32; $ascii_num++ ) { + if ( 9 !== $ascii_num && 10 !== $ascii_num && 13 !== $ascii_num ) { $this->chars[] = chr( $ascii_num ); } } @@ -42,6 +58,15 @@ public function onCreate() { return true; } -} + /** + * Replace control characters. + * + * @param string $string Data to replace. + * @return string + */ + private function replace_chars( $string ) { + return str_replace( $this->chars, ' ', $string ); + } +} stream_filter_register( 'xml_character_filter', 'XML_Character_Filter' ); diff --git a/src/parsers/class-wxr-parser-large-file.php b/src/parsers/class-wxr-parser-large-file.php index f707f97..894cfd4 100644 --- a/src/parsers/class-wxr-parser-large-file.php +++ b/src/parsers/class-wxr-parser-large-file.php @@ -1,6 +1,15 @@ \n"; - var $raw_footer = "\n"; + /** + * Holds the raw header data. + * + * @var string + */ + public $raw_header = '' . "\n"; - var $tiny_header = "\n"; - var $tiny_header_size = 0; + /** + * Holds the raw footer data. + * + * @var string + */ + public $raw_footer = "\n"; - var $current_post = 0; - var $posts_metadata = null; - var $posts_found = 0; - var $post_fp = 0; + /** + * Keeps track of header elements that are not rss/channel/item. + * + * @var string + */ + public $tiny_header = '' . "\n"; - var $tmp = null; - var $tmp_bytes = 0; + /** + * Size of the tiny header in bytes. + * + * @var int + */ + public $tiny_header_size = 0; - var $mini_parsed_wxr = null; + /** + * Index of the current post. + * + * @var int + */ + public $current_post = 0; + + /** + * The number of posts in the file. + * + * @var int + */ + public $posts_found = 0; + + /** + * A file pointer resource to a temporary file. + * + * @var false|resource|null + */ + public $tmp; - var $invalid_xml = false; + /** + * A file pointer resource to a temporary file. + * + * This file contains the posts, but not the header or footer. + * + * @var false|resource|null + */ + public $post_fp; - var $do_compress = false; - var $large_file_size = 524288000; // 500 MB + /** + * A file pointer resource to a temporary file. + * + * Contains the index data for seeking to posts in our $tmp data file. + * + * @var false|resource|null + */ + public $posts_metadata; + /** + * Used to store the index for the $posts_metadata file. + * + * @var int + */ + public $tmp_bytes = 0; + + /** + * Parsed data or WP_Error on failure. + * + * @var array|WP_Error + */ + public $mini_parsed_wxr; + + /** + * Whether to compress the temporary file. + * + * @var bool + */ + public $do_compress = false; + + /** + * The size that constitutes a large file. + * + * @var int + */ + public $large_file_size = 500 * MB_IN_BYTES; + + /** + * Parser class. + * + * @var string + */ public $wxr_parser_class = 'WXR_Parser'; /** - * Clean up resources so that they are not orphaned when the object is - * no longer in scope. Specifically open file handles. - */ - function close() { + * Clean up resources so that they are not orphaned when the object is + * no longer in scope. Specifically open file handles. + */ + public function close() { fclose( $this->tmp ); } - function __construct( $file, $override_wxr_parser = '' ) { - /** - * Detect compressed import files. - */ - $is_forced_compressed_file = self::is_file_compressed( $file ); + /** + * Constructor. + * + * @param string $file File to import. + * @param string $wxr_parser Optional. The name of the parser class to use. Default 'WXR_Parser'. + */ + public function __construct( $file, $wxr_parser = 'WXR_Parser' ) { + $this->wxr_parser_class = $wxr_parser; - $source_uri = sprintf("file://%s", realpath( $file ) ); + // Detect compressed import files. + $is_forced_compressed_file = self::is_file_compressed( $file ); + $source_uri = sprintf( 'file://%s', realpath( $file ) ); if ( ( preg_match( '/\.gz$/i', $file ) || $is_forced_compressed_file ) && - in_array( 'compress.zlib', stream_get_wrappers() ) + in_array( 'compress.zlib', stream_get_wrappers(), true ) ) { - // 100 MB of compressed data is quite a lot - $this->large_file_size = 104857600; - $source_uri = sprintf( "compress.zlib://%s", $file ); + // 100 MB of compressed data is quite a lot. + $this->large_file_size = 100 * MB_IN_BYTES; + $source_uri = sprintf( 'compress.zlib://%s', $file ); } - // Prepend stream filter to strip out control characters XMLReader doesn't like + // Prepend stream filter to strip out control characters XMLReader doesn't like. $source_uri = XML_CHARACTER_FILTER_PREFIX . $source_uri; if ( filesize( $file ) > $this->large_file_size && function_exists( 'gzencode' ) ) { $this->do_compress = true; } - /** - * Check if the WXR_Parser class needs to be overriden for another class. - * - * This is used to plug in the Site_Importer_WXR_Parser to be used with the Large File parser. + /* + * Create a file pointer with no filesystem references to act as our simple + * database container. We want no filesystem references, so that when + * the process dies the file is orphaned and space reclaimed by the OS. */ - if ( $override_wxr_parser && class_exists( $override_wxr_parser ) ) { - $this->wxr_parser_class = $override_wxr_parser; - } - - // Create a file pointer with no filesystem references to act as our simple - // database container. We want no filesystem references, so that when - // the process dies the file is orphaned and space reclaimed by the OS - $tmp = tempnam( sys_get_temp_dir(), "import-" ); + $tmp = tempnam( sys_get_temp_dir(), 'import-' ); $this->tmp = fopen( $tmp, 'w+' ); unlink( $tmp ); - // Create a similar orphaned file descriptor to house the index fata for seeking - // to posts in our data file (above). Exactly 12 bytes per item entry will be used: - // an unsigned 64 bit unsinged int for the offset and a 32 bit unsigned int for - // the length of the data. Therefore seeking to $id * 12 and unpacking the next 12 - // bytes gives us everything we need to pull data from $this->tmp - $tmp = tempnam( sys_get_temp_dir(), "import-" ); + /* + * Create a similar orphaned file descriptor to house the index data for seeking + * to posts in our data file (above). Exactly 12 bytes per item entry will be used: + * an unsigned 64 bit unsigned int for the offset and a 32 bit unsigned int for + * the length of the data. Therefore, seeking to $id * 12 and unpacking the next 12 + * bytes gives us everything we need to pull data from $this->tmp. + */ + $tmp = tempnam( sys_get_temp_dir(), 'import-' ); $this->posts_metadata = fopen( $tmp, 'w+' ); unlink( $tmp ); - // XMLReader is a stream parser. It does not need to read the entire file, - // and is therefore very memory efficient. It uses the same parsing engine - // that simplexml does (I believe) and so should work precisely the same + /* + * XMLReader is a stream parser. It does not need to read the entire file, + * and is therefore very memory efficient. It uses the same parsing engine + * that simplexml does and so should work precisely the same. + */ $reader = new XMLReader(); - libxml_use_internal_errors(true); + libxml_use_internal_errors( true ); $libxml_options = LIBXML_NOBLANKS; @@ -138,100 +222,111 @@ function __construct( $file, $override_wxr_parser = '' ) { $libxml_options = $libxml_options | LIBXML_COMPACT; } - if( defined( 'LIBXML_PARSEHUGE' ) ) { + if ( defined( 'LIBXML_PARSEHUGE' ) ) { $libxml_options = $libxml_options | LIBXML_PARSEHUGE; } - // Using `false` here is bad practice, but we're limiting it to the open - // step, in which XMLReader (unlike other XML tools) does not attempt to - // load or parse external entities. We go back to best practice during the - // read steps. + /* + * Using `false` here is bad practice, but we're limiting it to the open + * step, in which XMLReader (unlike other XML tools) does not attempt to + * load or parse external entities. We go back to best practice during the read steps. + */ $old_disable_entity_loader_value = libxml_disable_entity_loader( false ); - $opened = $reader->open( $source_uri, null, $libxml_options ); + $opened = $reader->open( $source_uri, null, $libxml_options ); libxml_disable_entity_loader( true ); if ( ! $opened ) { libxml_disable_entity_loader( $old_disable_entity_loader_value ); - return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it\'s valid XML.', 'wordpress-importer') ); + return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it’s valid XML.', 'wordpress-importer' ) ); } - // Be explicit about this default behavior for the read steps + // Be explicit about this default behavior for the read steps. $reader->setParserProperty( XMLReader::SUBST_ENTITIES, false ); - $writing_to = 0; + $writing_to = 0; $found_channel = false; $reader->read(); - while( true ) { - switch( $reader->name ) { + + while ( true ) { + switch ( $reader->name ) { case 'channel': $found_channel = true; + // Fall through to handle channel items. + case 'rss': // rss and rss/channel are both parts of the header or footer // depending on whether we have an opening tag or not. $node_name = $reader->name; - switch ( $reader->nodeType ) { + + switch ( $reader->nodeType ) { //phpcs:ignore WordPress.NamingConventions case XMLReader::ELEMENT: - // Trap any attributes on these kinds of elements so that - // we can include them in the header as well + // Trap any attributes on these kinds of elements so that we can include them in the header as well. $attrs = array(); if ( $reader->moveToFirstAttribute() ) { $attrs[] = sprintf( '%s="%s"', preg_replace( '/[^0-9a-z:]/i', '', $reader->name ), - str_replace( array( '"', '\\' ), "", $reader->value ) + str_replace( array( '"', '\\' ), '', $reader->value ) ); while ( $reader->moveToNextAttribute() ) { $attrs[] = sprintf( '%s="%s"', preg_replace( '/[^0-9a-z:]/i', '', $reader->name ), - str_replace( array( '"', '\\' ), "", $reader->value ) + str_replace( array( '"', '\\' ), '', $reader->value ) ); } } - if ( !empty( $attrs ) ) { - $this->raw_header .= "<$node_name " . implode( " ", $attrs ) . ">\n"; - $this->tiny_header .= "<$node_name " . implode( " ", $attrs ) . ">\n"; + if ( ! empty( $attrs ) ) { + $this->raw_header .= "<$node_name " . implode( ' ', $attrs ) . ">\n"; + $this->tiny_header .= "<$node_name " . implode( ' ', $attrs ) . ">\n"; } else { - $this->raw_header .= "<$node_name>\n"; + $this->raw_header .= "<$node_name>\n"; $this->tiny_header .= "<$node_name>\n"; } - $attrs = array(); break; + case XMLReader::END_ELEMENT: - $writing_to = 1; + $writing_to = 1; $this->raw_footer .= "name>\n"; break; } - if ( !$reader->read() ) { + + if ( ! $reader->read() ) { break 2; } break; - case "item": - // Write rss/channel/item elements into ur pseudo database file pointer - // and update the in-memory index about where the data starts and how - // many bytes long it is so that we know how to read it all back later. + + case 'item': + /* + * Write rss/channel/item elements into our pseudo database file pointer + * and update the in-memory index about where the data starts and how + * many bytes long it is so that we know how to read it all back later. + */ $inner_xml = $reader->readInnerXML(); + if ( $this->do_compress ) { $bytes = fwrite( $this->tmp, gzencode( "\n" . $inner_xml . "\n", 1 ) ); } else { $bytes = fwrite( $this->tmp, "\n" . $inner_xml . "\n" ); } + $this->posts_found++; - // I do this because it's memory efficient. I can index about 44.5 million posts - // in ram this way using only 512MB + + // We do this because it's memory efficient. We can index about 44.5 million posts in ram this way using only 512MB. fwrite( $this->posts_metadata, pack( 'QL', $this->tmp_bytes, $bytes ) ); $this->tmp_bytes += $bytes; - if ( !$reader->next() ) { + + if ( ! $reader->next() ) { break 2; } break; + default: - if ( !$found_channel ) { + if ( ! $found_channel ) { $this->raw_header .= $reader->readOuterXML() . "\n"; - } else { - if ( $reader->nodeType === XMLReader::ELEMENT ) { - if ( $writing_to === 0 ) { - switch( $reader->name ) { + } elseif ( XMLReader::ELEMENT === $reader->nodeType ) { //phpcs:ignore WordPress.NamingConventions + if ( 0 === $writing_to ) { + switch ( $reader->name ) { case 'wp:tag': case 'wp:author': case 'wp:wp_author': @@ -239,18 +334,19 @@ function __construct( $file, $override_wxr_parser = '' ) { case 'wp:category': $this->raw_header .= $reader->readOuterXML() . "\n"; break; + default: - $xml = $reader->readOuterXML(); - $this->raw_header .= $xml . "\n"; + $xml = $reader->readOuterXML(); + $this->raw_header .= $xml . "\n"; $this->tiny_header .= $xml . "\n"; break; - } - } else { - $this->raw_footer .= $reader->readOuterXML() . "\n"; } + } else { + $this->raw_footer .= $reader->readOuterXML() . "\n"; } } - if ( !$reader->next() ) { + + if ( ! $reader->next() ) { break 2; } break; @@ -260,53 +356,65 @@ function __construct( $file, $override_wxr_parser = '' ) { $reader->close(); libxml_disable_entity_loader( $old_disable_entity_loader_value ); - // XMLReader may have come across errors caused by bad characters which don't show up on $reader->open(). - // We follow the example above and die here because the error handling in the WP_Import plugin which - // calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin. - // Dying also ensures the shutdown process deletes the temp file. + /* + * XMLReader may have come across errors caused by bad characters which don't show up on $reader->open(). + * We follow the example above and die here because the error handling in the WP_Import plugin which + * calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin. + * Dying also ensures the shutdown process deletes the temp file. + */ $libxml_errors = libxml_get_errors(); libxml_clear_errors(); - if ( ! empty ( $libxml_errors ) ) { + if ( ! empty( $libxml_errors ) ) { return new WP_Error( 'xml_parse_error', __( 'We had trouble reading the import file. Please make sure it\'s valid XML.', 'wordpress-importer' ) ); } $this->init_mini(); - // If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error + // If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error. if ( is_wp_error( $this->mini_parsed_wxr ) ) { return; } - $tmp = tempnam( sys_get_temp_dir(), "import-" ); + $tmp = tempnam( sys_get_temp_dir(), 'import-' ); $this->post_fp = fopen( $tmp, 'w+' ); unlink( $tmp ); fwrite( $this->post_fp, $this->tiny_header ); $this->tiny_header_size = strlen( $this->tiny_header ); } - function init_mini() { - // initialize our persistent mini WXR data structure. This is where - // imports will read authors, tags, cats, etc from. + /** + * Initialize our persistent mini WXR data structure. + * + * This is where imports will read authors, tags, cats, etc from. + */ + public function init_mini() { $tmp = tempnam( sys_get_temp_dir(), 'import-mini-' ); - $fp = fopen( $tmp, 'w+' ); + $fp = fopen( $tmp, 'w+' ); unlink( $tmp ); + fwrite( $fp, $this->raw_header ); $this->raw_header = ''; fwrite( $fp, $this->raw_footer ); fflush( $fp ); fseek( $fp, 0, SEEK_SET ); - $parser = $this->get_wxr_parser_instance(); + + $parser = $this->get_wxr_parser_instance(); $this->mini_parsed_wxr = $parser->parse( $fp ); fclose( $fp ); } + /** + * Get the WXR parser instance. + * + * @return WXR_Parser + */ public function get_wxr_parser_instance() { $parser_class = $this->wxr_parser_class; + if ( class_exists( $parser_class ) ) { return new $parser_class(); - } - else { - // This is a precaution and fallback to the default parser if the override class doesn't exist + } else { + // This is a precaution and fallback to the default parser if the override class doesn't exist. return new WXR_Parser(); } } @@ -314,10 +422,9 @@ public function get_wxr_parser_instance() { /** * Check if file is compressed with zlib/gzip. * - * Inspired by https://stackoverflow.com/a/29268776/153310 - * - * @param string $file_path The file to check for compression + * Inspired by {@see https://stackoverflow.com/a/29268776/153310} * + * @param string $file_path The file to check for compression. * @return bool */ public static function is_file_compressed( $file_path ) { @@ -327,41 +434,68 @@ public static function is_file_compressed( $file_path ) { $handle = fopen( $file_path, 'r' ); $header = fread( $handle, 8 ); - $is_compressed = 0 === strpos( $header, "\x1f" . "\x8b" . "\x08" ); + + $is_compressed = 0 === strpos( $header, "\x1f\x8b\x08" ); fclose( $handle ); return $is_compressed; } - // For ArrayAccess - function offsetSet( $offset, $val ) { - $this->mini_parsed_wxr[$offset] = $val; + // For ArrayAccess. + + /** + * Whether an offset exists. + * + * @param mixed $offset An offset to check for. + * @return bool true on success or false on failure. + */ + public function offsetExists( $offset ) { + return isset( $this->mini_parsed_wxr[ $offset ] ); } - function offsetExists( $offset ) { - return isset( $this->mini_parsed_wxr[$offset] ); + /** + * Offset to retrieve. + * + * @param mixed $offset The offset to retrieve. + * @return mixed Can return all value types. + */ + public function offsetGet( $offset ) { + return $this->$offset; } - function offsetUnset( $offset ) { - unset( $this->mini_parsed_wxr[$offset] ); + /** + * Offset to set. + * + * @param mixed $offset The offset to assign the value to. + * @param mixed $value The value to set. + */ + public function offsetSet( $offset, $value ) { + $this->mini_parsed_wxr[ $offset ] = $value; } - function offsetGet( $offset ) { - return $this->$offset; + /** + * Offset to unset. + * + * @param mixed $offset The offset to unset. + */ + public function offsetUnset( $offset ) { + unset( $this->mini_parsed_wxr[ $offset ] ); } - // For Iterator + // For Iterator. /** - * Provide the $post to the foreach( $posts as $post ) loop - * - * It should be noted that there is a real possibility that, if the process - * dies or is killed between making the temp file and unlinking it that we - * will leave orphaned bits of WXR on the filesystem. It's very difficult to - * make PHP clean up after itself when it's been the victim of kill -9 - */ - function current() { - // Create a real filesystem file to write a single rss/channel/item WXR into + * Provide the $post to the foreach( $posts as $post ) loop. + * + * It should be noted that there is a real possibility that, if the process + * dies or is killed between making the temp file and unlinking it that we + * will leave orphaned bits of WXR on the filesystem. It's very difficult to + * make PHP clean up after itself when it's been the victim of kill -9. + * + * @return array|WP_Error Post array on success, WP_Error on failure. + */ + public function current() { + // Create a real filesystem file to write a single rss/channel/item WXR into. $index_offset = 12 * $this->current_post; fseek( $this->posts_metadata, $index_offset, SEEK_SET ); @@ -372,7 +506,7 @@ function current() { ftruncate( $this->post_fp, $this->tiny_header_size ); fseek( $this->post_fp, $this->tiny_header_size, SEEK_SET ); - // Compose the WXR from the header, bytes from the database for the item and footer + // Compose the WXR from the header, bytes from the database for the item and footer. if ( $this->do_compress ) { fwrite( $this->post_fp, gzdecode( fread( $this->tmp, $index['l'] ) ) ); } else { @@ -382,56 +516,79 @@ function current() { fflush( $this->post_fp ); fseek( $this->post_fp, 0, SEEK_SET ); - // Create a normal WXR_Parser data structure from the file + // Create a normal WXR_Parser data structure from the file. $parser = $this->get_wxr_parser_instance(); $parsed = $parser->parse( $this->post_fp ); - // Clean up + // Clean up. if ( is_wp_error( $parsed ) ) { return $parsed; } - // There is exactly one post in this WXR so we can just return that. - // It's all we really wanted anyway. - return $parsed["posts"][0]; + // There is exactly one post in this WXR, so we can just return that. + return $parsed['posts'][0]; } - function key() { + /** + * Move forward to next post. + */ + public function next() { + $this->current_post++; + } + + /** + * Return the current post. + * + * @return int|null Post index on success, or null on failure. + */ + public function key() { return $this->current_post; } - function next() { - $this->current_post++; + /** + * Checks if current position is valid. + * + * @return bool + */ + public function valid() { + $post_number = $this->current_post + 1; + + return $post_number > 0 && $post_number <= $this->posts_found; } - function rewind() { + /** + * Rewind the Iterator to the first post. + */ + public function rewind() { $this->current_post = 0; } - function valid() { - $post_number = $this->current_post + 1; - return ( $post_number > 0 && $post_number <= $this->posts_found ); - } + /** + * Getter. + * + * @param string|mixed $key The key to check. + * @return $this|mixed|null + */ + public function __get( $key ) { + // For use in the foreach( $data['posts'] as $post ) loop. + if ( 'posts' === $key ) { + return $this; + } - // Magic Methods - function __get( $key ) { - switch ( $key ) { - case 'posts': - // $this['posts'] returns $this. - // For use in the foreach( $data['posts'] as $post ) loop - return $this; - default: - // Anything else we're passing through to our embedded empty WXR - // that we keep around for just these purposes... eg: $data['authors'] - if ( isset( $this->mini_parsed_wxr[$key] ) ) { - return $this->mini_parsed_wxr[$key]; - } - return null; + // Anything else we're passing through to our embedded empty WXR that we keep around for just these purposes... eg: $data['authors']. + if ( isset( $this->mini_parsed_wxr[ $key ] ) ) { + return $this->mini_parsed_wxr[ $key ]; } + + return null; } - // For Countable - function count() { + /** + * Count elements of an object. + * + * @return int<0,max> The custom count as an integer. + */ + public function count() { return $this->posts_found; } } From 45524027b6e1c330c5349cacfb30bfea686cb70e Mon Sep 17 00:00:00 2001 From: Konstantin Obenland Date: Thu, 1 Dec 2022 21:02:28 -0800 Subject: [PATCH 3/3] Pinking shears --- src/parsers/class-wxr-parser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parsers/class-wxr-parser.php b/src/parsers/class-wxr-parser.php index 148b586..d1b4c61 100644 --- a/src/parsers/class-wxr-parser.php +++ b/src/parsers/class-wxr-parser.php @@ -14,7 +14,7 @@ function parse( $file ) { // just for testing now! $parser = new WXR_Parser_Large_File( $file, 'WXR_Parser_SimpleXML' ); return $parser; - + // Attempt to use proper XML parsers first if ( extension_loaded( 'simplexml' ) ) { $parser = new WXR_Parser_SimpleXML;