diff --git a/.wp-env.json b/.wp-env.json new file mode 100644 index 0000000..0d73d51 --- /dev/null +++ b/.wp-env.json @@ -0,0 +1,7 @@ +{ + "plugins": ["."], + "config": { + "WP_UPLOAD_MAX_FILESIZE": "128M", + "WP_MEMORY_LIMIT": "256M" + } +} diff --git a/src/helpers/class-xml-character-filter.php b/src/helpers/class-xml-character-filter.php new file mode 100644 index 0000000..961ca9f --- /dev/null +++ b/src/helpers/class-xml-character-filter.php @@ -0,0 +1,72 @@ +datalen; + $bucket->data = $this->replace_chars( $bucket->data ); + stream_bucket_append( $out, $bucket ); + } + + return PSFS_PASS_ON; + } + + /** + * This method is called during instantiation of the filter class object. + * + * @return bool + */ + public function onCreate() { + for ( $ascii_num = 0; $ascii_num < 32; $ascii_num++ ) { + if ( 9 !== $ascii_num && 10 !== $ascii_num && 13 !== $ascii_num ) { + $this->chars[] = chr( $ascii_num ); + } + } + $this->chars[] = chr( 127 ); + + return true; + } + + /** + * Replace control characters. + * + * @param string $string Data to replace. + * @return string + */ + private function replace_chars( $string ) { + return str_replace( $this->chars, ' ', $string ); + } +} +stream_filter_register( 'xml_character_filter', 'XML_Character_Filter' ); diff --git a/src/parsers/class-wxr-parser-large-file.php b/src/parsers/class-wxr-parser-large-file.php new file mode 100644 index 0000000..894cfd4 --- /dev/null +++ b/src/parsers/class-wxr-parser-large-file.php @@ -0,0 +1,594 @@ +' . "\n"; + + /** + * Holds the raw footer data. + * + * @var string + */ + public $raw_footer = "\n"; + + /** + * Keeps track of header elements that are not rss/channel/item. + * + * @var string + */ + public $tiny_header = '' . "\n"; + + /** + * Size of the tiny header in bytes. + * + * @var int + */ + public $tiny_header_size = 0; + + /** + * Index of the current post. + * + * @var int + */ + public $current_post = 0; + + /** + * The number of posts in the file. + * + * @var int + */ + public $posts_found = 0; + + /** + * A file pointer resource to a temporary file. + * + * @var false|resource|null + */ + public $tmp; + + /** + * A file pointer resource to a temporary file. + * + * This file contains the posts, but not the header or footer. + * + * @var false|resource|null + */ + public $post_fp; + + /** + * A file pointer resource to a temporary file. + * + * Contains the index data for seeking to posts in our $tmp data file. + * + * @var false|resource|null + */ + public $posts_metadata; + + /** + * Used to store the index for the $posts_metadata file. + * + * @var int + */ + public $tmp_bytes = 0; + + /** + * Parsed data or WP_Error on failure. + * + * @var array|WP_Error + */ + public $mini_parsed_wxr; + + /** + * Whether to compress the temporary file. + * + * @var bool + */ + public $do_compress = false; + + /** + * The size that constitutes a large file. + * + * @var int + */ + public $large_file_size = 500 * MB_IN_BYTES; + + /** + * Parser class. + * + * @var string + */ + public $wxr_parser_class = 'WXR_Parser'; + + /** + * Clean up resources so that they are not orphaned when the object is + * no longer in scope. Specifically open file handles. + */ + public function close() { + fclose( $this->tmp ); + } + + /** + * Constructor. + * + * @param string $file File to import. + * @param string $wxr_parser Optional. The name of the parser class to use. Default 'WXR_Parser'. + */ + public function __construct( $file, $wxr_parser = 'WXR_Parser' ) { + $this->wxr_parser_class = $wxr_parser; + + // Detect compressed import files. + $is_forced_compressed_file = self::is_file_compressed( $file ); + $source_uri = sprintf( 'file://%s', realpath( $file ) ); + + if ( + ( preg_match( '/\.gz$/i', $file ) || $is_forced_compressed_file ) && + in_array( 'compress.zlib', stream_get_wrappers(), true ) + ) { + // 100 MB of compressed data is quite a lot. + $this->large_file_size = 100 * MB_IN_BYTES; + $source_uri = sprintf( 'compress.zlib://%s', $file ); + } + + // Prepend stream filter to strip out control characters XMLReader doesn't like. + $source_uri = XML_CHARACTER_FILTER_PREFIX . $source_uri; + + if ( filesize( $file ) > $this->large_file_size && function_exists( 'gzencode' ) ) { + $this->do_compress = true; + } + + /* + * Create a file pointer with no filesystem references to act as our simple + * database container. We want no filesystem references, so that when + * the process dies the file is orphaned and space reclaimed by the OS. + */ + $tmp = tempnam( sys_get_temp_dir(), 'import-' ); + $this->tmp = fopen( $tmp, 'w+' ); + unlink( $tmp ); + + /* + * Create a similar orphaned file descriptor to house the index data for seeking + * to posts in our data file (above). Exactly 12 bytes per item entry will be used: + * an unsigned 64 bit unsigned int for the offset and a 32 bit unsigned int for + * the length of the data. Therefore, seeking to $id * 12 and unpacking the next 12 + * bytes gives us everything we need to pull data from $this->tmp. + */ + $tmp = tempnam( sys_get_temp_dir(), 'import-' ); + $this->posts_metadata = fopen( $tmp, 'w+' ); + unlink( $tmp ); + + /* + * XMLReader is a stream parser. It does not need to read the entire file, + * and is therefore very memory efficient. It uses the same parsing engine + * that simplexml does and so should work precisely the same. + */ + $reader = new XMLReader(); + + libxml_use_internal_errors( true ); + + $libxml_options = LIBXML_NOBLANKS; + + if ( defined( 'LIBXML_COMPACT' ) ) { + $libxml_options = $libxml_options | LIBXML_COMPACT; + } + + if ( defined( 'LIBXML_PARSEHUGE' ) ) { + $libxml_options = $libxml_options | LIBXML_PARSEHUGE; + } + + /* + * Using `false` here is bad practice, but we're limiting it to the open + * step, in which XMLReader (unlike other XML tools) does not attempt to + * load or parse external entities. We go back to best practice during the read steps. + */ + $old_disable_entity_loader_value = libxml_disable_entity_loader( false ); + $opened = $reader->open( $source_uri, null, $libxml_options ); + libxml_disable_entity_loader( true ); + + if ( ! $opened ) { + libxml_disable_entity_loader( $old_disable_entity_loader_value ); + return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it’s valid XML.', 'wordpress-importer' ) ); + } + + // Be explicit about this default behavior for the read steps. + $reader->setParserProperty( XMLReader::SUBST_ENTITIES, false ); + + $writing_to = 0; + $found_channel = false; + $reader->read(); + + while ( true ) { + switch ( $reader->name ) { + case 'channel': + $found_channel = true; + // Fall through to handle channel items. + + case 'rss': + // rss and rss/channel are both parts of the header or footer + // depending on whether we have an opening tag or not. + $node_name = $reader->name; + + switch ( $reader->nodeType ) { //phpcs:ignore WordPress.NamingConventions + case XMLReader::ELEMENT: + // Trap any attributes on these kinds of elements so that we can include them in the header as well. + $attrs = array(); + if ( $reader->moveToFirstAttribute() ) { + $attrs[] = sprintf( + '%s="%s"', + preg_replace( '/[^0-9a-z:]/i', '', $reader->name ), + str_replace( array( '"', '\\' ), '', $reader->value ) + ); + while ( $reader->moveToNextAttribute() ) { + $attrs[] = sprintf( + '%s="%s"', + preg_replace( '/[^0-9a-z:]/i', '', $reader->name ), + str_replace( array( '"', '\\' ), '', $reader->value ) + ); + } + } + if ( ! empty( $attrs ) ) { + $this->raw_header .= "<$node_name " . implode( ' ', $attrs ) . ">\n"; + $this->tiny_header .= "<$node_name " . implode( ' ', $attrs ) . ">\n"; + } else { + $this->raw_header .= "<$node_name>\n"; + $this->tiny_header .= "<$node_name>\n"; + } + break; + + case XMLReader::END_ELEMENT: + $writing_to = 1; + $this->raw_footer .= "name>\n"; + break; + } + + if ( ! $reader->read() ) { + break 2; + } + break; + + case 'item': + /* + * Write rss/channel/item elements into our pseudo database file pointer + * and update the in-memory index about where the data starts and how + * many bytes long it is so that we know how to read it all back later. + */ + $inner_xml = $reader->readInnerXML(); + + if ( $this->do_compress ) { + $bytes = fwrite( $this->tmp, gzencode( "\n" . $inner_xml . "\n", 1 ) ); + } else { + $bytes = fwrite( $this->tmp, "\n" . $inner_xml . "\n" ); + } + + $this->posts_found++; + + // We do this because it's memory efficient. We can index about 44.5 million posts in ram this way using only 512MB. + fwrite( $this->posts_metadata, pack( 'QL', $this->tmp_bytes, $bytes ) ); + $this->tmp_bytes += $bytes; + + if ( ! $reader->next() ) { + break 2; + } + break; + + default: + if ( ! $found_channel ) { + $this->raw_header .= $reader->readOuterXML() . "\n"; + } elseif ( XMLReader::ELEMENT === $reader->nodeType ) { //phpcs:ignore WordPress.NamingConventions + if ( 0 === $writing_to ) { + switch ( $reader->name ) { + case 'wp:tag': + case 'wp:author': + case 'wp:wp_author': + case 'wp:term': + case 'wp:category': + $this->raw_header .= $reader->readOuterXML() . "\n"; + break; + + default: + $xml = $reader->readOuterXML(); + $this->raw_header .= $xml . "\n"; + $this->tiny_header .= $xml . "\n"; + break; + } + } else { + $this->raw_footer .= $reader->readOuterXML() . "\n"; + } + } + + if ( ! $reader->next() ) { + break 2; + } + break; + } + } + fflush( $this->tmp ); + $reader->close(); + libxml_disable_entity_loader( $old_disable_entity_loader_value ); + + /* + * XMLReader may have come across errors caused by bad characters which don't show up on $reader->open(). + * We follow the example above and die here because the error handling in the WP_Import plugin which + * calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin. + * Dying also ensures the shutdown process deletes the temp file. + */ + $libxml_errors = libxml_get_errors(); + libxml_clear_errors(); + if ( ! empty( $libxml_errors ) ) { + return new WP_Error( 'xml_parse_error', __( 'We had trouble reading the import file. Please make sure it\'s valid XML.', 'wordpress-importer' ) ); + } + + $this->init_mini(); + + // If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error. + if ( is_wp_error( $this->mini_parsed_wxr ) ) { + return; + } + + $tmp = tempnam( sys_get_temp_dir(), 'import-' ); + $this->post_fp = fopen( $tmp, 'w+' ); + unlink( $tmp ); + fwrite( $this->post_fp, $this->tiny_header ); + $this->tiny_header_size = strlen( $this->tiny_header ); + } + + /** + * Initialize our persistent mini WXR data structure. + * + * This is where imports will read authors, tags, cats, etc from. + */ + public function init_mini() { + $tmp = tempnam( sys_get_temp_dir(), 'import-mini-' ); + $fp = fopen( $tmp, 'w+' ); + unlink( $tmp ); + + fwrite( $fp, $this->raw_header ); + $this->raw_header = ''; + fwrite( $fp, $this->raw_footer ); + fflush( $fp ); + fseek( $fp, 0, SEEK_SET ); + + $parser = $this->get_wxr_parser_instance(); + $this->mini_parsed_wxr = $parser->parse( $fp ); + fclose( $fp ); + } + + /** + * Get the WXR parser instance. + * + * @return WXR_Parser + */ + public function get_wxr_parser_instance() { + $parser_class = $this->wxr_parser_class; + + if ( class_exists( $parser_class ) ) { + return new $parser_class(); + } else { + // This is a precaution and fallback to the default parser if the override class doesn't exist. + return new WXR_Parser(); + } + } + + /** + * Check if file is compressed with zlib/gzip. + * + * Inspired by {@see https://stackoverflow.com/a/29268776/153310} + * + * @param string $file_path The file to check for compression. + * @return bool + */ + public static function is_file_compressed( $file_path ) { + if ( ! is_file( $file_path ) ) { + return false; + } + + $handle = fopen( $file_path, 'r' ); + $header = fread( $handle, 8 ); + + $is_compressed = 0 === strpos( $header, "\x1f\x8b\x08" ); + fclose( $handle ); + + return $is_compressed; + } + + // For ArrayAccess. + + /** + * Whether an offset exists. + * + * @param mixed $offset An offset to check for. + * @return bool true on success or false on failure. + */ + public function offsetExists( $offset ) { + return isset( $this->mini_parsed_wxr[ $offset ] ); + } + + /** + * Offset to retrieve. + * + * @param mixed $offset The offset to retrieve. + * @return mixed Can return all value types. + */ + public function offsetGet( $offset ) { + return $this->$offset; + } + + /** + * Offset to set. + * + * @param mixed $offset The offset to assign the value to. + * @param mixed $value The value to set. + */ + public function offsetSet( $offset, $value ) { + $this->mini_parsed_wxr[ $offset ] = $value; + } + + /** + * Offset to unset. + * + * @param mixed $offset The offset to unset. + */ + public function offsetUnset( $offset ) { + unset( $this->mini_parsed_wxr[ $offset ] ); + } + + // For Iterator. + + /** + * Provide the $post to the foreach( $posts as $post ) loop. + * + * It should be noted that there is a real possibility that, if the process + * dies or is killed between making the temp file and unlinking it that we + * will leave orphaned bits of WXR on the filesystem. It's very difficult to + * make PHP clean up after itself when it's been the victim of kill -9. + * + * @return array|WP_Error Post array on success, WP_Error on failure. + */ + public function current() { + // Create a real filesystem file to write a single rss/channel/item WXR into. + + $index_offset = 12 * $this->current_post; + fseek( $this->posts_metadata, $index_offset, SEEK_SET ); + $index = unpack( 'Qo/Ll', fread( $this->posts_metadata, 12 ) ); + + // Hop to the appropriate starting byte in our database file. + fseek( $this->tmp, $index['o'], SEEK_SET ); + ftruncate( $this->post_fp, $this->tiny_header_size ); + fseek( $this->post_fp, $this->tiny_header_size, SEEK_SET ); + + // Compose the WXR from the header, bytes from the database for the item and footer. + if ( $this->do_compress ) { + fwrite( $this->post_fp, gzdecode( fread( $this->tmp, $index['l'] ) ) ); + } else { + fwrite( $this->post_fp, fread( $this->tmp, $index['l'] ) ); + } + fwrite( $this->post_fp, $this->raw_footer ); + fflush( $this->post_fp ); + fseek( $this->post_fp, 0, SEEK_SET ); + + // Create a normal WXR_Parser data structure from the file. + $parser = $this->get_wxr_parser_instance(); + $parsed = $parser->parse( $this->post_fp ); + + // Clean up. + if ( is_wp_error( $parsed ) ) { + return $parsed; + } + + // There is exactly one post in this WXR, so we can just return that. + return $parsed['posts'][0]; + } + + /** + * Move forward to next post. + */ + public function next() { + $this->current_post++; + } + + /** + * Return the current post. + * + * @return int|null Post index on success, or null on failure. + */ + public function key() { + return $this->current_post; + } + + /** + * Checks if current position is valid. + * + * @return bool + */ + public function valid() { + $post_number = $this->current_post + 1; + + return $post_number > 0 && $post_number <= $this->posts_found; + } + + /** + * Rewind the Iterator to the first post. + */ + public function rewind() { + $this->current_post = 0; + } + + /** + * Getter. + * + * @param string|mixed $key The key to check. + * @return $this|mixed|null + */ + public function __get( $key ) { + // For use in the foreach( $data['posts'] as $post ) loop. + if ( 'posts' === $key ) { + return $this; + } + + // Anything else we're passing through to our embedded empty WXR that we keep around for just these purposes... eg: $data['authors']. + if ( isset( $this->mini_parsed_wxr[ $key ] ) ) { + return $this->mini_parsed_wxr[ $key ]; + } + + return null; + } + + /** + * Count elements of an object. + * + * @return int<0,max> The custom count as an integer. + */ + public function count() { + return $this->posts_found; + } +} diff --git a/src/parsers/class-wxr-parser-simplexml.php b/src/parsers/class-wxr-parser-simplexml.php index 00dd110..022a59c 100644 --- a/src/parsers/class-wxr-parser-simplexml.php +++ b/src/parsers/class-wxr-parser-simplexml.php @@ -24,7 +24,15 @@ function parse( $file ) { if ( function_exists( 'libxml_disable_entity_loader' ) && PHP_VERSION_ID < 80000 ) { $old_value = libxml_disable_entity_loader( true ); } - $success = $dom->loadXML( file_get_contents( $file ) ); + + $success = false; + if ( is_resource( $file ) ) { + fseek( $file, 0, SEEK_SET ); + $success = $dom->loadXML( stream_get_contents( $file ) ); + } else { + $success = $dom->loadXML( file_get_contents( $file ) ); + } + if ( ! is_null( $old_value ) ) { libxml_disable_entity_loader( $old_value ); } diff --git a/src/parsers/class-wxr-parser.php b/src/parsers/class-wxr-parser.php index 057bf9e..d1b4c61 100644 --- a/src/parsers/class-wxr-parser.php +++ b/src/parsers/class-wxr-parser.php @@ -11,6 +11,10 @@ */ class WXR_Parser { function parse( $file ) { + // just for testing now! + $parser = new WXR_Parser_Large_File( $file, 'WXR_Parser_SimpleXML' ); + return $parser; + // Attempt to use proper XML parsers first if ( extension_loaded( 'simplexml' ) ) { $parser = new WXR_Parser_SimpleXML; diff --git a/src/wordpress-importer.php b/src/wordpress-importer.php index 5d8824e..c0eeadd 100644 --- a/src/wordpress-importer.php +++ b/src/wordpress-importer.php @@ -36,6 +36,9 @@ /** Functions missing in older WordPress versions. */ require_once dirname( __FILE__ ) . '/compat.php'; +/** XML Character Stream Filter to sanitize XML input. */ +require_once dirname( __FILE__ ) . '/helpers/class-xml-character-filter.php'; + /** WXR_Parser class */ require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser.php'; @@ -48,6 +51,9 @@ /** WXR_Parser_Regex class */ require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser-regex.php'; +/** WXR_Large_File_Parser class */ +require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser-large-file.php'; + /** WP_Import class */ require_once dirname( __FILE__ ) . '/class-wp-import.php';