From d09e7660c95d72849b982f3a62fdc41132e16a39 Mon Sep 17 00:00:00 2001
From: Vishnu Gopal <533+vishnugopal@users.noreply.github.com>
Date: Wed, 30 Nov 2022 10:10:12 +0530
Subject: [PATCH 1/3] Add wxr-parser-large-file.

This adds a more memory-efficient WXR file parser to the WordPress
importer. The existing parser loads the entire WXR file into memory
whereas this one uses XMLReader to parse the file in chunks.

This parser implementation has been extensively tested at WordPress.com,
and reuses the existing SimpleXML parser for maximum compatibility.
---
 .wp-env.json                                |   7 +
 src/helpers/class-xml-character-filter.php  |  47 +++
 src/parsers/class-wxr-parser-large-file.php | 437 ++++++++++++++++++++
 src/parsers/class-wxr-parser-simplexml.php  |  10 +-
 src/parsers/class-wxr-parser.php            |   4 +
 src/wordpress-importer.php                  |   6 +
 6 files changed, 510 insertions(+), 1 deletion(-)
 create mode 100644 .wp-env.json
 create mode 100644 src/helpers/class-xml-character-filter.php
 create mode 100644 src/parsers/class-wxr-parser-large-file.php

diff --git a/.wp-env.json b/.wp-env.json
new file mode 100644
index 0000000..0d73d51
--- /dev/null
+++ b/.wp-env.json
@@ -0,0 +1,7 @@
+{
+  "plugins": ["."],
+  "config": {
+    "WP_UPLOAD_MAX_FILESIZE": "128M",
+    "WP_MEMORY_LIMIT": "256M"
+  }
+}
diff --git a/src/helpers/class-xml-character-filter.php b/src/helpers/class-xml-character-filter.php
new file mode 100644
index 0000000..2370cc4
--- /dev/null
+++ b/src/helpers/class-xml-character-filter.php
@@ -0,0 +1,47 @@
+<?php
+
+/*
+ * String which will be prefixed to stream URLs to add this filter
+ */
+define( 'XML_CHARACTER_FILTER_PREFIX', 'php://filter/read=xml_character_filter/resource=' );
+
+/**
+ * Class XML_Character_Filter 
+ *
+ * Filter for PHP stream
+ *
+ * Remove control characters except newline, tab and return
+ *
+ * Usage: php://filter/read=xml_character_filter/resource=zip://archive.zip#import.xml;
+ */
+class XML_Character_Filter extends php_user_filter {
+
+	private $chars = [];
+
+	public function filter( $in, $out, &$consumed, $closing ) {
+		while ( $bucket = stream_bucket_make_writeable( $in ) ) {
+			$consumed     += $bucket->datalen;
+			$bucket->data = $this->replace_chars( $bucket->data );
+			stream_bucket_append( $out, $bucket );
+		}
+
+		return PSFS_PASS_ON;
+	}
+
+	private function replace_chars( $string ) {
+		return str_replace( $this->chars, ' ', $string );
+	}
+
+	public function onCreate() {
+		for ( $ascii_num = 0; $ascii_num < 32; $ascii_num ++ ) {
+			if ( $ascii_num !== 9 && $ascii_num !== 10 && $ascii_num !== 13 ) {
+				$this->chars[] = chr( $ascii_num );
+			}
+		}
+		$this->chars[] = chr( 127 );
+
+		return true;
+	}
+}
+
+stream_filter_register( 'xml_character_filter', 'XML_Character_Filter' );
diff --git a/src/parsers/class-wxr-parser-large-file.php b/src/parsers/class-wxr-parser-large-file.php
new file mode 100644
index 0000000..f707f97
--- /dev/null
+++ b/src/parsers/class-wxr-parser-large-file.php
@@ -0,0 +1,437 @@
+<?php
+/**
+ * A memory efficient drop in replacement for WXR_Parser
+ *
+ * WXR_Parser_Large_File is a drop-in replacement for WXR_Parser that should
+ * be completely compatible, but massively more memory efficient (although
+ * it is very slightly slower). The end goal of this is to allow importing
+ * very large WordPress export files without the need to split them up into
+ * smaller chunks.
+ *
+ * Example: A WXR that was 229,407,801 bytes, 10,311 posts, 21,083 comments
+ *		WXR_Parser:            7.574581 seconds, 767,295,488 bytes of RAM
+ *		WXR_Parser_Large_File: 9.951963 seconds,   8,388,608 bytes of RAM
+ * See? Memory efficient.
+ *
+ * How it works:
+ *
+ * Step 1: Read everything that isn't an rss/channel/item into a header string
+ * or a footer string. This will be used later in creating smaller WXR files.
+ *
+ * Step 2: All rss/channel/item entries are put into an open, but otherwise
+ * orphaned file handle. On insertion, an offset and byte length is recorded
+ * for the entry. This is essentially an indexed database file. Most of the
+ * extra time parsing is spent here literally just shuffling bytes around.
+ *
+ * Step 3: Parse an itemless WXR once, and store the data in memory (containing all
+ * the authors, cats, etc). Any access on the object that is not for ['posts']
+ * is instead fetched from this itemless in-memory structure (this is why
+ * ArrayAccess and __get were implemented.) ['posts'] returns itself.
+ *
+ * Step 4: Allow the class to be foreached over and counted so that it behaves
+ * like the ['posts'] from the return value of WXR_Parser::parse(). Foreach
+ * uses the current() method to get the current item.
+ *
+ * Step 5: in current(), look up the offset and length of the post data in the
+ * database, seek to the offset, read that number of bytes. Sandwich it
+ * between the header and the footer strings, and write that to a temp file.
+ * It has to be a file because WXR_Parser:parse() requires a file. Finally
+ * parse the file with the original WXR_Parser, and return ['posts'][0] (since
+ * we know there's only one post in the import that we created for this purpose).
+ *
+ * Why would we bother using WXR_Parser when we could be more efficient by
+ * also rewriting what it does? The answer is that this code is as close as
+ * possible to the original while allowing huge files to be streamed in where
+ * the original would die horribly and out of memory.
+ */
+class WXR_Parser_Large_File implements Iterator, Countable, ArrayAccess {
+	var $raw_header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+	var $raw_footer = "\n";
+
+	var $tiny_header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+	var $tiny_header_size = 0;
+
+	var $current_post = 0;
+	var $posts_metadata = null;
+	var $posts_found = 0;
+	var $post_fp = 0;
+
+	var $tmp = null;
+	var $tmp_bytes = 0;
+
+	var $mini_parsed_wxr = null;
+
+	var $invalid_xml = false;
+
+	var $do_compress = false;
+	var $large_file_size = 524288000; // 500 MB
+
+	public $wxr_parser_class = 'WXR_Parser';
+
+	/**
+ 	 * Clean up resources so that they are not orphaned when the object is
+ 	 * no longer in scope. Specifically open file handles.
+ 	 */
+	function close() {
+		fclose( $this->tmp );
+	}
+
+	function __construct( $file, $override_wxr_parser = '' ) {
+		/**
+		 * Detect compressed import files.
+		 */
+		$is_forced_compressed_file = self::is_file_compressed( $file );
+
+		$source_uri = sprintf("file://%s", realpath( $file ) );
+
+		if (
+			( preg_match( '/\.gz$/i', $file ) || $is_forced_compressed_file ) &&
+			in_array( 'compress.zlib', stream_get_wrappers() )
+		) {
+			// 100 MB of compressed data is quite a lot
+			$this->large_file_size = 104857600;
+			$source_uri = sprintf( "compress.zlib://%s", $file );
+		}
+
+		// Prepend stream filter to strip out control characters XMLReader doesn't like
+		$source_uri = XML_CHARACTER_FILTER_PREFIX . $source_uri;
+
+		if ( filesize( $file ) > $this->large_file_size && function_exists( 'gzencode' ) ) {
+			$this->do_compress = true;
+		}
+
+		/**
+		 * Check if the WXR_Parser class needs to be overriden for another class.
+		 *
+		 * This is used to plug in the Site_Importer_WXR_Parser to be used with the Large File parser.
+		 */
+		if ( $override_wxr_parser && class_exists( $override_wxr_parser ) ) {
+			$this->wxr_parser_class = $override_wxr_parser;
+		}
+
+		// Create a file pointer with no filesystem references to act as our simple
+		// <item> database container. We want no filesystem references, so that when
+		// the process dies the file is orphaned and space reclaimed by the OS
+		$tmp = tempnam( sys_get_temp_dir(), "import-" );
+		$this->tmp = fopen( $tmp, 'w+' );
+		unlink( $tmp );
+
+		// Create a similar orphaned file descriptor to house the index fata for seeking
+		// to posts in our data file (above). Exactly 12 bytes per item entry will be used:
+		// an unsigned 64 bit unsinged int for the offset and a 32 bit unsigned int for
+		// the length of the data. Therefore seeking to $id * 12 and unpacking the next 12
+		// bytes gives us everything we need to pull data from $this->tmp
+		$tmp = tempnam( sys_get_temp_dir(), "import-" );
+		$this->posts_metadata = fopen( $tmp, 'w+' );
+		unlink( $tmp );
+
+		// XMLReader is a stream parser. It does not need to read the entire file,
+		// and is therefore very memory efficient.  It uses the same parsing engine
+		// that simplexml does (I believe) and so should work precisely the same
+		$reader = new XMLReader();
+
+		libxml_use_internal_errors(true);
+
+		$libxml_options = LIBXML_NOBLANKS;
+
+		if ( defined( 'LIBXML_COMPACT' ) ) {
+			$libxml_options = $libxml_options | LIBXML_COMPACT;
+		}
+
+		if( defined( 'LIBXML_PARSEHUGE' ) ) {
+			$libxml_options = $libxml_options | LIBXML_PARSEHUGE;
+		}
+
+		// Using `false` here is bad practice, but we're limiting it to the open
+		// step, in which XMLReader (unlike other XML tools) does not attempt to
+		// load or parse external entities. We go back to best practice during the
+		// read steps.
+		$old_disable_entity_loader_value = libxml_disable_entity_loader( false );
+		$opened = $reader->open( $source_uri, null, $libxml_options );
+		libxml_disable_entity_loader( true );
+
+		if ( ! $opened ) {
+			libxml_disable_entity_loader( $old_disable_entity_loader_value );
+			return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it\'s valid XML.', 'wordpress-importer') );
+		}
+
+		// Be explicit about this default behavior for the read steps
+		$reader->setParserProperty( XMLReader::SUBST_ENTITIES, false );
+
+		$writing_to = 0;
+		$found_channel = false;
+		$reader->read();
+		while( true ) {
+			switch( $reader->name ) {
+				case 'channel':
+					$found_channel = true;
+				case 'rss':
+					// rss and rss/channel are both parts of the header or footer
+					// depending on whether we have an opening tag or not.
+					$node_name = $reader->name;
+					switch ( $reader->nodeType ) {
+						case XMLReader::ELEMENT:
+							// Trap any attributes on these kinds of elements so that
+							// we can include them in the header as well
+							$attrs = array();
+							if ( $reader->moveToFirstAttribute() ) {
+								$attrs[] = sprintf(
+									'%s="%s"',
+									preg_replace( '/[^0-9a-z:]/i', '', $reader->name ),
+									str_replace( array( '"', '\\' ), "", $reader->value )
+								);
+								while ( $reader->moveToNextAttribute() ) {
+									$attrs[] = sprintf(
+										'%s="%s"',
+										preg_replace( '/[^0-9a-z:]/i', '', $reader->name ),
+										str_replace( array( '"', '\\' ), "", $reader->value )
+									);
+								}
+							}
+							if ( !empty( $attrs ) ) {
+								$this->raw_header .= "<$node_name " . implode( " ", $attrs ) . ">\n";
+								$this->tiny_header .= "<$node_name " . implode( " ", $attrs ) . ">\n";
+							} else {
+								$this->raw_header .= "<$node_name>\n";
+								$this->tiny_header .= "<$node_name>\n";
+							}
+							$attrs = array();
+							break;
+						case XMLReader::END_ELEMENT:
+							$writing_to = 1;
+							$this->raw_footer .= "</$reader->name>\n";
+							break;
+					}
+					if ( !$reader->read() ) {
+						break 2;
+					}
+					break;
+				case "item":
+					// Write rss/channel/item elements into ur pseudo database file pointer
+					// and update the in-memory index about where the data starts and how
+					// many bytes long it is so that we know how to read it all back later.
+					$inner_xml = $reader->readInnerXML();
+					if ( $this->do_compress ) {
+						$bytes = fwrite( $this->tmp, gzencode( "<item>\n" . $inner_xml . "</item>\n", 1 ) );
+					} else {
+						$bytes = fwrite( $this->tmp, "<item>\n" . $inner_xml . "</item>\n" );
+					}
+					$this->posts_found++;
+					// I do this because it's memory efficient. I can index about 44.5 million posts
+					// in ram this way using only 512MB
+					fwrite( $this->posts_metadata, pack( 'QL', $this->tmp_bytes, $bytes ) );
+					$this->tmp_bytes += $bytes;
+					if ( !$reader->next() ) {
+						break 2;
+					}
+					break;
+				default:
+					if ( !$found_channel ) {
+						$this->raw_header .= $reader->readOuterXML() . "\n";
+					} else {
+						if ( $reader->nodeType === XMLReader::ELEMENT ) {
+							if ( $writing_to === 0 ) {
+								switch( $reader->name ) {
+								case 'wp:tag':
+								case 'wp:author':
+								case 'wp:wp_author':
+								case 'wp:term':
+								case 'wp:category':
+									$this->raw_header .= $reader->readOuterXML() . "\n";
+									break;
+								default:
+									$xml = $reader->readOuterXML();
+									$this->raw_header .= $xml . "\n";
+									$this->tiny_header .= $xml . "\n";
+									break;
+								}
+							} else {
+								$this->raw_footer .= $reader->readOuterXML() . "\n";
+							}
+						}
+					}
+					if ( !$reader->next() ) {
+						break 2;
+					}
+					break;
+			}
+		}
+		fflush( $this->tmp );
+		$reader->close();
+		libxml_disable_entity_loader( $old_disable_entity_loader_value );
+
+		// XMLReader may have come across errors caused by bad characters which don't show up on $reader->open().
+		// We follow the example above and die here because the error handling in the WP_Import plugin which
+		// calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin.
+		// Dying also ensures the shutdown process deletes the temp file.
+		$libxml_errors = libxml_get_errors();
+		libxml_clear_errors();
+		if ( ! empty ( $libxml_errors ) ) {
+			return new WP_Error( 'xml_parse_error', __( 'We had trouble reading the import file. Please make sure it\'s valid XML.', 'wordpress-importer' ) );
+		}
+
+		$this->init_mini();
+
+		// If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error
+		if ( is_wp_error( $this->mini_parsed_wxr ) ) {
+			return;
+		}
+
+		$tmp = tempnam( sys_get_temp_dir(), "import-" );
+		$this->post_fp = fopen( $tmp, 'w+' );
+		unlink( $tmp );
+		fwrite( $this->post_fp, $this->tiny_header );
+		$this->tiny_header_size = strlen( $this->tiny_header );
+	}
+
+	function init_mini() {
+		// initialize our persistent mini WXR data structure. This is where
+		// imports will read authors, tags, cats, etc from.
+		$tmp = tempnam( sys_get_temp_dir(), 'import-mini-' );
+		$fp = fopen( $tmp, 'w+' );
+		unlink( $tmp );
+		fwrite( $fp, $this->raw_header );
+		$this->raw_header = '';
+		fwrite( $fp, $this->raw_footer );
+		fflush( $fp );
+		fseek( $fp, 0, SEEK_SET );
+		$parser = $this->get_wxr_parser_instance();
+		$this->mini_parsed_wxr = $parser->parse( $fp );
+		fclose( $fp );
+	}
+
+	public function get_wxr_parser_instance() {
+		$parser_class = $this->wxr_parser_class;
+		if ( class_exists( $parser_class ) ) {
+			return new $parser_class();
+		}
+		else {
+			// This is a precaution and fallback to the default parser if the override class doesn't exist
+			return new WXR_Parser();
+		}
+	}
+
+	/**
+	 * Check if file is compressed with zlib/gzip.
+	 *
+	 * Inspired by https://stackoverflow.com/a/29268776/153310
+	 *
+	 * @param string $file_path The file to check for compression
+	 *
+	 * @return bool
+	 */
+	public static function is_file_compressed( $file_path ) {
+		if ( ! is_file( $file_path ) ) {
+			return false;
+		}
+
+		$handle = fopen( $file_path, 'r' );
+		$header = fread( $handle, 8 );
+		$is_compressed = 0 === strpos( $header, "\x1f" . "\x8b" . "\x08" );
+		fclose( $handle );
+
+		return $is_compressed;
+	}
+
+	// For ArrayAccess
+	function offsetSet( $offset, $val ) {
+		$this->mini_parsed_wxr[$offset] = $val;
+	}
+
+	function offsetExists( $offset ) {
+		return isset( $this->mini_parsed_wxr[$offset] );
+	}
+
+	function offsetUnset( $offset ) {
+		unset( $this->mini_parsed_wxr[$offset] );
+	}
+
+	function offsetGet( $offset ) {
+		return $this->$offset;
+	}
+
+	// For Iterator
+
+	/**
+ 	 * Provide the $post to the foreach( $posts as $post ) loop
+ 	 *
+ 	 * It should be noted that there is a real possibility that, if the process
+ 	 * dies or is killed between making the temp file and unlinking it that we
+ 	 * will leave orphaned bits of WXR on the filesystem. It's very difficult to
+ 	 * make PHP clean up after itself when it's been the victim of kill -9
+ 	 */
+	function current() {
+		// Create a real filesystem file to write a single rss/channel/item WXR into
+
+		$index_offset = 12 * $this->current_post;
+		fseek( $this->posts_metadata, $index_offset, SEEK_SET );
+		$index = unpack( 'Qo/Ll', fread( $this->posts_metadata, 12 ) );
+
+		// Hop to the appropriate starting byte in our database file.
+		fseek( $this->tmp, $index['o'], SEEK_SET );
+		ftruncate( $this->post_fp, $this->tiny_header_size );
+		fseek( $this->post_fp, $this->tiny_header_size, SEEK_SET );
+
+		// Compose the WXR from the header, bytes from the database for the item and footer
+		if ( $this->do_compress ) {
+			fwrite( $this->post_fp, gzdecode( fread( $this->tmp, $index['l'] ) ) );
+		} else {
+			fwrite( $this->post_fp, fread( $this->tmp, $index['l'] ) );
+		}
+		fwrite( $this->post_fp, $this->raw_footer );
+		fflush( $this->post_fp );
+		fseek( $this->post_fp, 0, SEEK_SET );
+
+		// Create a normal WXR_Parser data structure from the file
+		$parser = $this->get_wxr_parser_instance();
+		$parsed = $parser->parse( $this->post_fp );
+
+		// Clean up
+		if ( is_wp_error( $parsed ) ) {
+			return $parsed;
+		}
+
+		// There is exactly one post in this WXR so we can just return that.
+		// It's all we really wanted anyway.
+		return $parsed["posts"][0];
+	}
+
+	function key() {
+		return $this->current_post;
+	}
+
+	function next() {
+		$this->current_post++;
+	}
+
+	function rewind() {
+		$this->current_post = 0;
+	}
+
+	function valid() {
+		$post_number = $this->current_post + 1;
+		return ( $post_number > 0 && $post_number <= $this->posts_found );
+	}
+
+	// Magic Methods
+	function __get( $key ) {
+		switch ( $key ) {
+			case 'posts':
+				// $this['posts'] returns $this.
+				// For use in the foreach( $data['posts'] as $post ) loop
+				return $this;
+			default:
+				// Anything else we're passing through to our embedded empty WXR
+				// that we keep around for just these purposes... eg: $data['authors']
+				if ( isset( $this->mini_parsed_wxr[$key] ) ) {
+					return $this->mini_parsed_wxr[$key];
+				}
+				return null;
+		}
+	}
+
+	// For Countable
+	function count() {
+		return $this->posts_found;
+	}
+}
diff --git a/src/parsers/class-wxr-parser-simplexml.php b/src/parsers/class-wxr-parser-simplexml.php
index 00dd110..022a59c 100644
--- a/src/parsers/class-wxr-parser-simplexml.php
+++ b/src/parsers/class-wxr-parser-simplexml.php
@@ -24,7 +24,15 @@ function parse( $file ) {
 		if ( function_exists( 'libxml_disable_entity_loader' ) && PHP_VERSION_ID < 80000 ) {
 			$old_value = libxml_disable_entity_loader( true );
 		}
-		$success = $dom->loadXML( file_get_contents( $file ) );
+
+		$success = false;
+		if ( is_resource( $file ) ) {
+			fseek( $file, 0, SEEK_SET );
+			$success = $dom->loadXML( stream_get_contents( $file ) );
+		} else {
+			$success = $dom->loadXML( file_get_contents( $file ) );
+		}
+
 		if ( ! is_null( $old_value ) ) {
 			libxml_disable_entity_loader( $old_value );
 		}
diff --git a/src/parsers/class-wxr-parser.php b/src/parsers/class-wxr-parser.php
index 057bf9e..148b586 100644
--- a/src/parsers/class-wxr-parser.php
+++ b/src/parsers/class-wxr-parser.php
@@ -11,6 +11,10 @@
  */
 class WXR_Parser {
 	function parse( $file ) {
+		// just for testing now!
+		$parser = new WXR_Parser_Large_File( $file, 'WXR_Parser_SimpleXML' );
+		return $parser;
+		
 		// Attempt to use proper XML parsers first
 		if ( extension_loaded( 'simplexml' ) ) {
 			$parser = new WXR_Parser_SimpleXML;
diff --git a/src/wordpress-importer.php b/src/wordpress-importer.php
index 5d8824e..c0eeadd 100644
--- a/src/wordpress-importer.php
+++ b/src/wordpress-importer.php
@@ -36,6 +36,9 @@
 /** Functions missing in older WordPress versions. */
 require_once dirname( __FILE__ ) . '/compat.php';
 
+/** XML Character Stream Filter to sanitize XML input. */
+require_once dirname( __FILE__ ) . '/helpers/class-xml-character-filter.php';
+
 /** WXR_Parser class */
 require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser.php';
 
@@ -48,6 +51,9 @@
 /** WXR_Parser_Regex class */
 require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser-regex.php';
 
+/** WXR_Large_File_Parser class */
+require_once dirname( __FILE__ ) . '/parsers/class-wxr-parser-large-file.php';
+
 /** WP_Import class */
 require_once dirname( __FILE__ ) . '/class-wp-import.php';
 

From 93c7cb9dde432a299ee1af68b5123e873eede580 Mon Sep 17 00:00:00 2001
From: Konstantin Obenland <obenland@gmx.de>
Date: Thu, 1 Dec 2022 20:46:34 -0800
Subject: [PATCH 2/3] Polish up and add some docs

---
 src/helpers/class-xml-character-filter.php  |  61 ++-
 src/parsers/class-wxr-parser-large-file.php | 507 +++++++++++++-------
 2 files changed, 375 insertions(+), 193 deletions(-)

diff --git a/src/helpers/class-xml-character-filter.php b/src/helpers/class-xml-character-filter.php
index 2370cc4..961ca9f 100644
--- a/src/helpers/class-xml-character-filter.php
+++ b/src/helpers/class-xml-character-filter.php
@@ -1,26 +1,41 @@
 <?php
-
-/*
- * String which will be prefixed to stream URLs to add this filter
+/**
+ * XML_Character_Filter file.
+ *
+ * @package WordPress
+ * @subpackage Importer
  */
+
+// String which will be prefixed to stream URLs to add this filter.
 define( 'XML_CHARACTER_FILTER_PREFIX', 'php://filter/read=xml_character_filter/resource=' );
 
 /**
- * Class XML_Character_Filter 
+ * Class XML_Character_Filter.
  *
- * Filter for PHP stream
- *
- * Remove control characters except newline, tab and return
+ * XML Character Stream Filter to sanitize XML input. Removes control characters except newline, tab and return.
  *
  * Usage: php://filter/read=xml_character_filter/resource=zip://archive.zip#import.xml;
  */
 class XML_Character_Filter extends php_user_filter {
+	/**
+	 * List of control characters to remove.
+	 *
+	 * @var array
+	 */
+	private $chars = array();
 
-	private $chars = [];
-
+	/**
+	 * This method is called whenever data is read from or written to the attached stream (such as with fread() or fwrite()).
+	 *
+	 * @param resource $in       A resource pointing to a bucket brigade which contains one or more bucket objects containing data to be filtered.
+	 * @param resource $out      A resource pointing to a second bucket brigade into which the modified buckets should be placed.
+	 * @param int      $consumed Reference to the length of the data that the filter reads in and alters.
+	 * @param bool     $closing  Whether the stream is in the process of closing.
+	 * @return int PSFS_PASS_ON|PSFS_FEED_ME|PSFS_ERR_FATAL.
+	 */
 	public function filter( $in, $out, &$consumed, $closing ) {
-		while ( $bucket = stream_bucket_make_writeable( $in ) ) {
-			$consumed     += $bucket->datalen;
+		while ( $bucket = stream_bucket_make_writeable( $in ) ) { //phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition
+			$consumed    += $bucket->datalen;
 			$bucket->data = $this->replace_chars( $bucket->data );
 			stream_bucket_append( $out, $bucket );
 		}
@@ -28,13 +43,14 @@ public function filter( $in, $out, &$consumed, $closing ) {
 		return PSFS_PASS_ON;
 	}
 
-	private function replace_chars( $string ) {
-		return str_replace( $this->chars, ' ', $string );
-	}
-
+	/**
+	 * This method is called during instantiation of the filter class object.
+	 *
+	 * @return bool
+	 */
 	public function onCreate() {
-		for ( $ascii_num = 0; $ascii_num < 32; $ascii_num ++ ) {
-			if ( $ascii_num !== 9 && $ascii_num !== 10 && $ascii_num !== 13 ) {
+		for ( $ascii_num = 0; $ascii_num < 32; $ascii_num++ ) {
+			if ( 9 !== $ascii_num && 10 !== $ascii_num && 13 !== $ascii_num ) {
 				$this->chars[] = chr( $ascii_num );
 			}
 		}
@@ -42,6 +58,15 @@ public function onCreate() {
 
 		return true;
 	}
-}
 
+	/**
+	 * Replace control characters.
+	 *
+	 * @param string $string Data to replace.
+	 * @return string
+	 */
+	private function replace_chars( $string ) {
+		return str_replace( $this->chars, ' ', $string );
+	}
+}
 stream_filter_register( 'xml_character_filter', 'XML_Character_Filter' );
diff --git a/src/parsers/class-wxr-parser-large-file.php b/src/parsers/class-wxr-parser-large-file.php
index f707f97..894cfd4 100644
--- a/src/parsers/class-wxr-parser-large-file.php
+++ b/src/parsers/class-wxr-parser-large-file.php
@@ -1,6 +1,15 @@
 <?php
 /**
- * A memory efficient drop in replacement for WXR_Parser
+ * WordPress eXtended RSS file parser implementations.
+ *
+ * @package WordPress
+ * @subpackage Importer
+ */
+
+// phpcs:disable WordPress.WP.AlternativeFunctions
+
+/**
+ * A memory efficient drop in replacement for WXR_Parser.
  *
  * WXR_Parser_Large_File is a drop-in replacement for WXR_Parser that should
  * be completely compatible, but massively more memory efficient (although
@@ -9,13 +18,12 @@
  * smaller chunks.
  *
  * Example: A WXR that was 229,407,801 bytes, 10,311 posts, 21,083 comments
- *		WXR_Parser:            7.574581 seconds, 767,295,488 bytes of RAM
- *		WXR_Parser_Large_File: 9.951963 seconds,   8,388,608 bytes of RAM
- * See? Memory efficient.
+ *      WXR_Parser:            7.574581 seconds, 767,295,488 bytes of RAM
+ *      WXR_Parser_Large_File: 9.951963 seconds,   8,388,608 bytes of RAM
  *
  * How it works:
  *
- * Step 1: Read everything that isn't an rss/channel/item into a header string
+ * Step 1: Read everything that isn't a rss/channel/item into a header string
  * or a footer string. This will be used later in creating smaller WXR files.
  *
  * Step 2: All rss/channel/item entries are put into an open, but otherwise
@@ -23,114 +31,190 @@
  * for the entry. This is essentially an indexed database file. Most of the
  * extra time parsing is spent here literally just shuffling bytes around.
  *
- * Step 3: Parse an itemless WXR once, and store the data in memory (containing all
- * the authors, cats, etc). Any access on the object that is not for ['posts']
- * is instead fetched from this itemless in-memory structure (this is why
+ * Step 3: Parse an item-less WXR once, and store the data in memory (containing all
+ * the authors, categories, etc.). Any access on the object that is not for ['posts']
+ * is instead fetched from this item-less in-memory structure (this is why
  * ArrayAccess and __get were implemented.) ['posts'] returns itself.
  *
- * Step 4: Allow the class to be foreached over and counted so that it behaves
+ * Step 4: Allow the class to be foreach-ed over and counted so that it behaves
  * like the ['posts'] from the return value of WXR_Parser::parse(). Foreach
  * uses the current() method to get the current item.
  *
  * Step 5: in current(), look up the offset and length of the post data in the
  * database, seek to the offset, read that number of bytes. Sandwich it
  * between the header and the footer strings, and write that to a temp file.
- * It has to be a file because WXR_Parser:parse() requires a file. Finally
+ * It has to be a file because WXR_Parser:parse() requires a file. Finally,
  * parse the file with the original WXR_Parser, and return ['posts'][0] (since
  * we know there's only one post in the import that we created for this purpose).
- *
- * Why would we bother using WXR_Parser when we could be more efficient by
- * also rewriting what it does? The answer is that this code is as close as
- * possible to the original while allowing huge files to be streamed in where
- * the original would die horribly and out of memory.
  */
 class WXR_Parser_Large_File implements Iterator, Countable, ArrayAccess {
-	var $raw_header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-	var $raw_footer = "\n";
+	/**
+	 * Holds the raw header data.
+	 *
+	 * @var string
+	 */
+	public $raw_header = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
 
-	var $tiny_header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-	var $tiny_header_size = 0;
+	/**
+	 * Holds the raw footer data.
+	 *
+	 * @var string
+	 */
+	public $raw_footer = "\n";
 
-	var $current_post = 0;
-	var $posts_metadata = null;
-	var $posts_found = 0;
-	var $post_fp = 0;
+	/**
+	 * Keeps track of header elements that are not rss/channel/item.
+	 *
+	 * @var string
+	 */
+	public $tiny_header = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
 
-	var $tmp = null;
-	var $tmp_bytes = 0;
+	/**
+	 * Size of the tiny header in bytes.
+	 *
+	 * @var int
+	 */
+	public $tiny_header_size = 0;
 
-	var $mini_parsed_wxr = null;
+	/**
+	 * Index of the current post.
+	 *
+	 * @var int
+	 */
+	public $current_post = 0;
+
+	/**
+	 * The number of posts in the file.
+	 *
+	 * @var int
+	 */
+	public $posts_found = 0;
+
+	/**
+	 * A file pointer resource to a temporary file.
+	 *
+	 * @var false|resource|null
+	 */
+	public $tmp;
 
-	var $invalid_xml = false;
+	/**
+	 * A file pointer resource to a temporary file.
+	 *
+	 * This file contains the posts, but not the header or footer.
+	 *
+	 * @var false|resource|null
+	 */
+	public $post_fp;
 
-	var $do_compress = false;
-	var $large_file_size = 524288000; // 500 MB
+	/**
+	 * A file pointer resource to a temporary file.
+	 *
+	 * Contains the index data for seeking to posts in our $tmp data file.
+	 *
+	 * @var false|resource|null
+	 */
+	public $posts_metadata;
 
+	/**
+	 * Used to store the index for the $posts_metadata file.
+	 *
+	 * @var int
+	 */
+	public $tmp_bytes = 0;
+
+	/**
+	 * Parsed data or WP_Error on failure.
+	 *
+	 * @var array|WP_Error
+	 */
+	public $mini_parsed_wxr;
+
+	/**
+	 * Whether to compress the temporary file.
+	 *
+	 * @var bool
+	 */
+	public $do_compress = false;
+
+	/**
+	 * The size that constitutes a large file.
+	 *
+	 * @var int
+	 */
+	public $large_file_size = 500 * MB_IN_BYTES;
+
+	/**
+	 * Parser class.
+	 *
+	 * @var string
+	 */
 	public $wxr_parser_class = 'WXR_Parser';
 
 	/**
- 	 * Clean up resources so that they are not orphaned when the object is
- 	 * no longer in scope. Specifically open file handles.
- 	 */
-	function close() {
+	 * Clean up resources so that they are not orphaned when the object is
+	 * no longer in scope. Specifically open file handles.
+	 */
+	public function close() {
 		fclose( $this->tmp );
 	}
 
-	function __construct( $file, $override_wxr_parser = '' ) {
-		/**
-		 * Detect compressed import files.
-		 */
-		$is_forced_compressed_file = self::is_file_compressed( $file );
+	/**
+	 * Constructor.
+	 *
+	 * @param string $file       File to import.
+	 * @param string $wxr_parser Optional. The name of the parser class to use. Default 'WXR_Parser'.
+	 */
+	public function __construct( $file, $wxr_parser = 'WXR_Parser' ) {
+		$this->wxr_parser_class = $wxr_parser;
 
-		$source_uri = sprintf("file://%s", realpath( $file ) );
+		// Detect compressed import files.
+		$is_forced_compressed_file = self::is_file_compressed( $file );
+		$source_uri                = sprintf( 'file://%s', realpath( $file ) );
 
 		if (
 			( preg_match( '/\.gz$/i', $file ) || $is_forced_compressed_file ) &&
-			in_array( 'compress.zlib', stream_get_wrappers() )
+			in_array( 'compress.zlib', stream_get_wrappers(), true )
 		) {
-			// 100 MB of compressed data is quite a lot
-			$this->large_file_size = 104857600;
-			$source_uri = sprintf( "compress.zlib://%s", $file );
+			// 100 MB of compressed data is quite a lot.
+			$this->large_file_size = 100 * MB_IN_BYTES;
+			$source_uri            = sprintf( 'compress.zlib://%s', $file );
 		}
 
-		// Prepend stream filter to strip out control characters XMLReader doesn't like
+		// Prepend stream filter to strip out control characters XMLReader doesn't like.
 		$source_uri = XML_CHARACTER_FILTER_PREFIX . $source_uri;
 
 		if ( filesize( $file ) > $this->large_file_size && function_exists( 'gzencode' ) ) {
 			$this->do_compress = true;
 		}
 
-		/**
-		 * Check if the WXR_Parser class needs to be overriden for another class.
-		 *
-		 * This is used to plug in the Site_Importer_WXR_Parser to be used with the Large File parser.
+		/*
+		 * Create a file pointer with no filesystem references to act as our simple
+		 * <item> database container. We want no filesystem references, so that when
+		 * the process dies the file is orphaned and space reclaimed by the OS.
 		 */
-		if ( $override_wxr_parser && class_exists( $override_wxr_parser ) ) {
-			$this->wxr_parser_class = $override_wxr_parser;
-		}
-
-		// Create a file pointer with no filesystem references to act as our simple
-		// <item> database container. We want no filesystem references, so that when
-		// the process dies the file is orphaned and space reclaimed by the OS
-		$tmp = tempnam( sys_get_temp_dir(), "import-" );
+		$tmp       = tempnam( sys_get_temp_dir(), 'import-' );
 		$this->tmp = fopen( $tmp, 'w+' );
 		unlink( $tmp );
 
-		// Create a similar orphaned file descriptor to house the index fata for seeking
-		// to posts in our data file (above). Exactly 12 bytes per item entry will be used:
-		// an unsigned 64 bit unsinged int for the offset and a 32 bit unsigned int for
-		// the length of the data. Therefore seeking to $id * 12 and unpacking the next 12
-		// bytes gives us everything we need to pull data from $this->tmp
-		$tmp = tempnam( sys_get_temp_dir(), "import-" );
+		/*
+		 * Create a similar orphaned file descriptor to house the index data for seeking
+		 * to posts in our data file (above). Exactly 12 bytes per item entry will be used:
+		 * an unsigned 64 bit unsigned int for the offset and a 32 bit unsigned int for
+		 * the length of the data. Therefore, seeking to $id * 12 and unpacking the next 12
+		 * bytes gives us everything we need to pull data from $this->tmp.
+		 */
+		$tmp                  = tempnam( sys_get_temp_dir(), 'import-' );
 		$this->posts_metadata = fopen( $tmp, 'w+' );
 		unlink( $tmp );
 
-		// XMLReader is a stream parser. It does not need to read the entire file,
-		// and is therefore very memory efficient.  It uses the same parsing engine
-		// that simplexml does (I believe) and so should work precisely the same
+		/*
+		 * XMLReader is a stream parser. It does not need to read the entire file,
+		 * and is therefore very memory efficient.  It uses the same parsing engine
+		 * that simplexml does and so should work precisely the same.
+		 */
 		$reader = new XMLReader();
 
-		libxml_use_internal_errors(true);
+		libxml_use_internal_errors( true );
 
 		$libxml_options = LIBXML_NOBLANKS;
 
@@ -138,100 +222,111 @@ function __construct( $file, $override_wxr_parser = '' ) {
 			$libxml_options = $libxml_options | LIBXML_COMPACT;
 		}
 
-		if( defined( 'LIBXML_PARSEHUGE' ) ) {
+		if ( defined( 'LIBXML_PARSEHUGE' ) ) {
 			$libxml_options = $libxml_options | LIBXML_PARSEHUGE;
 		}
 
-		// Using `false` here is bad practice, but we're limiting it to the open
-		// step, in which XMLReader (unlike other XML tools) does not attempt to
-		// load or parse external entities. We go back to best practice during the
-		// read steps.
+		/*
+		 * Using `false` here is bad practice, but we're limiting it to the open
+		 * step, in which XMLReader (unlike other XML tools) does not attempt to
+		 * load or parse external entities. We go back to best practice during the read steps.
+		 */
 		$old_disable_entity_loader_value = libxml_disable_entity_loader( false );
-		$opened = $reader->open( $source_uri, null, $libxml_options );
+		$opened                          = $reader->open( $source_uri, null, $libxml_options );
 		libxml_disable_entity_loader( true );
 
 		if ( ! $opened ) {
 			libxml_disable_entity_loader( $old_disable_entity_loader_value );
-			return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it\'s valid XML.', 'wordpress-importer') );
+			return new WP_Error( 'xml_parse_error', __( 'We had trouble opening the import file. Please make sure it&#8217;s valid XML.', 'wordpress-importer' ) );
 		}
 
-		// Be explicit about this default behavior for the read steps
+		// Be explicit about this default behavior for the read steps.
 		$reader->setParserProperty( XMLReader::SUBST_ENTITIES, false );
 
-		$writing_to = 0;
+		$writing_to    = 0;
 		$found_channel = false;
 		$reader->read();
-		while( true ) {
-			switch( $reader->name ) {
+
+		while ( true ) {
+			switch ( $reader->name ) {
 				case 'channel':
 					$found_channel = true;
+					// Fall through to handle channel items.
+
 				case 'rss':
 					// rss and rss/channel are both parts of the header or footer
 					// depending on whether we have an opening tag or not.
 					$node_name = $reader->name;
-					switch ( $reader->nodeType ) {
+
+					switch ( $reader->nodeType ) { //phpcs:ignore WordPress.NamingConventions
 						case XMLReader::ELEMENT:
-							// Trap any attributes on these kinds of elements so that
-							// we can include them in the header as well
+							// Trap any attributes on these kinds of elements so that we can include them in the header as well.
 							$attrs = array();
 							if ( $reader->moveToFirstAttribute() ) {
 								$attrs[] = sprintf(
 									'%s="%s"',
 									preg_replace( '/[^0-9a-z:]/i', '', $reader->name ),
-									str_replace( array( '"', '\\' ), "", $reader->value )
+									str_replace( array( '"', '\\' ), '', $reader->value )
 								);
 								while ( $reader->moveToNextAttribute() ) {
 									$attrs[] = sprintf(
 										'%s="%s"',
 										preg_replace( '/[^0-9a-z:]/i', '', $reader->name ),
-										str_replace( array( '"', '\\' ), "", $reader->value )
+										str_replace( array( '"', '\\' ), '', $reader->value )
 									);
 								}
 							}
-							if ( !empty( $attrs ) ) {
-								$this->raw_header .= "<$node_name " . implode( " ", $attrs ) . ">\n";
-								$this->tiny_header .= "<$node_name " . implode( " ", $attrs ) . ">\n";
+							if ( ! empty( $attrs ) ) {
+								$this->raw_header  .= "<$node_name " . implode( ' ', $attrs ) . ">\n";
+								$this->tiny_header .= "<$node_name " . implode( ' ', $attrs ) . ">\n";
 							} else {
-								$this->raw_header .= "<$node_name>\n";
+								$this->raw_header  .= "<$node_name>\n";
 								$this->tiny_header .= "<$node_name>\n";
 							}
-							$attrs = array();
 							break;
+
 						case XMLReader::END_ELEMENT:
-							$writing_to = 1;
+							$writing_to        = 1;
 							$this->raw_footer .= "</$reader->name>\n";
 							break;
 					}
-					if ( !$reader->read() ) {
+
+					if ( ! $reader->read() ) {
 						break 2;
 					}
 					break;
-				case "item":
-					// Write rss/channel/item elements into ur pseudo database file pointer
-					// and update the in-memory index about where the data starts and how
-					// many bytes long it is so that we know how to read it all back later.
+
+				case 'item':
+					/*
+					 * Write rss/channel/item elements into our pseudo database file pointer
+					 * and update the in-memory index about where the data starts and how
+					 * many bytes long it is so that we know how to read it all back later.
+					 */
 					$inner_xml = $reader->readInnerXML();
+
 					if ( $this->do_compress ) {
 						$bytes = fwrite( $this->tmp, gzencode( "<item>\n" . $inner_xml . "</item>\n", 1 ) );
 					} else {
 						$bytes = fwrite( $this->tmp, "<item>\n" . $inner_xml . "</item>\n" );
 					}
+
 					$this->posts_found++;
-					// I do this because it's memory efficient. I can index about 44.5 million posts
-					// in ram this way using only 512MB
+
+					// We do this because it's memory efficient. We can index about 44.5 million posts in ram this way using only 512MB.
 					fwrite( $this->posts_metadata, pack( 'QL', $this->tmp_bytes, $bytes ) );
 					$this->tmp_bytes += $bytes;
-					if ( !$reader->next() ) {
+
+					if ( ! $reader->next() ) {
 						break 2;
 					}
 					break;
+
 				default:
-					if ( !$found_channel ) {
+					if ( ! $found_channel ) {
 						$this->raw_header .= $reader->readOuterXML() . "\n";
-					} else {
-						if ( $reader->nodeType === XMLReader::ELEMENT ) {
-							if ( $writing_to === 0 ) {
-								switch( $reader->name ) {
+					} elseif ( XMLReader::ELEMENT === $reader->nodeType ) { //phpcs:ignore WordPress.NamingConventions
+						if ( 0 === $writing_to ) {
+							switch ( $reader->name ) {
 								case 'wp:tag':
 								case 'wp:author':
 								case 'wp:wp_author':
@@ -239,18 +334,19 @@ function __construct( $file, $override_wxr_parser = '' ) {
 								case 'wp:category':
 									$this->raw_header .= $reader->readOuterXML() . "\n";
 									break;
+
 								default:
-									$xml = $reader->readOuterXML();
-									$this->raw_header .= $xml . "\n";
+									$xml                = $reader->readOuterXML();
+									$this->raw_header  .= $xml . "\n";
 									$this->tiny_header .= $xml . "\n";
 									break;
-								}
-							} else {
-								$this->raw_footer .= $reader->readOuterXML() . "\n";
 							}
+						} else {
+							$this->raw_footer .= $reader->readOuterXML() . "\n";
 						}
 					}
-					if ( !$reader->next() ) {
+
+					if ( ! $reader->next() ) {
 						break 2;
 					}
 					break;
@@ -260,53 +356,65 @@ function __construct( $file, $override_wxr_parser = '' ) {
 		$reader->close();
 		libxml_disable_entity_loader( $old_disable_entity_loader_value );
 
-		// XMLReader may have come across errors caused by bad characters which don't show up on $reader->open().
-		// We follow the example above and die here because the error handling in the WP_Import plugin which
-		// calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin.
-		// Dying also ensures the shutdown process deletes the temp file.
+		/*
+		 * XMLReader may have come across errors caused by bad characters which don't show up on $reader->open().
+		 * We follow the example above and die here because the error handling in the WP_Import plugin which
+		 * calls this parser is weird, but we can't change it for fear of breaking things for other users of the plugin.
+		 * Dying also ensures the shutdown process deletes the temp file.
+		 */
 		$libxml_errors = libxml_get_errors();
 		libxml_clear_errors();
-		if ( ! empty ( $libxml_errors ) ) {
+		if ( ! empty( $libxml_errors ) ) {
 			return new WP_Error( 'xml_parse_error', __( 'We had trouble reading the import file. Please make sure it\'s valid XML.', 'wordpress-importer' ) );
 		}
 
 		$this->init_mini();
 
-		// If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error
+		// If the file isn't a valid import file, $this->mini_parsed_wxr can be a WP_Error.
 		if ( is_wp_error( $this->mini_parsed_wxr ) ) {
 			return;
 		}
 
-		$tmp = tempnam( sys_get_temp_dir(), "import-" );
+		$tmp           = tempnam( sys_get_temp_dir(), 'import-' );
 		$this->post_fp = fopen( $tmp, 'w+' );
 		unlink( $tmp );
 		fwrite( $this->post_fp, $this->tiny_header );
 		$this->tiny_header_size = strlen( $this->tiny_header );
 	}
 
-	function init_mini() {
-		// initialize our persistent mini WXR data structure. This is where
-		// imports will read authors, tags, cats, etc from.
+	/**
+	 * Initialize our persistent mini WXR data structure.
+	 *
+	 * This is where imports will read authors, tags, cats, etc from.
+	 */
+	public function init_mini() {
 		$tmp = tempnam( sys_get_temp_dir(), 'import-mini-' );
-		$fp = fopen( $tmp, 'w+' );
+		$fp  = fopen( $tmp, 'w+' );
 		unlink( $tmp );
+
 		fwrite( $fp, $this->raw_header );
 		$this->raw_header = '';
 		fwrite( $fp, $this->raw_footer );
 		fflush( $fp );
 		fseek( $fp, 0, SEEK_SET );
-		$parser = $this->get_wxr_parser_instance();
+
+		$parser                = $this->get_wxr_parser_instance();
 		$this->mini_parsed_wxr = $parser->parse( $fp );
 		fclose( $fp );
 	}
 
+	/**
+	 * Get the WXR parser instance.
+	 *
+	 * @return WXR_Parser
+	 */
 	public function get_wxr_parser_instance() {
 		$parser_class = $this->wxr_parser_class;
+
 		if ( class_exists( $parser_class ) ) {
 			return new $parser_class();
-		}
-		else {
-			// This is a precaution and fallback to the default parser if the override class doesn't exist
+		} else {
+			// This is a precaution and fallback to the default parser if the override class doesn't exist.
 			return new WXR_Parser();
 		}
 	}
@@ -314,10 +422,9 @@ public function get_wxr_parser_instance() {
 	/**
 	 * Check if file is compressed with zlib/gzip.
 	 *
-	 * Inspired by https://stackoverflow.com/a/29268776/153310
-	 *
-	 * @param string $file_path The file to check for compression
+	 * Inspired by {@see https://stackoverflow.com/a/29268776/153310}
 	 *
+	 * @param string $file_path The file to check for compression.
 	 * @return bool
 	 */
 	public static function is_file_compressed( $file_path ) {
@@ -327,41 +434,68 @@ public static function is_file_compressed( $file_path ) {
 
 		$handle = fopen( $file_path, 'r' );
 		$header = fread( $handle, 8 );
-		$is_compressed = 0 === strpos( $header, "\x1f" . "\x8b" . "\x08" );
+
+		$is_compressed = 0 === strpos( $header, "\x1f\x8b\x08" );
 		fclose( $handle );
 
 		return $is_compressed;
 	}
 
-	// For ArrayAccess
-	function offsetSet( $offset, $val ) {
-		$this->mini_parsed_wxr[$offset] = $val;
+	// For ArrayAccess.
+
+	/**
+	 * Whether an offset exists.
+	 *
+	 * @param mixed $offset An offset to check for.
+	 * @return bool true on success or false on failure.
+	 */
+	public function offsetExists( $offset ) {
+		return isset( $this->mini_parsed_wxr[ $offset ] );
 	}
 
-	function offsetExists( $offset ) {
-		return isset( $this->mini_parsed_wxr[$offset] );
+	/**
+	 * Offset to retrieve.
+	 *
+	 * @param mixed $offset The offset to retrieve.
+	 * @return mixed Can return all value types.
+	 */
+	public function offsetGet( $offset ) {
+		return $this->$offset;
 	}
 
-	function offsetUnset( $offset ) {
-		unset( $this->mini_parsed_wxr[$offset] );
+	/**
+	 * Offset to set.
+	 *
+	 * @param mixed $offset The offset to assign the value to.
+	 * @param mixed $value The value to set.
+	 */
+	public function offsetSet( $offset, $value ) {
+		$this->mini_parsed_wxr[ $offset ] = $value;
 	}
 
-	function offsetGet( $offset ) {
-		return $this->$offset;
+	/**
+	 * Offset to unset.
+	 *
+	 * @param mixed $offset The offset to unset.
+	 */
+	public function offsetUnset( $offset ) {
+		unset( $this->mini_parsed_wxr[ $offset ] );
 	}
 
-	// For Iterator
+	// For Iterator.
 
 	/**
- 	 * Provide the $post to the foreach( $posts as $post ) loop
- 	 *
- 	 * It should be noted that there is a real possibility that, if the process
- 	 * dies or is killed between making the temp file and unlinking it that we
- 	 * will leave orphaned bits of WXR on the filesystem. It's very difficult to
- 	 * make PHP clean up after itself when it's been the victim of kill -9
- 	 */
-	function current() {
-		// Create a real filesystem file to write a single rss/channel/item WXR into
+	 * Provide the $post to the foreach( $posts as $post ) loop.
+	 *
+	 * It should be noted that there is a real possibility that, if the process
+	 * dies or is killed between making the temp file and unlinking it that we
+	 * will leave orphaned bits of WXR on the filesystem. It's very difficult to
+	 * make PHP clean up after itself when it's been the victim of kill -9.
+	 *
+	 * @return array|WP_Error Post array on success, WP_Error on failure.
+	 */
+	public function current() {
+		// Create a real filesystem file to write a single rss/channel/item WXR into.
 
 		$index_offset = 12 * $this->current_post;
 		fseek( $this->posts_metadata, $index_offset, SEEK_SET );
@@ -372,7 +506,7 @@ function current() {
 		ftruncate( $this->post_fp, $this->tiny_header_size );
 		fseek( $this->post_fp, $this->tiny_header_size, SEEK_SET );
 
-		// Compose the WXR from the header, bytes from the database for the item and footer
+		// Compose the WXR from the header, bytes from the database for the item and footer.
 		if ( $this->do_compress ) {
 			fwrite( $this->post_fp, gzdecode( fread( $this->tmp, $index['l'] ) ) );
 		} else {
@@ -382,56 +516,79 @@ function current() {
 		fflush( $this->post_fp );
 		fseek( $this->post_fp, 0, SEEK_SET );
 
-		// Create a normal WXR_Parser data structure from the file
+		// Create a normal WXR_Parser data structure from the file.
 		$parser = $this->get_wxr_parser_instance();
 		$parsed = $parser->parse( $this->post_fp );
 
-		// Clean up
+		// Clean up.
 		if ( is_wp_error( $parsed ) ) {
 			return $parsed;
 		}
 
-		// There is exactly one post in this WXR so we can just return that.
-		// It's all we really wanted anyway.
-		return $parsed["posts"][0];
+		// There is exactly one post in this WXR, so we can just return that.
+		return $parsed['posts'][0];
 	}
 
-	function key() {
+	/**
+	 * Move forward to next post.
+	 */
+	public function next() {
+		$this->current_post++;
+	}
+
+	/**
+	 * Return the current post.
+	 *
+	 * @return int|null Post index on success, or null on failure.
+	 */
+	public function key() {
 		return $this->current_post;
 	}
 
-	function next() {
-		$this->current_post++;
+	/**
+	 * Checks if current position is valid.
+	 *
+	 * @return bool
+	 */
+	public function valid() {
+		$post_number = $this->current_post + 1;
+
+		return $post_number > 0 && $post_number <= $this->posts_found;
 	}
 
-	function rewind() {
+	/**
+	 * Rewind the Iterator to the first post.
+	 */
+	public function rewind() {
 		$this->current_post = 0;
 	}
 
-	function valid() {
-		$post_number = $this->current_post + 1;
-		return ( $post_number > 0 && $post_number <= $this->posts_found );
-	}
+	/**
+	 * Getter.
+	 *
+	 * @param string|mixed $key The key to check.
+	 * @return $this|mixed|null
+	 */
+	public function __get( $key ) {
+		// For use in the foreach( $data['posts'] as $post ) loop.
+		if ( 'posts' === $key ) {
+			return $this;
+		}
 
-	// Magic Methods
-	function __get( $key ) {
-		switch ( $key ) {
-			case 'posts':
-				// $this['posts'] returns $this.
-				// For use in the foreach( $data['posts'] as $post ) loop
-				return $this;
-			default:
-				// Anything else we're passing through to our embedded empty WXR
-				// that we keep around for just these purposes... eg: $data['authors']
-				if ( isset( $this->mini_parsed_wxr[$key] ) ) {
-					return $this->mini_parsed_wxr[$key];
-				}
-				return null;
+		// Anything else we're passing through to our embedded empty WXR that we keep around for just these purposes... eg: $data['authors'].
+		if ( isset( $this->mini_parsed_wxr[ $key ] ) ) {
+			return $this->mini_parsed_wxr[ $key ];
 		}
+
+		return null;
 	}
 
-	// For Countable
-	function count() {
+	/**
+	 * Count elements of an object.
+	 *
+	 * @return int<0,max> The custom count as an integer.
+	 */
+	public function count() {
 		return $this->posts_found;
 	}
 }

From 45524027b6e1c330c5349cacfb30bfea686cb70e Mon Sep 17 00:00:00 2001
From: Konstantin Obenland <obenland@gmx.de>
Date: Thu, 1 Dec 2022 21:02:28 -0800
Subject: [PATCH 3/3] Pinking shears

---
 src/parsers/class-wxr-parser.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parsers/class-wxr-parser.php b/src/parsers/class-wxr-parser.php
index 148b586..d1b4c61 100644
--- a/src/parsers/class-wxr-parser.php
+++ b/src/parsers/class-wxr-parser.php
@@ -14,7 +14,7 @@ function parse( $file ) {
 		// just for testing now!
 		$parser = new WXR_Parser_Large_File( $file, 'WXR_Parser_SimpleXML' );
 		return $parser;
-		
+
 		// Attempt to use proper XML parsers first
 		if ( extension_loaded( 'simplexml' ) ) {
 			$parser = new WXR_Parser_SimpleXML;