diff --git a/.gitignore b/.gitignore
index 32858aa..d29018d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,8 @@
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
+
+# Potentially locally generated HTML files (from tracked Markdown input)
+/LICENSE.html
+/README.html
+
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..f5f4b8b
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,195 @@
+Apache License
+==============
+
+_Version 2.0, January 2004_
+_<>_
+
+### Terms and Conditions for use, reproduction, and distribution
+
+#### 1. Definitions
+
+“License” shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+“Licensor” shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+“Legal Entity” shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, “control” means **(i)** the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
+outstanding shares, or **(iii)** beneficial ownership of such entity.
+
+“You” (or “Your”) shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+“Source” form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+“Object” form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+“Work” shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+“Derivative Works” shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+“Contribution” shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+“submitted” means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as “Not a Contribution.”
+
+“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+#### 2. Grant of Copyright License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+#### 3. Grant of Patent License
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+#### 4. Redistribution
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+* **(b)** You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+#### 5. Submission of Contributions
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+#### 6. Trademarks
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+#### 7. Disclaimer of Warranty
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+#### 8. Limitation of Liability
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+#### 9. Accepting Warranty or Additional Liability
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+_END OF TERMS AND CONDITIONS_
+
+### APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets `[]` replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same “printed page” as the copyright notice for easier identification within
+third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/README.md b/README.md
index 6c5bb64..e9f7bab 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,16 @@
-# zipdoc
-Git textconv program to dump a zip file's text contents to stdout
+# ZipDoc
+A Git `textconv` program to dump a ZIP files contents as text to stdout.
## Install
-Store Zipdoc.class somewhere in your home directory, for example ~/bin.
+Store ZipDoc.class somewhere in your home directory, for example `~/bin`.
-Define the diff filter in ~/.gitconfig :
+Define the diff filter in `~/.gitconfig`:
```
-git config --global --replace-all diff.zipdoc.textconv "java -cp ~/bin Zipdoc"
+git config --global --replace-all diff.zipdoc.textconv "java -cp ~/bin ZipDoc"
```
-Assign diff attributes to paths in .gitattributes: (also assigning the [rezip filter](https://github.com/costerwi/rezip) for efficient storage)
+Assign diff attributes to paths in `.gitattributes`
+ (also assigning the [rezip filter](https://github.com/costerwi/rezip) for efficient storage):
```
# MS Office
*.docx filter=zip diff=zipdoc
diff --git a/ZipDoc.java b/ZipDoc.java
new file mode 100644
index 0000000..a3f112e
--- /dev/null
+++ b/ZipDoc.java
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2015 Carl Osterwisch
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.xml.sax.InputSource;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.sax.SAXSource;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.stream.StreamResult;
+import javax.xml.transform.TransformerException;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.zip.CRC32;
+import java.util.zip.CheckedOutputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * The program takes a single argument, the name of the file to convert,
+ * and produces a more human readable, textual representation of its content on stdout.
+ * {@see https://github.com/costerwi/zipdoc}
+ */
+public class ZipDoc {
+
+ public static void main(final String[] argv) throws IOException, TransformerException {
+
+ if (1 != argv.length) {
+ System.err.printf("Usage: %s infile > text_representation.txt\n", ZipDoc.class.getSimpleName());
+ System.exit(1);
+ }
+
+ transform(argv[0]);
+ }
+
+ /**
+ * Checks whether a file denotes an XML based file format.
+ * @param fileName to be checked
+ * @return whether the supplied file name is XML based
+ */
+ @SuppressWarnings("WeakerAccess")
+ public static boolean isXml(final String fileName) {
+ // TODO Improve this function with a longer list of extensions, or optimally even by inspecting the MIME-type
+ return fileName.endsWith(".xml");
+ }
+
+ /**
+ * Checks whether a file denotes an plain-text file format.
+ * @param fileName to be checked
+ * @return whether the supplied file name is text based
+ */
+ @SuppressWarnings("WeakerAccess")
+ public static boolean isPlainText(final String fileName) {
+ // TODO Improve this function with a longer list of extensions, or optimally even by inspecting the MIME-type
+ return fileName.endsWith(".txt");
+ }
+
+ /**
+ * Reads the specified ZIP file and outputs a textual representation of its to stdout.
+ * @param zipFilePath the ZIP file to convert to a text
+ */
+ @SuppressWarnings("WeakerAccess")
+ public static void transform(final String zipFilePath) throws IOException, TransformerException {
+
+ try (final ZipInputStream zipIn = new ZipInputStream(new FileInputStream(zipFilePath))) {
+ transform(zipIn, System.out);
+ }
+ }
+
+ /**
+ * Reads the specified ZIP document and outputs a textual representation of its to the specified output stream.
+ * @param zipIn the ZIP document to convert to a text
+ * @param output where the text gets written to
+ */
+ @SuppressWarnings("WeakerAccess")
+ public static void transform(final ZipInputStream zipIn, final PrintStream output)
+ throws IOException, TransformerException
+ {
+ final Transformer serializer = SAXTransformerFactory.newInstance().newTransformer();
+ serializer.setOutputProperty(OutputKeys.INDENT, "yes");
+ serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
+ serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+ final byte[] buffer = new byte[8192];
+ ZipEntry entry;
+ final ByteArrayOutputStream uncompressedOutRaw = new ByteArrayOutputStream();
+ final CRC32 checkSum = new CRC32();
+ final CheckedOutputStream uncompressedOutChecked = new CheckedOutputStream(uncompressedOutRaw, checkSum);
+ while ((entry = zipIn.getNextEntry()) != null) {
+ uncompressedOutRaw.reset();
+ checkSum.reset();
+
+ output.println("Sub-file:\t" + entry);
+
+ // Copy the file from zipIn into the uncompressed, check-summed output stream
+ int len;
+ while ((len = zipIn.read(buffer)) > 0) {
+ uncompressedOutChecked.write(buffer, 0, len);
+ }
+ zipIn.closeEntry();
+
+ if (isXml(entry.getName())) {
+ // XML file: pretty-print the data to stdout
+ InputSource in = new InputSource(new ByteArrayInputStream(uncompressedOutRaw.toByteArray()));
+ serializer.transform(new SAXSource(in), new StreamResult(output));
+ } else if (isPlainText(entry.getName())) {
+ // Text file: dump directly to output
+ uncompressedOutRaw.writeTo(output);
+ } else {
+ // Unknown file type: report uncompressed size and CRC32
+ output.println("File size:\t" + uncompressedOutRaw.size());
+ output.println("Checksum:\t" + Long.toHexString(checkSum.getValue()));
+ }
+ output.println();
+ }
+ }
+}
diff --git a/Zipdoc.java b/Zipdoc.java
deleted file mode 100644
index 4538ece..0000000
--- a/Zipdoc.java
+++ /dev/null
@@ -1,70 +0,0 @@
-import java.io.*;
-import java.util.zip.*;
-
-import org.xml.sax.InputSource;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Source;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.sax.SAXSource;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.stream.StreamResult;
-import javax.xml.transform.TransformerException;
-
-public class Zipdoc {
- /**
- * Read specified zip document file and output text to stdout.
- *
- * The program takes aa single argument, the name of the file to convert,
- * and produces resulting text on stdout.
- * {@link https://github.com/costerwi/zipdoc}
- */
- public static void main(String argv[]) throws IOException, TransformerException {
- if (1 != argv.length) {
- System.err.println("Usage: Zipdoc infile >textconv.txt");
- System.exit(1);
- }
- ZipInputStream source_zip = new ZipInputStream(new FileInputStream(argv[0]));
-
- Transformer serializer = SAXTransformerFactory.newInstance().newTransformer();
- serializer.setOutputProperty(OutputKeys.INDENT, "yes");
- serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
- serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
-
- byte[] buffer = new byte[8192];
- ZipEntry entry;
- ByteArrayOutputStream uncomp_bs = new ByteArrayOutputStream();
- CRC32 cksum = new CRC32();
- CheckedOutputStream uncomp_os = new CheckedOutputStream(uncomp_bs, cksum);
- try {
- while ((entry = source_zip.getNextEntry()) != null) {
- uncomp_bs.reset();
- cksum.reset();
-
- System.out.println("Subfile:\t" + entry);
-
- // Copy file from source_zip into uncompressed, checksummed output stream
- int len = 0;
- while ((len = source_zip.read(buffer)) > 0) {
- uncomp_os.write(buffer, 0, len);
- }
- source_zip.closeEntry();
-
- if (entry.getName().endsWith(".xml")) {
- // xml file: pretty-print the data to stdout
- InputSource in = new InputSource(new ByteArrayInputStream(uncomp_bs.toByteArray()));
- serializer.transform(new SAXSource(in), new StreamResult(System.out));
- } else if (entry.getName().endsWith(".txt")) {
- // Text file: dump directly to stdout
- uncomp_bs.writeTo(System.out);
- } else {
- // Unknown file type: report uncompressed size and CRC32
- System.out.println("Filesize:\t" + uncomp_bs.size());
- System.out.println("Checksum:\t" + Long.toHexString(cksum.getValue()));
- }
- System.out.println();
- }
- } finally {
- source_zip.close();
- }
- }
-}