diff --git a/.gitignore b/.gitignore index 32858aa..d29018d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,8 @@ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* + +# Potentially locally generated HTML files (from tracked Markdown input) +/LICENSE.html +/README.html + diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..f5f4b8b --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,195 @@ +Apache License +============== + +_Version 2.0, January 2004_ +_<>_ + +### Terms and Conditions for use, reproduction, and distribution + +#### 1. Definitions + +“License” shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +“Licensor” shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +“Legal Entity” shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, “control” means **(i)** the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the +outstanding shares, or **(iii)** beneficial ownership of such entity. + +“You” (or “Your”) shall mean an individual or Legal Entity exercising +permissions granted by this License. + +“Source” form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +“Object” form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +“Work” shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +“Derivative Works” shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +“Contribution” shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +“submitted” means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as “Not a Contribution.” + +“Contributor” shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +#### 2. Grant of Copyright License + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +#### 3. Grant of Patent License + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +#### 4. Redistribution + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +* **(a)** You must give any other recipients of the Work or Derivative Works a copy of +this License; and +* **(b)** You must cause any modified files to carry prominent notices stating that You +changed the files; and +* **(c)** You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. + +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +#### 5. Submission of Contributions + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +#### 6. Trademarks + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +#### 7. Disclaimer of Warranty + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +#### 8. Limitation of Liability + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +#### 9. Accepting Warranty or Additional Liability + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +_END OF TERMS AND CONDITIONS_ + +### APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets `[]` replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same “printed page” as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/README.md b/README.md index 6c5bb64..e9f7bab 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,16 @@ -# zipdoc -Git textconv program to dump a zip file's text contents to stdout +# ZipDoc +A Git `textconv` program to dump a ZIP files contents as text to stdout. ## Install -Store Zipdoc.class somewhere in your home directory, for example ~/bin. +Store ZipDoc.class somewhere in your home directory, for example `~/bin`. -Define the diff filter in ~/.gitconfig : +Define the diff filter in `~/.gitconfig`: ``` -git config --global --replace-all diff.zipdoc.textconv "java -cp ~/bin Zipdoc" +git config --global --replace-all diff.zipdoc.textconv "java -cp ~/bin ZipDoc" ``` -Assign diff attributes to paths in .gitattributes: (also assigning the [rezip filter](https://github.com/costerwi/rezip) for efficient storage) +Assign diff attributes to paths in `.gitattributes` + (also assigning the [rezip filter](https://github.com/costerwi/rezip) for efficient storage): ``` # MS Office *.docx filter=zip diff=zipdoc diff --git a/ZipDoc.java b/ZipDoc.java new file mode 100644 index 0000000..a3f112e --- /dev/null +++ b/ZipDoc.java @@ -0,0 +1,131 @@ +/* + * Copyright 2015 Carl Osterwisch + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * you may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.xml.sax.InputSource; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.sax.SAXSource; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.TransformerException; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.zip.CRC32; +import java.util.zip.CheckedOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +/** + * The program takes a single argument, the name of the file to convert, + * and produces a more human readable, textual representation of its content on stdout. + * {@see https://github.com/costerwi/zipdoc} + */ +public class ZipDoc { + + public static void main(final String[] argv) throws IOException, TransformerException { + + if (1 != argv.length) { + System.err.printf("Usage: %s infile > text_representation.txt\n", ZipDoc.class.getSimpleName()); + System.exit(1); + } + + transform(argv[0]); + } + + /** + * Checks whether a file denotes an XML based file format. + * @param fileName to be checked + * @return whether the supplied file name is XML based + */ + @SuppressWarnings("WeakerAccess") + public static boolean isXml(final String fileName) { + // TODO Improve this function with a longer list of extensions, or optimally even by inspecting the MIME-type + return fileName.endsWith(".xml"); + } + + /** + * Checks whether a file denotes an plain-text file format. + * @param fileName to be checked + * @return whether the supplied file name is text based + */ + @SuppressWarnings("WeakerAccess") + public static boolean isPlainText(final String fileName) { + // TODO Improve this function with a longer list of extensions, or optimally even by inspecting the MIME-type + return fileName.endsWith(".txt"); + } + + /** + * Reads the specified ZIP file and outputs a textual representation of its to stdout. + * @param zipFilePath the ZIP file to convert to a text + */ + @SuppressWarnings("WeakerAccess") + public static void transform(final String zipFilePath) throws IOException, TransformerException { + + try (final ZipInputStream zipIn = new ZipInputStream(new FileInputStream(zipFilePath))) { + transform(zipIn, System.out); + } + } + + /** + * Reads the specified ZIP document and outputs a textual representation of its to the specified output stream. + * @param zipIn the ZIP document to convert to a text + * @param output where the text gets written to + */ + @SuppressWarnings("WeakerAccess") + public static void transform(final ZipInputStream zipIn, final PrintStream output) + throws IOException, TransformerException + { + final Transformer serializer = SAXTransformerFactory.newInstance().newTransformer(); + serializer.setOutputProperty(OutputKeys.INDENT, "yes"); + serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); + final byte[] buffer = new byte[8192]; + ZipEntry entry; + final ByteArrayOutputStream uncompressedOutRaw = new ByteArrayOutputStream(); + final CRC32 checkSum = new CRC32(); + final CheckedOutputStream uncompressedOutChecked = new CheckedOutputStream(uncompressedOutRaw, checkSum); + while ((entry = zipIn.getNextEntry()) != null) { + uncompressedOutRaw.reset(); + checkSum.reset(); + + output.println("Sub-file:\t" + entry); + + // Copy the file from zipIn into the uncompressed, check-summed output stream + int len; + while ((len = zipIn.read(buffer)) > 0) { + uncompressedOutChecked.write(buffer, 0, len); + } + zipIn.closeEntry(); + + if (isXml(entry.getName())) { + // XML file: pretty-print the data to stdout + InputSource in = new InputSource(new ByteArrayInputStream(uncompressedOutRaw.toByteArray())); + serializer.transform(new SAXSource(in), new StreamResult(output)); + } else if (isPlainText(entry.getName())) { + // Text file: dump directly to output + uncompressedOutRaw.writeTo(output); + } else { + // Unknown file type: report uncompressed size and CRC32 + output.println("File size:\t" + uncompressedOutRaw.size()); + output.println("Checksum:\t" + Long.toHexString(checkSum.getValue())); + } + output.println(); + } + } +} diff --git a/Zipdoc.java b/Zipdoc.java deleted file mode 100644 index 4538ece..0000000 --- a/Zipdoc.java +++ /dev/null @@ -1,70 +0,0 @@ -import java.io.*; -import java.util.zip.*; - -import org.xml.sax.InputSource; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Source; -import javax.xml.transform.Transformer; -import javax.xml.transform.sax.SAXSource; -import javax.xml.transform.sax.SAXTransformerFactory; -import javax.xml.transform.stream.StreamResult; -import javax.xml.transform.TransformerException; - -public class Zipdoc { - /** - * Read specified zip document file and output text to stdout. - * - * The program takes aa single argument, the name of the file to convert, - * and produces resulting text on stdout. - * {@link https://github.com/costerwi/zipdoc} - */ - public static void main(String argv[]) throws IOException, TransformerException { - if (1 != argv.length) { - System.err.println("Usage: Zipdoc infile >textconv.txt"); - System.exit(1); - } - ZipInputStream source_zip = new ZipInputStream(new FileInputStream(argv[0])); - - Transformer serializer = SAXTransformerFactory.newInstance().newTransformer(); - serializer.setOutputProperty(OutputKeys.INDENT, "yes"); - serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - serializer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); - - byte[] buffer = new byte[8192]; - ZipEntry entry; - ByteArrayOutputStream uncomp_bs = new ByteArrayOutputStream(); - CRC32 cksum = new CRC32(); - CheckedOutputStream uncomp_os = new CheckedOutputStream(uncomp_bs, cksum); - try { - while ((entry = source_zip.getNextEntry()) != null) { - uncomp_bs.reset(); - cksum.reset(); - - System.out.println("Subfile:\t" + entry); - - // Copy file from source_zip into uncompressed, checksummed output stream - int len = 0; - while ((len = source_zip.read(buffer)) > 0) { - uncomp_os.write(buffer, 0, len); - } - source_zip.closeEntry(); - - if (entry.getName().endsWith(".xml")) { - // xml file: pretty-print the data to stdout - InputSource in = new InputSource(new ByteArrayInputStream(uncomp_bs.toByteArray())); - serializer.transform(new SAXSource(in), new StreamResult(System.out)); - } else if (entry.getName().endsWith(".txt")) { - // Text file: dump directly to stdout - uncomp_bs.writeTo(System.out); - } else { - // Unknown file type: report uncompressed size and CRC32 - System.out.println("Filesize:\t" + uncomp_bs.size()); - System.out.println("Checksum:\t" + Long.toHexString(cksum.getValue())); - } - System.out.println(); - } - } finally { - source_zip.close(); - } - } -}