diff --git a/src/main/java/com/widen/tabitha/formats/FormatAdapter.java b/src/main/java/com/widen/tabitha/formats/FormatAdapter.java new file mode 100644 index 0000000..9c900c5 --- /dev/null +++ b/src/main/java/com/widen/tabitha/formats/FormatAdapter.java @@ -0,0 +1,58 @@ +package com.widen.tabitha.formats; + +import com.widen.tabitha.reader.ReaderOptions; +import com.widen.tabitha.reader.RowReader; +import com.widen.tabitha.writer.RowWriter; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Provides factory methods for creating readers and writers of a particular format. + */ +public interface FormatAdapter { + /** + * Create a row reader for a file at the given path. + * + * @param path The path of the file to read. + * @param options Options to pass to the reader. + * @return A new row reader. + * @throws IOException if an I/O error occurs. + */ + default RowReader createReader(Path path, ReaderOptions options) throws IOException { + return createReader(Files.newInputStream(path), options); + } + + /** + * Create a row reader for an input stream. + * + * @param inputStream The input stream to read. + * @param options Options to pass to the reader. + * @return A new row reader. + * @throws IOException if an I/O error occurs. + */ + RowReader createReader(InputStream inputStream, ReaderOptions options) throws IOException; + + /** + * Create a row writer that writes to the given path. + * + * @param path The path to write to. + * @return A new row writer. + * @throws IOException if an I/O error occurs. + */ + default RowWriter createWriter(Path path) throws IOException { + return createWriter(Files.newOutputStream(path)); + } + + /** + * Create a row writer that writes to the given output stream. + * + * @param outputStream The output stream to write to. + * @return A new row writer. + * @throws IOException if an I/O error occurs. + */ + RowWriter createWriter(OutputStream outputStream) throws IOException; +} diff --git a/src/main/java/com/widen/tabitha/formats/FormatRegistry.java b/src/main/java/com/widen/tabitha/formats/FormatRegistry.java new file mode 100644 index 0000000..b1de3d3 --- /dev/null +++ b/src/main/java/com/widen/tabitha/formats/FormatRegistry.java @@ -0,0 +1,110 @@ +package com.widen.tabitha.formats; + +import com.widen.tabitha.formats.delimited.DelimitedFormat; +import com.widen.tabitha.formats.delimited.DelimitedRowReader; +import com.widen.tabitha.formats.delimited.DelimitedRowWriter; +import com.widen.tabitha.formats.excel.WorkbookRowWriter; +import com.widen.tabitha.formats.excel.XLSRowReader; +import com.widen.tabitha.formats.excel.XLSXRowReader; +import com.widen.tabitha.reader.InlineHeaderReader; +import com.widen.tabitha.reader.ReaderOptions; +import com.widen.tabitha.reader.RowReader; +import com.widen.tabitha.writer.RowWriter; +import io.reactivex.Maybe; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Path; + +/** + * Manages the adapters for the file formats supported by Tabitha. + *

+ * You probably want to use {@link com.widen.tabitha.reader.RowReaders} or {@link com.widen.tabitha.writer.RowWriter} + * instead. + */ +public class FormatRegistry { + /** + * Get a format factory for handling the given MIME type. + * + * @param mimeType The format MIME type. + * @return A format adapter, if one could be found. + */ + public static Maybe forMimeType(String mimeType) { + switch (mimeType) { + case "text/csv": + case "text/plain": + return Maybe.just(new FormatAdapter() { + @Override + public RowReader createReader(InputStream inputStream, ReaderOptions options) { + return decorateReader(new DelimitedRowReader(inputStream, DelimitedFormat.CSV), options); + } + + @Override + public RowWriter createWriter(OutputStream outputStream) { + return new DelimitedRowWriter(outputStream, DelimitedFormat.CSV); + } + }); + + case "text/tab-separated-values": + return Maybe.just(new FormatAdapter() { + @Override + public RowReader createReader(InputStream inputStream, ReaderOptions options) { + return decorateReader(new DelimitedRowReader(inputStream, DelimitedFormat.TSV), options); + } + + @Override + public RowWriter createWriter(OutputStream outputStream) { + return new DelimitedRowWriter(outputStream, DelimitedFormat.TSV); + } + }); + + case "application/vnd.ms-excel": + return Maybe.just(new FormatAdapter() { + @Override + public RowReader createReader(Path path, ReaderOptions options) throws IOException { + return decorateReader(XLSRowReader.open(path, options), options); + } + + @Override + public RowReader createReader(InputStream inputStream, ReaderOptions options) throws IOException { + return decorateReader(XLSRowReader.open(inputStream, options), options); + } + + @Override + public RowWriter createWriter(OutputStream outputStream) { + return WorkbookRowWriter.xls(outputStream); + } + }); + + case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": + case "application/x-tika-ooxml": + return Maybe.just(new FormatAdapter() { + @Override + public RowReader createReader(Path path, ReaderOptions options) throws IOException { + return decorateReader(XLSXRowReader.open(path, options), options); + } + + @Override + public RowReader createReader(InputStream inputStream, ReaderOptions options) throws IOException { + return decorateReader(XLSXRowReader.open(inputStream, options), options); + } + + @Override + public RowWriter createWriter(OutputStream outputStream) { + return WorkbookRowWriter.xlsx(outputStream); + } + }); + + default: + return Maybe.empty(); + } + } + + private static RowReader decorateReader(RowReader reader, ReaderOptions options) { + if (options.isInlineHeaders()) { + reader = new InlineHeaderReader(reader); + } + return reader; + } +} diff --git a/src/main/java/com/widen/tabitha/reader/Header.java b/src/main/java/com/widen/tabitha/reader/Header.java index f4bf354..5e4f273 100644 --- a/src/main/java/com/widen/tabitha/reader/Header.java +++ b/src/main/java/com/widen/tabitha/reader/Header.java @@ -1,6 +1,12 @@ package com.widen.tabitha.reader; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; /** * Defines an ordered list of named columns. @@ -182,5 +188,7 @@ public DuplicateColumnException(String column) { } @Override - public String toString() { return columnsByIndex.toString(); } + public String toString() { + return columnsByIndex.toString(); + } } diff --git a/src/main/java/com/widen/tabitha/reader/InlineHeaderReader.java b/src/main/java/com/widen/tabitha/reader/InlineHeaderReader.java index 11d38e7..d846635 100644 --- a/src/main/java/com/widen/tabitha/reader/InlineHeaderReader.java +++ b/src/main/java/com/widen/tabitha/reader/InlineHeaderReader.java @@ -8,12 +8,12 @@ /** * Decorates another reader, interpreting the first row of each page of data as the header for subsequent rows. */ -class InlineHeaderReader implements RowReader { +public class InlineHeaderReader implements RowReader { private final RowReader inner; private Header currentHeader; private long currentPage = -1; - InlineHeaderReader(RowReader inner) { + public InlineHeaderReader(RowReader inner) { this.inner = inner; } diff --git a/src/main/java/com/widen/tabitha/reader/RowReaders.java b/src/main/java/com/widen/tabitha/reader/RowReaders.java index a0eac09..23ca91b 100644 --- a/src/main/java/com/widen/tabitha/reader/RowReaders.java +++ b/src/main/java/com/widen/tabitha/reader/RowReaders.java @@ -1,18 +1,13 @@ package com.widen.tabitha.reader; -import com.widen.tabitha.formats.delimited.DelimitedFormat; -import com.widen.tabitha.formats.delimited.DelimitedRowReader; -import com.widen.tabitha.formats.excel.XLSRowReader; -import com.widen.tabitha.formats.excel.XLSXRowReader; +import com.widen.tabitha.formats.FormatRegistry; +import io.reactivex.Maybe; import org.apache.tika.Tika; import java.io.BufferedInputStream; -import java.io.IOException; import java.io.InputStream; -import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.Optional; /** * Helper factory methods for creating row readers. @@ -24,7 +19,7 @@ public class RowReaders { * @param path The file path of the file to open. * @return A row reader if the file is in a supported format. */ - public static Optional open(String path) throws Exception { + public static Maybe open(String path) { return open(Paths.get(path), null); } @@ -34,7 +29,7 @@ public static Optional open(String path) throws Exception { * @param path The file path of the file to open. * @return A row reader if the file is in a supported format. */ - public static Optional open(Path path) throws Exception { + public static Maybe open(Path path) { return open(path, null); } @@ -45,30 +40,11 @@ public static Optional open(Path path) throws Exception { * @param options Options to pass to the reader. * @return A row reader if the file is in a supported format. */ - public static Optional open(Path path, ReaderOptions options) throws Exception { - if (options == null) { - options = new ReaderOptions(); - } - - String mimeType = tika.detect(path); - - switch (mimeType) { - case "text/csv": - case "text/plain": - return Optional.of(decorate(new DelimitedRowReader(Files.newInputStream(path), DelimitedFormat.CSV), options)); - - case "text/tab-separated-values": - return Optional.of(decorate(new DelimitedRowReader(Files.newInputStream(path), DelimitedFormat.TSV), options)); - - case "application/vnd.ms-excel": - return Optional.of(decorate(XLSRowReader.open(path, options), options)); - - case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": - case "application/x-tika-ooxml": - return Optional.of(decorate(XLSXRowReader.open(path, options), options)); - } - - return Optional.empty(); + public static Maybe open(Path path, ReaderOptions options) { + return Maybe + .fromCallable(() -> tika.detect(path)) + .flatMap(FormatRegistry::forMimeType) + .map(formatAdapter -> formatAdapter.createReader(path, options != null ? options : new ReaderOptions())); } /** @@ -77,7 +53,7 @@ public static Optional open(Path path, ReaderOptions options) throws * @param inputStream The input stream to read. * @return A row reader if the stream is in a supported format. */ - public static Optional open(InputStream inputStream) throws IOException { + public static Maybe open(InputStream inputStream) { return open(inputStream, null, null); } @@ -88,7 +64,7 @@ public static Optional open(InputStream inputStream) throws IOExcepti * @param filename The filename associated with the stream, if known. * @return A row reader if the stream is in a supported format. */ - public static Optional open(InputStream inputStream, String filename) throws IOException { + public static Maybe open(InputStream inputStream, String filename) { return open(inputStream, filename, null); } @@ -99,7 +75,7 @@ public static Optional open(InputStream inputStream, String filename) * @param options Options to pass to the reader. * @return A row reader if the stream is in a supported format. */ - public static Optional open(InputStream inputStream, ReaderOptions options) throws IOException { + public static Maybe open(InputStream inputStream, ReaderOptions options) { return open(inputStream, null, options); } @@ -111,44 +87,15 @@ public static Optional open(InputStream inputStream, ReaderOptions op * @param options Options to pass to the reader. * @return A row reader if the stream is in a supported format. */ - public static Optional open( - InputStream inputStream, - String filename, - ReaderOptions options - ) throws IOException { - if (options == null) { - options = new ReaderOptions(); - } - + public static Maybe open(InputStream inputStream, String filename, ReaderOptions options) { // If our input stream supports marks, Tika will rewind the stream back to the start for us after detecting the // format, so ensure our input stream supports it. - inputStream = createRewindableInputStream(inputStream); - String mimeType = tika.detect(inputStream, filename); - - switch (mimeType) { - case "text/csv": - case "text/plain": - return Optional.of(decorate(new DelimitedRowReader(inputStream, DelimitedFormat.CSV), options)); - - case "text/tab-separated-values": - return Optional.of(decorate(new DelimitedRowReader(inputStream, DelimitedFormat.TSV), options)); - - case "application/vnd.ms-excel": - return Optional.of(decorate(XLSRowReader.open(inputStream, options), options)); - - case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": - case "application/x-tika-ooxml": - return Optional.of(decorate(XLSXRowReader.open(inputStream, options), options)); - } - - return Optional.empty(); - } + InputStream rewindableStream = createRewindableInputStream(inputStream); - private static RowReader decorate(RowReader reader, ReaderOptions options) { - if (options.isInlineHeaders()) { - reader = new InlineHeaderReader(reader); - } - return reader; + return Maybe + .fromCallable(() -> tika.detect(rewindableStream, filename)) + .flatMap(FormatRegistry::forMimeType) + .map(formatAdapter -> formatAdapter.createReader(rewindableStream, options != null ? options : new ReaderOptions())); } private static InputStream createRewindableInputStream(InputStream inputStream) { diff --git a/src/main/java/com/widen/tabitha/writer/RowWriters.java b/src/main/java/com/widen/tabitha/writer/RowWriters.java index eb58381..cbe8bb1 100644 --- a/src/main/java/com/widen/tabitha/writer/RowWriters.java +++ b/src/main/java/com/widen/tabitha/writer/RowWriters.java @@ -1,13 +1,10 @@ package com.widen.tabitha.writer; -import com.widen.tabitha.formats.delimited.DelimitedFormat; -import com.widen.tabitha.formats.delimited.DelimitedRowWriter; -import com.widen.tabitha.formats.excel.WorkbookRowWriter; -import org.apache.commons.io.FilenameUtils; +import com.widen.tabitha.formats.FormatRegistry; +import io.reactivex.Maybe; +import org.apache.tika.Tika; -import java.io.IOException; import java.io.OutputStream; -import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -21,7 +18,7 @@ public class RowWriters { * @param path The path to open. * @return A row writer for the given path. */ - public static RowWriter create(String path) throws IOException { + public static Maybe create(String path) { return create(Paths.get(path)); } @@ -31,30 +28,26 @@ public static RowWriter create(String path) throws IOException { * @param path The path to open. * @return A row writer for the given file. */ - public static RowWriter create(Path path) throws IOException { - return create(Files.newOutputStream(path), path.getFileName().toString()); + public static Maybe create(Path path) { + return Maybe + .fromCallable(() -> tika.detect(path)) + .flatMap(FormatRegistry::forMimeType) + .map(formatAdapter -> formatAdapter.createWriter(path)); } /** * Create a new row writer for the given output stream and guess the output format based on a filename. * + * @param outputStream The output stream to write to. + * @param name The name of the output file or format. * @return A row writer for the given output stream. */ - public static RowWriter create(OutputStream outputStream, String filename) { - String extension = FilenameUtils.getExtension(filename); - - if ("xlsx".equals(extension)) { - return WorkbookRowWriter.xlsx(outputStream); - } - - if ("xls".equals(extension)) { - return WorkbookRowWriter.xls(outputStream); - } - - if ("tsv".equals(extension)) { - return new DelimitedRowWriter(outputStream, DelimitedFormat.TSV); - } - - return new DelimitedRowWriter(outputStream, DelimitedFormat.CSV); + public static Maybe create(OutputStream outputStream, String name) { + return FormatRegistry + .forMimeType(tika.detect(name)) + .map(formatAdapter -> formatAdapter.createWriter(outputStream)); } + + // Apache Tika instance for detecting MIME types. + private static final Tika tika = new Tika(); } diff --git a/src/test/groovy/com/widen/tabitha/HeaderTest.groovy b/src/test/groovy/com/widen/tabitha/HeaderTest.groovy index fcb5cc0..6b9130b 100644 --- a/src/test/groovy/com/widen/tabitha/HeaderTest.groovy +++ b/src/test/groovy/com/widen/tabitha/HeaderTest.groovy @@ -1,7 +1,7 @@ package com.widen.tabitha import com.widen.tabitha.reader.Header -import spock.lang.* +import spock.lang.Specification class HeaderTest extends Specification { def "build with duplicate column names"() { @@ -32,10 +32,10 @@ class HeaderTest extends Specification { def "get columns by index"() { given: def header = new Header.Builder() - .add("foo") - .add("bar") - .add("baz") - .build() + .add("foo") + .add("bar") + .add("baz") + .build() expect: header.nameOf(0).get() == "foo" diff --git a/src/test/groovy/com/widen/tabitha/RowReaderTest.groovy b/src/test/groovy/com/widen/tabitha/RowReaderTest.groovy index 7671888..8d743d8 100644 --- a/src/test/groovy/com/widen/tabitha/RowReaderTest.groovy +++ b/src/test/groovy/com/widen/tabitha/RowReaderTest.groovy @@ -2,7 +2,7 @@ package com.widen.tabitha import com.widen.tabitha.reader.Row import com.widen.tabitha.reader.RowReader -import spock.lang.* +import spock.lang.Specification class RowReaderTest extends Specification { def "empty reader"() { diff --git a/src/test/groovy/com/widen/tabitha/RowReadersTest.groovy b/src/test/groovy/com/widen/tabitha/RowReadersTest.groovy index f0f2d16..31567e7 100644 --- a/src/test/groovy/com/widen/tabitha/RowReadersTest.groovy +++ b/src/test/groovy/com/widen/tabitha/RowReadersTest.groovy @@ -1,7 +1,7 @@ package com.widen.tabitha import com.widen.tabitha.reader.RowReaders -import spock.lang.* +import spock.lang.Specification class RowReadersTest extends Specification { def "open a CSV file"() { @@ -10,7 +10,7 @@ class RowReadersTest extends Specification { def reader = RowReaders.open(file) expect: - reader.isPresent() + !reader.isEmpty().blockingGet() } def "open a CSV stream"() { @@ -19,7 +19,7 @@ class RowReadersTest extends Specification { def reader = RowReaders.open(stream) expect: - reader.isPresent() + !reader.isEmpty().blockingGet() } def "open an XLS file"() { @@ -28,7 +28,7 @@ class RowReadersTest extends Specification { def reader = RowReaders.open(file) expect: - reader.isPresent() + !reader.isEmpty().blockingGet() } def "open an XLS stream"() { @@ -37,7 +37,7 @@ class RowReadersTest extends Specification { def reader = RowReaders.open(stream) expect: - reader.isPresent() + !reader.isEmpty().blockingGet() } def "open an XLSX file"() { @@ -46,7 +46,7 @@ class RowReadersTest extends Specification { def reader = RowReaders.open(file) expect: - reader.isPresent() + !reader.isEmpty().blockingGet() } def "open an XLSX stream"() { @@ -55,6 +55,6 @@ class RowReadersTest extends Specification { def reader = RowReaders.open(stream) expect: - reader.isPresent() + !reader.isEmpty().blockingGet() } } diff --git a/src/test/groovy/com/widen/tabitha/VariantTest.groovy b/src/test/groovy/com/widen/tabitha/VariantTest.groovy index 54a4c93..1db1432 100644 --- a/src/test/groovy/com/widen/tabitha/VariantTest.groovy +++ b/src/test/groovy/com/widen/tabitha/VariantTest.groovy @@ -1,6 +1,6 @@ package com.widen.tabitha -import spock.lang.* +import spock.lang.Specification class VariantTest extends Specification { def "of factory creates correct variant types"() { diff --git a/src/test/groovy/com/widen/tabitha/formats/HiddenRowsTest.groovy b/src/test/groovy/com/widen/tabitha/formats/HiddenRowsTest.groovy index ae61a7b..0725e0d 100644 --- a/src/test/groovy/com/widen/tabitha/formats/HiddenRowsTest.groovy +++ b/src/test/groovy/com/widen/tabitha/formats/HiddenRowsTest.groovy @@ -9,9 +9,9 @@ class HiddenRowsTest extends Specification { def "Hidden rows are ignored"() { setup: def reader = RowReaders.open( - Helpers.getResourceStream(file), - new ReaderOptions().withIncludeHiddenRows(false) - ).get() + Helpers.getResourceStream(file), + new ReaderOptions().withIncludeHiddenRows(false) + ).blockingGet() expect: reader.each { row -> @@ -25,9 +25,9 @@ class HiddenRowsTest extends Specification { def "Hidden rows are not ignored"() { setup: def reader = RowReaders.open( - Helpers.getResourceStream(file), - new ReaderOptions().withIncludeHiddenRows(true) - ).get() + Helpers.getResourceStream(file), + new ReaderOptions().withIncludeHiddenRows(true) + ).blockingGet() when: def foundHidden = false