diff --git a/parquet-cli/README.md b/parquet-cli/README.md index 963e4f171b..92693c00fb 100644 --- a/parquet-cli/README.md +++ b/parquet-cli/README.md @@ -121,6 +121,8 @@ Usage: parquet [options] [command] [command options] Rewrite one or more Parquet files to a new Parquet file size-stats Print size statistics for a Parquet file + geospatial-stats + Print geospatial statistics for a Parquet file Examples: diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index c39e3b8e5a..37e2aabbb4 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -50,6 +50,7 @@ import org.apache.parquet.cli.commands.ShowColumnIndexCommand; import org.apache.parquet.cli.commands.ShowDictionaryCommand; import org.apache.parquet.cli.commands.ShowFooterCommand; +import org.apache.parquet.cli.commands.ShowGeospatialStatisticsCommand; import org.apache.parquet.cli.commands.ShowPagesCommand; import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand; import org.apache.parquet.cli.commands.ToAvroCommand; @@ -107,6 +108,7 @@ public class Main extends Configured implements Tool { jc.addCommand("scan", new ScanCommand(console)); jc.addCommand("rewrite", new RewriteCommand(console)); jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console)); + jc.addCommand("geospatial-stats", new ShowGeospatialStatisticsCommand(console)); } @Override diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowGeospatialStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowGeospatialStatisticsCommand.java new file mode 100644 index 0000000000..e310d3f4a8 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowGeospatialStatisticsCommand.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import java.io.IOException; +import java.util.List; +import org.apache.commons.text.TextStringBuilder; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; + +@Parameters(commandDescription = "Print geospatial statistics for a Parquet file") +public class ShowGeospatialStatisticsCommand extends BaseCommand { + + public ShowGeospatialStatisticsCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A Parquet file is required."); + Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files."); + + String source = targets.get(0); + try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) { + ParquetMetadata footer = reader.getFooter(); + MessageType schema = footer.getFileMetaData().getSchema(); + + console.info("\nFile path: {}", source); + + List rowGroups = footer.getBlocks(); + for (int index = 0, n = rowGroups.size(); index < n; index++) { + printRowGroupGeospatialStats(console, index, rowGroups.get(index), schema); + console.info(""); + } + } + + return 0; + } + + private void printRowGroupGeospatialStats(Logger console, int index, BlockMetaData rowGroup, MessageType schema) { + int maxColumnWidth = Math.max( + "column".length(), + rowGroup.getColumns().stream() + .map(col -> col.getPath().toString().length()) + .max(Integer::compare) + .orElse(0)); + + console.info(String.format("\nRow group %d\n%s", index, new TextStringBuilder(80).appendPadding(80, '-'))); + + String formatString = String.format("%%-%ds %%-15s %%-40s", maxColumnWidth); + console.info(String.format(formatString, "column", "bounding box", "geospatial types")); + + for (ColumnChunkMetaData column : rowGroup.getColumns()) { + printColumnGeospatialStats(console, column, schema, maxColumnWidth); + } + } + + private void printColumnGeospatialStats( + Logger console, ColumnChunkMetaData column, MessageType schema, int columnWidth) { + GeospatialStatistics stats = column.getGeospatialStatistics(); + + if (stats != null && stats.isValid()) { + String boundingBox = + stats.getBoundingBox() != null ? stats.getBoundingBox().toString() : "-"; + String geospatialTypes = stats.getGeospatialTypes() != null + ? stats.getGeospatialTypes().toString() + : "-"; + String formatString = String.format("%%-%ds %%-15s %%-40s", columnWidth); + console.info(String.format(formatString, column.getPath(), boundingBox, geospatialTypes)); + } else { + String formatString = String.format("%%-%ds %%-15s %%-40s", columnWidth); + console.info(String.format(formatString, column.getPath(), "-", "-")); + } + } + + @Override + public List getExamples() { + return Lists.newArrayList("# Show geospatial statistics for a Parquet file", "sample.parquet"); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowGeospatialStatisticsCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowGeospatialStatisticsCommandTest.java new file mode 100644 index 0000000000..1a4f3f73ce --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowGeospatialStatisticsCommandTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.commands; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import org.apache.hadoop.conf.Configuration; +import org.junit.Assert; +import org.junit.Test; + +public class ShowGeospatialStatisticsCommandTest extends ParquetFileTest { + @Test + public void testShowGeospatialStatisticsCommand() throws IOException { + File file = parquetFile(); + ShowGeospatialStatisticsCommand command = new ShowGeospatialStatisticsCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java index 95f735f75f..72d2dd4e55 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java @@ -26,6 +26,7 @@ import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.column.values.bloomfilter.AdaptiveBlockSplitBloomFilter; import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilter; @@ -42,6 +43,7 @@ class ColumnValueCollector { private BloomFilter bloomFilter; private Statistics statistics; private SizeStatistics.Builder sizeStatisticsBuilder; + private GeospatialStatistics.Builder geospatialStatisticsBuilder; ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) { this.path = path; @@ -60,6 +62,9 @@ void resetPageStatistics() { path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel()) : SizeStatistics.noopBuilder( path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel()); + this.geospatialStatisticsBuilder = statisticsEnabled + ? GeospatialStatistics.newBuilder(path.getPrimitiveType()) + : GeospatialStatistics.noopBuilder(); } void writeNull(int repetitionLevel, int definitionLevel) { @@ -99,6 +104,7 @@ void write(double value, int repetitionLevel, int definitionLevel) { void write(Binary value, int repetitionLevel, int definitionLevel) { statistics.updateStats(value); sizeStatisticsBuilder.add(repetitionLevel, definitionLevel, value); + geospatialStatisticsBuilder.update(value); bloomFilter.insertHash(bloomFilter.hash(value)); } @@ -199,4 +205,8 @@ Statistics getStatistics() { SizeStatistics getSizeStatistics() { return sizeStatisticsBuilder.build(); } + + GeospatialStatistics getGeospatialStatistics() { + return geospatialStatisticsBuilder.build(); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java index 1b0f3ba4d1..8fc4aa2722 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java @@ -27,6 +27,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; import org.apache.parquet.io.ParquetEncodingException; @@ -380,6 +381,7 @@ void writePage() { valueCount, collector.getStatistics(), collector.getSizeStatistics(), + collector.getGeospatialStatistics(), repetitionLevelColumn, definitionLevelColumn, dataColumn); @@ -403,6 +405,7 @@ abstract void writePage( int valueCount, Statistics statistics, SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics, ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index e15f9ecb34..882be23811 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -26,6 +26,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; @@ -62,6 +63,7 @@ void writePage( int valueCount, Statistics statistics, SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics, ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) @@ -72,6 +74,7 @@ void writePage( rowCount, statistics, sizeStatistics, + geospatialStatistics, repetitionLevels.getEncoding(), definitionLevels.getEncoding(), values.getEncoding()); diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index b66749e093..e7af6aaadf 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -26,6 +26,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; @@ -88,6 +89,7 @@ void writePage( int valueCount, Statistics statistics, SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics, ValuesWriter repetitionLevels, ValuesWriter definitionLevels, ValuesWriter values) @@ -105,6 +107,7 @@ void writePage( encoding, bytes, statistics, - sizeStatistics); + sizeStatistics, + geospatialStatistics); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java index 97d5ca68c1..1d82db8c32 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/page/PageWriter.java @@ -23,6 +23,7 @@ import org.apache.parquet.column.Encoding; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; /** * a writer for all the pages of a given column chunk @@ -86,6 +87,7 @@ void writePage( * @param valuesEncoding values encoding * @throws IOException */ + @Deprecated default void writePage( BytesInput bytesInput, int valueCount, @@ -99,6 +101,33 @@ default void writePage( throw new UnsupportedOperationException("writePage with SizeStatistics is not implemented"); } + /** + * writes a single page + * @param bytesInput the bytes for the page + * @param valueCount the number of values in that page + * @param rowCount the number of rows in that page + * @param statistics the statistics for that page + * @param sizeStatistics the size statistics for that page + * @param geospatialStatistics the geospatial statistics for that page + * @param rlEncoding repetition level encoding + * @param dlEncoding definition level encoding + * @param valuesEncoding values encoding + * @throws IOException + */ + default void writePage( + BytesInput bytesInput, + int valueCount, + int rowCount, + Statistics statistics, + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics, + Encoding rlEncoding, + Encoding dlEncoding, + Encoding valuesEncoding) + throws IOException { + throw new UnsupportedOperationException("writePage with GeospatialStatistics is not implemented"); + } + /** * writes a single page in the new format * @@ -136,6 +165,7 @@ void writePageV2( * @param sizeStatistics optional size stats for this page * @throws IOException if there is an exception while writing page data */ + @Deprecated default void writePageV2( int rowCount, int nullCount, @@ -150,6 +180,34 @@ default void writePageV2( throw new UnsupportedOperationException("writePageV2 with SizeStatistics is not implemented"); } + /** + * writes a single page in the new format + * @param rowCount the number of rows in this page + * @param nullCount the number of null values (out of valueCount) + * @param valueCount the number of values in that page (there could be multiple values per row for repeated fields) + * @param repetitionLevels the repetition levels encoded in RLE without any size header + * @param definitionLevels the definition levels encoded in RLE without any size header + * @param dataEncoding the encoding for the data + * @param data the data encoded with dataEncoding + * @param statistics optional stats for this page + * @param sizeStatistics optional size stats for this page + * @throws IOException if there is an exception while writing page data + */ + default void writePageV2( + int rowCount, + int nullCount, + int valueCount, + BytesInput repetitionLevels, + BytesInput definitionLevels, + Encoding dataEncoding, + BytesInput data, + Statistics statistics, + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics) + throws IOException { + throw new UnsupportedOperationException("writePageV2 with GeospatialStatistics is not implemented"); + } + /** * @return the current size used in the memory buffer for that column chunk */ diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/BoundingBox.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/BoundingBox.java new file mode 100644 index 0000000000..2984ea20d6 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/BoundingBox.java @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geospatial; + +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; + +public class BoundingBox { + + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + private boolean valid = true; + + public BoundingBox() {} + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + + // Update the validity + valid = isXYValid(); + } + + private void resetBBox() { + xMin = Double.POSITIVE_INFINITY; + xMax = Double.NEGATIVE_INFINITY; + yMin = Double.POSITIVE_INFINITY; + yMax = Double.NEGATIVE_INFINITY; + zMin = Double.POSITIVE_INFINITY; + zMax = Double.NEGATIVE_INFINITY; + mMin = Double.POSITIVE_INFINITY; + mMax = Double.NEGATIVE_INFINITY; + } + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + /** + * Checks if the bounding box is valid. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + * + * @return true if the bounding box is valid, false otherwise. + */ + public boolean isValid() { + return valid; + } + + /** + * Checks if the X and Y dimensions of the bounding box are valid. + * The X and Y dimensions are considered valid if none of the bounds contain NaN. + * + * @return true if the X and Y dimensions are valid, false otherwise. + */ + public boolean isXYValid() { + return isXValid() && isYValid(); + } + + /** + * Checks if the X dimension of the bounding box is valid. + * The X dimension is considered valid if neither bound contains NaN. + * + * @return true if the X dimension is valid, false otherwise. + */ + public boolean isXValid() { + return !(Double.isNaN(xMin) || Double.isNaN(xMax)); + } + + /** + * Checks if the Y dimension of the bounding box is valid. + * The Y dimension is considered valid if neither bound contains NaN. + * + * @return true if the Y dimension is valid, false otherwise. + */ + public boolean isYValid() { + return !(Double.isNaN(yMin) || Double.isNaN(yMax)); + } + + /** + * Checks if the Z dimension of the bounding box is valid. + * The Z dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the Z dimension is valid, false otherwise. + */ + public boolean isZValid() { + return !(Double.isNaN(zMin) || Double.isNaN(zMax)); + } + + /** + * Checks if the M dimension of the bounding box is valid. + * The M dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the M dimension is valid, false otherwise. + */ + public boolean isMValid() { + return !(Double.isNaN(mMin) || Double.isNaN(mMax)); + } + + /** + * Checks if the bounding box is empty in the X / Y dimension. + * + * @return true if the bounding box is empty, false otherwise. + */ + public boolean isXYEmpty() { + return isXEmpty() || isYEmpty(); + } + + /** + * Checks if the bounding box is empty in the X dimension. + * + * @return true if the X dimension is empty, false otherwise. + */ + public boolean isXEmpty() { + return Double.isInfinite(xMin) && Double.isInfinite(xMax); + } + + /** + * Checks if the bounding box is empty in the Y dimension. + * + * @return true if the Y dimension is empty, false otherwise. + */ + public boolean isYEmpty() { + return Double.isInfinite(yMin) && Double.isInfinite(yMax); + } + + /** + * Checks if the bounding box is empty in the Z dimension. + * + * @return true if the Z dimension is empty, false otherwise. + */ + public boolean isZEmpty() { + return Double.isInfinite(zMin) && Double.isInfinite(zMax); + } + + /** + * Checks if the bounding box is empty in the M dimension. + * + * @return true if the M dimension is empty, false otherwise. + */ + public boolean isMEmpty() { + return Double.isInfinite(mMin) && Double.isInfinite(mMax); + } + + /** + * Expands this bounding box to include the bounds of another box. + * After merging, this bounding box will contain both its original extent + * and the extent of the other bounding box. + * + * @param other the other BoundingBox whose bounds will be merged into this one + */ + public void merge(BoundingBox other) { + if (!valid) { + return; + } + + // If other is null or invalid, mark this as invalid + if (other == null || !other.valid) { + valid = false; + resetBBox(); + return; + } + + this.xMin = Math.min(this.xMin, other.xMin); + this.xMax = Math.max(this.xMax, other.xMax); + this.yMin = Math.min(this.yMin, other.yMin); + this.yMax = Math.max(this.yMax, other.yMax); + this.zMin = Math.min(this.zMin, other.zMin); + this.zMax = Math.max(this.zMax, other.zMax); + this.mMin = Math.min(this.mMin, other.mMin); + this.mMax = Math.max(this.mMax, other.mMax); + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Extends this bounding box to include the spatial extent of the provided geometry. + * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted + * to encompass both the current bounds and the geometry's bounds. + * + * @param geometry The geometry whose coordinates will be used to update this bounding box. + * If null or empty, the method returns without making any changes. + */ + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + Envelope envelope = geometry.getEnvelopeInternal(); + updateBounds(envelope.getMinX(), envelope.getMaxX(), envelope.getMinY(), envelope.getMaxY()); + + for (Coordinate coord : geometry.getCoordinates()) { + if (!Double.isNaN(coord.getZ())) { + zMin = Math.min(zMin, coord.getZ()); + zMax = Math.max(zMax, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + mMin = Math.min(mMin, coord.getM()); + mMax = Math.max(mMax, coord.getM()); + } + } + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Updates the X and Y bounds of this bounding box with the given coordinates. + * Updates are conditional: + * - X bounds are only updated if both minX and maxX are not NaN + * - Y bounds are only updated if both minY and maxY are not NaN + * + * This allows partial updates while preserving valid dimensions. + */ + private void updateBounds(double minX, double maxX, double minY, double maxY) { + if (!Double.isNaN(minX) && !Double.isNaN(maxX)) { + xMin = Math.min(xMin, minX); + xMax = Math.max(xMax, maxX); + } + + if (!Double.isNaN(minY) && !Double.isNaN(maxY)) { + yMin = Math.min(yMin, minY); + yMax = Math.max(yMax, maxY); + } + } + + /** + * Aborts the bounding box by resetting it to its initial state. + */ + public void abort() { + valid = false; + resetBBox(); + } + + /** + * Resets the bounding box to its initial state. + */ + public void reset() { + resetBBox(); + valid = true; + } + + /** + * Creates a copy of the current bounding box. + * + * @return a new BoundingBox instance with the same values as this one. + */ + public BoundingBox copy() { + return new BoundingBox( + this.xMin, this.xMax, + this.yMin, this.yMax, + this.zMin, this.zMax, + this.mMin, this.mMax); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("BoundingBox{xMin=") + .append(xMin) + .append(", xMax=") + .append(xMax) + .append(", yMin=") + .append(yMin) + .append(", yMax=") + .append(yMax) + .append(", zMin=") + .append(zMin) + .append(", zMax=") + .append(zMax) + .append(", mMin=") + .append(mMin) + .append(", mMax=") + .append(mMax); + + // Only include the valid flag when it's false + if (!valid) { + sb.append(", valid=false"); + } + + sb.append('}'); + return sb.toString(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/GeospatialStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/GeospatialStatistics.java new file mode 100644 index 0000000000..c2e0529626 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/GeospatialStatistics.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geospatial; + +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of geospatial data written. + */ +public class GeospatialStatistics { + private static final Logger LOG = LoggerFactory.getLogger(GeospatialStatistics.class); + + private BoundingBox boundingBox; + private GeospatialTypes geospatialTypes; + + /** + * Builder to create a GeospatialStatistics. + */ + public static class Builder { + private BoundingBox boundingBox; + private GeospatialTypes geospatialTypes; + private final WKBReader reader = new WKBReader(); + + /** + * Create a builder to create a GeospatialStatistics. + */ + public Builder() { + this.boundingBox = new BoundingBox(); + this.geospatialTypes = new GeospatialTypes(); + } + + public void update(Binary value) { + if (value == null) { + return; + } + try { + Geometry geom = reader.read(value.getBytes()); + update(geom); + } catch (ParseException e) { + LOG.warn("Failed to parse WKB geometry, omit it from stats", e); + } + } + + private void update(Geometry geom) { + boundingBox.update(geom); + geospatialTypes.update(geom); + } + + public void abort() { + boundingBox.abort(); + geospatialTypes.abort(); + } + + /** + * Build a GeospatialStatistics from the builder. + * + * @return a new GeospatialStatistics object + */ + public GeospatialStatistics build() { + return new GeospatialStatistics(boundingBox, geospatialTypes); + } + } + + /** + * Create a new GeospatialStatistics builder with the specified CRS. + * + * @param type the primitive type + * @return a new GeospatialStatistics builder + */ + public static GeospatialStatistics.Builder newBuilder(PrimitiveType type) { + LogicalTypeAnnotation logicalTypeAnnotation = type.getLogicalTypeAnnotation(); + if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) { + return new GeospatialStatistics.Builder(); + } else if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) { + // For Geography type, we have not implemented the algorithm yet. + return noopBuilder(); + } else { + return noopBuilder(); + } + } + + /** + * Constructs a GeospatialStatistics object with the specified CRS, bounding box, and geospatial types. + * + * @param boundingBox the bounding box for the geospatial data, or null if not applicable, note that + * - The bounding box (bbox) is omitted only if there are no X or Y values. + * - The Z and/or M statistics are omitted only if there are no Z and/or M values, respectively. + * @param geospatialTypes the geospatial types + */ + public GeospatialStatistics(BoundingBox boundingBox, GeospatialTypes geospatialTypes) { + this.boundingBox = boundingBox; + this.geospatialTypes = geospatialTypes; + } + + /** + * Constructs a GeospatialStatistics object with the specified CRS. + */ + public GeospatialStatistics() { + this(new BoundingBox(), new GeospatialTypes()); + } + + /** + * Constructs a GeospatialStatistics object with the specified CRS. + * + * @param crs the coordinate reference system + */ + public GeospatialStatistics(String crs) { + this.boundingBox = new BoundingBox(); + this.geospatialTypes = new GeospatialTypes(); + } + + /** Returns the bounding box. */ + public BoundingBox getBoundingBox() { + return boundingBox; + } + + /** Returns the geometry types. */ + public GeospatialTypes getGeospatialTypes() { + return geospatialTypes; + } + + /** + * @return whether the statistics has valid value. + */ + public boolean isValid() { + return (boundingBox != null && boundingBox.isValid()) || (geospatialTypes != null && geospatialTypes.isValid()); + } + + public void merge(GeospatialStatistics other) { + if (boundingBox != null) { + boundingBox.merge(other.boundingBox); + } + if (geospatialTypes != null) { + geospatialTypes.merge(other.geospatialTypes); + } + } + + private void abort() { + if (boundingBox != null) { + boundingBox.abort(); + } + if (geospatialTypes != null) { + geospatialTypes.abort(); + } + } + + // Copy the statistics + public GeospatialStatistics copy() { + return new GeospatialStatistics( + boundingBox != null ? boundingBox.copy() : null, + geospatialTypes != null ? geospatialTypes.copy() : null); + } + + @Override + public String toString() { + return "GeospatialStatistics{" + "boundingBox=" + boundingBox + ", geospatialTypes=" + geospatialTypes + '}'; + } + + /** + * Creates a no-op geospatial statistics builder that collects no data. + * Used when geospatial statistics collection is disabled. + */ + private static class NoopBuilder extends Builder { + private NoopBuilder() {} + + @Override + public GeospatialStatistics build() { + return new GeospatialStatistics(null, null); + } + + @Override + public void update(Binary value) { + // do nothing + } + + @Override + public void abort() { + // do nothing + } + } + + /** + * Creates a builder that doesn't collect any statistics. + */ + public static Builder noopBuilder() { + return new NoopBuilder(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/GeospatialTypes.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/GeospatialTypes.java new file mode 100644 index 0000000000..4b6947e32e --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/geospatial/GeospatialTypes.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geospatial; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +public class GeospatialTypes { + + private static final int UNKNOWN_TYPE_ID = -1; + private Set types = new HashSet<>(); + private boolean valid = true; + + public GeospatialTypes(Set types) { + this.types = types; + this.valid = true; + } + + public GeospatialTypes(Set types, boolean valid) { + this.types = types; + this.valid = valid; + } + + public GeospatialTypes() {} + + public Set getTypes() { + return types; + } + + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + int code = getGeometryTypeCode(geometry); + if (code != UNKNOWN_TYPE_ID) { + types.add(code); + } else { + valid = false; + types.clear(); + } + } + + public void merge(GeospatialTypes other) { + if (!valid) { + return; + } + + if (other == null || !other.valid) { + valid = false; + types.clear(); + return; + } + types.addAll(other.types); + } + + public void reset() { + types.clear(); + valid = true; + } + + public void abort() { + valid = false; + types.clear(); + } + + public boolean isValid() { + return valid; + } + + public GeospatialTypes copy() { + return new GeospatialTypes(new HashSet<>(types), valid); + } + + /** + * Extracts the base geometry type code from a full type code. + * For example: 1001 (XYZ Point) -> 1 (Point) + * + * @param typeId the full geometry type code + * @return the base type code (1-7) + */ + private int getBaseTypeCode(int typeId) { + return typeId % 1000; + } + + /** + * Extracts the dimension prefix from a full type code. + * For example: 1001 (XYZ Point) -> 1000 (XYZ) + * + * @param typeId the full geometry type code + * @return the dimension prefix (0, 1000, 2000, or 3000) + */ + private int getDimensionPrefix(int typeId) { + return (typeId / 1000) * 1000; + } + + @Override + public String toString() { + return "GeospatialTypes{" + "types=" + + types.stream().map(this::typeIdToString).collect(Collectors.toSet()) + '}'; + } + + private int getGeometryTypeId(Geometry geometry) { + switch (geometry.getGeometryType()) { + case Geometry.TYPENAME_POINT: + return 1; + case Geometry.TYPENAME_LINESTRING: + return 2; + case Geometry.TYPENAME_POLYGON: + return 3; + case Geometry.TYPENAME_MULTIPOINT: + return 4; + case Geometry.TYPENAME_MULTILINESTRING: + return 5; + case Geometry.TYPENAME_MULTIPOLYGON: + return 6; + case Geometry.TYPENAME_GEOMETRYCOLLECTION: + return 7; + default: + return UNKNOWN_TYPE_ID; + } + } + + /** + * Geospatial type codes: + * + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + * + * See https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types + */ + private int getGeometryTypeCode(Geometry geometry) { + int typeId = getGeometryTypeId(geometry); + if (typeId == UNKNOWN_TYPE_ID) { + return UNKNOWN_TYPE_ID; + } + Coordinate[] coordinates = geometry.getCoordinates(); + boolean hasZ = false; + boolean hasM = false; + if (coordinates.length > 0) { + Coordinate firstCoord = coordinates[0]; + hasZ = !Double.isNaN(firstCoord.getZ()); + hasM = !Double.isNaN(firstCoord.getM()); + } + if (hasZ) { + typeId += 1000; + } + if (hasM) { + typeId += 2000; + } + return typeId; + } + + private String typeIdToString(int typeId) { + String typeString; + switch (typeId % 1000) { + case 1: + typeString = Geometry.TYPENAME_POINT; + break; + case 2: + typeString = Geometry.TYPENAME_LINESTRING; + break; + case 3: + typeString = Geometry.TYPENAME_POLYGON; + break; + case 4: + typeString = Geometry.TYPENAME_MULTIPOINT; + break; + case 5: + typeString = Geometry.TYPENAME_MULTILINESTRING; + break; + case 6: + typeString = Geometry.TYPENAME_MULTIPOLYGON; + break; + case 7: + typeString = Geometry.TYPENAME_GEOMETRYCOLLECTION; + break; + default: + return "Unknown"; + } + if (typeId >= 3000) { + typeString += " (XYZM)"; + } else if (typeId >= 2000) { + typeString += " (XYM)"; + } else if (typeId >= 1000) { + typeString += " (XYZ)"; + } else { + typeString += " (XY)"; + } + return typeString; + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java b/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java index 4826987227..1594c119fd 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/page/mem/MemPageWriter.java @@ -32,6 +32,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.io.ParquetEncodingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -89,6 +90,7 @@ public void writePage( int rowCount, Statistics statistics, SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) @@ -135,7 +137,8 @@ public void writePageV2( Encoding dataEncoding, BytesInput data, Statistics statistics, - SizeStatistics sizeStatistics) + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics) throws IOException { writePageV2( rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics); diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestBoundingBox.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestBoundingBox.java new file mode 100644 index 0000000000..3b310ed15e --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestBoundingBox.java @@ -0,0 +1,684 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geospatial; + +import org.junit.Assert; +import org.junit.Test; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.CoordinateXYZM; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.geom.LineString; +import org.locationtech.jts.geom.Point; + +public class TestBoundingBox { + + @Test + public void testUpdate() { + GeometryFactory geometryFactory = new GeometryFactory(); + BoundingBox boundingBox = new BoundingBox(); + + // Create a 2D point + Point point2D = geometryFactory.createPoint(new Coordinate(10, 20)); + boundingBox.update(point2D); + + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMax(), 0.0); + } + + @Test + public void testEmptyGeometry() { + GeometryFactory geometryFactory = new GeometryFactory(); + BoundingBox boundingBox = new BoundingBox(); + + // Create an empty point + Point emptyPoint = geometryFactory.createPoint(); + boundingBox.update(emptyPoint); + + // Empty geometry should retain the initial state + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getXMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getXMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getYMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getYMax(), 0.0); + + // Test that after adding a non-empty geometry, values are updated correctly + Point point = geometryFactory.createPoint(new Coordinate(10, 20)); + boundingBox.update(point); + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMax(), 0.0); + + // Update with another empty geometry, should not change the bounds + boundingBox.update(emptyPoint); + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMax(), 0.0); + } + + @Test + public void testNaNCoordinates() { + GeometryFactory geometryFactory = new GeometryFactory(); + BoundingBox boundingBox = new BoundingBox(); + + // Create a point with NaN coordinates + Point nanPoint = geometryFactory.createPoint(new Coordinate(Double.NaN, Double.NaN)); + boundingBox.update(nanPoint); + + // All values should be NaN after updating with all-NaN coordinates + Assert.assertTrue(boundingBox.isValid()); + Assert.assertTrue(boundingBox.isXYEmpty()); + + // Reset the bounding box for the next test + boundingBox = new BoundingBox(); + + // Create a mixed point with a valid coordinate and a NaN coordinate + Point mixedPoint = geometryFactory.createPoint(new Coordinate(15.0, Double.NaN)); + boundingBox.update(mixedPoint); + + // The valid X coordinate should be used, Y should remain at initial values + Assert.assertTrue(boundingBox.isValid()); + Assert.assertTrue(boundingBox.isXYEmpty()); + } + + @Test + public void testNaNZAndMValues() { + GeometryFactory geometryFactory = new GeometryFactory(); + BoundingBox boundingBox = new BoundingBox(); + + // Create a point with NaN Z value only + Coordinate coord = new Coordinate(10, 20); + coord.setZ(Double.NaN); // Only set Z, not M + Point nanZPoint = geometryFactory.createPoint(coord); + boundingBox.update(nanZPoint); + + // X and Y should be updated, but Z should remain NaN + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMax(), 0.0); + + // Add a point with valid Z value + Coordinate coord2 = new Coordinate(15, 25, 30); // Using constructor with Z + Point validZPoint = geometryFactory.createPoint(coord2); + boundingBox.update(validZPoint); + + // X, Y, and Z values should now be updated + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(15.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(25.0, boundingBox.getYMax(), 0.0); + Assert.assertEquals(30.0, boundingBox.getZMin(), 0.0); + Assert.assertEquals(30.0, boundingBox.getZMax(), 0.0); + + // Reset the bounding box for M value tests + boundingBox.reset(); + + // Create a point with NaN M value + CoordinateXYZM coordNanM = new CoordinateXYZM(10, 20, 30, Double.NaN); + Point nanMPoint = geometryFactory.createPoint(coordNanM); + boundingBox.update(nanMPoint); + + // X, Y, Z should be updated, but M should remain at initial values + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMax(), 0.0); + Assert.assertEquals(30.0, boundingBox.getZMin(), 0.0); + Assert.assertEquals(30.0, boundingBox.getZMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getMMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getMMax(), 0.0); + + // Add a point with valid M value + CoordinateXYZM coordValidM = new CoordinateXYZM(15, 25, 35, 40); + Point validMPoint = geometryFactory.createPoint(coordValidM); + boundingBox.update(validMPoint); + + // All values including M should now be updated + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(15.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(25.0, boundingBox.getYMax(), 0.0); + Assert.assertEquals(30.0, boundingBox.getZMin(), 0.0); + Assert.assertEquals(35.0, boundingBox.getZMax(), 0.0); + Assert.assertEquals(40.0, boundingBox.getMMin(), 0.0); + Assert.assertEquals(40.0, boundingBox.getMMax(), 0.0); + } + + @Test + public void testAbort() { + GeometryFactory geometryFactory = new GeometryFactory(); + BoundingBox boundingBox = new BoundingBox(); + + // Create a valid point + Point validPoint = geometryFactory.createPoint(new Coordinate(10, 20)); + boundingBox.update(validPoint); + + // Check initial values + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(10.0, boundingBox.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox.getXMax(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox.getYMax(), 0.0); + + // Abort the update + boundingBox.abort(); + + // Check that values are reset to initial state + Assert.assertFalse(boundingBox.isValid()); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getXMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getXMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getYMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getYMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getZMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getZMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getMMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getMMax(), 0.0); + } + + @Test + public void testEmptyBoundingBox() { + BoundingBox boundingBox = new BoundingBox(); + + // Assert all initial values are Infinity + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getXMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getXMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getYMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getYMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getZMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getZMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getMMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getMMax(), 0.0); + } + + @Test + public void testMergeBoundingBoxes() { + BoundingBox boundingBox1 = new BoundingBox(0, 10, 0, 20, Double.NaN, Double.NaN, Double.NaN, Double.NaN); + BoundingBox boundingBox2 = new BoundingBox(5, 15, 10, 30, Double.NaN, Double.NaN, Double.NaN, Double.NaN); + + boundingBox1.merge(boundingBox2); + + Assert.assertTrue(boundingBox1.isValid()); + Assert.assertEquals(0.0, boundingBox1.getXMin(), 0.0); + Assert.assertEquals(15.0, boundingBox1.getXMax(), 0.0); + Assert.assertEquals(0.0, boundingBox1.getYMin(), 0.0); + Assert.assertEquals(30.0, boundingBox1.getYMax(), 0.0); + Assert.assertTrue(Double.isNaN(boundingBox1.getZMin())); + Assert.assertTrue(Double.isNaN(boundingBox1.getZMax())); + Assert.assertTrue(Double.isNaN(boundingBox1.getMMin())); + Assert.assertTrue(Double.isNaN(boundingBox1.getMMax())); + } + + @Test + public void testMergeWithEmptyBoundingBox() { + BoundingBox boundingBox1 = new BoundingBox(0, 10, 0, 20, Double.NaN, Double.NaN, Double.NaN, Double.NaN); + BoundingBox emptyBoundingBox = new BoundingBox(); + + boundingBox1.merge(emptyBoundingBox); + + Assert.assertTrue(boundingBox1.isValid()); + Assert.assertEquals(0.0, boundingBox1.getXMin(), 0.0); + Assert.assertEquals(10.0, boundingBox1.getXMax(), 0.0); + Assert.assertEquals(0.0, boundingBox1.getYMin(), 0.0); + Assert.assertEquals(20.0, boundingBox1.getYMax(), 0.0); + Assert.assertTrue(Double.isNaN(boundingBox1.getZMin())); + Assert.assertTrue(Double.isNaN(boundingBox1.getZMax())); + Assert.assertTrue(Double.isNaN(boundingBox1.getMMin())); + Assert.assertTrue(Double.isNaN(boundingBox1.getMMax())); + } + + @Test + public void testUpdateWithNullGeometry() { + BoundingBox boundingBox = new BoundingBox(); + boundingBox.update(null); + + // Check that the bounding box remains in its initial state + Assert.assertTrue(boundingBox.isValid()); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getXMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getXMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getYMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getYMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getZMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getZMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, boundingBox.getMMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, boundingBox.getMMax(), 0.0); + } + + @Test + public void testMergeWithNaNValues() { + // Test merging with NaN values in different dimensions + BoundingBox box1 = new BoundingBox(0, 10, 0, 10, 0, 10, 0, 10); + BoundingBox box2 = new BoundingBox(5, 15, Double.NaN, Double.NaN, 5, 15, Double.NaN, Double.NaN); + + box1.merge(box2); + + // Check that box1 is invalid after the merge + Assert.assertFalse("Box1 should be invalid after the merge", box1.isValid()); + } + + @Test + public void testUpdateWithAllNaNCoordinatesAfterValid() { + GeometryFactory gf = new GeometryFactory(); + BoundingBox box = new BoundingBox(); + + // First add a valid point + box.update(gf.createPoint(new Coordinate(10, 20))); + Assert.assertTrue(box.isValid()); + Assert.assertEquals(10.0, box.getXMin(), 0.0); + Assert.assertEquals(10.0, box.getXMax(), 0.0); + Assert.assertEquals(20.0, box.getYMin(), 0.0); + Assert.assertEquals(20.0, box.getYMax(), 0.0); + + // Then update with all NaN coordinates - should not change valid values + Point nanPoint = gf.createPoint(new Coordinate(Double.NaN, Double.NaN)); + box.update(nanPoint); + + Assert.assertFalse("Box should be empty after the merge", box.isXYEmpty()); + Assert.assertTrue("Box should be valid after the merge", box.isValid()); + } + + @Test + public void testUpdate3DPoint() { + GeometryFactory gf = new GeometryFactory(); + BoundingBox box = new BoundingBox(); + + // Create a 3D point + Coordinate coord = new Coordinate(10, 20, 30); + Point point3D = gf.createPoint(coord); + box.update(point3D); + + Assert.assertTrue(box.isValid()); + Assert.assertEquals(10.0, box.getXMin(), 0.0); + Assert.assertEquals(10.0, box.getXMax(), 0.0); + Assert.assertEquals(20.0, box.getYMin(), 0.0); + Assert.assertEquals(20.0, box.getYMax(), 0.0); + Assert.assertEquals(30.0, box.getZMin(), 0.0); + Assert.assertEquals(30.0, box.getZMax(), 0.0); + + // Add another 3D point with different Z + box.update(gf.createPoint(new Coordinate(15, 25, 10))); + + Assert.assertTrue(box.isValid()); + Assert.assertEquals(10.0, box.getXMin(), 0.0); + Assert.assertEquals(15.0, box.getXMax(), 0.0); + Assert.assertEquals(20.0, box.getYMin(), 0.0); + Assert.assertEquals(25.0, box.getYMax(), 0.0); + Assert.assertEquals(10.0, box.getZMin(), 0.0); + Assert.assertEquals(30.0, box.getZMax(), 0.0); + } + + @Test + public void testUpdateWithMeasureValue() { + GeometryFactory gf = new GeometryFactory(); + BoundingBox box = new BoundingBox(); + + // Create a point with M value using CoordinateXYZM instead of setM + CoordinateXYZM coord = new CoordinateXYZM(10, 20, Double.NaN, 5.0); + Point pointWithM = gf.createPoint(coord); + box.update(pointWithM); + + Assert.assertTrue(box.isValid()); + Assert.assertEquals(10.0, box.getXMin(), 0.0); + Assert.assertEquals(10.0, box.getXMax(), 0.0); + Assert.assertEquals(20.0, box.getYMin(), 0.0); + Assert.assertEquals(20.0, box.getYMax(), 0.0); + Assert.assertEquals(5.0, box.getMMin(), 0.0); + Assert.assertEquals(5.0, box.getMMax(), 0.0); + + // Add another point with different M value + CoordinateXYZM coord2 = new CoordinateXYZM(15, 25, Double.NaN, 10.0); + box.update(gf.createPoint(coord2)); + + Assert.assertTrue(box.isValid()); + Assert.assertEquals(10.0, box.getXMin(), 0.0); + Assert.assertEquals(15.0, box.getXMax(), 0.0); + Assert.assertEquals(20.0, box.getYMin(), 0.0); + Assert.assertEquals(25.0, box.getYMax(), 0.0); + Assert.assertEquals(5.0, box.getMMin(), 0.0); + Assert.assertEquals(10.0, box.getMMax(), 0.0); + } + + @Test + public void testResetAfterUpdate() { + GeometryFactory gf = new GeometryFactory(); + BoundingBox box = new BoundingBox(); + + // Update with a valid point + box.update(gf.createPoint(new Coordinate(10, 20))); + Assert.assertTrue(box.isValid()); + Assert.assertEquals(10.0, box.getXMin(), 0.0); + + // Reset the box + box.reset(); + + // All values should be reset to their initial state + Assert.assertTrue(box.isValid()); + Assert.assertEquals(Double.POSITIVE_INFINITY, box.getXMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, box.getXMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, box.getYMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, box.getYMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, box.getZMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, box.getZMax(), 0.0); + Assert.assertEquals(Double.POSITIVE_INFINITY, box.getMMin(), 0.0); + Assert.assertEquals(Double.NEGATIVE_INFINITY, box.getMMax(), 0.0); + + // Update after reset should work correctly + box.update(gf.createPoint(new Coordinate(30, 40))); + Assert.assertTrue(box.isValid()); + Assert.assertEquals(30.0, box.getXMin(), 0.0); + Assert.assertEquals(30.0, box.getXMax(), 0.0); + Assert.assertEquals(40.0, box.getYMin(), 0.0); + Assert.assertEquals(40.0, box.getYMax(), 0.0); + } + + @Test + public void testCopy() { + // Create and populate a bounding box + BoundingBox original = new BoundingBox(1, 2, 3, 4, 5, 6, 7, 8); + + // Create copy + BoundingBox copy = original.copy(); + + // Verify all values are copied correctly + Assert.assertTrue(original.isValid()); + Assert.assertEquals(original.getXMin(), copy.getXMin(), 0.0); + Assert.assertEquals(original.getXMax(), copy.getXMax(), 0.0); + Assert.assertEquals(original.getYMin(), copy.getYMin(), 0.0); + Assert.assertEquals(original.getYMax(), copy.getYMax(), 0.0); + Assert.assertEquals(original.getZMin(), copy.getZMin(), 0.0); + Assert.assertEquals(original.getZMax(), copy.getZMax(), 0.0); + Assert.assertEquals(original.getMMin(), copy.getMMin(), 0.0); + Assert.assertEquals(original.getMMax(), copy.getMMax(), 0.0); + + // Modify the copy and verify original is unchanged + copy.reset(); + Assert.assertTrue(original.isValid()); + Assert.assertEquals(1.0, original.getXMin(), 0.0); + } + + @Test + public void testMergeWithAllNaNBox() { + // Box with valid values + BoundingBox box1 = new BoundingBox(1, 2, 3, 4, 5, 6, 7, 8); + + // Empty box with all NaN values + BoundingBox box2 = new BoundingBox(); + + // Merge should keep existing values + box1.merge(box2); + + Assert.assertTrue(box1.isValid()); + Assert.assertEquals(1.0, box1.getXMin(), 0.0); + Assert.assertEquals(2.0, box1.getXMax(), 0.0); + Assert.assertEquals(3.0, box1.getYMin(), 0.0); + Assert.assertEquals(4.0, box1.getYMax(), 0.0); + Assert.assertEquals(5.0, box1.getZMin(), 0.0); + Assert.assertEquals(6.0, box1.getZMax(), 0.0); + Assert.assertEquals(7.0, box1.getMMin(), 0.0); + Assert.assertEquals(8.0, box1.getMMax(), 0.0); + + // Test the reverse - NaN box merging with valid box + BoundingBox box3 = new BoundingBox(); + box3.merge(box1); + + Assert.assertTrue(box1.isValid()); + Assert.assertEquals(1.0, box3.getXMin(), 0.0); + Assert.assertEquals(2.0, box3.getXMax(), 0.0); + Assert.assertEquals(3.0, box3.getYMin(), 0.0); + Assert.assertEquals(4.0, box3.getYMax(), 0.0); + } + + @Test + public void testLineStringWithNaNCoordinates() { + GeometryFactory gf = new GeometryFactory(); + BoundingBox box = new BoundingBox(); + + // Create a LineString with NaN coordinates in the middle + Coordinate[] coords = + new Coordinate[] {new Coordinate(0, 1), new Coordinate(Double.NaN, Double.NaN), new Coordinate(2, 3)}; + + box.update(gf.createLineString(coords)); + + // The bounding box should include the valid coordinates and ignore NaN + Assert.assertTrue(box.isValid()); + Assert.assertEquals(0.0, box.getXMin(), 0.0); + Assert.assertEquals(2.0, box.getXMax(), 0.0); + Assert.assertEquals(1.0, box.getYMin(), 0.0); + Assert.assertEquals(3.0, box.getYMax(), 0.0); + + // Test with only one valid coordinate + BoundingBox box2 = new BoundingBox(); + Coordinate[] coords2 = new Coordinate[] { + new Coordinate(5, 6), new Coordinate(Double.NaN, Double.NaN), new Coordinate(Double.NaN, Double.NaN) + }; + + box2.update(gf.createLineString(coords2)); + + Assert.assertTrue(box2.isValid()); + Assert.assertEquals(5.0, box2.getXMin(), 0.0); + Assert.assertEquals(5.0, box2.getXMax(), 0.0); + Assert.assertEquals(6.0, box2.getYMin(), 0.0); + Assert.assertEquals(6.0, box2.getYMax(), 0.0); + + // Test with all NaN coordinates + BoundingBox box3 = new BoundingBox(); + Coordinate[] coords3 = + new Coordinate[] {new Coordinate(Double.NaN, Double.NaN), new Coordinate(Double.NaN, Double.NaN)}; + + box3.update(gf.createLineString(coords3)); + + // The bounding box should remain empty + Assert.assertTrue(box3.isValid()); + Assert.assertTrue(box3.isXYEmpty()); + } + + @Test + public void testLineStringWithPartialNaNCoordinates() { + GeometryFactory gf = new GeometryFactory(); + BoundingBox box = new BoundingBox(); + + // Create a LineString with partial NaN coordinate in the middle + // where only the Y value is NaN: "LINESTRING (0 1, 1 nan, 2 3)" + Coordinate[] coords = + new Coordinate[] {new Coordinate(0, 1), new Coordinate(1, Double.NaN), new Coordinate(2, 3)}; + + box.update(gf.createLineString(coords)); + + // The bounding box should include all valid coordinates + Assert.assertTrue(box.isValid()); + Assert.assertEquals(0.0, box.getXMin(), 0.0); + Assert.assertEquals(2.0, box.getXMax(), 0.0); + Assert.assertEquals(1.0, box.getYMin(), 0.0); + Assert.assertEquals(3.0, box.getYMax(), 0.0); + + // Test with mixed NaN values in different components + BoundingBox box2 = new BoundingBox(); + Coordinate[] coords2 = + new Coordinate[] {new Coordinate(Double.NaN, 5), new Coordinate(6, Double.NaN), new Coordinate(7, 8)}; + + box2.update(gf.createLineString(coords2)); + Assert.assertTrue(box2.isValid()); + Assert.assertTrue(box2.isXYEmpty()); + } + + /** + * Tests the end-to-end case for updating and merging bounding boxes with mixed valid and NaN coordinates. + * + * Scenario - Parquet file with multiple row groups: + * file-level bbox: [1, 9, 100, 900] + * + * Row group 1: [1, 2, 100, 100] + * - POINT (1, 100) + * - POINT (2, NaN) + * + * Row group 2: [3, 3, 300, 300] + * - POINT (3, 300) + * - POINT (NaN, NaN) + * + * Row group 3: no valid bbox + * - POINT (5, NaN) + * - POINT (6, NaN) + * + * Row group 4: [7, 8, 700, 800] + * - POINT (7, 700) + * - POINT (8, 800) + * + * Row group 5: no valid bbox + * - POINT (NaN, NaN) + * - POINT (NaN, NaN) + * + * Row group 6: [9, 9, 900, 900] + * - POINT (9, 900) + * - LINESTRING EMPTY + * + * The test verifies that: + * 1. Individual row group bounding boxes correctly handle NaN coordinates + * 2. The merge operation correctly combines valid bounding boxes and ignores invalid ones + * 3. The resulting file-level bounding box correctly represents the overall spatial extent [1, 8, 100, 800] + * 4. The merge operation is commutative - the order of merging does not affect the result + */ + @Test + public void testMergingRowGroupBoundingBoxes() { + GeometryFactory gf = new GeometryFactory(); + + // File-level bounding box (to be computed by merging row group boxes) + BoundingBox fileBBox = new BoundingBox(); + + // Row Group 1: [1, 2, 100, 100] + BoundingBox rowGroup1 = new BoundingBox(); + rowGroup1.update(gf.createPoint(new Coordinate(1, 100))); + // Point with NaN Y-coordinate + rowGroup1.update(gf.createPoint(new Coordinate(2, Double.NaN))); + + // Verify Row Group 1 + Assert.assertTrue(rowGroup1.isValid()); + Assert.assertEquals(1.0, rowGroup1.getXMin(), 0.0); + Assert.assertEquals(2.0, rowGroup1.getXMax(), 0.0); + Assert.assertEquals(100.0, rowGroup1.getYMin(), 0.0); + Assert.assertEquals(100.0, rowGroup1.getYMax(), 0.0); + Assert.assertTrue(rowGroup1.isValid()); + + // Row Group 2: [3, 3, 300, 300] + BoundingBox rowGroup2 = new BoundingBox(); + rowGroup2.update(gf.createPoint(new Coordinate(3, 300))); + // Point with all NaN coordinates + Coordinate nanCoord = new Coordinate(Double.NaN, Double.NaN); + rowGroup2.update(gf.createPoint(nanCoord)); + + // Verify Row Group 2 + Assert.assertTrue(rowGroup2.isValid()); + Assert.assertEquals(3.0, rowGroup2.getXMin(), 0.0); + Assert.assertEquals(3.0, rowGroup2.getXMax(), 0.0); + Assert.assertEquals(300.0, rowGroup2.getYMin(), 0.0); + Assert.assertEquals(300.0, rowGroup2.getYMax(), 0.0); + Assert.assertTrue(rowGroup2.isValid()); + + // Row Group 3: No defined bbox due to NaN Y-coordinates + BoundingBox rowGroup3 = new BoundingBox(); + rowGroup3.update(gf.createPoint(new Coordinate(5, Double.NaN))); + rowGroup3.update(gf.createPoint(new Coordinate(6, Double.NaN))); + + // Verify Row Group 3 + Assert.assertTrue(rowGroup3.isXYEmpty()); + + // Row Group 4: [7, 8, 700, 800] + BoundingBox rowGroup4 = new BoundingBox(); + rowGroup4.update(gf.createPoint(new Coordinate(7, 700))); + rowGroup4.update(gf.createPoint(new Coordinate(8, 800))); + + // Verify Row Group 4 + Assert.assertTrue(rowGroup4.isValid()); + Assert.assertEquals(7.0, rowGroup4.getXMin(), 0.0); + Assert.assertEquals(8.0, rowGroup4.getXMax(), 0.0); + Assert.assertEquals(700.0, rowGroup4.getYMin(), 0.0); + Assert.assertEquals(800.0, rowGroup4.getYMax(), 0.0); + Assert.assertTrue(rowGroup4.isValid()); + + // Row Group 5: No defined bbox due to all NaN coordinates + BoundingBox rowGroup5 = new BoundingBox(); + rowGroup5.update(gf.createPoint(nanCoord)); + rowGroup5.update(gf.createPoint(nanCoord)); + + // Verify Row Group 5 + Assert.assertTrue(rowGroup5.isXYEmpty()); + + // Row Group 6: Test mixing an empty geometry with a valid point [9, 9, 900, 900] + BoundingBox rowGroup6 = new BoundingBox(); + // Create an empty LineString + LineString emptyLineString = gf.createLineString(new Coordinate[0]); + // Create a valid point + Coordinate pointCoord = new Coordinate(9, 900); + Point validPoint = gf.createPoint(pointCoord); + + // Update the bounding box with both geometries + rowGroup6.update(emptyLineString); // This should be a no-op + rowGroup6.update(validPoint); // This should set the bounds + + // Verify Row Group 6 + Assert.assertTrue(rowGroup6.isValid()); + Assert.assertEquals(9.0, rowGroup6.getXMin(), 0.0); + Assert.assertEquals(9.0, rowGroup6.getXMax(), 0.0); + Assert.assertEquals(900.0, rowGroup6.getYMin(), 0.0); + Assert.assertEquals(900.0, rowGroup6.getYMax(), 0.0); + + // Merge row group boxes into file-level box + fileBBox.merge(rowGroup1); + fileBBox.merge(rowGroup2); + fileBBox.merge(rowGroup3); + fileBBox.merge(rowGroup4); + fileBBox.merge(rowGroup5); + fileBBox.merge(rowGroup6); + + // Verify file-level bounding box + // Note: Now includes point (9, 900) from rowGroup6 + Assert.assertTrue(fileBBox.isValid()); + Assert.assertEquals(1.0, fileBBox.getXMin(), 0.0); + Assert.assertEquals(9.0, fileBBox.getXMax(), 0.0); + Assert.assertEquals(100.0, fileBBox.getYMin(), 0.0); + Assert.assertEquals(900.0, fileBBox.getYMax(), 0.0); + Assert.assertTrue(fileBBox.isValid()); + + // Test merging in reverse order to ensure commutativity + BoundingBox reverseMergeBox = new BoundingBox(); + reverseMergeBox.merge(rowGroup6); + reverseMergeBox.merge(rowGroup5); + reverseMergeBox.merge(rowGroup4); + reverseMergeBox.merge(rowGroup3); + reverseMergeBox.merge(rowGroup2); + reverseMergeBox.merge(rowGroup1); + + Assert.assertTrue(reverseMergeBox.isValid()); + Assert.assertEquals(1.0, reverseMergeBox.getXMin(), 0.0); + Assert.assertEquals(9.0, reverseMergeBox.getXMax(), 0.0); + Assert.assertEquals(100.0, reverseMergeBox.getYMin(), 0.0); + Assert.assertEquals(900.0, reverseMergeBox.getYMax(), 0.0); + Assert.assertTrue(reverseMergeBox.isValid()); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestGeospatialStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestGeospatialStatistics.java new file mode 100644 index 0000000000..7c91ffbecb --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestGeospatialStatistics.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.column.statistics.geospatial; + +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; +import org.junit.Assert; +import org.junit.Test; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBWriter; +import org.locationtech.jts.io.WKTReader; + +public class TestGeospatialStatistics { + + @Test + public void testAddGeospatialData() throws ParseException { + PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType(null)) + .named("a"); + GeospatialStatistics.Builder builder = GeospatialStatistics.newBuilder(type); + WKTReader wktReader = new WKTReader(); + WKBWriter wkbWriter = new WKBWriter(); + // Convert Geometry to WKB and update the builder + builder.update(Binary.fromConstantByteArray(wkbWriter.write(wktReader.read("POINT (1 1)")))); + builder.update(Binary.fromConstantByteArray(wkbWriter.write(wktReader.read("POINT (2 2)")))); + GeospatialStatistics statistics = builder.build(); + Assert.assertTrue(statistics.isValid()); + Assert.assertNotNull(statistics.getBoundingBox()); + Assert.assertNotNull(statistics.getGeospatialTypes()); + } + + @Test + public void testMergeGeospatialStatistics() throws ParseException { + PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType(null)) + .named("a"); + + WKTReader wktReader = new WKTReader(); + WKBWriter wkbWriter = new WKBWriter(); + + GeospatialStatistics.Builder builder1 = GeospatialStatistics.newBuilder(type); + builder1.update(Binary.fromConstantByteArray(wkbWriter.write(wktReader.read("POINT (1 1)")))); + GeospatialStatistics statistics1 = builder1.build(); + + GeospatialStatistics.Builder builder2 = GeospatialStatistics.newBuilder(type); + builder2.update(Binary.fromConstantByteArray(wkbWriter.write(wktReader.read("POINT (2 2)")))); + GeospatialStatistics statistics2 = builder2.build(); + + statistics1.merge(statistics2); + Assert.assertTrue(statistics1.isValid()); + Assert.assertNotNull(statistics1.getBoundingBox()); + Assert.assertNotNull(statistics1.getGeospatialTypes()); + } + + @Test + public void testMergeNullGeospatialStatistics() { + // Create a valid stats object + PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType(null)) + .named("a"); + + WKTReader wktReader = new WKTReader(); + WKBWriter wkbWriter = new WKBWriter(); + + GeospatialStatistics.Builder validBuilder = GeospatialStatistics.newBuilder(type); + try { + validBuilder.update(Binary.fromConstantByteArray(wkbWriter.write(wktReader.read("POINT (1 1)")))); + } catch (ParseException e) { + Assert.fail("Failed to parse valid WKT: " + e.getMessage()); + } + GeospatialStatistics validStats = validBuilder.build(); + Assert.assertTrue(validStats.isValid()); + + // Create stats with null components + GeospatialStatistics nullStats = new GeospatialStatistics(null, null); + Assert.assertFalse(nullStats.isValid()); + + // Test merging valid with null + GeospatialStatistics validCopy = validStats.copy(); + validCopy.merge(nullStats); + Assert.assertFalse(validCopy.isValid()); + Assert.assertNotNull(validCopy.getBoundingBox()); + Assert.assertNotNull(validCopy.getGeospatialTypes()); + + // Test merging null with valid + nullStats = new GeospatialStatistics(null, null); + nullStats.merge(validStats); + Assert.assertFalse(nullStats.isValid()); + Assert.assertNull(nullStats.getBoundingBox()); + Assert.assertNull(nullStats.getGeospatialTypes()); + + // Create stats with null bounding box only + GeospatialStatistics nullBboxStats = new GeospatialStatistics(null, new GeospatialTypes()); + Assert.assertTrue(nullBboxStats.isValid()); + + // Test merging valid with null bounding box + validCopy = validStats.copy(); + validCopy.merge(nullBboxStats); + Assert.assertTrue(validCopy.isValid()); + Assert.assertNotNull(validCopy.getBoundingBox()); + Assert.assertNotNull(validCopy.getGeospatialTypes()); + } + + @Test + public void testCopyGeospatialStatistics() { + PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType(null)) + .named("a"); + GeospatialStatistics.Builder builder = GeospatialStatistics.newBuilder(type); + builder.update(Binary.fromString("POINT (1 1)")); + GeospatialStatistics statistics = builder.build(); + GeospatialStatistics copy = statistics.copy(); + Assert.assertTrue(copy.isValid()); + Assert.assertNotNull(copy.getBoundingBox()); + Assert.assertNotNull(copy.getGeospatialTypes()); + } + + @Test + public void testInvalidGeometryMakesStatisticsInvalid() { + PrimitiveType type = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType(null)) + .named("a"); + GeospatialStatistics.Builder builder = GeospatialStatistics.newBuilder(type); + + // First add a valid geometry + WKTReader wktReader = new WKTReader(); + WKBWriter wkbWriter = new WKBWriter(); + try { + builder.update(Binary.fromConstantByteArray(wkbWriter.write(wktReader.read("POINT (1 1)")))); + } catch (ParseException e) { + Assert.fail("Failed to parse valid WKT: " + e.getMessage()); + } + + // Valid at this point + GeospatialStatistics validStats = builder.build(); + Assert.assertTrue(validStats.isValid()); + + // Now add invalid data - corrupt WKB bytes + byte[] invalidBytes = new byte[] {0x01, 0x02, 0x03}; // Invalid WKB format + builder.update(Binary.fromConstantByteArray(invalidBytes)); + + // After adding invalid data, omit it from stats + GeospatialStatistics invalidStats = builder.build(); + Assert.assertTrue(invalidStats.isValid()); + } + + @Test + public void testNoopBuilder() { + GeospatialStatistics.Builder builder = GeospatialStatistics.noopBuilder(); + GeospatialStatistics statistics = builder.build(); + Assert.assertFalse(statistics.isValid()); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestGeospatialTypes.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestGeospatialTypes.java new file mode 100644 index 0000000000..832aac06ee --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/geospatial/TestGeospatialTypes.java @@ -0,0 +1,559 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geospatial; + +import java.util.HashSet; +import java.util.Set; +import org.junit.Assert; +import org.junit.Test; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.CoordinateXYZM; +import org.locationtech.jts.geom.GeometryCollection; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.geom.LineString; +import org.locationtech.jts.geom.LinearRing; +import org.locationtech.jts.geom.MultiLineString; +import org.locationtech.jts.geom.MultiPoint; +import org.locationtech.jts.geom.MultiPolygon; +import org.locationtech.jts.geom.Point; +import org.locationtech.jts.geom.Polygon; + +public class TestGeospatialTypes { + + @Test + public void testUpdateWithDifferentGeometryTypes() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Test with Point (type code 1) + Point point = gf.createPoint(new Coordinate(1, 1)); + geospatialTypes.update(point); + Assert.assertTrue(geospatialTypes.getTypes().contains(1)); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + + // Test with LineString (type code 2) + Coordinate[] lineCoords = new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}; + LineString line = gf.createLineString(lineCoords); + geospatialTypes.update(line); + Assert.assertTrue(geospatialTypes.getTypes().contains(1)); + Assert.assertTrue(geospatialTypes.getTypes().contains(2)); + Assert.assertEquals(2, geospatialTypes.getTypes().size()); + + // Test with Polygon (type code 3) + Coordinate[] polygonCoords = new Coordinate[] { + new Coordinate(0, 0), new Coordinate(1, 0), + new Coordinate(1, 1), new Coordinate(0, 1), + new Coordinate(0, 0) + }; + LinearRing shell = gf.createLinearRing(polygonCoords); + Polygon polygon = gf.createPolygon(shell); + geospatialTypes.update(polygon); + Assert.assertTrue(geospatialTypes.getTypes().contains(1)); + Assert.assertTrue(geospatialTypes.getTypes().contains(2)); + Assert.assertTrue(geospatialTypes.getTypes().contains(3)); + Assert.assertEquals(3, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateWithComplexGeometries() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // MultiPoint (type code 4) + Point[] points = new Point[] {gf.createPoint(new Coordinate(1, 1)), gf.createPoint(new Coordinate(2, 2))}; + MultiPoint multiPoint = gf.createMultiPoint(points); + geospatialTypes.update(multiPoint); + Assert.assertTrue(geospatialTypes.getTypes().contains(4)); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + + // MultiLineString (type code 5) + LineString[] lines = new LineString[] { + gf.createLineString(new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}), + gf.createLineString(new Coordinate[] {new Coordinate(3, 3), new Coordinate(4, 4)}) + }; + MultiLineString multiLine = gf.createMultiLineString(lines); + geospatialTypes.update(multiLine); + Assert.assertTrue(geospatialTypes.getTypes().contains(4)); + Assert.assertTrue(geospatialTypes.getTypes().contains(5)); + Assert.assertEquals(2, geospatialTypes.getTypes().size()); + + // MultiPolygon (type code 6) + Polygon[] polygons = new Polygon[] { + gf.createPolygon(gf.createLinearRing(new Coordinate[] { + new Coordinate(0, 0), new Coordinate(1, 0), + new Coordinate(1, 1), new Coordinate(0, 1), + new Coordinate(0, 0) + })) + }; + MultiPolygon multiPolygon = gf.createMultiPolygon(polygons); + geospatialTypes.update(multiPolygon); + Assert.assertTrue(geospatialTypes.getTypes().contains(4)); + Assert.assertTrue(geospatialTypes.getTypes().contains(5)); + Assert.assertTrue(geospatialTypes.getTypes().contains(6)); + Assert.assertEquals(3, geospatialTypes.getTypes().size()); + + // GeometryCollection (type code 7) + GeometryCollection collection = gf.createGeometryCollection( + new org.locationtech.jts.geom.Geometry[] {multiPoint, multiLine, multiPolygon}); + geospatialTypes.update(collection); + Assert.assertTrue(geospatialTypes.getTypes().contains(4)); + Assert.assertTrue(geospatialTypes.getTypes().contains(5)); + Assert.assertTrue(geospatialTypes.getTypes().contains(6)); + Assert.assertTrue(geospatialTypes.getTypes().contains(7)); + Assert.assertEquals(4, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateWithZCoordinates() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Create a 3D point (XYZ) - should be type code 1001 + Point pointXYZ = gf.createPoint(new Coordinate(1, 1, 1)); + geospatialTypes.update(pointXYZ); + Assert.assertTrue(geospatialTypes.getTypes().contains(1001)); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateWithMCoordinates() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Create a point with measure (XYM) - should be type code 2001 + CoordinateXYZM coord = new CoordinateXYZM(1, 1, Double.NaN, 10); + Point pointXYM = gf.createPoint(coord); + geospatialTypes.update(pointXYM); + Assert.assertTrue(geospatialTypes.getTypes().contains(2001)); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateWithZMCoordinates() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Create a 4D point (XYZM) - should be type code 3001 + CoordinateXYZM coord = new CoordinateXYZM(1, 1, 1, 10); + Point pointXYZM = gf.createPoint(coord); + geospatialTypes.update(pointXYZM); + Assert.assertTrue(geospatialTypes.getTypes().contains(3001)); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testMergeGeospatialTypes() { + GeometryFactory gf = new GeometryFactory(); + + // Create first set of types + GeospatialTypes types1 = new GeospatialTypes(); + types1.update(gf.createPoint(new Coordinate(1, 1))); // Point (1) + + // Create second set of types + GeospatialTypes types2 = new GeospatialTypes(); + Coordinate[] lineCoords = new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}; + types2.update(gf.createLineString(lineCoords)); // LineString (2) + + // Merge types2 into types1 + types1.merge(types2); + + // Check merged result + Assert.assertTrue(types1.getTypes().contains(1)); // Point + Assert.assertTrue(types1.getTypes().contains(2)); // LineString + Assert.assertEquals(2, types1.getTypes().size()); + + // Create third set of types with Z dimension + GeospatialTypes types3 = new GeospatialTypes(); + types3.update(gf.createPoint(new Coordinate(1, 1, 1))); // Point XYZ (1001) + + // Merge types3 into types1 + types1.merge(types3); + + // Check merged result + Assert.assertTrue(types1.getTypes().contains(1)); // Point XY + Assert.assertTrue(types1.getTypes().contains(2)); // LineString XY + Assert.assertTrue(types1.getTypes().contains(1001)); // Point XYZ + Assert.assertEquals(3, types1.getTypes().size()); + } + + @Test + public void testMergeWithEmptyGeospatialTypes() { + GeometryFactory gf = new GeometryFactory(); + + // Create set with types + GeospatialTypes types1 = new GeospatialTypes(); + types1.update(gf.createPoint(new Coordinate(1, 1))); // Type 1 + Assert.assertEquals(1, types1.getTypes().size()); + + // Create empty set + GeospatialTypes emptyTypes = new GeospatialTypes(); + Assert.assertEquals(0, emptyTypes.getTypes().size()); + + // Merge empty into non-empty + types1.merge(emptyTypes); + Assert.assertEquals(1, types1.getTypes().size()); + Assert.assertTrue(types1.getTypes().contains(1)); + + // Merge non-empty into empty + emptyTypes.merge(types1); + Assert.assertEquals(1, emptyTypes.getTypes().size()); + Assert.assertTrue(emptyTypes.getTypes().contains(1)); + } + + @Test + public void testUpdateWithNullOrEmptyGeometry() { + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Update with null geometry + geospatialTypes.update(null); + Assert.assertEquals(0, geospatialTypes.getTypes().size()); + + // Update with empty point + GeometryFactory gf = new GeometryFactory(); + Point emptyPoint = gf.createPoint((Coordinate) null); + geospatialTypes.update(emptyPoint); + Assert.assertEquals(0, geospatialTypes.getTypes().size()); + + // Update with empty linestring + LineString emptyLine = gf.createLineString((Coordinate[]) null); + geospatialTypes.update(emptyLine); + Assert.assertEquals(0, geospatialTypes.getTypes().size()); + } + + @Test + public void testReset() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Add some types + geospatialTypes.update(gf.createPoint(new Coordinate(1, 1))); + geospatialTypes.update(gf.createLineString(new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)})); + Assert.assertEquals(2, geospatialTypes.getTypes().size()); + + // Reset the types + geospatialTypes.reset(); + Assert.assertEquals(0, geospatialTypes.getTypes().size()); + + // Add new types after reset + geospatialTypes.update(gf.createPoint(new Coordinate(3, 3, 3))); // XYZ point (1001) + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(1001)); + } + + @Test + public void testAbort() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Add some types + geospatialTypes.update(gf.createPoint(new Coordinate(1, 1))); + Assert.assertTrue(geospatialTypes.isValid()); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + + // Abort the set + geospatialTypes.abort(); + Assert.assertFalse(geospatialTypes.isValid()); + Assert.assertEquals(0, geospatialTypes.getTypes().size()); + + // Update after abort shouldn't add anything + geospatialTypes.update(gf.createPoint(new Coordinate(2, 2))); + Assert.assertEquals(0, geospatialTypes.getTypes().size()); + Assert.assertFalse(geospatialTypes.isValid()); + } + + @Test + public void testCopy() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes original = new GeospatialTypes(); + + // Add some types + original.update(gf.createPoint(new Coordinate(1, 1))); + original.update(gf.createLineString(new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)})); + + // Create a copy + GeospatialTypes copy = original.copy(); + + // Verify the copy has the same types + Assert.assertEquals(original.getTypes().size(), copy.getTypes().size()); + for (Integer typeId : original.getTypes()) { + Assert.assertTrue(copy.getTypes().contains(typeId)); + } + + // Modify copy and verify it doesn't affect the original + copy.update(gf.createPoint(new Coordinate(3, 3, 3))); // Add XYZ point (1001) + Assert.assertEquals(2, original.getTypes().size()); + Assert.assertEquals(3, copy.getTypes().size()); + Assert.assertTrue(copy.getTypes().contains(1001)); + Assert.assertFalse(original.getTypes().contains(1001)); + } + + @Test + public void testMergeWithNullGeospatialTypes() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes types = new GeospatialTypes(); + + // Add a type + types.update(gf.createPoint(new Coordinate(1, 1))); + Assert.assertEquals(1, types.getTypes().size()); + Assert.assertTrue(types.isValid()); + + // Merge with null + types.merge(null); + Assert.assertEquals(0, types.getTypes().size()); + Assert.assertFalse(types.isValid()); + } + + @Test + public void testMergeWithInvalidGeospatialTypes() { + GeometryFactory gf = new GeometryFactory(); + + // Create valid types + GeospatialTypes validTypes = new GeospatialTypes(); + validTypes.update(gf.createPoint(new Coordinate(1, 1))); + Assert.assertTrue(validTypes.isValid()); + Assert.assertEquals(1, validTypes.getTypes().size()); + + // Create invalid types + GeospatialTypes invalidTypes = new GeospatialTypes(); + invalidTypes.abort(); // Mark as invalid + Assert.assertFalse(invalidTypes.isValid()); + + // Merge invalid into valid + validTypes.merge(invalidTypes); + Assert.assertFalse(validTypes.isValid()); + Assert.assertEquals(0, validTypes.getTypes().size()); + + // Create new valid types + GeospatialTypes newValidTypes = new GeospatialTypes(); + newValidTypes.update(gf.createPoint(new Coordinate(2, 2))); + + // Merge valid into invalid + invalidTypes.merge(newValidTypes); + Assert.assertFalse(invalidTypes.isValid()); + Assert.assertEquals(0, invalidTypes.getTypes().size()); + } + + @Test + public void testConstructorWithTypes() { + // Create a set of types + Set typeSet = new HashSet<>(); + typeSet.add(1); // Point XY + typeSet.add(1001); // Point XYZ + typeSet.add(2); // LineString XY + + // Create GeospatialTypes with the set + GeospatialTypes types = new GeospatialTypes(typeSet); + + // Verify types were properly set + Assert.assertEquals(3, types.getTypes().size()); + Assert.assertTrue(types.getTypes().contains(1)); + Assert.assertTrue(types.getTypes().contains(1001)); + Assert.assertTrue(types.getTypes().contains(2)); + Assert.assertTrue(types.isValid()); + } + + @Test + public void testUpdateWithMixedDimensionGeometries() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes types = new GeospatialTypes(); + + // Add Point XY + types.update(gf.createPoint(new Coordinate(1, 1))); + Assert.assertTrue(types.getTypes().contains(1)); + + // Add Point XYZ + types.update(gf.createPoint(new Coordinate(2, 2, 2))); + Assert.assertTrue(types.getTypes().contains(1)); + Assert.assertTrue(types.getTypes().contains(1001)); + Assert.assertEquals(2, types.getTypes().size()); + + // Add Point XYM + CoordinateXYZM coordXYM = new CoordinateXYZM(3, 3, Double.NaN, 10); + types.update(gf.createPoint(coordXYM)); + Assert.assertTrue(types.getTypes().contains(1)); + Assert.assertTrue(types.getTypes().contains(1001)); + Assert.assertTrue(types.getTypes().contains(2001)); + Assert.assertEquals(3, types.getTypes().size()); + + // Add Point XYZM + CoordinateXYZM coordXYZM = new CoordinateXYZM(4, 4, 4, 10); + types.update(gf.createPoint(coordXYZM)); + Assert.assertTrue(types.getTypes().contains(1)); + Assert.assertTrue(types.getTypes().contains(1001)); + Assert.assertTrue(types.getTypes().contains(2001)); + Assert.assertTrue(types.getTypes().contains(3001)); + Assert.assertEquals(4, types.getTypes().size()); + } + + @Test + public void testRowGroupTypeMerging() { + GeometryFactory gf = new GeometryFactory(); + + // File level geospatial types (to be computed by merging row groups) + GeospatialTypes fileTypes = new GeospatialTypes(); + + // Row Group 1: Points XY and XYZ + GeospatialTypes rowGroup1 = new GeospatialTypes(); + rowGroup1.update(gf.createPoint(new Coordinate(1, 1))); // Point XY (1) + rowGroup1.update(gf.createPoint(new Coordinate(2, 2, 2))); // Point XYZ (1001) + Assert.assertEquals(2, rowGroup1.getTypes().size()); + Assert.assertTrue(rowGroup1.getTypes().contains(1)); + Assert.assertTrue(rowGroup1.getTypes().contains(1001)); + + // Row Group 2: LineStrings XY + GeospatialTypes rowGroup2 = new GeospatialTypes(); + LineString lineXY = gf.createLineString(new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}); + rowGroup2.update(lineXY); // LineString XY (2) + Assert.assertEquals(1, rowGroup2.getTypes().size()); + Assert.assertTrue(rowGroup2.getTypes().contains(2)); + + // Row Group 3: Invalid types (aborted) + GeospatialTypes rowGroup3 = new GeospatialTypes(); + rowGroup3.abort(); + Assert.assertFalse(rowGroup3.isValid()); + + // Merge row groups into file-level types + fileTypes.merge(rowGroup1); + fileTypes.merge(rowGroup2); + fileTypes.merge(rowGroup3); // This should invalidate fileTypes + + // Verify file level types after merge + Assert.assertFalse(fileTypes.isValid()); + Assert.assertEquals(0, fileTypes.getTypes().size()); + + // Test with different merge order - abort last + fileTypes = new GeospatialTypes(); + fileTypes.merge(rowGroup1); + fileTypes.merge(rowGroup3); // This should invalidate fileTypes immediately + fileTypes.merge(rowGroup2); // This shouldn't change anything since fileTypes is already invalid + + // Verify file level types after second merge sequence + Assert.assertFalse(fileTypes.isValid()); + Assert.assertEquals(0, fileTypes.getTypes().size()); + + // Test without the invalid row group + fileTypes = new GeospatialTypes(); + fileTypes.merge(rowGroup1); + fileTypes.merge(rowGroup2); + + // Verify file level types - should have 3 types: Point XY, Point XYZ, LineString XY + Assert.assertTrue(fileTypes.isValid()); + Assert.assertEquals(3, fileTypes.getTypes().size()); + Assert.assertTrue(fileTypes.getTypes().contains(1)); + Assert.assertTrue(fileTypes.getTypes().contains(1001)); + Assert.assertTrue(fileTypes.getTypes().contains(2)); + } + + @Test + public void testGeometryTypeCodeAssignment() { + GeometryFactory gf = new GeometryFactory(); + GeospatialTypes geospatialTypes = new GeospatialTypes(); + + // Test Point (type code 1) + Point point = gf.createPoint(new Coordinate(1, 1)); + geospatialTypes.update(point); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(1)); + + geospatialTypes.reset(); + + // Test LineString (type code 2) + LineString line = gf.createLineString(new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}); + geospatialTypes.update(line); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(2)); + + geospatialTypes.reset(); + + // Test Polygon (type code 3) + LinearRing shell = gf.createLinearRing(new Coordinate[] { + new Coordinate(0, 0), new Coordinate(1, 0), + new Coordinate(1, 1), new Coordinate(0, 1), + new Coordinate(0, 0) + }); + Polygon polygon = gf.createPolygon(shell); + geospatialTypes.update(polygon); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(3)); + + geospatialTypes.reset(); + + // Test MultiPoint (type code 4) + MultiPoint multiPoint = gf.createMultiPoint( + new Point[] {gf.createPoint(new Coordinate(1, 1)), gf.createPoint(new Coordinate(2, 2))}); + geospatialTypes.update(multiPoint); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(4)); + + geospatialTypes.reset(); + + // Test MultiLineString (type code 5) + MultiLineString multiLine = gf.createMultiLineString(new LineString[] { + gf.createLineString(new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}), + gf.createLineString(new Coordinate[] {new Coordinate(3, 3), new Coordinate(4, 4)}) + }); + geospatialTypes.update(multiLine); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(5)); + + geospatialTypes.reset(); + + // Test MultiPolygon (type code 6) + MultiPolygon multiPolygon = gf.createMultiPolygon(new Polygon[] {gf.createPolygon(shell)}); + geospatialTypes.update(multiPolygon); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(6)); + + geospatialTypes.reset(); + + // Test GeometryCollection (type code 7) + GeometryCollection collection = + gf.createGeometryCollection(new org.locationtech.jts.geom.Geometry[] {point, line}); + geospatialTypes.update(collection); + Assert.assertEquals(1, geospatialTypes.getTypes().size()); + Assert.assertTrue(geospatialTypes.getTypes().contains(7)); + } + + @Test + public void testGeometryTypeDimensionCodes() { + GeometryFactory gf = new GeometryFactory(); + + // Test XY (standard 2D, no prefix = 0) + GeospatialTypes types2D = new GeospatialTypes(); + types2D.update(gf.createPoint(new Coordinate(1, 1))); + Assert.assertTrue(types2D.getTypes().contains(1)); // Point XY + + // Test XYZ (Z dimension, prefix = 1000) + GeospatialTypes types3D = new GeospatialTypes(); + types3D.update(gf.createPoint(new Coordinate(1, 1, 1))); + Assert.assertTrue(types3D.getTypes().contains(1001)); // Point XYZ + + // Test XYM (M dimension, prefix = 2000) + GeospatialTypes typesXYM = new GeospatialTypes(); + CoordinateXYZM coordXYM = new CoordinateXYZM(1, 1, Double.NaN, 10); + typesXYM.update(gf.createPoint(coordXYM)); + Assert.assertTrue(typesXYM.getTypes().contains(2001)); // Point XYM + + // Test XYZM (Z and M dimensions, prefix = 3000) + GeospatialTypes typesXYZM = new GeospatialTypes(); + CoordinateXYZM coordXYZM = new CoordinateXYZM(1, 1, 1, 10); + typesXYZM.update(gf.createPoint(coordXYZM)); + Assert.assertTrue(typesXYZM.getTypes().contains(3001)); // Point XYZM + } +} diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index adfebfbd05..687310d9e2 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -135,6 +135,12 @@ jar compile + + org.locationtech.jts + jts-core + ${jts.version} + test + io.airlift aircompressor diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 15fcd14a73..d20ac7faeb 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -50,6 +50,7 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialTypes; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.crypto.AesCipher; import org.apache.parquet.crypto.AesGcmEncryptor; @@ -65,6 +66,7 @@ import org.apache.parquet.format.BloomFilterHash; import org.apache.parquet.format.BloomFilterHeader; import org.apache.parquet.format.BoundaryOrder; +import org.apache.parquet.format.BoundingBox; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnCryptoMetaData; import org.apache.parquet.format.ColumnIndex; @@ -83,6 +85,7 @@ import org.apache.parquet.format.FileMetaData; import org.apache.parquet.format.GeographyType; import org.apache.parquet.format.GeometryType; +import org.apache.parquet.format.GeospatialStatistics; import org.apache.parquet.format.IntType; import org.apache.parquet.format.KeyValue; import org.apache.parquet.format.LogicalType; @@ -608,6 +611,12 @@ private void addRowGroup( metaData.setSize_statistics(toParquetSizeStatistics(columnMetaData.getSizeStatistics())); } + if (columnMetaData.getGeospatialStatistics() != null + && columnMetaData.getGeospatialStatistics().isValid()) { + metaData.setGeospatial_statistics( + toParquetGeospatialStatistics(columnMetaData.getGeospatialStatistics())); + } + if (!encryptMetaData) { columnChunk.setMeta_data(metaData); } else { @@ -795,6 +804,36 @@ public static Statistics toParquetStatistics( return formatStats; } + private static BoundingBox toParquetBoundingBox(org.apache.parquet.column.statistics.geospatial.BoundingBox bbox) { + // Check if any of the required bounding box is valid. + if (!bbox.isXYValid() || bbox.isXYEmpty()) { + // According to the Thrift-generated class, these fields are marked as required and must be set explicitly. + // If any of them is NaN, it indicates the bounding box is invalid or uninitialized, + // so we return null to avoid creating a malformed BoundingBox object that would later fail serialization + // or validation. + return null; + } + + // Now we can safely create the BoundingBox object + BoundingBox formatBbox = new BoundingBox(); + formatBbox.setXmin(bbox.getXMin()); + formatBbox.setXmax(bbox.getXMax()); + formatBbox.setYmin(bbox.getYMin()); + formatBbox.setYmax(bbox.getYMax()); + + if (bbox.isZValid() && !bbox.isZEmpty()) { + formatBbox.setZmin(bbox.getZMin()); + formatBbox.setZmax(bbox.getZMax()); + } + + if (bbox.isMValid() && !bbox.isMEmpty()) { + formatBbox.setMmin(bbox.getMMin()); + formatBbox.setMmax(bbox.getMMax()); + } + + return formatBbox; + } + private static boolean withinLimit(org.apache.parquet.column.statistics.Statistics stats, int truncateLength) { if (stats.isSmallerThan(MAX_STATS_SIZE)) { return true; @@ -900,6 +939,75 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); } + GeospatialStatistics toParquetGeospatialStatistics( + org.apache.parquet.column.statistics.geospatial.GeospatialStatistics geospatialStatistics) { + if (geospatialStatistics == null) { + return null; + } + + GeospatialStatistics formatStats = new GeospatialStatistics(); + boolean hasStats = false; + + if (geospatialStatistics.getBoundingBox() != null + && geospatialStatistics.getBoundingBox().isValid() + && !geospatialStatistics.getBoundingBox().isXYEmpty()) { + formatStats.setBbox(toParquetBoundingBox(geospatialStatistics.getBoundingBox())); + hasStats = true; + } + + if (geospatialStatistics.getGeospatialTypes() != null + && geospatialStatistics.getGeospatialTypes().isValid()) { + List geometryTypes = + new ArrayList<>(geospatialStatistics.getGeospatialTypes().getTypes()); + if (!geometryTypes.isEmpty()) { + Collections.sort(geometryTypes); + formatStats.setGeospatial_types(geometryTypes); + hasStats = true; + } + } + + if (!hasStats) { + return null; + } + + return formatStats; + } + + static org.apache.parquet.column.statistics.geospatial.GeospatialStatistics fromParquetStatistics( + GeospatialStatistics formatGeomStats, PrimitiveType type) { + org.apache.parquet.column.statistics.geospatial.BoundingBox bbox = null; + if (formatGeomStats == null) { + return null; + } + if (formatGeomStats.isSetBbox()) { + BoundingBox formatBbox = formatGeomStats.getBbox(); + bbox = new org.apache.parquet.column.statistics.geospatial.BoundingBox( + formatBbox.isSetXmin() ? formatBbox.getXmin() : Double.NaN, + formatBbox.isSetXmax() ? formatBbox.getXmax() : Double.NaN, + formatBbox.isSetYmin() ? formatBbox.getYmin() : Double.NaN, + formatBbox.isSetYmax() ? formatBbox.getYmax() : Double.NaN, + formatBbox.isSetZmin() ? formatBbox.getZmin() : Double.NaN, + formatBbox.isSetZmax() ? formatBbox.getZmax() : Double.NaN, + formatBbox.isSetMmin() ? formatBbox.getMmin() : Double.NaN, + formatBbox.isSetMmax() ? formatBbox.getMmax() : Double.NaN); + } + GeospatialTypes geospatialTypes = null; + if (formatGeomStats.isSetGeospatial_types()) { + geospatialTypes = new GeospatialTypes(new HashSet<>(formatGeomStats.getGeospatial_types())); + } + + // get the logical type annotation data from the type + LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation(); + if (logicalType instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) { + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType = + (LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) logicalType; + return new org.apache.parquet.column.statistics.geospatial.GeospatialStatistics(bbox, geospatialTypes); + } + return new org.apache.parquet.column.statistics.geospatial.GeospatialStatistics( + // this case should not happen in normal cases + bbox, geospatialTypes); + } + /** * Sort order for page and column statistics. Types are associated with sort * orders (e.g., UTF8 columns should use UNSIGNED) and column stats are @@ -1064,6 +1172,12 @@ public Optional visit( LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { return of(SortOrder.SIGNED); } + + @Override + public Optional visit( + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { + return of(SortOrder.UNKNOWN); + } }) .orElse(defaultSortOrder(primitive.getPrimitiveTypeName())); } @@ -1653,7 +1767,8 @@ public ColumnChunkMetaData buildColumnChunkMetaData( metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size, - fromParquetSizeStatistics(metaData.size_statistics, type)); + fromParquetSizeStatistics(metaData.size_statistics, type), + fromParquetStatistics(metaData.geospatial_statistics, type)); } public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 566ab76cc5..d9e6ea0990 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -40,6 +40,7 @@ import org.apache.parquet.column.page.PageWriter; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore; import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter; @@ -92,6 +93,7 @@ private static final class ColumnChunkPageWriter implements PageWriter, BloomFil private OffsetIndexBuilder offsetIndexBuilder; private Statistics totalStatistics; private final SizeStatistics totalSizeStatistics; + private final GeospatialStatistics totalGeospatialStatistics; private final ByteBufferReleaser releaser; private final CRC32 crc; @@ -126,6 +128,8 @@ private ColumnChunkPageWriter( this.totalSizeStatistics = SizeStatistics.newBuilder( path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel()) .build(); + this.totalGeospatialStatistics = + GeospatialStatistics.newBuilder(path.getPrimitiveType()).build(); this.pageWriteChecksumEnabled = pageWriteChecksumEnabled; this.crc = pageWriteChecksumEnabled ? new CRC32() : null; this.headerBlockEncryptor = headerBlockEncryptor; @@ -175,7 +179,7 @@ public void writePage( Encoding dlEncoding, Encoding valuesEncoding) throws IOException { - writePage(bytes, valueCount, rowCount, statistics, null, rlEncoding, dlEncoding, valuesEncoding); + writePage(bytes, valueCount, rowCount, statistics, null, null, rlEncoding, dlEncoding, valuesEncoding); } @Override @@ -185,6 +189,7 @@ public void writePage( int rowCount, Statistics statistics, SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) @@ -241,7 +246,7 @@ public void writePage( this.totalValueCount += valueCount; this.pageCount += 1; - mergeColumnStatistics(statistics, sizeStatistics); + mergeColumnStatistics(statistics, sizeStatistics, geospatialStatistics); offsetIndexBuilder.add( toIntWithCheck(tempOutputStream.size() + compressedSize), rowCount, @@ -275,7 +280,8 @@ public void writePageV2( dataEncoding, data, statistics, - /*size_statistics=*/ null); + /*size_statistics=*/ null, + /*geospatial_statistics=*/ null); } @Override @@ -288,7 +294,8 @@ public void writePageV2( Encoding dataEncoding, BytesInput data, Statistics statistics, - SizeStatistics sizeStatistics) + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics) throws IOException { pageOrdinal++; @@ -357,7 +364,7 @@ public void writePageV2( this.totalValueCount += valueCount; this.pageCount += 1; - mergeColumnStatistics(statistics, sizeStatistics); + mergeColumnStatistics(statistics, sizeStatistics, geospatialStatistics); offsetIndexBuilder.add( toIntWithCheck((long) tempOutputStream.size() + compressedSize), rowCount, @@ -378,13 +385,16 @@ private int toIntWithCheck(long size) { return (int) size; } - private void mergeColumnStatistics(Statistics statistics, SizeStatistics sizeStatistics) { + private void mergeColumnStatistics( + Statistics statistics, SizeStatistics sizeStatistics, GeospatialStatistics geospatialStatistics) { totalSizeStatistics.mergeStatistics(sizeStatistics); if (!totalSizeStatistics.isValid()) { // Set page size statistics to null to clear state in the ColumnIndexBuilder. sizeStatistics = null; } + totalGeospatialStatistics.merge(geospatialStatistics); + if (totalStatistics != null && totalStatistics.isEmpty()) { return; } @@ -422,6 +432,7 @@ public void writeToFileWriter(ParquetFileWriter writer) throws IOException { compressedLength, totalStatistics, totalSizeStatistics, + totalGeospatialStatistics, columnIndexBuilder, offsetIndexBuilder, bloomFilter, @@ -439,6 +450,7 @@ public void writeToFileWriter(ParquetFileWriter writer) throws IOException { compressedLength, totalStatistics, totalSizeStatistics, + totalGeospatialStatistics, columnIndexBuilder, offsetIndexBuilder, bloomFilter, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 02efcc8b52..4d17a1d6e4 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -57,6 +57,7 @@ import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.column.values.bloomfilter.BloomFilter; import org.apache.parquet.crypto.AesCipher; import org.apache.parquet.crypto.ColumnEncryptionProperties; @@ -158,6 +159,7 @@ public static enum Mode { private long compressedLength; private Statistics currentStatistics; // accumulated in writePage(s) private SizeStatistics currentSizeStatistics; // accumulated in writePage(s) + private GeospatialStatistics currentGeospatialStatistics; // accumulated in writePage(s) private ColumnIndexBuilder columnIndexBuilder; private OffsetIndexBuilder offsetIndexBuilder; @@ -625,6 +627,8 @@ public void startColumn(ColumnDescriptor descriptor, long valueCount, Compressio descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel()) .build(); + currentGeospatialStatistics = + GeospatialStatistics.newBuilder(descriptor.getPrimitiveType()).build(); columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength); offsetIndexBuilder = OffsetIndexBuilder.getBuilder(); @@ -1400,6 +1404,7 @@ void writeColumnChunk( long compressedTotalPageSize, Statistics totalStats, SizeStatistics totalSizeStats, + GeospatialStatistics totalGeospatialStats, ColumnIndexBuilder columnIndexBuilder, OffsetIndexBuilder offsetIndexBuilder, BloomFilter bloomFilter, @@ -1417,6 +1422,7 @@ void writeColumnChunk( compressedTotalPageSize, totalStats, totalSizeStats, + totalGeospatialStats, columnIndexBuilder, offsetIndexBuilder, bloomFilter, @@ -1439,6 +1445,7 @@ void writeColumnChunk( long compressedTotalPageSize, Statistics totalStats, SizeStatistics totalSizeStats, + GeospatialStatistics totalGeospatialStats, ColumnIndexBuilder columnIndexBuilder, OffsetIndexBuilder offsetIndexBuilder, BloomFilter bloomFilter, @@ -1496,6 +1503,7 @@ void writeColumnChunk( currentEncodings.addAll(dataEncodings); currentStatistics = totalStats; currentSizeStatistics = totalSizeStats; + currentGeospatialStatistics = totalGeospatialStats; this.columnIndexBuilder = columnIndexBuilder; this.offsetIndexBuilder = offsetIndexBuilder; @@ -1542,7 +1550,8 @@ public void endColumn() throws IOException { currentChunkValueCount, compressedLength, uncompressedLength, - currentSizeStatistics)); + currentSizeStatistics, + currentGeospatialStatistics)); this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 14a949b0e0..4ba52dec2c 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -31,6 +31,7 @@ import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; import org.apache.parquet.crypto.AesCipher; import org.apache.parquet.crypto.InternalColumnDecryptionSetup; import org.apache.parquet.crypto.InternalFileDecryptor; @@ -145,6 +146,35 @@ public static ColumnChunkMetaData get( totalUncompressedSize); } + public static ColumnChunkMetaData get( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize, + SizeStatistics sizeStatistics) { + return get( + path, + type, + codec, + encodingStats, + encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + valueCount, + totalSize, + totalUncompressedSize, + sizeStatistics, + null); + } + public static ColumnChunkMetaData get( ColumnPath path, PrimitiveType type, @@ -169,6 +199,7 @@ public static ColumnChunkMetaData get( valueCount, totalSize, totalUncompressedSize, + null, null); } @@ -199,7 +230,8 @@ public static ColumnChunkMetaData get( long valueCount, long totalSize, long totalUncompressedSize, - SizeStatistics sizeStatistics) { + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStats) { // to save space we store those always positive longs in ints when they fit. if (positiveLongFitsInAnInt(firstDataPage) @@ -219,7 +251,8 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { valueCount, totalSize, totalUncompressedSize, - sizeStatistics); + sizeStatistics, + geospatialStats); } else { return new LongColumnChunkMetaData( path, @@ -233,7 +266,8 @@ && positiveLongFitsInAnInt(totalUncompressedSize)) { valueCount, totalSize, totalUncompressedSize, - sizeStatistics); + sizeStatistics, + geospatialStats); } } @@ -395,6 +429,12 @@ public SizeStatistics getSizeStatistics() { throw new UnsupportedOperationException("SizeStatistics is not implemented"); } + /** @return the geospatial stats for this column */ + @JsonIgnore + public GeospatialStatistics getGeospatialStatistics() { + throw new UnsupportedOperationException("GeospatialStatistics is not implemented"); + } + /** * Method should be considered private * @@ -515,6 +555,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { private final int totalUncompressedSize; private final Statistics statistics; private final SizeStatistics sizeStatistics; + private final GeospatialStatistics geospatialStatistics; /** * @param path column identifier @@ -528,6 +569,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { * @param totalSize * @param totalUncompressedSize * @param sizeStatistics + * @param geospatialStatistics */ IntColumnChunkMetaData( ColumnPath path, @@ -541,7 +583,8 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { long valueCount, long totalSize, long totalUncompressedSize, - SizeStatistics sizeStatistics) { + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics) { super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings)); this.firstDataPage = positiveLongToInt(firstDataPage); this.dictionaryPageOffset = positiveLongToInt(dictionaryPageOffset); @@ -550,6 +593,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { this.totalUncompressedSize = positiveLongToInt(totalUncompressedSize); this.statistics = statistics; this.sizeStatistics = sizeStatistics; + this.geospatialStatistics = geospatialStatistics; } /** @@ -624,6 +668,11 @@ public Statistics getStatistics() { public SizeStatistics getSizeStatistics() { return sizeStatistics; } + + @Override + public GeospatialStatistics getGeospatialStatistics() { + return geospatialStatistics; + } } class LongColumnChunkMetaData extends ColumnChunkMetaData { @@ -635,6 +684,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { private final long totalUncompressedSize; private final Statistics statistics; private final SizeStatistics sizeStatistics; + private final GeospatialStatistics geospatialStatistics; /** * @param path column identifier @@ -648,6 +698,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { * @param totalSize * @param totalUncompressedSize * @param sizeStatistics + * @param geospatialStatistics */ LongColumnChunkMetaData( ColumnPath path, @@ -661,7 +712,8 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { long valueCount, long totalSize, long totalUncompressedSize, - SizeStatistics sizeStatistics) { + SizeStatistics sizeStatistics, + GeospatialStatistics geospatialStatistics) { super(encodingStats, ColumnChunkProperties.get(path, type, codec, encodings)); this.firstDataPageOffset = firstDataPageOffset; this.dictionaryPageOffset = dictionaryPageOffset; @@ -670,6 +722,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { this.totalUncompressedSize = totalUncompressedSize; this.statistics = statistics; this.sizeStatistics = sizeStatistics; + this.geospatialStatistics = geospatialStatistics; } /** @@ -721,6 +774,11 @@ public Statistics getStatistics() { public SizeStatistics getSizeStatistics() { return sizeStatistics; } + + @Override + public GeospatialStatistics getGeospatialStatistics() { + return geospatialStatistics; + } } class EncryptedColumnChunkMetaData extends ColumnChunkMetaData { @@ -850,4 +908,8 @@ public SizeStatistics getSizeStatistics() { public boolean isEncrypted() { return true; } + + public GeospatialStatistics getGeospatialStatistics() { + return shadowColumnChunkMetaData.getGeospatialStatistics(); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java index 10c84731f0..cd82cf4a8b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java @@ -508,7 +508,8 @@ private void processBlock( chunk.getValueCount(), chunk.getTotalSize(), chunk.getTotalUncompressedSize(), - chunk.getSizeStatistics()); + chunk.getSizeStatistics(), + chunk.getGeospatialStatistics()); } ColumnDescriptor descriptorOriginal = outSchema.getColumns().get(outColumnIdx); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 82c70bed95..2529f06ada 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -89,6 +89,7 @@ import org.apache.parquet.column.statistics.LongStatistics; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.statistics.geospatial.GeospatialTypes; import org.apache.parquet.crypto.DecryptionPropertiesFactory; import org.apache.parquet.crypto.EncryptionPropertiesFactory; import org.apache.parquet.crypto.FileDecryptionProperties; @@ -96,6 +97,7 @@ import org.apache.parquet.example.Paper; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroup; +import org.apache.parquet.format.BoundingBox; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnMetaData; import org.apache.parquet.format.ConvertedType; @@ -104,6 +106,7 @@ import org.apache.parquet.format.FileMetaData; import org.apache.parquet.format.GeographyType; import org.apache.parquet.format.GeometryType; +import org.apache.parquet.format.GeospatialStatistics; import org.apache.parquet.format.LogicalType; import org.apache.parquet.format.MapType; import org.apache.parquet.format.PageHeader; @@ -1801,4 +1804,161 @@ public void testGeographyLogicalTypeWithAlgorithmButNoCrs() { EdgeInterpolationAlgorithm.SPHERICAL, geographyAnnotation.getAlgorithm()); } + + @Test + public void testGeospatialStatisticsConversion() { + // Create a ParquetMetadataConverter + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + + // Create a valid BoundingBox with all fields set + org.apache.parquet.column.statistics.geospatial.BoundingBox bbox = + new org.apache.parquet.column.statistics.geospatial.BoundingBox( + 1.0, 2.0, // xmin, xmax + 3.0, 4.0, // ymin, ymax + 5.0, 6.0, // zmin, zmax + 7.0, 8.0 // mmin, mmax + ); + + // Create GeospatialTypes with some example type values + Set types = new HashSet<>(Arrays.asList(1, 2, 3)); + GeospatialTypes geospatialTypes = new GeospatialTypes(types); + + // Create GeospatialStatistics with the bbox and types + org.apache.parquet.column.statistics.geospatial.GeospatialStatistics origStats = + new org.apache.parquet.column.statistics.geospatial.GeospatialStatistics(bbox, geospatialTypes); + + // Convert to Thrift format + GeospatialStatistics thriftStats = converter.toParquetGeospatialStatistics(origStats); + + // Verify conversion to Thrift + assertNotNull("Thrift GeospatialStatistics should not be null", thriftStats); + assertTrue("BoundingBox should be set", thriftStats.isSetBbox()); + assertTrue("Geospatial types should be set", thriftStats.isSetGeospatial_types()); + + // Check BoundingBox values + BoundingBox thriftBbox = thriftStats.getBbox(); + assertEquals(1.0, thriftBbox.getXmin(), 0.0001); + assertEquals(2.0, thriftBbox.getXmax(), 0.0001); + assertEquals(3.0, thriftBbox.getYmin(), 0.0001); + assertEquals(4.0, thriftBbox.getYmax(), 0.0001); + assertEquals(5.0, thriftBbox.getZmin(), 0.0001); + assertEquals(6.0, thriftBbox.getZmax(), 0.0001); + assertEquals(7.0, thriftBbox.getMmin(), 0.0001); + assertEquals(8.0, thriftBbox.getMmax(), 0.0001); + + // Check geospatial types + List thriftTypes = thriftStats.getGeospatial_types(); + assertEquals(3, thriftTypes.size()); + assertTrue(thriftTypes.contains(1)); + assertTrue(thriftTypes.contains(2)); + assertTrue(thriftTypes.contains(3)); + + // Create primitive geometry type for conversion back + LogicalTypeAnnotation geometryAnnotation = LogicalTypeAnnotation.geometryType("EPSG:4326"); + PrimitiveType geometryType = + Types.required(PrimitiveTypeName.BINARY).as(geometryAnnotation).named("geometry"); + + // Convert back from Thrift format + org.apache.parquet.column.statistics.geospatial.GeospatialStatistics convertedStats = + ParquetMetadataConverter.fromParquetStatistics(thriftStats, geometryType); + + // Verify conversion from Thrift + assertNotNull("Converted GeospatialStatistics should not be null", convertedStats); + assertNotNull("BoundingBox should not be null", convertedStats.getBoundingBox()); + assertNotNull("GeospatialTypes should not be null", convertedStats.getGeospatialTypes()); + + // Check BoundingBox values + org.apache.parquet.column.statistics.geospatial.BoundingBox convertedBbox = convertedStats.getBoundingBox(); + assertEquals(1.0, convertedBbox.getXMin(), 0.0001); + assertEquals(2.0, convertedBbox.getXMax(), 0.0001); + assertEquals(3.0, convertedBbox.getYMin(), 0.0001); + assertEquals(4.0, convertedBbox.getYMax(), 0.0001); + assertEquals(5.0, convertedBbox.getZMin(), 0.0001); + assertEquals(6.0, convertedBbox.getZMax(), 0.0001); + assertEquals(7.0, convertedBbox.getMMin(), 0.0001); + assertEquals(8.0, convertedBbox.getMMax(), 0.0001); + + // Check geospatial types + Set convertedTypes = convertedStats.getGeospatialTypes().getTypes(); + assertEquals(3, convertedTypes.size()); + assertTrue(convertedTypes.contains(1)); + assertTrue(convertedTypes.contains(2)); + assertTrue(convertedTypes.contains(3)); + } + + @Test + public void testGeospatialStatisticsWithNullBoundingBox() { + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + + // Create GeospatialStatistics with null bbox but valid types + Set types = new HashSet<>(Arrays.asList(1, 2, 3)); + GeospatialTypes geospatialTypes = new GeospatialTypes(types); + org.apache.parquet.column.statistics.geospatial.GeospatialStatistics origStats = + new org.apache.parquet.column.statistics.geospatial.GeospatialStatistics(null, geospatialTypes); + + // Convert to Thrift format + GeospatialStatistics thriftStats = converter.toParquetGeospatialStatistics(origStats); + + // Verify conversion to Thrift + assertNotNull("Thrift GeospatialStatistics should not be null", thriftStats); + assertFalse("BoundingBox should not be set", thriftStats.isSetBbox()); + assertTrue("Geospatial types should be set", thriftStats.isSetGeospatial_types()); + + // Create primitive geometry type for conversion back + LogicalTypeAnnotation geometryAnnotation = LogicalTypeAnnotation.geometryType("EPSG:4326"); + PrimitiveType geometryType = + Types.required(PrimitiveTypeName.BINARY).as(geometryAnnotation).named("geometry"); + + // Convert back from Thrift format + org.apache.parquet.column.statistics.geospatial.GeospatialStatistics convertedStats = + ParquetMetadataConverter.fromParquetStatistics(thriftStats, geometryType); + + // Verify conversion from Thrift + assertNotNull("Converted GeospatialStatistics should not be null", convertedStats); + assertNull("BoundingBox should be null", convertedStats.getBoundingBox()); + assertNotNull("GeospatialTypes should not be null", convertedStats.getGeospatialTypes()); + } + + @Test + public void testInvalidBoundingBox() { + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + + // Create an invalid BoundingBox with NaN values + org.apache.parquet.column.statistics.geospatial.BoundingBox invalidBbox = + new org.apache.parquet.column.statistics.geospatial.BoundingBox( + Double.NaN, + 2.0, // xmin is NaN (invalid) + 3.0, + 4.0, + 5.0, + 6.0, + 7.0, + 8.0); + + org.apache.parquet.column.statistics.geospatial.GeospatialStatistics origStats = + new org.apache.parquet.column.statistics.geospatial.GeospatialStatistics(invalidBbox, null); + + // Convert to Thrift format - should return null for invalid bbox + GeospatialStatistics thriftStats = converter.toParquetGeospatialStatistics(origStats); + assertNull("Should return null for invalid BoundingBox", thriftStats); + } + + @Test + public void testEdgeInterpolationAlgorithmConversion() { + // Test conversion from Parquet to Thrift enum + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL; + org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo = + ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); + + // convert the Thrift enum to the column schema enum + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected = + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL; + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual = + ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); + assertEquals(expected, actual); + + // Test with null + assertNull(ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(null)); + assertNull(ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(null)); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index 7079d499c1..2b037b5261 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -306,6 +306,7 @@ public void testColumnOrderV1() throws IOException { eq(fakeData.size()), eq(fakeStats), any(), + any(), same(ColumnIndexBuilder.getNoOpBuilder()), // Deprecated writePage -> no column index same(OffsetIndexBuilder.getNoOpBuilder()), // Deprecated writePage -> no offset index any(), diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestGeometryTypeRoundTrip.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestGeometryTypeRoundTrip.java new file mode 100644 index 0000000000..5e6e4c3682 --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestGeometryTypeRoundTrip.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.statistics; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.geographyType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.geometryType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.Preconditions; +import org.apache.parquet.column.statistics.geospatial.GeospatialStatistics; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.GroupFactory; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.LocalInputFile; +import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBWriter; + +public class TestGeometryTypeRoundTrip { + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + private Path newTempPath() throws IOException { + File file = temp.newFile(); + Preconditions.checkArgument(file.delete(), "Could not remove temp file"); + return file.toPath(); + } + + @Test + public void testBasicReadWriteGeometryValue() throws Exception { + GeometryFactory geomFactory = new GeometryFactory(); + + // A class to convert JTS Geometry objects to and from Well-Known Binary (WKB) format. + WKBWriter wkbWriter = new WKBWriter(); + + // OGC:CRS84 (WGS 84): Uses the order longitude, latitude + Binary[] points = { + Binary.fromConstantByteArray(wkbWriter.write(geomFactory.createPoint(new Coordinate(1.0, 1.0)))), + Binary.fromConstantByteArray(wkbWriter.write(geomFactory.createPoint(new Coordinate(2.0, 2.0)))) + }; + + // A message type that represents a message with a geometry column. + MessageType schema = Types.buildMessage() + .required(BINARY) + .as(geometryType(null)) + .named("geometry") + .named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + GroupFactory factory = new SimpleGroupFactory(schema); + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(new LocalOutputFile(path)) + .withConf(conf) + .withDictionaryEncoding(false) + .build()) { + for (Binary value : points) { + writer.write(factory.newGroup().append("geometry", value)); + } + } + + try (ParquetFileReader reader = ParquetFileReader.open(new LocalInputFile(path))) { + Assert.assertEquals(2, reader.getRecordCount()); + + ParquetMetadata footer = reader.getFooter(); + Assert.assertNotNull(footer); + + ColumnChunkMetaData columnChunkMetaData = + reader.getRowGroups().get(0).getColumns().get(0); + Assert.assertNotNull(columnChunkMetaData); + + GeospatialStatistics geospatialStatistics = columnChunkMetaData.getGeospatialStatistics(); + Assert.assertNotNull(geospatialStatistics); + + Assert.assertEquals(1.0, geospatialStatistics.getBoundingBox().getXMin(), 0.0); + Assert.assertEquals(2.0, geospatialStatistics.getBoundingBox().getXMax(), 0.0); + Assert.assertEquals(1.0, geospatialStatistics.getBoundingBox().getYMin(), 0.0); + Assert.assertEquals(2.0, geospatialStatistics.getBoundingBox().getYMax(), 0.0); + } + } + + @Test + public void testBasicReadWriteGeographyValue() throws Exception { + GeometryFactory geomFactory = new GeometryFactory(); + + // A class to convert JTS Geometry objects to and from Well-Known Binary (WKB) format. + WKBWriter wkbWriter = new WKBWriter(); + + // OGC:CRS84 (WGS 84): Uses the order longitude, latitude + Binary[] points = { + Binary.fromConstantByteArray(wkbWriter.write(geomFactory.createPoint(new Coordinate(1.0, 1.0)))), + Binary.fromConstantByteArray(wkbWriter.write(geomFactory.createPoint(new Coordinate(2.0, 2.0)))) + }; + + // A message type that represents a message with a geography column. + MessageType schema = Types.buildMessage() + .required(BINARY) + .as(geographyType()) // Assuming geographyType() is similar to geometryType() + .named("geography") + .named("msg"); + + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + GroupFactory factory = new SimpleGroupFactory(schema); + Path path = newTempPath(); + try (ParquetWriter writer = ExampleParquetWriter.builder(new LocalOutputFile(path)) + .withConf(conf) + .withDictionaryEncoding(false) + .build()) { + for (Binary value : points) { + writer.write(factory.newGroup().append("geography", value)); + } + } + + try (ParquetFileReader reader = ParquetFileReader.open(new LocalInputFile(path))) { + Assert.assertEquals(2, reader.getRecordCount()); + + ParquetMetadata footer = reader.getFooter(); + Assert.assertNotNull(footer); + + ColumnChunkMetaData columnChunkMetaData = + reader.getRowGroups().get(0).getColumns().get(0); + Assert.assertNotNull(columnChunkMetaData); + + GeospatialStatistics geospatialStatistics = columnChunkMetaData.getGeospatialStatistics(); + Assert.assertNull(geospatialStatistics); + } + } + + @Test + public void testInvalidGeometryPresented() throws Exception { + GeometryFactory geomFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + // Create an array of binary values with a mix of valid and invalid geometry data + Binary[] geometries = { + // Valid point + Binary.fromConstantByteArray(wkbWriter.write(geomFactory.createPoint(new Coordinate(1.0, 1.0)))), + // Invalid "geometry" - corrupt WKB data + Binary.fromConstantByteArray(new byte[] {0x01, 0x02, 0x03, 0x04}), + // Another valid point + Binary.fromConstantByteArray(wkbWriter.write(geomFactory.createPoint(new Coordinate(2.0, 2.0)))) + }; + + // Create schema with geometry type + MessageType schema = Types.buildMessage() + .required(BINARY) + .as(geometryType(null)) + .named("geometry") + .named("msg"); + + // Write file with mixed valid/invalid geometries + Configuration conf = new Configuration(); + GroupWriteSupport.setSchema(schema, conf); + GroupFactory factory = new SimpleGroupFactory(schema); + Path path = newTempPath(); + + try (ParquetWriter writer = ExampleParquetWriter.builder(new LocalOutputFile(path)) + .withConf(conf) + .withDictionaryEncoding(false) + .build()) { + for (Binary value : geometries) { + writer.write(factory.newGroup().append("geometry", value)); + } + } + + // Read and verify the file + try (ParquetFileReader reader = ParquetFileReader.open(new LocalInputFile(path))) { + Assert.assertEquals(3, reader.getRecordCount()); + + ParquetMetadata footer = reader.getFooter(); + Assert.assertNotNull(footer); + + ColumnChunkMetaData columnChunkMetaData = + reader.getRowGroups().get(0).getColumns().get(0); + Assert.assertNotNull(columnChunkMetaData); + + // The key verification - when invalid geometry data is present, + // geospatial statistics should omit the invalid data + GeospatialStatistics geospatialStatistics = columnChunkMetaData.getGeospatialStatistics(); + Assert.assertNotNull("Geospatial statistics should omit the corrupt geometry", geospatialStatistics); + + // further check fields in the GeospatialStatistics + Assert.assertTrue("Geospatial statistics should be valid", geospatialStatistics.isValid()); + Assert.assertNotNull("Bounding box should not be null", geospatialStatistics.getBoundingBox()); + Assert.assertNotNull("Geospatial types should not be null", geospatialStatistics.getGeospatialTypes()); + Assert.assertTrue( + "Geospatial types should be valid", + geospatialStatistics.getGeospatialTypes().isValid()); + } + } +}