diff --git a/src/main/java/nl/sidnlabs/entrada/engine/AbstractQueryEngine.java b/src/main/java/nl/sidnlabs/entrada/engine/AbstractQueryEngine.java index 48d93940..5239df44 100644 --- a/src/main/java/nl/sidnlabs/entrada/engine/AbstractQueryEngine.java +++ b/src/main/java/nl/sidnlabs/entrada/engine/AbstractQueryEngine.java @@ -37,6 +37,9 @@ public abstract class AbstractQueryEngine implements QueryEngine { @Value("${entrada.database.table.icmp}") protected String tableIcmp; + @Value("${entrada.parquet.compression}") + protected String parquetCompression; + protected JdbcTemplate jdbcTemplate; protected FileManager fileManager; private String scriptPrefix; @@ -143,6 +146,7 @@ private Map createValueMap(TablePartition p) { values.put("MONTH", p.getMonth()); values.put("DAY", p.getDay()); values.put("SERVER", p.getServer()); + values.put("PARQUET_COMPRESSION", parquetCompression); return values; } diff --git a/src/main/java/nl/sidnlabs/entrada/initialize/AbstractInitializer.java b/src/main/java/nl/sidnlabs/entrada/initialize/AbstractInitializer.java index 8e363a92..0838a6e6 100644 --- a/src/main/java/nl/sidnlabs/entrada/initialize/AbstractInitializer.java +++ b/src/main/java/nl/sidnlabs/entrada/initialize/AbstractInitializer.java @@ -52,6 +52,9 @@ public abstract class AbstractInitializer implements Initializer { @Value("${aws.encryption}") protected boolean encrypt; + @Value("${entrada.parquet.compression}") + protected String parquetCompression; + private QueryEngine queryEngine; private String scriptPrefix; @@ -147,6 +150,7 @@ private Map dnsParameters() { parameters.put("TABLE_NAME", tableDns); parameters.put("TABLE_LOC", FileUtil.appendPath(output, tableDns)); parameters.put("ENCRYPTED", encrypt); + parameters.put("PARQUET_COMPRESSION", parquetCompression); return parameters; } @@ -157,6 +161,7 @@ private Map icmpParameters() { parameters.put("TABLE_NAME", tableIcmp); parameters.put("TABLE_LOC", FileUtil.appendPath(output, tableIcmp)); parameters.put("ENCRYPTED", encrypt); + parameters.put("PARQUET_COMPRESSION", parquetCompression); return parameters; } diff --git a/src/main/resources/application-dev.properties b/src/main/resources/application-dev.properties index 6be69c3b..1f1dfe09 100644 --- a/src/main/resources/application-dev.properties +++ b/src/main/resources/application-dev.properties @@ -172,6 +172,8 @@ entrada.parquet.filesize.max=128 entrada.parquet.rowgroup.size=128 # max rows to use for each column chunk page in parquet file entrada.parquet.page-row.limit=20000 +# parquet compression format, use GZIP to save on S3 storage/athena scan costs for example +entrada.parquet.compression=SNAPPY # seconds cached dns questions (without responses) timeout # required to match dns requests spanning multiple pcap files entrada.cache.timeout=2 diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index f63848a2..ff834174 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -169,6 +169,8 @@ entrada.parquet.filesize.max=128 entrada.parquet.rowgroup.size=128 # max rows to use for each column chunk page in parquet file entrada.parquet.page-row.limit=20000 +# parquet compression format, use GZIP to save on S3 storage/athena scan costs for example +entrada.parquet.compression=SNAPPY # seconds cached dns questions (without responses) timeout # required to match dns requests spanning multiple pcap files entrada.cache.timeout=2 diff --git a/src/main/resources/sql/athena/partition-compaction-dns.sql b/src/main/resources/sql/athena/partition-compaction-dns.sql index 8aa4635c..b751e80e 100644 --- a/src/main/resources/sql/athena/partition-compaction-dns.sql +++ b/src/main/resources/sql/athena/partition-compaction-dns.sql @@ -2,7 +2,7 @@ CREATE TABLE ${DATABASE_NAME}.tmp_compaction WITH ( external_location = '${TABLE_LOC}', format = 'Parquet', - parquet_compression = 'SNAPPY') + parquet_compression = '${PARQUET_COMPRESSION}') AS SELECT id, time, diff --git a/src/main/resources/sql/athena/partition-compaction-icmp.sql b/src/main/resources/sql/athena/partition-compaction-icmp.sql index bbe93e8a..27c5d95e 100644 --- a/src/main/resources/sql/athena/partition-compaction-icmp.sql +++ b/src/main/resources/sql/athena/partition-compaction-icmp.sql @@ -2,7 +2,7 @@ CREATE TABLE ${DATABASE_NAME}.tmp_compaction WITH ( external_location = '${TABLE_LOC}', format = 'Parquet', - parquet_compression = 'SNAPPY') + parquet_compression = '${PARQUET_COMPRESSION}') AS SELECT icmp_type, icmp_code, diff --git a/src/main/resources/sql/athena/partition-purge-dns.sql b/src/main/resources/sql/athena/partition-purge-dns.sql index 2c9c2161..334e1849 100644 --- a/src/main/resources/sql/athena/partition-purge-dns.sql +++ b/src/main/resources/sql/athena/partition-purge-dns.sql @@ -2,7 +2,7 @@ CREATE TABLE ${DATABASE_NAME}.tmp_compaction WITH ( external_location = '${TABLE_LOC}', format = 'Parquet', - parquet_compression = 'SNAPPY') + parquet_compression = '${PARQUET_COMPRESSION}') AS SELECT id, time, diff --git a/src/main/resources/sql/athena/partition-purge-icmp.sql b/src/main/resources/sql/athena/partition-purge-icmp.sql index d9b7eef6..44f23442 100644 --- a/src/main/resources/sql/athena/partition-purge-icmp.sql +++ b/src/main/resources/sql/athena/partition-purge-icmp.sql @@ -2,7 +2,7 @@ CREATE TABLE ${DATABASE_NAME}.tmp_compaction WITH ( external_location = '${TABLE_LOC}', format = 'Parquet', - parquet_compression = 'SNAPPY') + parquet_compression = '${PARQUET_COMPRESSION}') AS SELECT icmp_type, icmp_code,