Use key-value search instead of offset for parread

qqndrew · qqndrew · commit ff9b1ff2f162 · 2023-09-26T22:41:17.000-05:00
Allow writing nulls to CSV
diff --git a/IO/src/main/java/org/ohnlp/backbone/io/jdbc/JDBCExtract.java b/IO/src/main/java/org/ohnlp/backbone/io/jdbc/JDBCExtract.java
@@ -1,11 +1,15 @@
 package org.ohnlp.backbone.io.jdbc;
 
 import com.mchange.v2.c3p0.ComboPooledDataSource;
+import org.apache.beam.sdk.coders.Coder;
+import org.apache.beam.sdk.coders.CollectionCoder;
 import org.apache.beam.sdk.coders.RowCoder;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
 import org.apache.beam.sdk.io.jdbc.JdbcIO;
 import org.apache.beam.sdk.io.jdbc.SchemaUtilProxy;
 import org.apache.beam.sdk.schemas.Schema;
-import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.*;
+import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PBegin;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.beam.sdk.values.Row;
@@ -14,8 +18,10 @@
 import org.ohnlp.backbone.api.components.ExtractToOne;
 import org.ohnlp.backbone.api.exceptions.ComponentInitializationException;
 
+import java.beans.PropertyVetoException;
 import java.sql.*;
 import java.util.*;
+import java.util.concurrent.ThreadLocalRandom;
 
 /**
  * Performs data extraction using a JDBC connector
@@ -62,7 +68,8 @@ public class JDBCExtract extends ExtractToOne {
     private int batchSize = 1000;
     @ConfigurationProperty(
             path = "identifier_col",
-            desc = "An ID column returned as part of the query that can be used to identify and partition records.",
+            desc = "An ID column returned as part of the query that can be used to identify and partition records, " +
+                    "multiple columns can be entered in column-delimited order",
             required = false
     )
     private String identifierCol = null;
@@ -81,6 +88,8 @@ public class JDBCExtract extends ExtractToOne {
     private String viewName;
     private String orderedQuery;
     private Schema schema;
+    private String keyValueQuery;
+    private Schema keyValueSchema;
 
     /**
      * Initializes a Beam JdbcIO Provider
@@ -125,39 +134,46 @@ public void init() throws ComponentInitializationException {
             // We will first preflight with a query that counts the number of records so that we can get number
             // of batches
             String runId = UUID.randomUUID().toString().replaceAll("-", "_");
-            //noinspection SqlResolve
-            String countQuery = "SELECT COUNT(*) FROM (" + query + ") bckbone_preflight_query_" + runId;
             this.viewName = "backbone_jdbcextract_" + runId;
-            // Find appropriate columns to order by so that pagination results are consistent
-            this.orderByCols = findPaginationOrderingColumns(this.query);
-            // Get record count so that we know how many batches are going to be needed
-            try (Connection conn = initializationDS.getConnection()) {
-                ResultSet rs = conn.createStatement().executeQuery(countQuery);
-                rs.next();
-                int resultCount = rs.getInt(1);
-                this.numBatches = Math.round(Math.ceil((double) resultCount / this.batchSize));
-            }
-            // Normally I would say use Strings.join for the below, but this was causing cross-jvm issues
-            // so we use the more portable stringbuilder instead...
-            StringBuilder sB = new StringBuilder();
-            boolean flag = false;
-            for (String s : this.orderByCols) {
-                if (flag) {
-                    sB.append(", ");
+            if (this.identifierCol == null) {
+                // No identifier column provided so we can only do a full-form sort.
+                // TODO find a better solution for this
+                //noinspection SqlResolve
+                String countQuery = "SELECT COUNT(*) FROM (" + query + ") bckbone_preflight_query_" + runId;
+                // Find appropriate columns to order by so that pagination results are consistent
+                this.orderByCols = findPaginationOrderingColumns(this.query);
+                // Get record count so that we know how many batches are going to be needed
+                try (Connection conn = initializationDS.getConnection()) {
+                    ResultSet rs = conn.createStatement().executeQuery(countQuery);
+                    rs.next();
+                    int resultCount = rs.getInt(1);
+                    this.numBatches = Math.round(Math.ceil((double) resultCount / this.batchSize));
                 }
-                sB.append(s);
-                flag = true;
-            }
-            this.orderedQuery = "SELECT * FROM (" + this.query + ") " + this.viewName
-                    + " ORDER BY " + sB.toString() + " ";
-            // Now we have to add the offset/fetch in the dialect local format..
-            // Specifically, postgres and MySQL are special in that they do not conform to the
-            // SQL:2011 standard syntax
-            if (driver.equals("org.postgresql.Driver") || driver.equals("com.mysql.jdbc.Driver")
-                    || driver.equals("com.mysql.cj.jdbc.Driver") || driver.equals("org.sqlite.JDBC")) {
-                this.orderedQuery += "LIMIT " + batchSize + " OFFSET ?";
-            } else { // This is the SQL:2011 standard definition of an offset...fetch syntax
-                this.orderedQuery += "OFFSET ? ROWS FETCH NEXT " + batchSize + " ROWS ONLY";
+                // Normally I would say use Strings.join for the below, but this was causing cross-jvm issues
+                // so we use the more portable stringbuilder instead...
+                StringBuilder sB = new StringBuilder();
+                boolean flag = false;
+                for (String s : this.orderByCols) {
+                    if (flag) {
+                        sB.append(", ");
+                    }
+                    sB.append(s);
+                    flag = true;
+                }
+                this.orderedQuery = "SELECT * FROM (" + this.query + ") " + this.viewName
+                        + " ORDER BY " + sB.toString() + " ";
+                // Now we have to add the offset/fetch in the dialect local format..
+                // Specifically, postgres and MySQL are special in that they do not conform to the
+                // SQL:2011 standard syntax
+                if (driver.equals("org.postgresql.Driver") || driver.equals("com.mysql.jdbc.Driver")
+                        || driver.equals("com.mysql.cj.jdbc.Driver") || driver.equals("org.sqlite.JDBC")) {
+                    this.orderedQuery += "LIMIT " + batchSize + " OFFSET ?";
+                } else { // This is the SQL:2011 standard definition of an offset...fetch syntax
+                    this.orderedQuery += "OFFSET ? ROWS FETCH NEXT " + batchSize + " ROWS ONLY";
+                }
+            } else {
+                this.keyValueQuery = "SELECT DISTINCT " + identifierCol + " FROM (" + query + ") " + viewName;
+                this.keyValueSchema = getIdentifierColumnsSchema();
             }
         } catch (Throwable t) {
             throw new ComponentInitializationException(t);
@@ -184,24 +200,90 @@ public Schema calculateOutputSchema() {
 
     @Override
     public PCollection<Row> begin(PBegin input) {
-        List<Integer> offsets = new ArrayList<>();
-        for (int i = 0; i < numBatches; i++) {
-            offsets.add(i * batchSize); // Create a sequence of batches at the appropriate offset
+        if (this.identifierCol == null) {
+            List<Integer> offsets = new ArrayList<>();
+            for (int i = 0; i < numBatches; i++) {
+                offsets.add(i * batchSize); // Create a sequence of batches at the appropriate offset
+            }
+            return input.apply(
+                    "Read from JDBC",
+                    JdbcIO.<Row>read()
+                    .withDataSourceConfiguration(datasourceConfig)
+                    .withQuery("SELECT * FROM (" + this.query + ") " + this.viewName)
+                    .withRowMapper(this.driver.equals("org.sqlite.JDBC") ?
+                            new SchemaUtilProxy.SQLiteBeamRowMapperProxy(schema) :
+                            new SchemaUtilProxy.BeamRowMapperProxy(schema))
+                    .withCoder(RowCoder.of(schema))
+                    .withOutputParallelization(false)
+            ).apply("JDBC Break Fusion", Repartition.of()).setRowSchema(schema);
+        } else {
+            StringBuilder queryByKey = new StringBuilder("SELECT * FROM (" + this.query + ") " + this.viewName + " WHERE ");
+            boolean appendAnd = false;
+            for (String identifierCol : this.identifierCol.split(",")) {
+                if (appendAnd) {
+                    queryByKey.append("AND ");
+                } else {
+                    appendAnd = true;
+                }
+                queryByKey.append(identifierCol).append(" = ? ");
+            }
+            JdbcIO.RowMapper<Row> rowmapper = this.driver.equals("org.sqlite.JDBC") ?
+                    new SchemaUtilProxy.SQLiteBeamRowMapperProxy(keyValueSchema) :
+                    new SchemaUtilProxy.BeamRowMapperProxy(keyValueSchema);
+            String[] cols = this.identifierCol.split(",");
+            return input.apply("JDBC Init", Create.of(keyValueQuery))
+                    .apply("JDBC Preflight for Query Keys", ParDo.of(
+                            new DoFn<String, Row>() {
+                                private ComboPooledDataSource ds;
+
+                                @Setup
+                                public void init() throws PropertyVetoException {
+                                    this.ds = new ComboPooledDataSource(); // Set separate
+                                    ds.setDriverClass(driver);
+                                    ds.setJdbcUrl(url);
+                                    ds.setUser(user);
+                                    ds.setPassword(password);
+                                    ds.setMaxIdleTime(idleTimeout);
+                                }
+
+                                @ProcessElement
+                                public void process(ProcessContext pc) throws Exception {
+                                    try (Connection conn = ds.getConnection()) {
+                                        ResultSet rs = conn.createStatement().executeQuery(pc.element());
+                                        while (rs.next()) {
+                                            pc.output(rowmapper.mapRow(rs));
+                                        }
+
+                                    } catch (SQLException e) {
+                                        throw new RuntimeException(e);
+                                    }
+                                }
+                            }
+                    )).setRowSchema(this.keyValueSchema)
+                    .apply("JDBC Break Fusion", Repartition.of()) // Break fusion here due to large fanout/preflight being on single thread
+                    .apply("JDBC Read", JdbcIO.<Row, Row>readAll()
+                            .withDataSourceConfiguration(datasourceConfig)
+                            .withQuery(queryByKey.toString())
+                            .withRowMapper(this.driver.equals("org.sqlite.JDBC") ?
+                                    new SchemaUtilProxy.SQLiteBeamRowMapperProxy(schema) :
+                                    new SchemaUtilProxy.BeamRowMapperProxy(schema))
+                            .withParameterSetter((JdbcIO.PreparedStatementSetter<Row>) (element, preparedStatement) -> {
+                                for (int i = 0; i < cols.length; i++) {
+                                    preparedStatement.setObject(i + 1, element.getValue(cols[i]));
+                                }
+                            })
+                            .withCoder(RowCoder.of(schema))
+                            .withOutputParallelization(false));
+        }
+    }
+
+    private Schema getIdentifierColumnsSchema() throws ComponentInitializationException {
+        try (Connection conn = this.initializationDS.getConnection()) {
+            ResultSetMetaData queryMeta = conn.prepareStatement("SELECT " + this.initializationDS + " FROM (" + this.query + ") " + this.viewName).getMetaData();
+            return SchemaUtilProxy.toBeamSchema(this.driver, queryMeta);
+        } catch (SQLException e) {
+            throw new ComponentInitializationException(e);
         }
-        return input.apply("JDBC Preflight", Create.of(offsets)) // First create partitions # = to num batches
-                .apply("JDBC Read", // Now actually do the read, the readall function will execute one query per input partition
-                        JdbcIO.<Integer, Row>readAll()
-                                .withDataSourceConfiguration(datasourceConfig)
-                                .withQuery(this.orderedQuery)
-                                .withRowMapper(this.driver.equals("org.sqlite.JDBC") ?
-                                        new SchemaUtilProxy.SQLiteBeamRowMapperProxy(schema) :
-                                        new SchemaUtilProxy.BeamRowMapperProxy(schema))
-                                .withParameterSetter((JdbcIO.PreparedStatementSetter<Integer>) (element, preparedStatement) -> {
-                                    preparedStatement.setInt(1, element); // Replace
-                                })
-                                .withCoder(RowCoder.of(schema))
-                                .withOutputParallelization(false)
-                );
     }
 
     private String[] findPaginationOrderingColumns(String query) throws ComponentInitializationException {
@@ -243,4 +325,32 @@ private String[] findPaginationOrderingColumns(String query) throws ComponentIni
         }
 
     }
+
+    private static class Repartition<T> extends PTransform<PCollection<T>, PCollection<T>> {
+
+        private Repartition() {}
+
+        public static <T> Repartition<T> of() {
+            return new Repartition<>();
+        }
+
+        @Override
+        public PCollection<T> expand(PCollection<T> input) {
+            return input
+                    .apply(ParDo.of(new DoFn<T, KV<Integer, T>>() {
+                        @ProcessElement
+                        public void process(ProcessContext pc) {
+                            pc.output(KV.of(ThreadLocalRandom.current().nextInt(), pc.element()));
+                        }
+                    }))
+                    .apply(GroupByKey.<Integer, T>create())
+                    .apply(ParDo.of(new DoFn<KV<Integer, Iterable<T>>, T>() {
+                        @ProcessElement
+                        public void process(ProcessContext pc) {
+                            for (T element : pc.element().getValue()) {
+                                pc.output(element);
+                            }                        }
+                    }));
+        }
+    }
 }
diff --git a/IO/src/main/java/org/ohnlp/backbone/io/local/encodings/RowValueToCSVEncoding.java b/IO/src/main/java/org/ohnlp/backbone/io/local/encodings/RowValueToCSVEncoding.java
@@ -16,6 +16,7 @@ public String toTextWithFields(Row input, List<String> fields) {
 
     @Override
     public String toTextAllFields(Row input) {
-        return input.getValues().stream().map(o -> StringEscapeUtils.escapeCsv(o.toString())).collect(Collectors.joining(","));
+        return input.getValues().stream().map(o -> o == null ? "" : StringEscapeUtils.escapeCsv(o.toString())).collect(Collectors.joining(","));
+
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ public String toTextWithFields(Row input, List<String> fields) {`
`16`	`16`
`17`	`17`	`@Override`
`18`	`18`	`public String toTextAllFields(Row input) {`
`19`		`- return input.getValues().stream().map(o -> StringEscapeUtils.escapeCsv(o.toString())).collect(Collectors.joining(","));`
	`19`	`+ return input.getValues().stream().map(o -> o == null ? "" : StringEscapeUtils.escapeCsv(o.toString())).collect(Collectors.joining(","));`
	`20`	`+`
`20`	`21`	`}`
`21`	`22`	`}`