add Kotlin DataFrame and DuckDB implementation, upgrade tablesaw

uwemaurer · uwemaurer · commit 567ce0d83b37 · 2023-05-30T11:31:39.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ bin
 .project
 .settings
 .gradle
+.idea/
+build/
diff --git a/build.gradle b/build.gradle
@@ -1,8 +1,7 @@
 buildscript {
-  ext.kotlin_version = '1.3.61'
+  ext.kotlin_version = '1.8.21'
   repositories {
     mavenCentral()
-    jcenter()
   }
 
   dependencies {
@@ -22,7 +21,9 @@ repositories {
 }
 
 dependencies {
-  implementation 'tech.tablesaw:tablesaw-core:0.37.3'
+  implementation 'tech.tablesaw:tablesaw-core:0.43.1'
+  // needed for tablesaw
+  implementation 'com.google.guava:guava:31.1-jre'
 
   implementation 'joinery:joinery-dataframe:1.9'
   // For the CSV import joinery needs this dependency too:
@@ -35,5 +36,8 @@ dependencies {
 
   implementation "de.mpicbg.scicomp:krangl:0.11"
   implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version"
+
+  implementation "org.jetbrains.kotlinx:dataframe:0.10.0"
+  implementation 'org.duckdb:duckdb_jdbc:0.8.0'
 }
 
diff --git a/src/main/java/test_dataframes/TestDuckDb.kt b/src/main/java/test_dataframes/TestDuckDb.kt
@@ -0,0 +1,53 @@
+package test_dataframes
+
+import com.google.common.base.Stopwatch
+import tech.tablesaw.api.Table
+import java.sql.DriverManager
+
+
+/**
+ * Test duckdb to do some basic dataframe manipulations.
+ *
+ * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437
+ * for more information.
+ */
+fun main() {
+    val conn = DriverManager.getConnection("jdbc:duckdb:")
+    val stmt = conn.createStatement()
+    var rs = stmt.executeQuery("SELECT * FROM 'urb_cpop1_1_Data.csv'")
+    Table.read().db(rs).print().also { println(it) }
+
+    val watch = Stopwatch.createStarted()
+    stmt.execute(
+        """
+         CREATE TEMP TABLE t1 AS (
+             WITH cities AS (
+                SELECT CITIES || ':' || INDIC_UR as key,
+                CAST(Value AS INTEGER) as Value,
+                * EXCLUDE (CITIES, INDIC_UR, Value)
+                FROM 'urb_cpop1_1_Data.csv' WHERE Value != ':'),
+             pivot_table AS (
+                 PIVOT cities
+                 ON TIME
+                 USING AVG(Value)
+                 GROUP BY key
+             )
+             SELECT *, ("2016"::REAL / "2010"::REAL - 1.0 ) * 100.0 as growth
+             FROM pivot_table
+             WHERE suffix(key, 'January, total')
+             ORDER BY growth DESC
+         )
+     """
+    )
+    rs = stmt.executeQuery("SELECT * FROM t1")
+    Table.read().db(rs).print().also { println(it) }
+    val result = stmt.executeQuery("SELECT key FROM t1").use { r ->
+        mutableListOf<String>().apply {
+            while (r.next()) {
+                this += r.getString("key")
+            }
+        }
+    }
+    CheckResult.checkResult(result)
+    println("Total time: $watch")
+}
diff --git a/src/main/java/test_dataframes/TestKotlinDataFrame.kt b/src/main/java/test_dataframes/TestKotlinDataFrame.kt
@@ -0,0 +1,39 @@
+package test_dataframes
+
+import com.google.common.base.Stopwatch
+import org.jetbrains.kotlinx.dataframe.DataFrame
+import org.jetbrains.kotlinx.dataframe.io.*
+import org.jetbrains.kotlinx.dataframe.api.*
+
+/**
+ * Test the API of Kotlin Dataframes to do some basic dataframe manipulations.
+ *
+ * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437
+ * for more information.
+ */
+fun main() {
+    val df = DataFrame.read("urb_cpop1_1_Data.csv", delimiter = ',')
+    df.print()
+    val watch = Stopwatch.createStarted()
+    val key by column<String>()
+    // remove missing values indicated with ":", convert column to IntCol
+    val filtered = df.filter { "Value"<String>() != ":" }
+        .add(key) { "CITIES"<String>() + ":" + "INDIC_UR"<String>() }
+        .convert { "Value"<String>() }.toInt()
+
+    var cities = filtered.groupBy(key).pivot("TIME", inward = false).mean { "Value"<Int>() }
+    cities.print()
+
+    cities = cities.filter { key().endsWith("January, total") }.sortByDesc("2017")
+    cities.print()
+
+    // growth
+    val highestGrowthTable =
+        cities.filter { "2010"<Double?>() != null && "2016"<Double?>() != null }
+            .add("growth") { ("2016"<Double>() / "2010"<Double>() - 1.0) * 100.0 }
+            .sortByDesc("growth")
+    highestGrowthTable.print()
+
+    CheckResult.checkResult(highestGrowthTable[{ key }].toList())
+    println("Total time: $watch")
+}
diff --git a/src/main/java/test_dataframes/TestTablesaw.java b/src/main/java/test_dataframes/TestTablesaw.java
@@ -18,7 +18,7 @@
  * for more information.
  */
 public class TestTablesaw {
-    public static void main(String[] args) throws Exception {
+    public static void main(String[] args) {
         // This automatically makes the ":" values missing
         Table data = Table.read().csv(
             CsvReadOptions.builder("urb_cpop1_1_Data.csv").missingValueIndicator(":").build());