Skip to content

Commit 567ce0d

Browse files
committed
add Kotlin DataFrame and DuckDB implementation, upgrade tablesaw
1 parent 5ac2967 commit 567ce0d

File tree

5 files changed

+102
-4
lines changed

5 files changed

+102
-4
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@ bin
44
.project
55
.settings
66
.gradle
7+
.idea/
8+
build/

build.gradle

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
buildscript {
2-
ext.kotlin_version = '1.3.61'
2+
ext.kotlin_version = '1.8.21'
33
repositories {
44
mavenCentral()
5-
jcenter()
65
}
76

87
dependencies {
@@ -22,7 +21,9 @@ repositories {
2221
}
2322

2423
dependencies {
25-
implementation 'tech.tablesaw:tablesaw-core:0.37.3'
24+
implementation 'tech.tablesaw:tablesaw-core:0.43.1'
25+
// needed for tablesaw
26+
implementation 'com.google.guava:guava:31.1-jre'
2627

2728
implementation 'joinery:joinery-dataframe:1.9'
2829
// For the CSV import joinery needs this dependency too:
@@ -35,5 +36,8 @@ dependencies {
3536

3637
implementation "de.mpicbg.scicomp:krangl:0.11"
3738
implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version"
39+
40+
implementation "org.jetbrains.kotlinx:dataframe:0.10.0"
41+
implementation 'org.duckdb:duckdb_jdbc:0.8.0'
3842
}
3943

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package test_dataframes
2+
3+
import com.google.common.base.Stopwatch
4+
import tech.tablesaw.api.Table
5+
import java.sql.DriverManager
6+
7+
8+
/**
9+
* Test duckdb to do some basic dataframe manipulations.
10+
*
11+
* See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437
12+
* for more information.
13+
*/
14+
fun main() {
15+
val conn = DriverManager.getConnection("jdbc:duckdb:")
16+
val stmt = conn.createStatement()
17+
var rs = stmt.executeQuery("SELECT * FROM 'urb_cpop1_1_Data.csv'")
18+
Table.read().db(rs).print().also { println(it) }
19+
20+
val watch = Stopwatch.createStarted()
21+
stmt.execute(
22+
"""
23+
CREATE TEMP TABLE t1 AS (
24+
WITH cities AS (
25+
SELECT CITIES || ':' || INDIC_UR as key,
26+
CAST(Value AS INTEGER) as Value,
27+
* EXCLUDE (CITIES, INDIC_UR, Value)
28+
FROM 'urb_cpop1_1_Data.csv' WHERE Value != ':'),
29+
pivot_table AS (
30+
PIVOT cities
31+
ON TIME
32+
USING AVG(Value)
33+
GROUP BY key
34+
)
35+
SELECT *, ("2016"::REAL / "2010"::REAL - 1.0 ) * 100.0 as growth
36+
FROM pivot_table
37+
WHERE suffix(key, 'January, total')
38+
ORDER BY growth DESC
39+
)
40+
"""
41+
)
42+
rs = stmt.executeQuery("SELECT * FROM t1")
43+
Table.read().db(rs).print().also { println(it) }
44+
val result = stmt.executeQuery("SELECT key FROM t1").use { r ->
45+
mutableListOf<String>().apply {
46+
while (r.next()) {
47+
this += r.getString("key")
48+
}
49+
}
50+
}
51+
CheckResult.checkResult(result)
52+
println("Total time: $watch")
53+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package test_dataframes
2+
3+
import com.google.common.base.Stopwatch
4+
import org.jetbrains.kotlinx.dataframe.DataFrame
5+
import org.jetbrains.kotlinx.dataframe.io.*
6+
import org.jetbrains.kotlinx.dataframe.api.*
7+
8+
/**
9+
* Test the API of Kotlin Dataframes to do some basic dataframe manipulations.
10+
*
11+
* See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437
12+
* for more information.
13+
*/
14+
fun main() {
15+
val df = DataFrame.read("urb_cpop1_1_Data.csv", delimiter = ',')
16+
df.print()
17+
val watch = Stopwatch.createStarted()
18+
val key by column<String>()
19+
// remove missing values indicated with ":", convert column to IntCol
20+
val filtered = df.filter { "Value"<String>() != ":" }
21+
.add(key) { "CITIES"<String>() + ":" + "INDIC_UR"<String>() }
22+
.convert { "Value"<String>() }.toInt()
23+
24+
var cities = filtered.groupBy(key).pivot("TIME", inward = false).mean { "Value"<Int>() }
25+
cities.print()
26+
27+
cities = cities.filter { key().endsWith("January, total") }.sortByDesc("2017")
28+
cities.print()
29+
30+
// growth
31+
val highestGrowthTable =
32+
cities.filter { "2010"<Double?>() != null && "2016"<Double?>() != null }
33+
.add("growth") { ("2016"<Double>() / "2010"<Double>() - 1.0) * 100.0 }
34+
.sortByDesc("growth")
35+
highestGrowthTable.print()
36+
37+
CheckResult.checkResult(highestGrowthTable[{ key }].toList())
38+
println("Total time: $watch")
39+
}

src/main/java/test_dataframes/TestTablesaw.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* for more information.
1919
*/
2020
public class TestTablesaw {
21-
public static void main(String[] args) throws Exception {
21+
public static void main(String[] args) {
2222
// This automatically makes the ":" values missing
2323
Table data = Table.read().csv(
2424
CsvReadOptions.builder("urb_cpop1_1_Data.csv").missingValueIndicator(":").build());

0 commit comments

Comments
 (0)