Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -4462,7 +4462,9 @@ public final class org/jetbrains/kotlinx/dataframe/api/TakeKt {
}

public final class org/jetbrains/kotlinx/dataframe/api/ToDataFrameKt {
public static final fun toDataFrame (Ljava/util/List;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrame (Ljava/util/Map;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun toDataFrame$default (Ljava/util/List;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrameAnyColumn (Ljava/lang/Iterable;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrameColumnPathAnyNullable (Ljava/lang/Iterable;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrameColumnPathAnyNullable (Ljava/util/Map;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2525,4 +2525,4 @@ public fun <T, C> Convert<T, List<List<C>>>.toDataFrames(containsColumns: Boolea
* @return A new [DataColumn] with the values converted to [DataFrame].
*/
public fun <T> DataColumn<List<List<T>>>.toDataFrames(containsColumns: Boolean = false): DataColumn<AnyFrame> =
map { it.toDataFrame(containsColumns) }
map { it.toDataFrame(containsColumns = containsColumns) }
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,49 @@ public fun Map<ColumnPath, Iterable<Any?>>.toDataFrame(): AnyFrame =
}.toDataFrameFromPairs<Unit>()

// endregion

/**
* Converts a list of lists into a [DataFrame].
*
* By default, treats lists as row values. If [header] is not provided, the first inner list becomes a header (column names), and the remaining lists are treated as data.
*
* With [containsColumns] = `true`, interprets each inner list as a column.
* If [header] is not provided, the first element will be used as the column name, and the remaining elements as values.
*
* @param T The type of elements contained in the nested lists.
* @param containsColumns If `true`, treats each nested list as a column.
* Otherwise, each nested list is a row.
* Defaults to `false`.
* @param header overrides extraction of column names from lists - all values are treated as data instead.
* @return A [DataFrame] containing the data from the nested list structure.
* Returns an empty [DataFrame] if the input is empty or invalid.
*/
@Refine
@Interpretable("ValuesListsToDataFrame")
public fun <T> List<List<T>>.toDataFrame(header: List<String>?, containsColumns: Boolean = false): AnyFrame =
when {
containsColumns -> {
mapIndexedNotNull { index, list ->
if (list.isEmpty()) return@mapIndexedNotNull null
val name = header?.get(index) ?: list[0].toString()
val values = if (header == null) list.drop(1) else list
createColumnGuessingType(name, values)
}.toDataFrame()
}

isEmpty() -> DataFrame.Empty

else -> {
val data = if (header == null) drop(1) else this
(header ?: get(0).map { it.toString() }).mapIndexed { colIndex, name ->
val values = data.map { row ->
if (row.size <= colIndex) {
null
} else {
row[colIndex]
}
}
createColumnGuessingType(name, values)
}.toDataFrame()
}
}
49 changes: 7 additions & 42 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/common.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import org.apache.commons.io.input.BOMInputStream
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.util.IS_URL
import org.jetbrains.kotlinx.dataframe.util.IS_URL_IMPORT
import org.jetbrains.kotlinx.dataframe.util.IS_URL_REPLACE
import org.jetbrains.kotlinx.dataframe.util.LISTS_TO_DATAFRAME_MIGRATION
import java.io.File
import java.io.InputStream
import java.net.HttpURLConnection
Expand Down Expand Up @@ -45,48 +45,13 @@ public fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFram
}
}

/**
* Converts a list of lists into a [DataFrame].
*
* By default, treats the first inner list as a header (column names), and the remaining lists as rows.
* If [containsColumns] is `true`, interprets each inner list as a column,
* where the first element is used as the column name, and the remaining elements as values.
*
* @param T The type of elements contained in the nested lists.
* @param containsColumns If `true`, treats each nested list as a column with its first element as the column name.
* Otherwise, the first list is treated as the header.
* Defaults to `false`.
* @return A [DataFrame] containing the data from the nested list structure.
* Returns an empty [DataFrame] if the input is empty or invalid.
*/
@Deprecated(
LISTS_TO_DATAFRAME_MIGRATION,
ReplaceWith("this.toDataFrame(header = null, containsColumns)", "org.jetbrains.kotlinx.dataframe.api.toDataFrame"),
level = DeprecationLevel.WARNING,
)
public fun <T> List<List<T>>.toDataFrame(containsColumns: Boolean = false): AnyFrame =
when {
containsColumns -> {
mapNotNull {
if (it.isEmpty()) return@mapNotNull null
val name = it[0].toString()
val values = it.drop(1)
createColumnGuessingType(name, values)
}.toDataFrame()
}

isEmpty() -> DataFrame.Empty

else -> {
val header = get(0).map { it.toString() }
val data = drop(1)
header.mapIndexed { colIndex, name ->
val values = data.map { row ->
if (row.size <= colIndex) {
null
} else {
row[colIndex]
}
}
createColumnGuessingType(name, values)
}.toDataFrame()
}
}
toDataFrame(header = null, containsColumns)

@Deprecated(
message = IS_URL,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ internal const val GET_ROWS_RANGE_REPLACE = "df().getRows(indices)"
internal const val GET_ROW_OR_NULL_REPLACE = "df().getRowOrNull(index)"
internal const val COPY_REPLACE = "columns().toDataFrame().cast()"

internal const val LISTS_TO_DATAFRAME_MIGRATION =
"Function moved from io to api package, and a new `header` parameter is introduced. $MESSAGE_1_1"

// endregion

// region keep across releases
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -700,4 +700,92 @@ class CreateDataFrameTests {
val df = list.toDataFrame(maxDepth = 2)
df["map"].type() shouldBe typeOf<Map<String, Int>>()
}

@Test
fun `parsing row-major lines into structured dataframe`() {
// I think finding data in such format will be rare, so we need an optional header parameter.
val lines = buildList {
addAll(listOf("stamp", "header", "data"))
repeat(33) { row ->
add("stamp $row")
add("header $row")
add("data $row")
}
}

val df = lines.chunked(3).toDataFrame(header = null)

df.columnNames() shouldBe listOf("stamp", "header", "data")
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
df.rowsCount() shouldBe 33
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")
}

@Test
fun `parsing srt lines into structured dataframe`() {
// *.srt subtitle file format
val lines = buildList {
repeat(33) { row ->
add("stamp $row")
add("header $row")
add("data $row")
add("\n")
}
}

val df = lines.chunked(4).map { it.dropLast(1) }.toDataFrame(header = listOf("stamp", "header", "data"))

df.columnNames() shouldBe listOf("stamp", "header", "data")
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
df.rowsCount() shouldBe 33
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")

// Different approach. I think the dropLast one is better
lines.chunked(4)
.toDataFrame(header = listOf("stamp", "header", "data", "whitespace"))
.remove("whitespace") shouldBe df
}

@Test
fun `parsing column-major lines into structured dataframe`() {
val lines = buildList {
repeat(4) { col ->
repeat(5) { row ->
add("data$col $row")
}
add("\n")
}
}

val header = List(4) { "col $it" }
val df = lines
.chunked(6)
.map { it.dropLast(1) }
.toDataFrame(header = header, containsColumns = true)
df.columnNames() shouldBe header
df.columnTypes() shouldBe List(4) { typeOf<String>() }
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
}

@Test
fun `parsing column-major lines with header into structured dataframe`() {
val lines = buildList {
repeat(4) { col ->
add("col $col")
repeat(5) { row ->
add("data$col $row")
}
add("\n")
}
}

val header = List(4) { "col $it" }
val df = lines
.chunked(7)
.map { it.dropLast(1) }
.toDataFrame(header = null, containsColumns = true)
df.columnNames() shouldBe header
df.columnTypes() shouldBe List(4) { typeOf<String>() }
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -425,4 +425,22 @@ class Create : TestBase() {
val df = files.toDataFrame(columnName = "data")
// SampleEnd
}

@Test
@TransformDataFrameExpressions
fun toDataFrameLists() {
// SampleStart
val lines = """
1
00:00:05,000 --> 00:00:07,500
This is the first subtitle.

2
00:00:08,000 --> 00:00:10,250
This is the second subtitle.
""".trimIndent().lines()

lines.chunked(4) { it.take(3) }.toDataFrame(header = listOf("n", "timestamp", "text"))
// SampleEnd
}
}
Loading