Automated commit of generated code

Kotlin · Nov 25, 2024 · 114c570 · 114c570
1 parent 2f3f364
commit 114c570
Show file tree

Hide file tree

Showing 16 changed files with 546 additions and 239 deletions.
diff --git a/...d-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/annotations/ImportDataSchema.kt b/...d-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/annotations/ImportDataSchema.kt
@@ -47,6 +47,7 @@ public enum class DataSchemaVisibility {
     EXPLICIT_PUBLIC,
 }
 
+// TODO add more options
 public annotation class CsvOptions(public val delimiter: Char)
 
 /**

diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt
@@ -24,10 +24,9 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
 import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
 import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
 import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
-import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
-import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
 import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
 import org.jetbrains.kotlinx.dataframe.impl.api.convertRowColumnImpl
+import org.jetbrains.kotlinx.dataframe.impl.api.convertToDoubleImpl
 import org.jetbrains.kotlinx.dataframe.impl.api.convertToTypeImpl
 import org.jetbrains.kotlinx.dataframe.impl.api.defaultTimeZone
 import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDate
@@ -36,14 +35,12 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime
 import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
 import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
 import org.jetbrains.kotlinx.dataframe.io.toDataFrame
-import org.jetbrains.kotlinx.dataframe.path
 import java.math.BigDecimal
 import java.net.URL
 import java.util.Locale
 import kotlin.reflect.KProperty
 import kotlin.reflect.KType
 import kotlin.reflect.full.isSubtypeOf
-import kotlin.reflect.full.withNullability
 import kotlin.reflect.typeOf
 
 @Interpretable("Convert0")
@@ -129,15 +126,29 @@ public inline fun <T, C, reified R> Convert<T, C>.perRowCol(
 
 public inline fun <reified C> AnyCol.convertTo(): DataColumn<C> = convertTo(typeOf<C>()) as DataColumn<C>
 
-public fun AnyCol.convertTo(newType: KType): AnyCol {
-    val isTypesAreCorrect = this.type().withNullability(true).isSubtypeOf(typeOf<String?>()) &&
-        newType.withNullability(true) == typeOf<Double?>()
+@Suppress("UNCHECKED_CAST")
+public fun AnyCol.convertTo(newType: KType): AnyCol =
+    when {
+        type().isSubtypeOf(typeOf<String?>()) ->
+            (this as DataColumn<String?>).convertTo(newType)
 
-    if (isTypesAreCorrect) {
-        return (this as DataColumn<String?>).convertToDouble().setNullable(newType.isMarkedNullable)
+        else -> convertToTypeImpl(newType, null)
+    }
+
+public inline fun <reified C> DataColumn<String?>.convertTo(parserOptions: ParserOptions? = null): DataColumn<C> =
+    convertTo(typeOf<C>(), parserOptions) as DataColumn<C>
+
+public fun DataColumn<String?>.convertTo(newType: KType, parserOptions: ParserOptions? = null): AnyCol =
+    when {
+        newType.isSubtypeOf(typeOf<Double?>()) ->
+            convertToDoubleImpl(
+                locale = parserOptions?.locale,
+                nullStrings = parserOptions?.nullStrings,
+                useFastDoubleParser = parserOptions?.useFastDoubleParser,
+            ).setNullable(newType.isMarkedNullable)
+
+        else -> convertToTypeImpl(newType, parserOptions)
     }
-    return convertToTypeImpl(newType)
-}
 
 @JvmName("convertToLocalDateTimeFromT")
 public fun <T : Any> DataColumn<T>.convertToLocalDateTime(): DataColumn<LocalDateTime> = convertTo()
@@ -185,78 +196,72 @@ public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = conve
 public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()
 
 /** Parses a String column to Double considering locale (number format).
- * If [locale] parameter is defined, it's number format is used for parsing.
- * If [locale] parameter is null, the current system locale is used.
- * If the column cannot be parsed, then the POSIX format is used. */
+ *
+ * If any of the parameters is `null`, the global default (in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser]) is used.
+ *
+ * @param locale If defined, its number format is used for parsing.
+ *   The default in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser] is the system locale.
+ * If the column cannot be parsed, the POSIX format is used. */
 @JvmName("convertToDoubleFromString")
 public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> =
-    convertToDouble(locale = locale, useFastDoubleParser = false)
+    convertToDouble(locale = locale, nullStrings = null, useFastDoubleParser = null)
 
 /**
  * Parses a String column to Double considering locale (number format).
- * If [locale] parameter is defined, it's number format is used for parsing.
- * If [locale] parameter is null, the current system locale is used.
- * If the column cannot be parsed, then the POSIX format is used.
- * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
+ *
+ * If any of the parameters is `null`, the global default (in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser]) is used.
+ *
+ * @param locale If defined, its number format is used for parsing.
+ *   The default in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser] is the system locale.
+ *   If the column cannot be parsed, the POSIX format is used.
+ * @param nullStrings a set of strings that should be treated as `null` values.
+ *   The default in [DataFrame.parser][DataFrame.Companion.parser] is ["null", "NULL", "NA", "N/A"].
+ * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser.
+ *   The default in [DataFrame.parser][DataFrame.Companion.parser] is `false` for now.
  */
 @JvmName("convertToDoubleFromString")
 public fun DataColumn<String>.convertToDouble(
     locale: Locale? = null,
-    useFastDoubleParser: Boolean,
-): DataColumn<Double> = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable()
+    nullStrings: Set<String>?,
+    useFastDoubleParser: Boolean?,
+): DataColumn<Double> =
+    this.castToNullable().convertToDouble(locale, nullStrings, useFastDoubleParser).castToNotNullable()
 
 /** Parses a String column to Double considering locale (number format).
- * If [locale] parameter is defined, it's number format is used for parsing.
- * If [locale] parameter is null, the current system locale is used.
- * If the column cannot be parsed, then the POSIX format is used. */
+ *
+ * If any of the parameters is `null`, the global default (in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser]) is used.
+ *
+ * @param locale If defined, its number format is used for parsing.
+ *   The default in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser] is the system locale.
+ * If the column cannot be parsed, the POSIX format is used. */
 @JvmName("convertToDoubleFromStringNullable")
 public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> =
-    convertToDouble(locale = locale, useFastDoubleParser = false)
+    convertToDouble(locale = locale, nullStrings = null, useFastDoubleParser = null)
 
 /**
  * Parses a String column to Double considering locale (number format).
- * If [locale] parameter is defined, it's number format is used for parsing.
- * If [locale] parameter is null, the current system locale is used.
- * If the column cannot be parsed, then the POSIX format is used.
- * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
+ *
+ * If any of the parameters is `null`, the global default (in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser]) is used.
+ *
+ * @param locale If defined, its number format is used for parsing.
+ *   The default in [DataFrame.parser][org.jetbrains.kotlinx.dataframe.DataFrame.Companion.parser] is the system locale.
+ *   If the column cannot be parsed, the POSIX format is used.
+ * @param nullStrings a set of strings that should be treated as `null` values.
+ *   The default in [DataFrame.parser][DataFrame.Companion.parser] is ["null", "NULL", "NA", "N/A"].
+ * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser.
+ *   The default in [DataFrame.parser][DataFrame.Companion.parser] is `false` for now.
  */
 @JvmName("convertToDoubleFromStringNullable")
 public fun DataColumn<String?>.convertToDouble(
     locale: Locale? = null,
-    useFastDoubleParser: Boolean,
-): DataColumn<Double?> {
-    fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
-        var currentRow = 0
-        try {
-            return mapIndexed { row, value ->
-                currentRow = row
-                value?.let {
-                    parser(value.trim()) ?: throw TypeConversionException(
-                        value = value,
-                        from = typeOf<String>(),
-                        to = typeOf<Double>(),
-                        column = path,
-                    )
-                }
-            }
-        } catch (e: TypeConversionException) {
-            throw CellConversionException(e.value, e.from, e.to, path, currentRow, e)
-        }
-    }
-
-    return if (locale != null) {
-        val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser)
-        applyParser(explicitParser)
-    } else {
-        try {
-            val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser)
-            applyParser(defaultParser)
-        } catch (e: TypeConversionException) {
-            val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser)
-            applyParser(posixParser)
-        }
-    }
-}
+    nullStrings: Set<String>?,
+    useFastDoubleParser: Boolean?,
+): DataColumn<Double?> =
+    convertToDoubleImpl(
+        locale = locale,
+        nullStrings = nullStrings,
+        useFastDoubleParser = useFastDoubleParser,
+    )
 
 @JvmName("convertToFloatFromT")
 public fun <T : Any> DataColumn<T>.convertToFloat(): DataColumn<Float> = convertTo()

diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt
@@ -10,14 +10,25 @@ import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
 import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
 import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
 import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
+import org.jetbrains.kotlinx.dataframe.io.readCSV
 import org.jetbrains.kotlinx.dataframe.typeClass
 import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS
 import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
 import java.time.format.DateTimeFormatter
 import java.util.Locale
 import kotlin.reflect.KProperty
+import kotlin.reflect.KType
 
-public val DataFrame.Companion.parser: GlobalParserOptions get() = Parsers
+/**
+ * ### Global Parser Options
+ *
+ * These options are used to configure how [DataColumns][DataColumn] of type [String] or [String?][String]
+ * should be parsed.
+ * You can always pass a [ParserOptions] object to functions that perform parsing, like [tryParse], [parse],
+ * or even [DataFrame.readCSV][DataFrame.Companion.readCSV] to override these options.
+ */
+public val DataFrame.Companion.parser: GlobalParserOptions
+    get() = Parsers
 
 public fun <T> DataFrame<T>.parse(options: ParserOptions? = null, columns: ColumnsSelector<T, Any?>): DataFrame<T> =
     parseImpl(options, columns)
@@ -37,14 +48,33 @@ public interface GlobalParserOptions {
 
     public fun addNullString(str: String)
 
+    /** This function can be called to skip some types. Parsing will be attempted for all other types. */
+    public fun addSkipType(type: KType)
+
+    /** Whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */
+    public var useFastDoubleParser: Boolean
+
     public fun resetToDefault()
 
     public var locale: Locale
+
+    public val nulls: Set<String>
+
+    public val skipTypes: Set<KType>
 }
 
 /**
  * ### Options for parsing [String]`?` columns
  *
+ * These options are used to configure how [DataColumn]s of type [String] or [String?][String] should be parsed.
+ * They can be passed to [tryParse] and [parse] functions.
+ *
+ * You can also use the [DataFrame.parser][DataFrame.Companion.parser] property to access and modify
+ * the global parser configuration.
+ *
+ * If any of the arguments in [ParserOptions] are `null` (or [ParserOptions] itself is `null`),
+ * the global configuration will be queried.
+ *
  * @param locale locale to use for parsing dates and numbers, defaults to the System default locale.
  *   If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern]
  *   to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse
@@ -55,16 +85,19 @@ public interface GlobalParserOptions {
  * @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter],
  *   it will be used to create a [DateTimeFormatter].
  * @param nullStrings a set of strings that should be treated as `null` values. By default, it's
- *   ["null", "NULL", "NA", "N/A"].
+ *   `["null", "NULL", "NA", "N/A"]`.
+ * @param skipTypes a set of types that should be skipped during parsing. Parsing will be attempted for all other types.
+ *   By default, it's an empty set. To skip all types except a specified one, use [convertTo] instead.
  * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
  */
-public data class ParserOptions(
-    val locale: Locale? = null,
+public class ParserOptions(
+    public val locale: Locale? = null,
     // TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876
-    val dateTimeFormatter: DateTimeFormatter? = null,
-    val dateTimePattern: String? = null,
-    val nullStrings: Set<String>? = null,
-    val useFastDoubleParser: Boolean = false,
+    public val dateTimeFormatter: DateTimeFormatter? = null,
+    public val dateTimePattern: String? = null,
+    public val nullStrings: Set<String>? = null,
+    public val skipTypes: Set<KType>? = null,
+    public val useFastDoubleParser: Boolean? = null,
 ) {
 
     /** For binary compatibility. */
@@ -82,7 +115,8 @@ public data class ParserOptions(
         dateTimeFormatter = dateTimeFormatter,
         dateTimePattern = dateTimePattern,
         nullStrings = nullStrings,
-        useFastDoubleParser = false,
+        skipTypes = null,
+        useFastDoubleParser = null,
     )
 
     /** For binary compatibility. */
@@ -101,6 +135,7 @@ public data class ParserOptions(
             dateTimeFormatter = dateTimeFormatter,
             dateTimePattern = dateTimePattern,
             nullStrings = nullStrings,
+            skipTypes = skipTypes,
             useFastDoubleParser = useFastDoubleParser,
         )
 
@@ -111,6 +146,52 @@ public data class ParserOptions(
             dateTimePattern != null -> DateTimeFormatter.ofPattern(dateTimePattern)
             else -> null
         }
+
+    public fun copy(
+        locale: Locale? = this.locale,
+        dateTimeFormatter: DateTimeFormatter? = this.dateTimeFormatter,
+        dateTimePattern: String? = this.dateTimePattern,
+        nullStrings: Set<String>? = this.nullStrings,
+        skipTypes: Set<KType>? = this.skipTypes,
+        useFastDoubleParser: Boolean? = this.useFastDoubleParser,
+    ): ParserOptions =
+        ParserOptions(
+            locale = locale,
+            dateTimeFormatter = dateTimeFormatter,
+            dateTimePattern = dateTimePattern,
+            nullStrings = nullStrings,
+            skipTypes = skipTypes,
+            useFastDoubleParser = useFastDoubleParser,
+        )
+
+    override fun equals(other: Any?): Boolean {
+        if (this === other) return true
+        if (javaClass != other?.javaClass) return false
+
+        other as ParserOptions
+
+        if (useFastDoubleParser != other.useFastDoubleParser) return false
+        if (locale != other.locale) return false
+        if (dateTimeFormatter != other.dateTimeFormatter) return false
+        if (dateTimePattern != other.dateTimePattern) return false
+        if (nullStrings != other.nullStrings) return false
+        if (skipTypes != other.skipTypes) return false
+
+        return true
+    }
+
+    override fun hashCode(): Int {
+        var result = useFastDoubleParser?.hashCode() ?: 0
+        result = 31 * result + (locale?.hashCode() ?: 0)
+        result = 31 * result + (dateTimeFormatter?.hashCode() ?: 0)
+        result = 31 * result + (dateTimePattern?.hashCode() ?: 0)
+        result = 31 * result + (nullStrings?.hashCode() ?: 0)
+        result = 31 * result + (skipTypes?.hashCode() ?: 0)
+        return result
+    }
+
+    override fun toString(): String =
+        "ParserOptions(locale=$locale, dateTimeFormatter=$dateTimeFormatter, dateTimePattern=$dateTimePattern, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser)"
 }
 
 /** Tries to parse a column of strings into a column of a different type.

diff --git a/...rated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/ColumnNameGenerator.kt b/...rated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/ColumnNameGenerator.kt
@@ -2,13 +2,13 @@ package org.jetbrains.kotlinx.dataframe.impl
 
 import org.jetbrains.kotlinx.dataframe.AnyFrame
 
-internal class ColumnNameGenerator(columnNames: List<String> = emptyList()) {
+public class ColumnNameGenerator(columnNames: List<String> = emptyList()) {
 
     private val usedNames = columnNames.toMutableSet()
 
     private val colNames = columnNames.toMutableList()
 
-    fun addUnique(preferredName: String): String {
+    public fun addUnique(preferredName: String): String {
         var name = preferredName
         var k = 1
         while (usedNames.contains(name)) {
@@ -19,17 +19,17 @@ internal class ColumnNameGenerator(columnNames: List<String> = emptyList()) {
         return name
     }
 
-    fun addIfAbsent(name: String) {
+    public fun addIfAbsent(name: String) {
         if (!usedNames.contains(name)) {
             usedNames.add(name)
             colNames.add(name)
         }
     }
 
-    val names: List<String>
+    public val names: List<String>
         get() = colNames
 
-    fun contains(name: String) = usedNames.contains(name)
+    public operator fun contains(name: String): Boolean = usedNames.contains(name)
 }
 
 internal fun AnyFrame.nameGenerator() = ColumnNameGenerator(columnNames())
-Original file line number
+Diff line change
@@ Expand Up / @@ -47,6 +47,7 @@ public enum class DataSchemaVisibility { @@
         EXPLICIT_PUBLIC,
     }
+    // TODO add more options
     public annotation class CsvOptions(public val delimiter: Char)
     /**
@@ Expand Down @@