Skip to content

Commit

Permalink
Merge pull request #935 from Kotlin/fast-double-parser
Browse files Browse the repository at this point in the history
Fast double parser
  • Loading branch information
Jolanrensen authored Oct 31, 2024
2 parents 6aab5fa + 0d6083b commit 903f58b
Show file tree
Hide file tree
Showing 11 changed files with 655 additions and 80 deletions.
23 changes: 21 additions & 2 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -3817,9 +3817,13 @@ public final class org/jetbrains/kotlinx/dataframe/api/ConvertKt {
public static final fun convertToByteFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDouble (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromString (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromString (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;Z)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromString$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromString$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromStringNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromStringNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;Z)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromStringNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun convertToDoubleFromStringNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToDoubleFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToFloat (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun convertToFloatFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
Expand Down Expand Up @@ -6482,19 +6486,25 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt {

public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions {
public fun <init> ()V
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)V
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
public final fun component1 ()Ljava/util/Locale;
public final fun component2 ()Ljava/time/format/DateTimeFormatter;
public final fun component3 ()Ljava/lang/String;
public final fun component4 ()Ljava/util/Set;
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public final fun component5 ()Z
public final synthetic fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
public fun equals (Ljava/lang/Object;)Z
public final fun getDateTimeFormatter ()Ljava/time/format/DateTimeFormatter;
public final fun getDateTimePattern ()Ljava/lang/String;
public final fun getLocale ()Ljava/util/Locale;
public final fun getNullStrings ()Ljava/util/Set;
public final fun getUseFastDoubleParser ()Z
public fun hashCode ()I
public fun toString ()Ljava/lang/String;
}
Expand Down Expand Up @@ -10198,6 +10208,15 @@ public final class org/jetbrains/kotlinx/dataframe/impl/columns/UtilsKt {
public static final fun asAnyFrameColumn (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/columns/FrameColumn;
}

public final class org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser {
public fun <init> (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)V
public final fun parseOrNull (Ljava/lang/CharSequence;)Ljava/lang/Double;
public final fun parseOrNull ([BIILjava/nio/charset/Charset;)Ljava/lang/Double;
public final fun parseOrNull ([CII)Ljava/lang/Double;
public static synthetic fun parseOrNull$default (Lorg/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser;[BIILjava/nio/charset/Charset;ILjava/lang/Object;)Ljava/lang/Double;
public static synthetic fun parseOrNull$default (Lorg/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser;[CIIILjava/lang/Object;)Ljava/lang/Double;
}

public final class org/jetbrains/kotlinx/dataframe/impl/schema/DataFrameSchemaImpl : org/jetbrains/kotlinx/dataframe/schema/DataFrameSchema {
public fun <init> (Ljava/util/Map;)V
public fun compare (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema;)Lorg/jetbrains/kotlinx/dataframe/schema/CompareResult;
Expand Down
1 change: 1 addition & 0 deletions core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dependencies {
implementation(libs.commonsIo)
implementation(libs.serialization.core)
implementation(libs.serialization.json)
implementation(libs.fastDoubleParser)

implementation(libs.fuel)

Expand Down
43 changes: 33 additions & 10 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
Expand Down Expand Up @@ -185,21 +186,43 @@ public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = conve
public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()

/**
* Parse String column to Double considering locale (number format).
* Parses a String column to Double considering locale (number format).
* If [locale] parameter is defined, it's number format is used for parsing.
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
* If [locale] parameter is null, the current system locale is used.
* If the column cannot be parsed, then the POSIX format is used.
*/
@ExcludeFromSources
private interface DataColumnStringConvertToDoubleDoc

/** @include [DataColumnStringConvertToDoubleDoc] */
@JvmName("convertToDoubleFromString")
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> =
this.castToNullable().convertToDouble(locale).castToNotNullable()
convertToDouble(locale = locale, useFastDoubleParser = false)

/**
* Parse String column to Double considering locale (number format).
* If [locale] parameter is defined, it's number format is used for parsing.
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
* @include [DataColumnStringConvertToDoubleDoc]
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
*/
@JvmName("convertToDoubleFromString")
public fun DataColumn<String>.convertToDouble(
locale: Locale? = null,
useFastDoubleParser: Boolean,
): DataColumn<Double> = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable()

/** @include [DataColumnStringConvertToDoubleDoc] */
@JvmName("convertToDoubleFromStringNullable")
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> =
convertToDouble(locale = locale, useFastDoubleParser = false)

/**
* @include [DataColumnStringConvertToDoubleDoc]
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
*/
@JvmName("convertToDoubleFromStringNullable")
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> {
public fun DataColumn<String?>.convertToDouble(
locale: Locale? = null,
useFastDoubleParser: Boolean,
): DataColumn<Double?> {
fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
var currentRow = 0
try {
Expand All @@ -220,14 +243,14 @@ public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColu
}

return if (locale != null) {
val explicitParser = Parsers.getDoubleParser(locale)
val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser)
applyParser(explicitParser)
} else {
try {
val defaultParser = Parsers.getDoubleParser()
val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser)
applyParser(defaultParser)
} catch (e: TypeConversionException) {
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"))
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser)
applyParser(posixParser)
}
}
Expand Down
57 changes: 57 additions & 0 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
import org.jetbrains.kotlinx.dataframe.typeClass
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
import java.time.format.DateTimeFormatter
import java.util.Locale
import kotlin.reflect.KProperty
Expand Down Expand Up @@ -40,13 +42,68 @@ public interface GlobalParserOptions {
public var locale: Locale
}

/**
* ### Options for parsing [String]`?` columns
*
* @param locale locale to use for parsing dates and numbers, defaults to the System default locale.
* If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern]
* to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse
* locale-specific dates!
* @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created
* from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified,
* [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used.
* @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter],
* it will be used to create a [DateTimeFormatter].
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's
* ["null", "NULL", "NA", "N/A"].
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
*/
public data class ParserOptions(
val locale: Locale? = null,
// TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876
val dateTimeFormatter: DateTimeFormatter? = null,
val dateTimePattern: String? = null,
val nullStrings: Set<String>? = null,
val useFastDoubleParser: Boolean = false,
) {

/** For binary compatibility. */
@Deprecated(
message = PARSER_OPTIONS,
level = DeprecationLevel.HIDDEN,
)
public constructor(
locale: Locale? = null,
dateTimeFormatter: DateTimeFormatter? = null,
dateTimePattern: String? = null,
nullStrings: Set<String>? = null,
) : this(
locale = locale,
dateTimeFormatter = dateTimeFormatter,
dateTimePattern = dateTimePattern,
nullStrings = nullStrings,
useFastDoubleParser = false,
)

/** For binary compatibility. */
@Deprecated(
message = PARSER_OPTIONS_COPY,
level = DeprecationLevel.HIDDEN,
)
public fun copy(
locale: Locale? = this.locale,
dateTimeFormatter: DateTimeFormatter? = this.dateTimeFormatter,
dateTimePattern: String? = this.dateTimePattern,
nullStrings: Set<String>? = this.nullStrings,
): ParserOptions =
ParserOptions(
locale = locale,
dateTimeFormatter = dateTimeFormatter,
dateTimePattern = dateTimePattern,
nullStrings = nullStrings,
useFastDoubleParser = useFastDoubleParser,
)

internal fun getDateTimeFormatter(): DateTimeFormatter? =
when {
dateTimeFormatter != null -> dateTimeFormatter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,31 @@ import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER
* {@include [Indent]}
*
*/
@ExcludeFromSources
internal interface LineBreak

/** &nbsp; */
@ExcludeFromSources
internal interface QuarterIndent

/** &nbsp;&nbsp; */
@ExcludeFromSources
internal interface HalfIndent

/** &nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface Indent

/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface DoubleIndent

/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface TripleIndent

/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
@ExcludeFromSources
internal interface QuadrupleIndent

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ import org.jetbrains.kotlinx.dataframe.hasNulls
import org.jetbrains.kotlinx.dataframe.impl.canParse
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse
import org.jetbrains.kotlinx.dataframe.io.isURL
import org.jetbrains.kotlinx.dataframe.io.readJsonStr
import org.jetbrains.kotlinx.dataframe.values
import java.math.BigDecimal
import java.net.URL
import java.text.NumberFormat
import java.text.ParsePosition
import java.time.format.DateTimeFormatter
import java.time.format.DateTimeFormatterBuilder
Expand Down Expand Up @@ -275,29 +275,6 @@ internal object Parsers : GlobalParserOptions {
null
}

private fun String.parseDouble(format: NumberFormat) =
when (uppercase(Locale.getDefault())) {
"NAN" -> Double.NaN

"INF" -> Double.POSITIVE_INFINITY

"-INF" -> Double.NEGATIVE_INFINITY

"INFINITY" -> Double.POSITIVE_INFINITY

"-INFINITY" -> Double.NEGATIVE_INFINITY

else -> {
val parsePosition = ParsePosition(0)
val result: Double? = format.parse(this, parsePosition)?.toDouble()
if (parsePosition.index != this.length) {
null
} else {
result
}
}
}

inline fun <reified T : Any> stringParser(
catch: Boolean = false,
coveredBy: Set<KType> = emptySet(),
Expand All @@ -317,11 +294,15 @@ internal object Parsers : GlobalParserOptions {
): StringParserWithFormat<T> = StringParserWithFormat(typeOf<T>(), coveredBy, body)

private val parserToDoubleWithOptions = stringParserWithOptions { options ->
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
val parser = { it: String -> it.parseDouble(numberFormat) }
val fastDoubleParser = FastDoubleParser(options ?: ParserOptions())
val parser = { it: String -> fastDoubleParser.parseOrNull(it) }
parser
}

private val posixDoubleParser = FastDoubleParser(
ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")),
)

internal val parsersOrder = listOf(
// Int
stringParser<Int> { it.toIntOrNull() },
Expand Down Expand Up @@ -384,7 +365,7 @@ internal object Parsers : GlobalParserOptions {
// Double, with explicit number format or taken from current locale
parserToDoubleWithOptions,
// Double, with POSIX format
stringParser<Double> { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) },
stringParser<Double> { posixDoubleParser.parseOrNull(it) },
// Boolean
stringParser<Boolean> { it.toBooleanOrNull() },
// BigDecimal
Expand Down Expand Up @@ -449,9 +430,9 @@ internal object Parsers : GlobalParserOptions {
return parser.applyOptions(options)
}

internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? {
internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? {
val options = if (locale != null) {
ParserOptions(locale = locale)
ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser)
} else {
null
}
Expand Down
Loading

0 comments on commit 903f58b

Please sign in to comment.