diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index aa3aacd82f..7196695704 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -184,22 +184,47 @@ public fun DataColumn.convertToDouble(): DataColumn = conve public fun DataColumn.convertToDouble(): DataColumn = convertTo() +/** Parses a String column to Double considering locale (number format). + * If [locale] parameter is defined, it's number format is used for parsing. + * If [locale] parameter is null, the current system locale is used. + * If the column cannot be parsed, then the POSIX format is used. */ +@JvmName("convertToDoubleFromString") +public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn = + convertToDouble(locale = locale, useFastDoubleParser = false) + /** - * Parse String column to Double considering locale (number format). + * Parses a String column to Double considering locale (number format). * If [locale] parameter is defined, it's number format is used for parsing. - * If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used. + * If [locale] parameter is null, the current system locale is used. + * If the column cannot be parsed, then the POSIX format is used. + * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ @JvmName("convertToDoubleFromString") -public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn = - this.castToNullable().convertToDouble(locale).castToNotNullable() +public fun DataColumn.convertToDouble( + locale: Locale? = null, + useFastDoubleParser: Boolean, +): DataColumn = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable() + +/** Parses a String column to Double considering locale (number format). + * If [locale] parameter is defined, it's number format is used for parsing. + * If [locale] parameter is null, the current system locale is used. + * If the column cannot be parsed, then the POSIX format is used. */ +@JvmName("convertToDoubleFromStringNullable") +public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn = + convertToDouble(locale = locale, useFastDoubleParser = false) /** - * Parse String column to Double considering locale (number format). + * Parses a String column to Double considering locale (number format). * If [locale] parameter is defined, it's number format is used for parsing. - * If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used. + * If [locale] parameter is null, the current system locale is used. + * If the column cannot be parsed, then the POSIX format is used. + * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ @JvmName("convertToDoubleFromStringNullable") -public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn { +public fun DataColumn.convertToDouble( + locale: Locale? = null, + useFastDoubleParser: Boolean, +): DataColumn { fun applyParser(parser: (String) -> Double?): DataColumn { var currentRow = 0 try { @@ -220,14 +245,14 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColu } return if (locale != null) { - val explicitParser = Parsers.getDoubleParser(locale) + val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser) applyParser(explicitParser) } else { try { - val defaultParser = Parsers.getDoubleParser() + val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser) applyParser(defaultParser) } catch (e: TypeConversionException) { - val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8")) + val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser) applyParser(posixParser) } } diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index b216c7d1c9..f2a065faa3 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl import org.jetbrains.kotlinx.dataframe.typeClass +import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS +import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY import java.time.format.DateTimeFormatter import java.util.Locale import kotlin.reflect.KProperty @@ -40,13 +42,68 @@ public interface GlobalParserOptions { public var locale: Locale } +/** + * ### Options for parsing [String]`?` columns + * + * @param locale locale to use for parsing dates and numbers, defaults to the System default locale. + * If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern] + * to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse + * locale-specific dates! + * @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created + * from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified, + * [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used. + * @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter], + * it will be used to create a [DateTimeFormatter]. + * @param nullStrings a set of strings that should be treated as `null` values. By default, it's + * ["null", "NULL", "NA", "N/A"]. + * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. + */ public data class ParserOptions( val locale: Locale? = null, // TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876 val dateTimeFormatter: DateTimeFormatter? = null, val dateTimePattern: String? = null, val nullStrings: Set? = null, + val useFastDoubleParser: Boolean = false, ) { + + /** For binary compatibility. */ + @Deprecated( + message = PARSER_OPTIONS, + level = DeprecationLevel.HIDDEN, + ) + public constructor( + locale: Locale? = null, + dateTimeFormatter: DateTimeFormatter? = null, + dateTimePattern: String? = null, + nullStrings: Set? = null, + ) : this( + locale = locale, + dateTimeFormatter = dateTimeFormatter, + dateTimePattern = dateTimePattern, + nullStrings = nullStrings, + useFastDoubleParser = false, + ) + + /** For binary compatibility. */ + @Deprecated( + message = PARSER_OPTIONS_COPY, + level = DeprecationLevel.HIDDEN, + ) + public fun copy( + locale: Locale? = this.locale, + dateTimeFormatter: DateTimeFormatter? = this.dateTimeFormatter, + dateTimePattern: String? = this.dateTimePattern, + nullStrings: Set? = this.nullStrings, + ): ParserOptions = + ParserOptions( + locale = locale, + dateTimeFormatter = dateTimeFormatter, + dateTimePattern = dateTimePattern, + nullStrings = nullStrings, + useFastDoubleParser = useFastDoubleParser, + ) + internal fun getDateTimeFormatter(): DateTimeFormatter? = when { dateTimeFormatter != null -> dateTimeFormatter diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt index 7d9563405b..abd1d9b500 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt @@ -14,31 +14,6 @@ import kotlin.annotation.AnnotationTarget.TYPE import kotlin.annotation.AnnotationTarget.TYPEALIAS import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER -/** - * - *      - * - */ -internal interface LineBreak - -/**   */ -internal interface QuarterIndent - -/**    */ -internal interface HalfIndent - -/**      */ -internal interface Indent - -/**          */ -internal interface DoubleIndent - -/**              */ -internal interface TripleIndent - -/**                  */ -internal interface QuadrupleIndent - /** * Any `Documentable` annotated with this annotation will be excluded from the generated sources by * the documentation processor. diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index be68050557..99b4ec7caf 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -34,13 +34,13 @@ import org.jetbrains.kotlinx.dataframe.hasNulls import org.jetbrains.kotlinx.dataframe.impl.canParse import org.jetbrains.kotlinx.dataframe.impl.catchSilent import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse import org.jetbrains.kotlinx.dataframe.io.isURL import org.jetbrains.kotlinx.dataframe.io.readJsonStr import org.jetbrains.kotlinx.dataframe.values import java.math.BigDecimal import java.net.URL -import java.text.NumberFormat import java.text.ParsePosition import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatterBuilder @@ -275,29 +275,6 @@ internal object Parsers : GlobalParserOptions { null } - private fun String.parseDouble(format: NumberFormat) = - when (uppercase(Locale.getDefault())) { - "NAN" -> Double.NaN - - "INF" -> Double.POSITIVE_INFINITY - - "-INF" -> Double.NEGATIVE_INFINITY - - "INFINITY" -> Double.POSITIVE_INFINITY - - "-INFINITY" -> Double.NEGATIVE_INFINITY - - else -> { - val parsePosition = ParsePosition(0) - val result: Double? = format.parse(this, parsePosition)?.toDouble() - if (parsePosition.index != this.length) { - null - } else { - result - } - } - } - inline fun stringParser( catch: Boolean = false, coveredBy: Set = emptySet(), @@ -317,11 +294,15 @@ internal object Parsers : GlobalParserOptions { ): StringParserWithFormat = StringParserWithFormat(typeOf(), coveredBy, body) private val parserToDoubleWithOptions = stringParserWithOptions { options -> - val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) - val parser = { it: String -> it.parseDouble(numberFormat) } + val fastDoubleParser = FastDoubleParser(options ?: ParserOptions()) + val parser = { it: String -> fastDoubleParser.parseOrNull(it) } parser } + private val posixDoubleParser = FastDoubleParser( + ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")), + ) + internal val parsersOrder = listOf( // Int stringParser { it.toIntOrNull() }, @@ -384,7 +365,7 @@ internal object Parsers : GlobalParserOptions { // Double, with explicit number format or taken from current locale parserToDoubleWithOptions, // Double, with POSIX format - stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) }, + stringParser { posixDoubleParser.parseOrNull(it) }, // Boolean stringParser { it.toBooleanOrNull() }, // BigDecimal @@ -449,9 +430,9 @@ internal object Parsers : GlobalParserOptions { return parser.applyOptions(options) } - internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? { + internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? { val options = if (locale != null) { - ParserOptions(locale = locale) + ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser) } else { null } diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt new file mode 100644 index 0000000000..54b584336b --- /dev/null +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt @@ -0,0 +1,263 @@ +package org.jetbrains.kotlinx.dataframe.impl.io + +import ch.randelshofer.fastdoubleparser.ConfigurableDoubleParser +import ch.randelshofer.fastdoubleparser.NumberFormatSymbols +import io.github.oshai.kotlinlogging.KotlinLogging +import org.jetbrains.kotlinx.dataframe.api.ParserOptions +import java.nio.charset.Charset +import java.text.DecimalFormatSymbols +import java.text.NumberFormat +import java.text.ParsePosition +import java.util.Locale + +private val logger = KotlinLogging.logger {} + +// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales +private val INFINITIES = arrayOf("∞", "inf", "infinity", "infty") +private val PLUS_INFINITIES = INFINITIES.map { "+$it" } +private val MINUS_INFINITIES = INFINITIES.map { "-$it" } +private val NANS = arrayOf("nan", "na", "n/a") + +/** + * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double]. + * + * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with an _EXPERIMENTAL_ + * fast double parser, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). + * If not, or if it fails, it will use [NumberFormat] to parse the input. + * + * Public, so it can be used in other modules. + * + * @param parserOptions can be supplied to configure the parser. + * We'll only use [ParserOptions.locale] and [ParserOptions.useFastDoubleParser]. + */ +@Suppress("ktlint:standard:comment-wrapping") +public class FastDoubleParser(private val parserOptions: ParserOptions) { + + private val supportedFastCharsets = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII) + + private val locale = parserOptions.locale ?: Locale.getDefault() + private val fallbackLocale = Locale.ROOT + + private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale) + private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale) + + private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true) + + // Fix for Java 8 RTL languages minus sign not being recognized + private val minusSignIsFormatSymbol = + Character.getType(localDecimalFormatSymbols.minusSign) == Character.FORMAT.toInt() + + /** + * Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on + * [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols]. + * + * Fallback characters/strings are only added if they're not clashing with local characters/strings. + */ + private fun setupNumberFormatSymbols(): NumberFormatSymbols { + // collect all chars and strings that are locale-specific such that we can check whether + // fallback chars and strings are safe to add + val localChars = with(localDecimalFormatSymbols) { + buildSet { + add(decimalSeparator.lowercaseChar()) + add(groupingSeparator.lowercaseChar()) + add(minusSign.lowercaseChar()) + add('+') + add(zeroDigit.lowercaseChar()) + } + } + val localStrings = with(localDecimalFormatSymbols) { + buildSet { + add(exponentSeparator.lowercase()) + add(infinity.lowercase()) + add(naN.lowercase()) + } + } + + /** + * Builds a set with the specified char from [localDecimalFormatSymbols] and + * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so. + * [additionals] will be added to the set too, when they're safe to add. + */ + fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set = + buildSet { + val getChar = this@fromLocalWithFallBack + val char = getChar(localDecimalFormatSymbols).lowercaseChar() + add(char) + + // add fallback char if it's safe to do so + val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar() + if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) { + add(fallbackChar) + } + + // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. + if (char.isWhitespace()) add(' ') + + // add additional chars if needed + for (additional in additionals) { + val lowercase = additional.lowercaseChar() + if (lowercase !in localChars && !localStrings.any { lowercase in it }) { + add(lowercase) + } + } + } + + /** + * Builds a set with the specified string from [localDecimalFormatSymbols] and + * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so. + * [additionals] will be added to the set too, when they're safe to add. + */ + fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set = + buildSet { + val getString = this@fromLocalWithFallBack + val string = getString(localDecimalFormatSymbols).lowercase() + add(string) + + // add fallback string if it's safe to do so + val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase() + if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) { + add(fallbackString) + } + + // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. + if (string.isBlank()) add(" ") + + // add additional strings if needed + for (additional in additionals) { + val lowercase = additional.lowercase() + if (!lowercase.any { it in localChars } && lowercase !in localStrings) { + add(lowercase) + } + } + } + + return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols) + .withPlusSign(setOf('+')) + .withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack()) + .withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack()) + .withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack()) + .withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack()) + .withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES)) + .withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS)) + } + + /** Fallback method for parsing doubles. */ + private fun String.parseToDoubleOrNullFallback(): Double? = + when (lowercase()) { + in INFINITIES, in PLUS_INFINITIES -> Double.POSITIVE_INFINITY + + in MINUS_INFINITIES -> Double.NEGATIVE_INFINITY + + in NANS -> Double.NaN + + else -> { + // not thread safe; must be created here + val numberFormat = NumberFormat.getInstance(locale) + val parsePosition = ParsePosition(0) + val result = numberFormat.parse(this, parsePosition)?.toDouble() + if (parsePosition.index != this.length || parsePosition.errorIndex != -1) { + null + } else { + result + } + } + }.also { + if (it == null) { + logger.debug { "Could not parse '$this' as Double with NumberFormat with locale '$locale'." } + } + } + + /** + * Parses a double value from a substring of the specified byte array. + * + * It uses the [fast double parser][ConfigurableDoubleParser] if [ParserOptions.useFastDoubleParser] is enabled, + * else, or if that fails, it uses [parseToDoubleOrNullFallback]. + */ + public fun parseOrNull( + ba: ByteArray, + offset: Int = 0, + length: Int = ba.size, + charset: Charset = Charsets.UTF_8, + ): Double? { + if (parserOptions.useFastDoubleParser && charset in supportedFastCharsets) { + try { + // Fixes RTL minus sign not being recognized + if (minusSignIsFormatSymbol && ba.toString(charset).startsWith(localDecimalFormatSymbols.minusSign)) { + val localMinusSize = localDecimalFormatSymbols.minusSign.toString().toByteArray(charset).size + val fallbackMinusSize = fallbackDecimalFormatSymbols.minusSign.toString().toByteArray(charset).size + val newOffset = (localMinusSize - fallbackMinusSize).coerceAtLeast(0) + val newBa = ba.copyOf() + fallbackDecimalFormatSymbols.minusSign.toString().toByteArray(charset) + .copyInto(destination = newBa, destinationOffset = newOffset) + + return parser.parseDouble(newBa, newOffset, length - newOffset) + } + return parser.parseDouble(ba, offset, length) + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '${ + ba.toString(charset) + }' from a ByteArray to Double with FastDoubleParser with locale '$locale'." + } + } + } + return String(bytes = ba, offset = offset, length = length, charset = charset) + .parseToDoubleOrNullFallback() + } + + /** + * Parses a double value from the specified [CharSequence]. + * + * It uses the [fast double parser][ConfigurableDoubleParser] if [ParserOptions.useFastDoubleParser] is enabled, + * else, or if that fails, it uses [parseToDoubleOrNullFallback]. + */ + public fun parseOrNull(cs: CharSequence): Double? { + if (parserOptions.useFastDoubleParser) { + try { + // Fixes RTL minus sign not being recognized + if (minusSignIsFormatSymbol && cs.startsWith(localDecimalFormatSymbols.minusSign)) { + val newCs = cs.toString().replaceFirst( + localDecimalFormatSymbols.minusSign, + fallbackDecimalFormatSymbols.minusSign, + ) + return parser.parseDouble(newCs) + } + + return parser.parseDouble(cs) + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '$cs' from a CharSequence to Double with FastDoubleParser with locale '$locale'." + } + } + } + + return cs.toString().parseToDoubleOrNullFallback() + } + + /** + * Parses a double value from the specified [CharArray]. + * + * It uses the [fast double parser][ConfigurableDoubleParser] if [ParserOptions.useFastDoubleParser] is enabled, + * else, or if that fails, it uses [parseToDoubleOrNullFallback]. + */ + public fun parseOrNull(ca: CharArray, offset: Int = 0, length: Int = ca.size): Double? { + if (parserOptions.useFastDoubleParser) { + try { + // Fixes RTL minus sign not being recognized. + if (minusSignIsFormatSymbol && ca.firstOrNull() == localDecimalFormatSymbols.minusSign) { + val newCa = ca.copyOf() + newCa[0] = fallbackDecimalFormatSymbols.minusSign + return parser.parseDouble(newCa, offset, length) + } + return parser.parseDouble(ca, offset, length) + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '${ + ca.joinToString("") + }' as from a CharArray to Double with FastDoubleParser with locale '$locale'." + } + } + } + return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback() + } +} diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt index 5dda568ac3..b25e7fb604 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt @@ -39,6 +39,11 @@ internal const val CREATE_COLUMN = "This function is just here for binary compat internal const val GUESS_COLUMN_TYPE = "This function is just here for binary compatibility. $MESSAGE_0_16" public const val DF_READ_EXCEL: String = "This function is just here for binary compatibility. $MESSAGE_0_16" + +internal const val PARSER_OPTIONS = "This constructor is only here for binary compatibility. $MESSAGE_0_16" + +internal const val PARSER_OPTIONS_COPY = "This function is only here for binary compatibility. $MESSAGE_0_16" + // endregion // region WARNING in 0.16, ERROR in 0.17 diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt new file mode 100644 index 0000000000..57dbdc6380 --- /dev/null +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/FastDoubleParserTests.kt @@ -0,0 +1,205 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.collections.shouldContainInOrder +import org.jetbrains.kotlinx.dataframe.api.ParserOptions +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser +import org.junit.After +import org.junit.Before +import org.junit.Test +import java.text.NumberFormat +import java.util.Locale + +private const val LOG_LEVEL = "org.slf4j.simpleLogger.defaultLogLevel" + +class FastDoubleParserTests { + + private var loggerBefore: String? = null + + @Before + fun setLogger() { + loggerBefore = System.getProperty(LOG_LEVEL) + System.setProperty(LOG_LEVEL, "debug") + } + + @After + fun restoreLogger() { + if (loggerBefore != null) { + System.setProperty(LOG_LEVEL, loggerBefore) + } + } + + @Test + fun `can fast parse doubles`() { + val parser = FastDoubleParser(ParserOptions(locale = Locale.ROOT, useFastDoubleParser = true)) + + val numbers = listOf( + "+12.45", + "-13.35", + "100123.35", + "-204,235.23", + "1.234e3", + "3e-04", // failed with old double parser + "nAn", + "-N/a", + "inf", + "-InfinIty", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + 3e-04, + Double.NaN, + -Double.NaN, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse german locale`() { + val parser = FastDoubleParser(ParserOptions(locale = Locale.GERMANY, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100.123,35", + "-204.235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse french locale`() { + val parser = FastDoubleParser(ParserOptions(locale = Locale.FRANCE, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100 123,35", + "-204 235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse estonian locale`() { + val parser = FastDoubleParser( + ParserOptions(locale = Locale.forLanguageTag("et-EE"), useFastDoubleParser = true), + ) + + val numbers = listOf( + "12,45", + "−13,35", // note the different minus sign '−' vs '-' + "100 123,35", + "−204 235,23", // note the different minus sign '−' vs '-' + "1,234e3", + "-345,122", // check forgiving behavior with 'ordinary' minus sign + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + -345.122, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `fast parse any locale`() { + val locales = Locale.getAvailableLocales() + val doubles = listOf( + 12.45, + -12.45, + 100_123.35, + -204_235.23, + 1.234e3, + -345.122, + 0.0, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + Double.NaN, + ) + + for (locale in locales) { + val parser = FastDoubleParser(ParserOptions(locale = locale, useFastDoubleParser = true)) + val formatter = NumberFormat.getInstance(locale) + for (double in doubles) { + val formatted = formatter.format(double) + val parsedByNumberFormatter = formatter.parse(formatted)?.toDouble() + + val parsedString = parser.parseOrNull(formatted) + assert(double == parsedString || double.isNaN() && parsedString?.isNaN() == true) { + "Failed to parse $formatted with locale $locale. Expected $double, got $parsedString. NumberFormat parsed it like: $parsedByNumberFormatter" + } + + val parsedCharArray = parser.parseOrNull(formatted.toCharArray()) + assert(double == parsedCharArray || double.isNaN() && parsedCharArray?.isNaN() == true) { + "Failed to parse $formatted with locale $locale. Expected $double, got $parsedCharArray. NumberFormat parsed it like: $parsedByNumberFormatter" + } + + val parsedByteArray = parser.parseOrNull(formatted.toByteArray()) + assert(double == parsedByteArray || double.isNaN() && parsedByteArray?.isNaN() == true) { + "Failed to parse $formatted with locale $locale. Expected $double, got $parsedByteArray. NumberFormat parsed it like: $parsedByNumberFormatter" + } + } + } + } +} diff --git a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index fab887befc..8d2595ab06 100644 --- a/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/generated-sources/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -10,6 +10,7 @@ import kotlinx.datetime.toKotlinLocalDateTime import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.columnOf import org.jetbrains.kotlinx.dataframe.api.convertTo @@ -134,6 +135,17 @@ class ParserTests { } @Test + fun `custom nullStrings`() { + val col by columnOf("1", "2", "null", "3", "NA", "nothing", "4.0", "5.0") + + val parsed = col.tryParse( + ParserOptions(nullStrings = setOf("null", "NA", "nothing")), + ) + parsed.type() shouldBe typeOf() + parsed.toList() shouldBe listOf(1, 2, null, 3, null, null, 4.0, 5.0) + } + + @Test // This does not yet use fastDoubleParser! fun `converting String to Double in different locales`() { val currentLocale = Locale.getDefault() try { @@ -153,68 +165,68 @@ class ParserTests { Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } - columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } Locale.setDefault(Locale.forLanguageTag("en-US")) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } - columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } Locale.setDefault(Locale.forLanguageTag("ru-RU")) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12.345, 67.89) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertTo() shouldBe columnOf(12.345, 67.89) + columnComma.convertTo() shouldBe columnOf(12.345, 67.89) + columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) - columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) - columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) - columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) - columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } - columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } } finally { Locale.setDefault(currentLocale)