apache · mihailom-db · May 21, 2025 · May 21, 2025 · May 26, 2025 · May 26, 2025
diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
@@ -21,6 +21,7 @@ import java.util.Comparator
 import java.util.concurrent.atomic.{AtomicInteger, AtomicReference}
 
 import scala.collection.mutable
+import scala.jdk.CollectionConverters.MapHasAsScala
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion, UnresolvedException}
@@ -1109,8 +1110,10 @@ case class MapZipWith(left: Expression, right: Expression, function: Expression)
    */
   @transient private lazy val getKeysWithValueIndexes:
       (ArrayData, ArrayData) => mutable.Iterable[(Any, Array[Option[Int]])] = {
-    if (TypeUtils.typeWithProperEquals(keyType)) {
-      getKeysWithIndexesFast
+    if (TypeUtils.typeWithProperEquals(keyType) && SQLConf.get.mapZipWithUsesJavaCollections) {
+      getKeysWithIndexesFastAsJava
+    } else if (TypeUtils.typeWithProperEquals(keyType)) {
+      getKeysWithIndexesFastUsingScala
     } else {
       getKeysWithIndexesBruteForce
     }
@@ -1122,7 +1125,7 @@ case class MapZipWith(left: Expression, right: Expression, function: Expression)
     }
   }
 
-  private def getKeysWithIndexesFast(keys1: ArrayData, keys2: ArrayData) = {
+  private def getKeysWithIndexesFastUsingScala(keys1: ArrayData, keys2: ArrayData) = {
     val hashMap = new mutable.LinkedHashMap[Any, Array[Option[Int]]]
     for ((z, array) <- Array((0, keys1), (1, keys2))) {
       var i = 0
@@ -1144,6 +1147,31 @@ case class MapZipWith(left: Expression, right: Expression, function: Expression)
     hashMap
   }
 
+  private def getKeysWithIndexesFastAsJava(
+      keys1: ArrayData,
+      keys2: ArrayData
+  ): scala.collection.mutable.LinkedHashMap[Any, Array[Option[Int]]] = {
+    val hashMap = new java.util.LinkedHashMap[Any, Array[Option[Int]]]
+    for ((z, array) <- Array((0, keys1), (1, keys2))) {
+      var i = 0
+      while (i < array.numElements()) {
+        val key = array.get(i, keyType)
+        Option(hashMap.get(key)) match {
+          case Some(indexes) =>
+            if (indexes(z).isEmpty) {
+              indexes(z) = Some(i)
+            }
+          case None =>
+            val indexes = Array[Option[Int]](None, None)
+            indexes(z) = Some(i)
+            hashMap.put(key, indexes)
+        }
+        i += 1
+      }
+    }
+    scala.collection.mutable.LinkedHashMap(hashMap.asScala.toSeq: _*)
+  }
+
   private def getKeysWithIndexesBruteForce(keys1: ArrayData, keys2: ArrayData) = {
     val arrayBuffer = new mutable.ArrayBuffer[(Any, Array[Option[Int]])]
     for ((z, array) <- Array((0, keys1), (1, keys2))) {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1095,6 +1095,14 @@ object SQLConf {
       .stringConf
       .createOptional
 
+  val MAP_ZIP_WITH_USES_JAVA_COLLECTIONS =
+    buildConf("spark.sql.mapZipWithUsesJavaCollections")
+      .doc("When true, the `map_zip_with` function uses Java collections instead of Scala " +
+        "collections. This is useful for avoiding NaN equality issues.")
+      .version("4.1.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val SUBEXPRESSION_ELIMINATION_ENABLED =
     buildConf("spark.sql.subexpressionElimination.enabled")
       .internal()
@@ -6367,6 +6375,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
    */
   def hintErrorHandler: HintErrorHandler = HintErrorLogger
 
+  def mapZipWithUsesJavaCollections: Boolean =
+    getConf(MAP_ZIP_WITH_USES_JAVA_COLLECTIONS)
+
   def subexpressionEliminationEnabled: Boolean =
     getConf(SUBEXPRESSION_ELIMINATION_ENABLED)