#788 Add test suites for SparkCobolProcessor.

yruslan · yruslan · commit 72b116bd91a9 · 2025-10-10T09:34:51.000+02:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/SerializableRawRecordProcessor.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/processor/SerializableRawRecordProcessor.scala
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.processor
+
+/**
+  * A serializable version of RawRecordProcessor for distributed processing in Spark.
+  *
+  * Usage patterns:
+  * - For standalone JVM applications: Use CobolProcessor with RawRecordProcessor
+  * - For Spark applications: Use SparkCobolProcessor with SerializableRawRecordProcessor
+  *
+  * This trait extends Serializable since Spark distributes processing code across the network
+  * to worker nodes, requiring all components to be serializable.
+  */
+trait SerializableRawRecordProcessor extends RawRecordProcessor with Serializable
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessor.scala
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-package za.co.absa.cobrix.spark.cobol.builder
+package za.co.absa.cobrix.spark.cobol
 
 import org.apache.hadoop.fs.Path
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.slf4j.LoggerFactory
-import za.co.absa.cobrix.cobol.processor.{CobolProcessor, RawRecordProcessor}
+import za.co.absa.cobrix.cobol.processor.{CobolProcessor, SerializableRawRecordProcessor}
 import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
 import za.co.absa.cobrix.spark.cobol.source.streaming.FileStreamer
 
@@ -47,7 +47,7 @@ object SparkCobolProcessor {
   class SparkCobolProcessorBuilder(implicit spark: SparkSession) {
     private val caseInsensitiveOptions = new mutable.HashMap[String, String]()
     private var copybookContentsOpt: Option[String] = None
-    private var rawRecordProcessorOpt: Option[RawRecordProcessor] = None
+    private var rawRecordProcessorOpt: Option[SerializableRawRecordProcessor] = None
     private var numberOfThreads: Int = 1
 
     def build(): SparkCobolProcessor = {
@@ -82,7 +82,7 @@ object SparkCobolProcessor {
       this
     }
 
-    def withRecordProcessor(processor: RawRecordProcessor): SparkCobolProcessorBuilder = {
+    def withRecordProcessor(processor: SerializableRawRecordProcessor): SparkCobolProcessorBuilder = {
       rawRecordProcessorOpt = Option(processor)
       this
     }
@@ -130,7 +130,7 @@ object SparkCobolProcessor {
                                   outputPath: String,
                                   copybookContents: String,
                                   cobolProcessor: CobolProcessor,
-                                  rawRecordProcessor: RawRecordProcessor,
+                                  rawRecordProcessor: SerializableRawRecordProcessor,
                                   sconf: SerializableConfiguration,
                                   numberOfThreads: Int
                                  )(implicit spark: SparkSession): RDD[Long] = {
@@ -145,7 +145,7 @@ object SparkCobolProcessor {
                                  outputPath: String,
                                  copybookContents: String,
                                  cobolProcessor: CobolProcessor,
-                                 rawRecordProcessor: RawRecordProcessor,
+                                 rawRecordProcessor: SerializableRawRecordProcessor,
                                  sconf: SerializableConfiguration,
                                  numberOfThreads: Int
                                 ): Long = {
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/SparkCobolProcessorSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol
+
+import org.apache.hadoop.fs.Path
+import org.scalatest.wordspec.AnyWordSpec
+import za.co.absa.cobrix.cobol.parser.Copybook
+import za.co.absa.cobrix.cobol.processor.SerializableRawRecordProcessor
+import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
+import za.co.absa.cobrix.spark.cobol.source.fixtures.{BinaryFileFixture, TextComparisonFixture}
+
+class SparkCobolProcessorSuite extends AnyWordSpec with SparkTestBase with BinaryFileFixture with TextComparisonFixture {
+  private val copybook =
+    """      01 RECORD.
+      |         05  T     PIC X.
+      |""".stripMargin
+
+  private val rawRecordProcessor = new SerializableRawRecordProcessor {
+    override def processRecord(copybook: Copybook, options: Map[String, String], record: Array[Byte], offset: Long): Array[Byte] = {
+      record.map(v => (v - 1).toByte)
+    }
+  }
+
+  "SparkCobolProcessor" should {
+    "fail to create when a copybook is not specified" in {
+      val exception = intercept[IllegalArgumentException] {
+        SparkCobolProcessor.builder.build()
+      }
+
+      assert(exception.getMessage.contains("Copybook contents must be provided."))
+    }
+
+    "fail to create when a record processor is not provided" in {
+      val exception = intercept[IllegalArgumentException] {
+        SparkCobolProcessor.builder
+          .withCopybookContents(copybook)
+          .build()
+      }
+
+      assert(exception.getMessage.contains("A RawRecordProcessor must be provided."))
+    }
+
+    "fail to create when the number of threads is less than 0" in {
+      val exception = intercept[IllegalArgumentException] {
+        SparkCobolProcessor.builder
+          .withCopybookContents(copybook)
+          .withRecordProcessor(rawRecordProcessor)
+          .withMultithreaded(0)
+          .build()
+      }
+
+      assert(exception.getMessage.contains("Number of threads must be at least 1."))
+    }
+
+    "create a processor that processes files via an RDD" in {
+      withTempDirectory("spark_cobol_processor") { tempDir =>
+        val binData = Array(0xF1, 0xF2, 0xF3, 0xF4).map(_.toByte)
+
+        val inputPath = new Path(tempDir, "input.dat").toString
+        val outputPath = new Path(tempDir, "output").toString
+        val outputFile = new Path(outputPath, "input.dat").toString
+
+        writeBinaryFile(inputPath, binData)
+
+        val processor = SparkCobolProcessor.builder
+          .withCopybookContents(copybook)
+          .withRecordProcessor(rawRecordProcessor)
+          .build()
+
+        processor.process(Seq(inputPath), outputPath)
+
+        val outputData = readBinaryFile(outputFile)
+
+        assert(outputData.length == binData.length)
+        assert(outputData.head == 0xF0.toByte)
+        assert(outputData(1) == 0xF1.toByte)
+        assert(outputData(2) == 0xF2.toByte)
+        assert(outputData(3) == 0xF3.toByte)
+      }
+    }
+  }
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/fixtures/BinaryFileFixture.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/fixtures/BinaryFileFixture.scala
@@ -113,6 +113,16 @@ trait BinaryFileFixture {
     tempFile
   }
 
+  def writeBinaryFile(filePath: String, content: Array[Byte]): Unit = {
+    val ostream = new DataOutputStream(new FileOutputStream(filePath))
+    ostream.write(content)
+    ostream.close()
+  }
+
+  def readBinaryFile(filePath: String): Array[Byte] = {
+    FileUtils.readFileToByteArray(new File(filePath))
+  }
+
   private def hex2bytes(hex: String): Array[Byte] = {
     val compactStr = hex.replaceAll("\\s", "")
     compactStr.sliding(2, 2).toArray.map(Integer.parseInt(_, 16).toByte)