AbsaOSS
diff --git a/‎.sonarcloud.properties
Lines changed: 2 additions & 0 deletions b/‎.sonarcloud.properties
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 29 additions & 2 deletions b/‎README.md
Lines changed: 29 additions & 2 deletions
diff --git a/‎compatibility_spark-3/pom.xml
Lines changed: 5 additions & 0 deletions b/‎compatibility_spark-3/pom.xml
Lines changed: 5 additions & 0 deletions
diff --git a/‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/delta/scd2/DeltaCDCToSCD2WriterAttributes.scala renamed to ‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/cdc/CDCToSCD2WriterAttributes.scala
Lines changed: 5 additions & 5 deletions b/‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/delta/scd2/DeltaCDCToSCD2WriterAttributes.scala renamed to ‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/cdc/CDCToSCD2WriterAttributes.scala
Lines changed: 5 additions & 5 deletions
diff --git a/‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/cdc/CDCUtil.scala
Lines changed: 184 additions & 0 deletions b/‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/cdc/CDCUtil.scala
Lines changed: 184 additions & 0 deletions
diff --git a/‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/cdc/delta/DeltaUtil.scala
Lines changed: 44 additions & 0 deletions b/‎compatibility_spark-3/src/main/scala/za/co/absa/hyperdrive/compatibility/impl/writer/cdc/delta/DeltaUtil.scala
Lines changed: 44 additions & 0 deletions
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+sonar.cpd.exclusions=**/TestDeltaCDCToSCD2Writer.scala,**/TestHudiCDCToSCD2Writer.scala
@@ -46,6 +46,7 @@ The data ingestion pipeline of Hyperdrive consists of four components: readers,
 - `KafkaStreamWriter` - writes to a Kafka topic.
 - `DeltaCDCToSnapshotWriter` - writes the DataFrame in Delta format. It expects CDC events and performs merge logic and creates the latest snapshot table.
 - `DeltaCDCToSCD2Writer` - writes the DataFrame in Delta format. It expects CDC events and performs merge logic and creates SCD2 table.
+- `HudiCDCToSCD2Writer` - writes the DataFrame in Hudi format. It expects CDC events and performs merge logic and creates SCD2 table.
 
 ### Custom components
 Custom components can be implemented using the [Component Archetype](component-archetype) following the API defined in the package `za.co.absa.hyperdrive.ingestor.api`
@@ -345,7 +346,7 @@ Any additional properties for the `DataStreamWriter` can be added with the prefi
 
 **Example**
 
-- `component.writer=za.co.absa.hyperdrive.compatibility.impl.writer.delta.snapshot.DeltaCDCToSnapshotWriter`
+- `component.writer=za.co.absa.hyperdrive.compatibility.impl.writer.cdc.delta.snapshot.DeltaCDCToSnapshotWriter`
 - `writer.deltacdctosnapshot.destination.directory=/tmp/destination`
 - `writer.deltacdctosnapshot.key.column=key`
 - `writer.deltacdctosnapshot.operation.column=ENTTYP`
@@ -370,7 +371,7 @@ Any additional properties for the `DataStreamWriter` can be added with the prefi
 Any additional properties for the `DataStreamWriter` can be added with the prefix `writer.deltacdctoscd2.options`, e.g. `writer.deltacdctoscd2.options.key=value`
 
 **Example**
-- `component.writer=za.co.absa.hyperdrive.compatibility.impl.writer.delta.scd2.DeltaCDCToSCD2Writer`
+- `component.writer=za.co.absa.hyperdrive.compatibility.impl.writer.cdc.delta.scd2.DeltaCDCToSCD2Writer`
 - `writer.deltacdctoscd2.destination.directory=/tmp/destination`
 - `writer.deltacdctoscd2.key.column=key`
 - `writer.deltacdctoscd2.timestamp.column=TIMSTAMP`
@@ -379,6 +380,32 @@ Any additional properties for the `DataStreamWriter` can be added with the prefi
 - `writer.deltacdctoscd2.precombineColumns=ENTTYP`
 - `writer.deltacdctoscd2.precombineColumns.customOrder.ENTTYP=PT,FI,RR,UB,UP,DL,FD`
 
+##### HudiCDCToSCD2Writer
+| Property Name                                         | Required | Description                                                                                                                                              |
+|:------------------------------------------------------| :---: |:---------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `writer.hudicdctoscd2.destination.directory`         | Yes | Destination path of the sink. Equivalent to Spark property `path` for the `DataStreamWriter`                                                             |
+| `writer.hudicdctoscd2.partition.columns`             | No | Comma-separated list of columns to partition by.                                                                                                         |
+| `writer.hudicdctoscd2.key.column`                    | Yes | A column with unique entity identifier.                                                                                                                  |
+| `writer.hudicdctoscd2.timestamp.column`              | Yes | A column with timestamp.                                                                                                                                 |
+| `writer.hudicdctoscd2.operation.column`              | Yes | A column containing value marking a record with an operation.                                                                                            |
+| `writer.hudicdctoscd2.operation.deleted.values`      | Yes | Values marking a record for deletion in the operation column.                                                                                            |
+| `writer.hudicdctoscd2.precombineColumns`             | Yes | When two records have the same key and timestamp value, we will pick the one with the largest value for precombine columns. Evaluated in provided order. |
+| `writer.hudicdctoscd2.precombineColumns.customOrder` | No | Precombine column's custom order in ascending order.                                                                                                     |
+| `writer.common.trigger.type`                          | No | See [Combination writer properties](#common-writer-properties)                                                                                           |
+| `writer.common.trigger.processing.time`               | No | See [Combination writer properties](#common-writer-properties)                                                                                           |
+
+Any additional properties for the `DataStreamWriter` can be added with the prefix `writer.hudicdctoscd2.options`, e.g. `writer.hudicdctoscd2.options.key=value`
+
+**Example**
+- `component.writer=za.co.absa.hyperdrive.compatibility.impl.writer.cdc.hudi.scd2.HudiCDCToSCD2Writer`
+- `writer.hudicdctoscd2.destination.directory=/tmp/destination`
+- `writer.hudicdctoscd2.key.column=key`
+- `writer.hudicdctoscd2.timestamp.column=TIMSTAMP`
+- `writer.hudicdctoscd2.operation.column=ENTTYP`
+- `writer.hudicdctoscd2.operation.deleted.values=DL,FD`
+- `writer.hudicdctoscd2.precombineColumns=ENTTYP`
+- `writer.hudicdctoscd2.precombineColumns.customOrder.ENTTYP=PT,FI,RR,UB,UP,DL,FD`
+
 #### Common writer properties
 
 | Property Name | Required |Description |
 
@@ -60,5 +60,10 @@
             <groupId>io.delta</groupId>
             <artifactId>delta-core_${scala.compat.version}</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.apache.hudi</groupId>
+            <artifactId>hudi-spark3.2-bundle_${scala.compat.version}</artifactId>
+            <scope>provided</scope>
+        </dependency>
     </dependencies>
 </project>
@@ -13,13 +13,13 @@
  * limitations under the License.
  */
 
-package za.co.absa.hyperdrive.compatibility.impl.writer.delta.scd2
+package za.co.absa.hyperdrive.compatibility.impl.writer.cdc
 
 import za.co.absa.hyperdrive.ingestor.api.writer.StreamWriterCommonAttributes
 import za.co.absa.hyperdrive.ingestor.api.{HasComponentAttributes, PropertyMetadata}
 
-trait DeltaCDCToSCD2WriterAttributes extends HasComponentAttributes {
-  private val rootFactoryConfKey = "writer.deltacdctoscd2"
+trait CDCToSCD2WriterAttributes extends HasComponentAttributes {
+  val rootFactoryConfKey: String
   val KEY_DESTINATION_DIRECTORY = s"$rootFactoryConfKey.destination.directory"
   val KEY_EXTRA_CONFS_ROOT = s"$rootFactoryConfKey.options"
   val KEY_PARTITION_COLUMNS = s"$rootFactoryConfKey.partition.columns"
@@ -30,9 +30,9 @@ trait DeltaCDCToSCD2WriterAttributes extends HasComponentAttributes {
   val KEY_PRECOMBINE_COLUMNS = s"$rootFactoryConfKey.precombineColumns"
   val KEY_PRECOMBINE_COLUMNS_CUSTOM_ORDER = s"$rootFactoryConfKey.precombineColumns.customOrder"
 
-  override def getName: String = "Delta Stream Writer"
+  override def getName: String
 
-  override def getDescription: String = "This writer saves ingested data in Delta format on a filesystem (e.g. HDFS)"
+  override def getDescription: String
 
   override def getProperties: Map[String, PropertyMetadata] = Map(
     KEY_DESTINATION_DIRECTORY -> PropertyMetadata("Destination directory", Some("A path to a directory"), required = true),
 
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.compatibility.impl.writer.cdc
+
+import org.apache.hadoop.fs.FileSystem
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{BooleanType, StructField, StructType, TimestampType}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import za.co.absa.hyperdrive.shared.utils.FileUtils
+
+import java.net.URI
+
+object CDCUtil {
+
+  private val StartDateColumn = "_start_date"
+  private val EndDateColumn = "_end_date"
+  private val IsCurrentColumn = "_is_current"
+  private val IsOldDataColumn = "_is_old_data"
+  private val SortFieldPrefix = "_tmp_hyperdrive_"
+  private val OldData = "_old_data"
+  private val NewData = "_new_data"
+  private val SortFieldCustomOrderColumn = "_tmp_hyperdrive_sort_field_custom_order_"
+
+  private[hyperdrive] case class SCD2Fields(keyColumn: String,
+                                            timestampColumn: String,
+                                            operationColumn: String,
+                                            operationDeleteValues: Seq[String],
+                                            precombineColumns: Seq[String],
+                                            precombineColumnsCustomOrder: Map[String, Seq[String]])
+
+  private[hyperdrive] def getStagedDataForSCD2(history: DataFrame, input: DataFrame, scd2Fields: SCD2Fields): DataFrame = {
+    val uniqueChangesForEachKeyAndTimestamp = removeDuplicates(input, scd2Fields)
+    val previousEvents = getPreviousEvents(history, uniqueChangesForEachKeyAndTimestamp, scd2Fields)
+    val nextEvents = getNextEvents(history, uniqueChangesForEachKeyAndTimestamp, scd2Fields)
+
+    val union = previousEvents.union(nextEvents).distinct().union(
+      uniqueChangesForEachKeyAndTimestamp
+        .withColumn(StartDateColumn, col(scd2Fields.timestampColumn))
+        .withColumn(EndDateColumn, lit(null))
+        .withColumn(IsCurrentColumn, lit(false))
+        .withColumn(IsOldDataColumn, lit(false))
+        .selectExpr(
+          Seq(StartDateColumn, EndDateColumn, IsCurrentColumn) ++
+            uniqueChangesForEachKeyAndTimestamp.columns ++
+            Seq(IsOldDataColumn): _*
+        )
+    )
+
+    val uniqueEvents = removeDuplicates(union, scd2Fields)
+    setSCD2Fields(uniqueEvents, scd2Fields).drop(IsOldDataColumn)
+  }
+
+  private[hyperdrive] def getDataFrameWithSortColumns(dataFrame: DataFrame, sortFieldsPrefix: String, precombineColumns: Seq[String], precombineColumnsCustomOrder: Map[String, Seq[String]]): DataFrame = {
+    precombineColumns.foldLeft(dataFrame) { (df, precombineColumn) =>
+      val order = precombineColumnsCustomOrder.getOrElse(precombineColumn, Seq.empty[String])
+      order match {
+        case o if o.isEmpty =>
+          df.withColumn(s"$sortFieldsPrefix$precombineColumn", col(precombineColumn))
+        case o =>
+          df
+            .withColumn(SortFieldCustomOrderColumn, lit(o.toArray))
+            .withColumn(
+              s"$sortFieldsPrefix$precombineColumn",
+              expr(s"""array_position($SortFieldCustomOrderColumn,$precombineColumn)""")
+            ).drop(SortFieldCustomOrderColumn)
+      }
+    }
+  }
+
+  private[hyperdrive] def getSchemaWithSCD2Fields(input: DataFrame): StructType = {
+    StructType(
+      Seq(
+        StructField(StartDateColumn, TimestampType, nullable = false),
+        StructField(EndDateColumn, TimestampType, nullable = true),
+        StructField(IsCurrentColumn, BooleanType, nullable = false)
+      ).toArray ++ input.schema.fields
+    )
+  }
+
+  private[hyperdrive] def isDirEmptyOrDoesNotExist(spark: SparkSession, destination: String): Boolean = {
+    implicit val fs: FileSystem = FileSystem.get(new URI(destination), spark.sparkContext.hadoopConfiguration)
+    if (FileUtils.exists(destination)) {
+      if (FileUtils.isDirectory(destination)) {
+        FileUtils.isEmpty(destination)
+      } else {
+        false
+      }
+    } else {
+      true
+    }
+  }
+
+  private def removeDuplicates(input: DataFrame, scd2Fields: SCD2Fields): DataFrame = {
+    val dataFrameWithSortColumns = getDataFrameWithSortColumns(input, SortFieldPrefix, scd2Fields.precombineColumns, scd2Fields.precombineColumnsCustomOrder)
+    val sortColumnsWithPrefix = dataFrameWithSortColumns.schema.fieldNames.filter(_.startsWith(SortFieldPrefix))
+    val window = Window
+      .partitionBy(s"${scd2Fields.keyColumn}", s"${scd2Fields.timestampColumn}")
+      .orderBy(sortColumnsWithPrefix.map(col(_).desc): _*)
+    dataFrameWithSortColumns
+      .withColumn("rank", row_number().over(window))
+      .where("rank == 1")
+      .drop("rank")
+      .drop(sortColumnsWithPrefix: _*)
+  }
+
+  private def getPreviousEvents(history: DataFrame, uniqueChangesForEachKeyAndTimestamp: DataFrame, scd2Fields: SCD2Fields): DataFrame = {
+    history.as(OldData).join(
+      uniqueChangesForEachKeyAndTimestamp.as(NewData),
+      col(s"$NewData.${scd2Fields.keyColumn}").equalTo(col(s"$OldData.${scd2Fields.keyColumn}"))
+        .and(col(s"$NewData.${scd2Fields.timestampColumn}").>=(col(s"$OldData.$StartDateColumn")))
+        .and(col(s"$NewData.${scd2Fields.timestampColumn}").<=(col(s"$OldData.$EndDateColumn")))
+        .or(
+          col(s"$NewData.${scd2Fields.keyColumn}").equalTo(col(s"$OldData.${scd2Fields.keyColumn}"))
+            .and(col(s"$NewData.${scd2Fields.timestampColumn}").>=(col(s"$OldData.$StartDateColumn")))
+            .and(col(s"$OldData.$IsCurrentColumn").equalTo(true))
+        )
+    ).select(s"$OldData.*").withColumn(s"$IsOldDataColumn", lit(true))
+  }
+
+  private def getNextEvents(history: DataFrame, uniqueChangesForEachKeyAndTimestamp: DataFrame, scd2Fields: SCD2Fields): DataFrame = {
+    val window = Window
+      .partitionBy(col(s"$OldData.${scd2Fields.keyColumn}"), col(s"$NewData.${scd2Fields.timestampColumn}"))
+      .orderBy(col(s"$OldData.$StartDateColumn").asc, col(s"$OldData.${scd2Fields.timestampColumn}").asc)
+
+    history.as(OldData).join(
+      uniqueChangesForEachKeyAndTimestamp.as(NewData),
+      col(s"$NewData.${scd2Fields.keyColumn}").equalTo(col(s"$OldData.${scd2Fields.keyColumn}"))
+        .and(col(s"$NewData.${scd2Fields.timestampColumn}").<(col(s"$OldData.$StartDateColumn")))
+    ).select(s"$OldData.*", s"$NewData.${scd2Fields.timestampColumn}")
+      .withColumn("rank", row_number().over(window))
+      .where("rank == 1")
+      .drop("rank")
+      .select(s"$OldData.*")
+      .withColumn(s"$IsOldDataColumn", lit(true))
+  }
+
+  private def setSCD2Fields(dataFrame: DataFrame, scd2Fields: SCD2Fields): DataFrame = {
+    val idWindowDesc = org.apache.spark.sql.expressions.Window
+      .partitionBy(scd2Fields.keyColumn)
+      .orderBy(col(scd2Fields.timestampColumn).desc, col(IsOldDataColumn).desc)
+    dataFrame
+      .withColumn(
+        EndDateColumn,
+        when(
+          col(IsOldDataColumn).equalTo(true).and(
+            lag(scd2Fields.keyColumn, 1, null).over(idWindowDesc).isNull
+          ),
+          col(EndDateColumn)
+        ).when(
+          col(IsOldDataColumn).equalTo(true).and(
+            lag(IsOldDataColumn, 1, false).over(idWindowDesc).equalTo(true)
+          ),
+          col(EndDateColumn)
+        ).otherwise(
+          lag(StartDateColumn, 1, null).over(idWindowDesc)
+        )
+      )
+      .withColumn(
+        EndDateColumn,
+        when(col(scd2Fields.operationColumn).isInCollection(scd2Fields.operationDeleteValues), col(StartDateColumn))
+          .when(!col(scd2Fields.operationColumn).isInCollection(scd2Fields.operationDeleteValues), col(EndDateColumn))
+          .otherwise(null)
+      )
+      .withColumn(
+        IsCurrentColumn,
+        when(col(EndDateColumn).isNull, lit(true)).otherwise(lit(false))
+      )
+  }
+
+}
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.compatibility.impl.writer.cdc.delta
+
+import io.delta.tables.DeltaTable
+import org.apache.spark.sql.{Row, SaveMode, SparkSession}
+import org.apache.spark.sql.types.StructType
+import org.slf4j.LoggerFactory
+import za.co.absa.hyperdrive.compatibility.impl.writer.cdc.CDCUtil.isDirEmptyOrDoesNotExist
+
+object DeltaUtil {
+  private val logger = LoggerFactory.getLogger(this.getClass)
+
+  private[hyperdrive] def createDeltaTableIfNotExists(sparkSession: SparkSession, destination: String, schema: StructType, partitionColumns: Seq[String]): Unit = {
+    if (!DeltaTable.isDeltaTable(sparkSession, destination)) {
+      if (isDirEmptyOrDoesNotExist(sparkSession, destination)) {
+        logger.info(s"Destination: $destination is not a delta table. Creating new delta table.")
+        sparkSession
+          .createDataFrame(sparkSession.sparkContext.emptyRDD[Row], schema)
+          .write
+          .format("delta")
+          .mode(SaveMode.Overwrite)
+          .option("overwriteSchema", "true")
+          .partitionBy(partitionColumns: _*)
+          .save(destination)
+      } else {
+        throw new IllegalArgumentException(s"Could not create new delta table. Directory $destination is not empty!")
+      }
+    }
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -12,3 +12,5 @@`
`12`	`12`	`# See the License for the specific language governing permissions and`
`13`	`13`	`# limitations under the License.`
`14`	`14`	`#`
	`15`	`+`
	`16`	`+sonar.cpd.exclusions=/TestDeltaCDCToSCD2Writer.scala,/TestHudiCDCToSCD2Writer.scala`