Add iceberg v2 consumer for flattened data consuming, schema will be inferred by arrow

ismailsimsek · ismailsimsek · commit 0a881ff6c0ec · 2025-07-23T22:03:18.000+02:00
diff --git a/pydbzengine/handlers/iceberg.py b/pydbzengine/handlers/iceberg.py
@@ -143,7 +143,8 @@ def load_table(self, table_identifier):
             table = self.catalog.create_table(identifier=table_identifier,
                                               schema=self._target_schema,
                                               partition_spec=self.DEBEZIUM_TABLE_PARTITION_SPEC)
-            self.log.info(f"Created iceberg table {'.'.join(table_identifier)} with daily partitioning on _consumed_at.")
+            self.log.info(
+                f"Created iceberg table {'.'.join(table_identifier)} with daily partitioning on _consumed_at.")
             return table
 
     @property
@@ -236,6 +237,7 @@ def _handle_table_changes(self, destination: str, records: List[ChangeEvent]):
             records=records
         )
 
+        self._handle_schema_changes(table=table, arrow_schema=enriched_arrow_data.schema)
         table.append(enriched_arrow_data)
         self.log.info(f"Appended {len(enriched_arrow_data)} records to table {'.'.join(table.name())}")
 
@@ -401,3 +403,8 @@ def _get_identifier_fields(self, sample_event: ChangeEvent, table_identifier: tu
 
         self.log.info(f"Found potential primary key fields {key_field_names} for table {table_name_str}")
         return key_field_names
+
+    def _handle_schema_changes(self, table: "Table", arrow_schema: "pa.Schema"):
+        with table.update_schema() as update:
+            update.union_by_name(new_schema=arrow_schema)
+        self.log.info(f"Schema for table {'.'.join(table.name())} has been updated.")
diff --git a/pydbzengine/helper.py b/pydbzengine/helper.py
@@ -20,7 +20,7 @@ def timeout_handler(signum, frame):
 class Utils:
 
     @staticmethod
-    def run_engine_async(engine, timeout_sec=22):
+    def run_engine_async(engine, timeout_sec=22, blocking=True):
         """
         Runs an engine asynchronously with a timeout.
 
@@ -37,11 +37,12 @@ def run_engine_async(engine, timeout_sec=22):
         signal.alarm(timeout_sec)
 
         try:
-            thread = threading.Thread(target=engine.run)
+            thread = threading.Thread(target=engine.run, daemon=True)
             thread.start()
 
-            # Wait for the thread to complete (or the timeout to occur).
-            thread.join()  # This will block until the thread finishes or the signal is received.
+            if blocking:
+                # Wait for the thread to complete (or the timeout to occur).
+                thread.join()  # This will block until the thread finishes or the signal is received.
 
         except TimeoutError:
             # Handle the timeout exception.
diff --git a/tests/base_postgresql_test.py b/tests/base_postgresql_test.py
@@ -11,35 +11,44 @@ class BasePostgresqlTest(unittest.TestCase):
     OFFSET_FILE = CURRENT_DIR.joinpath('postgresql-offsets.dat')
     SOURCEPGDB = DbPostgresql()
 
-    def debezium_engine_props(self, unwrap_messages=True):
+    def debezium_engine_props_dict(self, unwrap_messages=True) -> dict:
         current_dir = Path(__file__).parent
         offset_file_path = current_dir.joinpath('postgresql-offsets.dat')
 
-        props = Properties()
-        props.setProperty("name", "engine")
-        props.setProperty("snapshot.mode", "always")
-        props.setProperty("database.hostname", self.SOURCEPGDB.CONTAINER.get_container_host_ip())
-        props.setProperty("database.port", str(self.SOURCEPGDB.CONTAINER.get_exposed_port(self.SOURCEPGDB.POSTGRES_PORT_DEFAULT)))
-        props.setProperty("database.user", self.SOURCEPGDB.POSTGRES_USER)
-        props.setProperty("database.password", self.SOURCEPGDB.POSTGRES_PASSWORD)
-        props.setProperty("database.dbname", self.SOURCEPGDB.POSTGRES_DBNAME)
-        props.setProperty("connector.class", "io.debezium.connector.postgresql.PostgresConnector")
-        props.setProperty("offset.storage", "org.apache.kafka.connect.storage.FileOffsetBackingStore")
-        props.setProperty("offset.storage.file.filename", offset_file_path.as_posix())
-        props.setProperty("poll.interval.ms", "10000")
-        props.setProperty("converter.schemas.enable", "false")
-        props.setProperty("offset.flush.interval.ms", "1000")
-        props.setProperty("topic.prefix", "testc")
-        props.setProperty("schema.whitelist", "inventory")
-        props.setProperty("database.whitelist", "inventory")
-        props.setProperty("table.whitelist", "inventory.products")
-        props.setProperty("replica.identity.autoset.values", "inventory.*:FULL")
+        conf: dict = {}
+        conf.setdefault("name", "engine")
+        conf.setdefault("snapshot.mode", "always")
+        conf.setdefault("database.hostname", self.SOURCEPGDB.CONTAINER.get_container_host_ip())
+        conf.setdefault("database.port",
+                        str(self.SOURCEPGDB.CONTAINER.get_exposed_port(self.SOURCEPGDB.POSTGRES_PORT_DEFAULT)))
+        conf.setdefault("database.user", self.SOURCEPGDB.POSTGRES_USER)
+        conf.setdefault("database.password", self.SOURCEPGDB.POSTGRES_PASSWORD)
+        conf.setdefault("database.dbname", self.SOURCEPGDB.POSTGRES_DBNAME)
+        conf.setdefault("connector.class", "io.debezium.connector.postgresql.PostgresConnector")
+        conf.setdefault("offset.storage", "org.apache.kafka.connect.storage.FileOffsetBackingStore")
+        conf.setdefault("offset.storage.file.filename", offset_file_path.as_posix())
+        conf.setdefault("poll.interval.ms", "10000")
+        conf.setdefault("converter.schemas.enable", "false")
+        conf.setdefault("offset.flush.interval.ms", "1000")
+        conf.setdefault("topic.prefix", "testc")
+        conf.setdefault("schema.whitelist", "inventory")
+        conf.setdefault("database.whitelist", "inventory")
+        conf.setdefault("table.whitelist", "inventory.products")
+        conf.setdefault("replica.identity.autoset.values", "inventory.*:FULL")
 
         if unwrap_messages:
-            props.setProperty("transforms", "unwrap")
-            props.setProperty("transforms.unwrap.type", "io.debezium.transforms.ExtractNewRecordState")
-            props.setProperty("transforms.unwrap.add.fields", "op,table,source.ts_ms,sourcedb,ts_ms")
-            props.setProperty("transforms.unwrap.delete.handling.mode", "rewrite")
+            conf.setdefault("transforms", "unwrap")
+            conf.setdefault("transforms.unwrap.type", "io.debezium.transforms.ExtractNewRecordState")
+            conf.setdefault("transforms.unwrap.add.fields", "op,table,source.ts_ms,sourcedb,ts_ms")
+            conf.setdefault("transforms.unwrap.delete.handling.mode", "rewrite")
+
+        return conf
+
+    def debezium_engine_props(self, unwrap_messages=True):
+        props = Properties()
+        conf = self.debezium_engine_props_dict(unwrap_messages=unwrap_messages)
+        for k, v in conf.items():
+            props.setProperty(k, v)
         return props
 
     def clean_offset_file(self):
@@ -54,5 +63,5 @@ def tearDown(self):
         self.SOURCEPGDB.stop()
         self.clean_offset_file()
 
-    def execute_on_source_db(self, sql:str):
-        self.SOURCEPGDB.execute_sql(sql=sql)
+    def execute_on_source_db(self, sql: str):
+        self.SOURCEPGDB.execute_sql(sql=sql)
diff --git a/tests/catalog_rest.py b/tests/catalog_rest.py
@@ -46,3 +46,6 @@ def list_namespaces(self):
         catalog = self.get_catalog()
         namespaces = catalog.list_namespaces()
         print("Namespaces:", namespaces)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
diff --git a/tests/db_postgresql.py b/tests/db_postgresql.py
@@ -39,7 +39,9 @@ def stop(self):
             pass
 
     def get_connection(self) -> Connection:
-        engine = sqlalchemy.create_engine(self.CONTAINER.get_connection_url())
+        url = self.CONTAINER.get_connection_url()
+        print(url)
+        engine = sqlalchemy.create_engine(url)
         return engine.connect()
 
     def __exit__(self, exc_type, exc_value, traceback):
diff --git a/tests/test_iceberg_handler.py b/tests/test_iceberg_handler.py
@@ -26,6 +26,7 @@ def setUp(self):
     def tearDown(self):
         self.SOURCEPGDB.stop()
         self.S3MiNIO.stop()
+        self.RESTCATALOG.stop()
         self.clean_offset_file()
 
     @unittest.skip
diff --git a/tests/test_iceberg_handlerv2.py b/tests/test_iceberg_handlerv2.py
@@ -1,9 +1,12 @@
-import unittest
+import io
+import threading
+import time
 
 import pandas as pd
+import pyarrow as pa
+import pyarrow.json as pj
+import waiting
 from pyiceberg.catalog import load_catalog
-from pyiceberg.schema import Schema
-from pyiceberg.types import LongType, NestedField, StringType
 
 from base_postgresql_test import BasePostgresqlTest
 from catalog_rest import CatalogRestContainer
@@ -24,99 +27,99 @@ def setUp(self):
         self.S3MiNIO.start()
         self.RESTCATALOG.start(s3_endpoint=self.S3MiNIO.endpoint())
         # Set pandas options to display all rows and columns, and prevent truncation of cell content
-        pd.set_option('display.max_rows', None)         # Show all rows
-        pd.set_option('display.max_columns', None)      # Show all columns
-        pd.set_option('display.width', None)            # Auto-detect terminal width
-        pd.set_option('display.max_colwidth', None)     # Do not truncate cell contents
+        pd.set_option('display.max_rows', None)  # Show all rows
+        pd.set_option('display.max_columns', None)  # Show all columns
+        pd.set_option('display.width', None)  # Auto-detect terminal width
+        pd.set_option('display.max_colwidth', None)  # Do not truncate cell contents
 
     def tearDown(self):
         self.SOURCEPGDB.stop()
         self.S3MiNIO.stop()
+        self.RESTCATALOG.stop()
         self.clean_offset_file()
 
-    @unittest.skip
-    def test_iceberg_catalog(self):
-        conf = {
-            "uri": self.RESTCATALOG.get_uri(),
-            # "s3.path-style.access": "true",
-            "warehouse": "warehouse",
-            "s3.endpoint": self.S3MiNIO.endpoint(),
-            "s3.access-key-id": S3Minio.AWS_ACCESS_KEY_ID,
-            "s3.secret-access-key": S3Minio.AWS_SECRET_ACCESS_KEY,
-        }
-        print(conf)
-        catalog = load_catalog(
-            name="rest",
-            **conf
-        )
-        catalog.create_namespace('my_warehouse')
-        debezium_event_schema = Schema(
-            NestedField(field_id=1, name="id", field_type=LongType(), required=True),
-            NestedField(field_id=2, name="data", field_type=StringType(), required=False),
-        )
-        table = catalog.create_table(identifier=("my_warehouse", "test_table",), schema=debezium_event_schema)
-        print(f"Created iceberg table {table.refs()}")
+    def test_read_json_lines_example(self):
+        json_data = """
+{"id": 1, "name": "Alice", "age": 30}
+{"id": 2, "name": "Bob", "age": 24}
+{"id": 3, "name": "Charlie", "age": 35}
+    """.strip()  # .strip() removes leading/trailing whitespace/newlines
+        json_buffer = io.BytesIO(json_data.encode('utf-8'))
+        json_buffer.seek(0)
+        # =============================
+        table_inferred = pj.read_json(json_buffer)
+        print("\nInferred Schema:")
+        print(table_inferred.schema)
+        # =============================
+        explicit_schema = pa.schema([
+            pa.field('id', pa.int64()),  # Integer type for 'id'
+            pa.field('name', pa.string()),  # String type for 'name'
+        ])
+        json_buffer.seek(0)
+        po = pj.ParseOptions(explicit_schema=explicit_schema)
+        table_explicit = pj.read_json(json_buffer, parse_options=po)
+        print("\nExplicit Schema:")
+        print(table_explicit.schema)
+
+    def _apply_source_db_changes(self):
+        time.sleep(12)
+        self.execute_on_source_db("UPDATE inventory.customers SET first_name='George__UPDATE1' WHERE ID = 1002 ;")
+        # self.execute_on_source_db("ALTER TABLE inventory.customers DROP COLUMN email;")
+        self.execute_on_source_db("UPDATE inventory.customers SET first_name='George__UPDATE2'  WHERE ID = 1002 ;")
+        self.execute_on_source_db("DELETE FROM inventory.orders WHERE purchaser = 1002 ;")
+        self.execute_on_source_db("DELETE FROM inventory.customers WHERE id = 1002 ;")
+        self.execute_on_source_db("ALTER TABLE inventory.customers ADD birth_date date;")
+        self.execute_on_source_db("UPDATE inventory.customers SET birth_date = '2020-01-01'  WHERE id = 1001 ;")
 
     def test_iceberg_handler(self):
-        dest_ns1_database="my_warehouse"
-        dest_ns2_schema="dbz_cdc_data"
-        conf = {
+        dest_ns1_database = "my_warehouse"
+        dest_ns2_schema = "dbz_cdc_data"
+        catalog_conf = {
             "uri": self.RESTCATALOG.get_uri(),
-            # "s3.path-style.access": "true",
             "warehouse": "warehouse",
             "s3.endpoint": self.S3MiNIO.endpoint(),
             "s3.access-key-id": S3Minio.AWS_ACCESS_KEY_ID,
             "s3.secret-access-key": S3Minio.AWS_SECRET_ACCESS_KEY,
         }
-        catalog = load_catalog(name="rest",**conf)
-
+        catalog = load_catalog(name="rest", **catalog_conf)
         handler = IcebergChangeHandlerV2(catalog=catalog,
                                          destination_namespace=(dest_ns1_database, dest_ns2_schema,),
                                          event_flattening_enabled=True
                                          )
-
         dbz_props = self.debezium_engine_props(unwrap_messages=True)
         engine = DebeziumJsonEngine(properties=dbz_props, handler=handler)
-        with self.assertLogs(IcebergChangeHandlerV2.LOGGER_NAME, level='INFO') as cm:
-            # run async then interrupt after timeout time to test the result!
-            Utils.run_engine_async(engine=engine, timeout_sec=44)
 
-        # for t in cm.output:
-        #     print(t)
-        self.assertRegex(text=str(cm.output), expected_regex='.*Created iceberg table.*')
-        self.assertRegex(text=str(cm.output), expected_regex='.*Appended.*records to table.*')
+        t = threading.Thread(target=self._apply_source_db_changes)
+        t.start()
+        Utils.run_engine_async(engine=engine, timeout_sec=77, blocking=False)
 
-        # catalog.create_namespace(dest_ns1_database)
-        namespaces = catalog.list_namespaces()
-        self.assertIn((dest_ns1_database,) , namespaces, msg="Namespace not found in catalog")
+        test_ns = (dest_ns1_database,)
+        print(catalog.list_namespaces())
+        waiting.wait(predicate=lambda: test_ns in catalog.list_namespaces(), timeout_seconds=7.5)
 
-        tables = catalog.list_tables((dest_ns1_database, dest_ns2_schema,))
-        print(tables)
-        self.assertIn(('my_warehouse', 'dbz_cdc_data', 'testc_inventory_customers'), tables, msg="Namespace not found in catalog")
+        test_tbl = ('my_warehouse', 'dbz_cdc_data', 'testc_inventory_customers')
+        test_tbl_ns = (dest_ns1_database, dest_ns2_schema,)
+        waiting.wait(predicate=lambda: test_tbl in catalog.list_tables(test_tbl_ns), timeout_seconds=10.5)
 
-        tbl = catalog.load_table(identifier=('my_warehouse', 'dbz_cdc_data', 'testc_inventory_customers'))
-        data = tbl.scan().to_arrow()
-        self.assertIn("sally.thomas@acme.com", str(data))
-        self.assertIn("annek@noanswer.org", str(data))
-        self.assertEqual(data.num_rows, 4)
+        test_tbl_data = ('my_warehouse', 'dbz_cdc_data', 'testc_inventory_customers')
+        waiting.wait(predicate=lambda: "sally.thomas@acme.com" in str(self.red_table(catalog, test_tbl_data)),
+                     timeout_seconds=10.5)
+        waiting.wait(predicate=lambda: self.red_table(catalog, test_tbl_data).num_rows >= 4, timeout_seconds=10.5)
+
+        data = self.red_table(catalog, test_tbl_data)
         self.pprint_table(data=data)
-        #=================================================================
-        ## ==== PART 2 CONSUME CHANGES FROM BINLOG =======================
-        #=================================================================
-        self.execute_on_source_db("UPDATE inventory.customers SET first_name='George__UPDATE1' WHERE ID = 1002 ;")
-        # self.execute_on_source_db("ALTER TABLE inventory.customers DROP COLUMN email;")
-        self.execute_on_source_db("UPDATE inventory.customers SET first_name='George__UPDATE2'  WHERE ID = 1002 ;")
-        self.execute_on_source_db("DELETE FROM inventory.orders WHERE purchaser = 1002 ;")
-        self.execute_on_source_db("DELETE FROM inventory.customers WHERE id = 1002 ;")
-        self.execute_on_source_db("ALTER TABLE inventory.customers ADD birth_date date;")
-        self.execute_on_source_db("UPDATE inventory.customers SET birth_date = '2020-01-01'  WHERE id = 1001 ;")
-        # run
-        Utils.run_engine_async(engine=engine, timeout_sec=44)
-        # test
-        # @TODO test that new field is received and added to iceberg!
-        data = tbl.scan().to_arrow()
+        # =================================================================
+        ## ==== PART 2 CONSUME CHANGES FROM BINLOG ========================
+        # =================================================================
+        waiting.wait(predicate=lambda: self.red_table(catalog, test_tbl_data).num_rows >= 7, timeout_seconds=77)
+        data = self.red_table(catalog, test_tbl_data)
         self.pprint_table(data=data)
-        self.assertEqual(data.num_rows, 4)
+
+    def red_table(self, catalog, table_identifier) -> "pa.Table":
+        tbl = catalog.load_table(identifier=table_identifier)
+        data = tbl.scan().to_arrow()
+        self.pprint_table(data)
+        return data
 
     def pprint_table(self, data):
         print("--- Iceberg Table Content ---")