1- import unittest
1+ import io
2+ import threading
3+ import time
24
35import pandas as pd
6+ import pyarrow as pa
7+ import pyarrow .json as pj
8+ import waiting
49from pyiceberg .catalog import load_catalog
5- from pyiceberg .schema import Schema
6- from pyiceberg .types import LongType , NestedField , StringType
710
811from base_postgresql_test import BasePostgresqlTest
912from catalog_rest import CatalogRestContainer
@@ -24,99 +27,99 @@ def setUp(self):
2427 self .S3MiNIO .start ()
2528 self .RESTCATALOG .start (s3_endpoint = self .S3MiNIO .endpoint ())
2629 # Set pandas options to display all rows and columns, and prevent truncation of cell content
27- pd .set_option ('display.max_rows' , None ) # Show all rows
28- pd .set_option ('display.max_columns' , None ) # Show all columns
29- pd .set_option ('display.width' , None ) # Auto-detect terminal width
30- pd .set_option ('display.max_colwidth' , None ) # Do not truncate cell contents
30+ pd .set_option ('display.max_rows' , None ) # Show all rows
31+ pd .set_option ('display.max_columns' , None ) # Show all columns
32+ pd .set_option ('display.width' , None ) # Auto-detect terminal width
33+ pd .set_option ('display.max_colwidth' , None ) # Do not truncate cell contents
3134
3235 def tearDown (self ):
3336 self .SOURCEPGDB .stop ()
3437 self .S3MiNIO .stop ()
38+ self .RESTCATALOG .stop ()
3539 self .clean_offset_file ()
3640
37- @unittest .skip
38- def test_iceberg_catalog (self ):
39- conf = {
40- "uri" : self .RESTCATALOG .get_uri (),
41- # "s3.path-style.access": "true",
42- "warehouse" : "warehouse" ,
43- "s3.endpoint" : self .S3MiNIO .endpoint (),
44- "s3.access-key-id" : S3Minio .AWS_ACCESS_KEY_ID ,
45- "s3.secret-access-key" : S3Minio .AWS_SECRET_ACCESS_KEY ,
46- }
47- print (conf )
48- catalog = load_catalog (
49- name = "rest" ,
50- ** conf
51- )
52- catalog .create_namespace ('my_warehouse' )
53- debezium_event_schema = Schema (
54- NestedField (field_id = 1 , name = "id" , field_type = LongType (), required = True ),
55- NestedField (field_id = 2 , name = "data" , field_type = StringType (), required = False ),
56- )
57- table = catalog .create_table (identifier = ("my_warehouse" , "test_table" ,), schema = debezium_event_schema )
58- print (f"Created iceberg table { table .refs ()} " )
41+ def test_read_json_lines_example (self ):
42+ json_data = """
43+ {"id": 1, "name": "Alice", "age": 30}
44+ {"id": 2, "name": "Bob", "age": 24}
45+ {"id": 3, "name": "Charlie", "age": 35}
46+ """ .strip () # .strip() removes leading/trailing whitespace/newlines
47+ json_buffer = io .BytesIO (json_data .encode ('utf-8' ))
48+ json_buffer .seek (0 )
49+ # =============================
50+ table_inferred = pj .read_json (json_buffer )
51+ print ("\n Inferred Schema:" )
52+ print (table_inferred .schema )
53+ # =============================
54+ explicit_schema = pa .schema ([
55+ pa .field ('id' , pa .int64 ()), # Integer type for 'id'
56+ pa .field ('name' , pa .string ()), # String type for 'name'
57+ ])
58+ json_buffer .seek (0 )
59+ po = pj .ParseOptions (explicit_schema = explicit_schema )
60+ table_explicit = pj .read_json (json_buffer , parse_options = po )
61+ print ("\n Explicit Schema:" )
62+ print (table_explicit .schema )
63+
64+ def _apply_source_db_changes (self ):
65+ time .sleep (12 )
66+ self .execute_on_source_db ("UPDATE inventory.customers SET first_name='George__UPDATE1' WHERE ID = 1002 ;" )
67+ # self.execute_on_source_db("ALTER TABLE inventory.customers DROP COLUMN email;")
68+ self .execute_on_source_db ("UPDATE inventory.customers SET first_name='George__UPDATE2' WHERE ID = 1002 ;" )
69+ self .execute_on_source_db ("DELETE FROM inventory.orders WHERE purchaser = 1002 ;" )
70+ self .execute_on_source_db ("DELETE FROM inventory.customers WHERE id = 1002 ;" )
71+ self .execute_on_source_db ("ALTER TABLE inventory.customers ADD birth_date date;" )
72+ self .execute_on_source_db ("UPDATE inventory.customers SET birth_date = '2020-01-01' WHERE id = 1001 ;" )
5973
6074 def test_iceberg_handler (self ):
61- dest_ns1_database = "my_warehouse"
62- dest_ns2_schema = "dbz_cdc_data"
63- conf = {
75+ dest_ns1_database = "my_warehouse"
76+ dest_ns2_schema = "dbz_cdc_data"
77+ catalog_conf = {
6478 "uri" : self .RESTCATALOG .get_uri (),
65- # "s3.path-style.access": "true",
6679 "warehouse" : "warehouse" ,
6780 "s3.endpoint" : self .S3MiNIO .endpoint (),
6881 "s3.access-key-id" : S3Minio .AWS_ACCESS_KEY_ID ,
6982 "s3.secret-access-key" : S3Minio .AWS_SECRET_ACCESS_KEY ,
7083 }
71- catalog = load_catalog (name = "rest" ,** conf )
72-
84+ catalog = load_catalog (name = "rest" , ** catalog_conf )
7385 handler = IcebergChangeHandlerV2 (catalog = catalog ,
7486 destination_namespace = (dest_ns1_database , dest_ns2_schema ,),
7587 event_flattening_enabled = True
7688 )
77-
7889 dbz_props = self .debezium_engine_props (unwrap_messages = True )
7990 engine = DebeziumJsonEngine (properties = dbz_props , handler = handler )
80- with self .assertLogs (IcebergChangeHandlerV2 .LOGGER_NAME , level = 'INFO' ) as cm :
81- # run async then interrupt after timeout time to test the result!
82- Utils .run_engine_async (engine = engine , timeout_sec = 44 )
8391
84- # for t in cm.output:
85- # print(t)
86- self .assertRegex (text = str (cm .output ), expected_regex = '.*Created iceberg table.*' )
87- self .assertRegex (text = str (cm .output ), expected_regex = '.*Appended.*records to table.*' )
92+ t = threading .Thread (target = self ._apply_source_db_changes )
93+ t .start ()
94+ Utils .run_engine_async (engine = engine , timeout_sec = 77 , blocking = False )
8895
89- # catalog.create_namespace (dest_ns1_database)
90- namespaces = catalog .list_namespaces ()
91- self . assertIn (( dest_ns1_database ,) , namespaces , msg = "Namespace not found in catalog" )
96+ test_ns = (dest_ns1_database , )
97+ print ( catalog .list_namespaces () )
98+ waiting . wait ( predicate = lambda : test_ns in catalog . list_namespaces (), timeout_seconds = 7.5 )
9299
93- tables = catalog . list_tables (( dest_ns1_database , dest_ns2_schema ,) )
94- print ( tables )
95- self . assertIn (( 'my_warehouse' , 'dbz_cdc_data' , 'testc_inventory_customers' ), tables , msg = "Namespace not found in catalog" )
100+ test_tbl = ( 'my_warehouse' , 'dbz_cdc_data' , 'testc_inventory_customers' )
101+ test_tbl_ns = ( dest_ns1_database , dest_ns2_schema , )
102+ waiting . wait ( predicate = lambda : test_tbl in catalog . list_tables ( test_tbl_ns ), timeout_seconds = 10.5 )
96103
97- tbl = catalog .load_table (identifier = ('my_warehouse' , 'dbz_cdc_data' , 'testc_inventory_customers' ))
98- data = tbl .scan ().to_arrow ()
99- self .
assertIn (
"[email protected] " ,
str (
data ))
100- self .
assertIn (
"[email protected] " ,
str (
data ))
101- self .assertEqual (data .num_rows , 4 )
104+ test_tbl_data = ('my_warehouse' , 'dbz_cdc_data' , 'testc_inventory_customers' )
105+ waiting .
wait (
predicate = lambda :
"[email protected] " in str (
self .
red_table (
catalog ,
test_tbl_data )),
106+ timeout_seconds = 10.5 )
107+ waiting .wait (predicate = lambda : self .red_table (catalog , test_tbl_data ).num_rows >= 4 , timeout_seconds = 10.5 )
108+
109+ data = self .red_table (catalog , test_tbl_data )
102110 self .pprint_table (data = data )
103- #=================================================================
104- ## ==== PART 2 CONSUME CHANGES FROM BINLOG =======================
105- #=================================================================
106- self .execute_on_source_db ("UPDATE inventory.customers SET first_name='George__UPDATE1' WHERE ID = 1002 ;" )
107- # self.execute_on_source_db("ALTER TABLE inventory.customers DROP COLUMN email;")
108- self .execute_on_source_db ("UPDATE inventory.customers SET first_name='George__UPDATE2' WHERE ID = 1002 ;" )
109- self .execute_on_source_db ("DELETE FROM inventory.orders WHERE purchaser = 1002 ;" )
110- self .execute_on_source_db ("DELETE FROM inventory.customers WHERE id = 1002 ;" )
111- self .execute_on_source_db ("ALTER TABLE inventory.customers ADD birth_date date;" )
112- self .execute_on_source_db ("UPDATE inventory.customers SET birth_date = '2020-01-01' WHERE id = 1001 ;" )
113- # run
114- Utils .run_engine_async (engine = engine , timeout_sec = 44 )
115- # test
116- # @TODO test that new field is received and added to iceberg!
117- data = tbl .scan ().to_arrow ()
111+ # =================================================================
112+ ## ==== PART 2 CONSUME CHANGES FROM BINLOG ========================
113+ # =================================================================
114+ waiting .wait (predicate = lambda : self .red_table (catalog , test_tbl_data ).num_rows >= 7 , timeout_seconds = 77 )
115+ data = self .red_table (catalog , test_tbl_data )
118116 self .pprint_table (data = data )
119- self .assertEqual (data .num_rows , 4 )
117+
118+ def red_table (self , catalog , table_identifier ) -> "pa.Table" :
119+ tbl = catalog .load_table (identifier = table_identifier )
120+ data = tbl .scan ().to_arrow ()
121+ self .pprint_table (data )
122+ return data
120123
121124 def pprint_table (self , data ):
122125 print ("--- Iceberg Table Content ---" )
0 commit comments