@@ -162,7 +162,7 @@ def _diff_tables_root(self, table1: TableSegment, table2: TableSegment, info_tre
162
162
yield from self ._diff_segments (None , table1 , table2 , info_tree , None )
163
163
else :
164
164
yield from self ._bisect_and_diff_tables (table1 , table2 , info_tree )
165
- logger .info ("Diffing complete" )
165
+ logger .info (f "Diffing complete: { table1 . table_path } <> { table2 . table_path } " )
166
166
if self .materialize_to_table :
167
167
logger .info ("Materialized diff to table '%s'." , "." .join (self .materialize_to_table ))
168
168
@@ -193,8 +193,8 @@ def _diff_segments(
193
193
partial (self ._collect_stats , 1 , table1 , info_tree ),
194
194
partial (self ._collect_stats , 2 , table2 , info_tree ),
195
195
partial (self ._test_null_keys , table1 , table2 ),
196
- partial (self ._sample_and_count_exclusive , db , diff_rows , a_cols , b_cols ),
197
- partial (self ._count_diff_per_column , db , diff_rows , list (a_cols ), is_diff_cols ),
196
+ partial (self ._sample_and_count_exclusive , db , diff_rows , a_cols , b_cols , table1 , table2 ),
197
+ partial (self ._count_diff_per_column , db , diff_rows , list (a_cols ), is_diff_cols , table1 , table2 ),
198
198
partial (
199
199
self ._materialize_diff ,
200
200
db ,
@@ -205,8 +205,8 @@ def _diff_segments(
205
205
else None ,
206
206
):
207
207
assert len (a_cols ) == len (b_cols )
208
- logger .debug ("Querying for different rows" )
209
- diff = db .query (diff_rows , list )
208
+ logger .debug (f "Querying for different rows: { table1 . table_path } " )
209
+ diff = db .query (diff_rows , list , log_message = table1 . table_path )
210
210
info_tree .info .set_diff (diff , schema = tuple (diff_rows .schema .items ()))
211
211
for is_xa , is_xb , * x in diff :
212
212
if is_xa and is_xb :
@@ -227,7 +227,7 @@ def _diff_segments(
227
227
yield "+" , tuple (b_row )
228
228
229
229
def _test_duplicate_keys (self , table1 : TableSegment , table2 : TableSegment ):
230
- logger .debug ("Testing for duplicate keys" )
230
+ logger .debug (f "Testing for duplicate keys: { table1 . table_path } <> { table2 . table_path } " )
231
231
232
232
# Test duplicate keys
233
233
for ts in [table1 , table2 ]:
@@ -240,24 +240,24 @@ def _test_duplicate_keys(self, table1: TableSegment, table2: TableSegment):
240
240
241
241
unvalidated = list (set (key_columns ) - set (unique ))
242
242
if unvalidated :
243
- logger .info (f"Validating that the are no duplicate keys in columns: { unvalidated } " )
243
+ logger .info (f"Validating that the are no duplicate keys in columns: { unvalidated } for { ts . table_path } " )
244
244
# Validate that there are no duplicate keys
245
245
self .stats ["validated_unique_keys" ] = self .stats .get ("validated_unique_keys" , []) + [unvalidated ]
246
246
q = t .select (total = Count (), total_distinct = Count (Concat (this [unvalidated ]), distinct = True ))
247
- total , total_distinct = ts .database .query (q , tuple )
247
+ total , total_distinct = ts .database .query (q , tuple , log_message = ts . table_path )
248
248
if total != total_distinct :
249
249
raise ValueError ("Duplicate primary keys" )
250
250
251
251
def _test_null_keys (self , table1 , table2 ):
252
- logger .debug ("Testing for null keys" )
252
+ logger .debug (f "Testing for null keys: { table1 . table_path } <> { table2 . table_path } " )
253
253
254
254
# Test null keys
255
255
for ts in [table1 , table2 ]:
256
256
t = ts .make_select ()
257
257
key_columns = ts .key_columns
258
258
259
259
q = t .select (* this [key_columns ]).where (or_ (this [k ] == None for k in key_columns ))
260
- nulls = ts .database .query (q , list )
260
+ nulls = ts .database .query (q , list , log_message = ts . table_path )
261
261
if nulls :
262
262
if self .skip_null_keys :
263
263
logger .warning (
@@ -267,7 +267,7 @@ def _test_null_keys(self, table1, table2):
267
267
raise ValueError (f"NULL values in one or more primary keys of { ts .table_path } " )
268
268
269
269
def _collect_stats (self , i , table_seg : TableSegment , info_tree : InfoTree ):
270
- logger .debug (f"Collecting stats for table #{ i } " )
270
+ logger .debug (f"Collecting stats for table #{ i } : { table_seg . table_path } " )
271
271
db = table_seg .database
272
272
273
273
# Metrics
@@ -288,7 +288,7 @@ def _collect_stats(self, i, table_seg: TableSegment, info_tree: InfoTree):
288
288
)
289
289
col_exprs ["count" ] = Count ()
290
290
291
- res = db .query (table_seg .make_select ().select (** col_exprs ), tuple )
291
+ res = db .query (table_seg .make_select ().select (** col_exprs ), tuple , log_message = table_seg . table_path )
292
292
293
293
for col_name , value in safezip (col_exprs , res ):
294
294
if value is not None :
@@ -303,7 +303,7 @@ def _collect_stats(self, i, table_seg: TableSegment, info_tree: InfoTree):
303
303
else :
304
304
self .stats [stat_name ] = value
305
305
306
- logger .debug ("Done collecting stats for table #%s" , i )
306
+ logger .debug ("Done collecting stats for table #%s: %s " , i , table_seg . table_path )
307
307
308
308
def _create_outer_join (self , table1 , table2 ):
309
309
db = table1 .database
@@ -334,23 +334,46 @@ def _create_outer_join(self, table1, table2):
334
334
diff_rows = all_rows .where (or_ (this [c ] == 1 for c in is_diff_cols ))
335
335
return diff_rows , a_cols , b_cols , is_diff_cols , all_rows
336
336
337
- def _count_diff_per_column (self , db , diff_rows , cols , is_diff_cols ):
338
- logger .debug ("Counting differences per column" )
339
- is_diff_cols_counts = db .query (diff_rows .select (sum_ (this [c ]) for c in is_diff_cols ), tuple )
337
+ def _count_diff_per_column (
338
+ self ,
339
+ db ,
340
+ diff_rows ,
341
+ cols ,
342
+ is_diff_cols ,
343
+ table1 : Optional [TableSegment ] = None ,
344
+ table2 : Optional [TableSegment ] = None ,
345
+ ):
346
+ logger .info (type (table1 ))
347
+ logger .debug (f"Counting differences per column: { table1 .table_path } <> { table2 .table_path } " )
348
+ is_diff_cols_counts = db .query (
349
+ diff_rows .select (sum_ (this [c ]) for c in is_diff_cols ),
350
+ tuple ,
351
+ log_message = f"{ table1 .table_path } <> { table2 .table_path } " ,
352
+ )
340
353
diff_counts = {}
341
354
for name , count in safezip (cols , is_diff_cols_counts ):
342
355
diff_counts [name ] = diff_counts .get (name , 0 ) + (count or 0 )
343
356
self .stats ["diff_counts" ] = diff_counts
344
357
345
- def _sample_and_count_exclusive (self , db , diff_rows , a_cols , b_cols ):
358
+ def _sample_and_count_exclusive (
359
+ self ,
360
+ db ,
361
+ diff_rows ,
362
+ a_cols ,
363
+ b_cols ,
364
+ table1 : Optional [TableSegment ] = None ,
365
+ table2 : Optional [TableSegment ] = None ,
366
+ ):
346
367
if isinstance (db , (Oracle , MsSQL )):
347
368
exclusive_rows_query = diff_rows .where ((this .is_exclusive_a == 1 ) | (this .is_exclusive_b == 1 ))
348
369
else :
349
370
exclusive_rows_query = diff_rows .where (this .is_exclusive_a | this .is_exclusive_b )
350
371
351
372
if not self .sample_exclusive_rows :
352
- logger .debug ("Counting exclusive rows" )
353
- self .stats ["exclusive_count" ] = db .query (exclusive_rows_query .count (), int )
373
+ logger .debug (f"Counting exclusive rows: { table1 .table_path } <> { table2 .table_path } " )
374
+ self .stats ["exclusive_count" ] = db .query (
375
+ exclusive_rows_query .count (), int , log_message = f"{ table1 .table_path } <> { table2 .table_path } "
376
+ )
354
377
return
355
378
356
379
logger .info ("Counting and sampling exclusive rows" )
0 commit comments