@@ -519,6 +519,7 @@ def consolidate_catalog(
519
519
start : TimestampLike | None = None ,
520
520
end : TimestampLike | None = None ,
521
521
ensure_contiguous_files : bool = True ,
522
+ deduplicate : bool = False ,
522
523
) -> None :
523
524
"""
524
525
Consolidate all parquet files across the entire catalog within the specified
@@ -541,6 +542,8 @@ def consolidate_catalog(
541
542
up to the end of time will be considered.
542
543
ensure_contiguous_files : bool, default True
543
544
If True, ensures that files have contiguous timestamps before consolidation.
545
+ deduplicate : bool, default False
546
+ If True, removes duplicate rows from the consolidated file.
544
547
545
548
Notes
546
549
-----
@@ -558,7 +561,13 @@ def consolidate_catalog(
558
561
leaf_directories = self ._find_leaf_data_directories ()
559
562
560
563
for directory in leaf_directories :
561
- self ._consolidate_directory (directory , start , end , ensure_contiguous_files )
564
+ self ._consolidate_directory (
565
+ directory ,
566
+ start ,
567
+ end ,
568
+ ensure_contiguous_files ,
569
+ deduplicate = deduplicate ,
570
+ )
562
571
563
572
def consolidate_data (
564
573
self ,
@@ -567,6 +576,7 @@ def consolidate_data(
567
576
start : TimestampLike | None = None ,
568
577
end : TimestampLike | None = None ,
569
578
ensure_contiguous_files : bool = True ,
579
+ deduplicate : bool = False ,
570
580
) -> None :
571
581
"""
572
582
Consolidate multiple parquet files for a specific data class and instrument ID
@@ -593,6 +603,8 @@ def consolidate_data(
593
603
up to the end of time will be considered.
594
604
ensure_contiguous_files : bool, default True
595
605
If True, ensures that files have contiguous timestamps before consolidation.
606
+ deduplicate : bool, default False
607
+ If True, removes duplicate rows from the consolidated file.
596
608
597
609
Notes
598
610
-----
@@ -604,14 +616,21 @@ def consolidate_data(
604
616
605
617
"""
606
618
directory = self ._make_path (data_cls , identifier )
607
- self ._consolidate_directory (directory , start , end , ensure_contiguous_files )
619
+ self ._consolidate_directory (
620
+ directory ,
621
+ start ,
622
+ end ,
623
+ ensure_contiguous_files ,
624
+ deduplicate = deduplicate ,
625
+ )
608
626
609
627
def _consolidate_directory (
610
628
self ,
611
629
directory : str ,
612
630
start : TimestampLike | None = None ,
613
631
end : TimestampLike | None = None ,
614
632
ensure_contiguous_files : bool = True ,
633
+ deduplicate : bool = False ,
615
634
) -> None :
616
635
parquet_files = self .fs .glob (os .path .join (directory , "*.parquet" ))
617
636
files_to_consolidate = []
@@ -643,20 +662,33 @@ def _consolidate_directory(
643
662
_timestamps_to_filename (intervals [0 ][0 ], intervals [- 1 ][1 ]),
644
663
)
645
664
files_to_consolidate .sort ()
646
- self ._combine_parquet_files (files_to_consolidate , new_file_name )
665
+ self ._combine_parquet_files (files_to_consolidate , new_file_name , deduplicate = deduplicate )
647
666
648
- def _combine_parquet_files (self , file_list : list [str ], new_file : str ) -> None :
667
+ def _combine_parquet_files (
668
+ self ,
669
+ file_list : list [str ],
670
+ new_file : str ,
671
+ deduplicate : bool = False ,
672
+ ) -> None :
649
673
if len (file_list ) <= 1 :
650
674
return
651
675
652
676
tables = [pq .read_table (file , memory_map = True , pre_buffer = False ) for file in file_list ]
653
677
combined_table = pa .concat_tables (tables )
678
+
679
+ if deduplicate :
680
+ combined_table = self ._deduplicate_table (combined_table )
681
+
654
682
pq .write_table (combined_table , where = new_file )
655
683
656
684
for file in file_list :
657
685
if file != new_file :
658
686
self .fs .rm (file )
659
687
688
+ @staticmethod
689
+ def _deduplicate_table (table : pa .Table ) -> pa .Table :
690
+ return table .group_by (table .column_names ).aggregate ([])
691
+
660
692
def consolidate_catalog_by_period (
661
693
self ,
662
694
period : pd .Timedelta = pd .Timedelta (days = 1 ),
0 commit comments