11import argparse
22import csv
3+ import datetime
34import re
45import time
56from typing import Tuple
@@ -38,7 +39,7 @@ def clean_research(max_months: int, dry_run: bool) -> Tuple[int, int]:
3839 )
3940 # create a csv filename to store the deleted files that includes the current datetime
4041 output_csv = f"deleted_research_files_{ time .strftime ('%Y%m%d-%H%M%S' )} " + ("_dryrun" if dry_run else "" ) + ".csv"
41- return _delete_data (max_months , dry_run , regex_to_delete , output_csv )
42+ return _delete_data (max_months , dry_run , regex_to_delete , output_csv , checkpoint_protection = True )
4243
4344
4445def clean_production (max_months : int , dry_run : bool ) -> Tuple [int , int ]:
@@ -48,27 +49,78 @@ def clean_production(max_months: int, dry_run: bool) -> Tuple[int, int]:
4849 return _delete_data (max_months , dry_run , regex_to_delete , output_csv )
4950
5051
51- def _delete_data (max_months : int , dry_run : bool , regex_to_delete : str , output_csv : str ) -> Tuple [int , int ]:
52+ def _delete_data (
53+ max_months : int , dry_run : bool , regex_to_delete : str , output_csv : str , checkpoint_protection : bool = False
54+ ) -> Tuple [int , int ]:
5255 max_age = max_months * MONTH_IN_SECONDS
5356
5457 s3 = boto3 .client ("s3" )
5558 paginator = s3 .get_paginator ("list_objects_v2" )
5659 total_deleted = 0
5760 storage_space_freed = 0
61+ keep_until_dates = {}
62+ # First pass, identify keep until files
63+ # which must follow the format keep_until_YYYY-MM-DD.lock and be located in the same folder
64+ # as the experiment's config.yml file
65+ for page in paginator .paginate (Bucket = "silnlp" ):
66+ for obj in page ["Contents" ]:
67+ s3_filename = obj ["Key" ]
68+ parts = s3_filename .split ("/" )
69+
70+ if parts [- 1 ].startswith ("keep_until_" ):
71+ try :
72+ date_str = parts [- 1 ].split ("_" )[- 1 ].replace (".lock" , "" )
73+ keep_until_timestamp = datetime .datetime .strptime (date_str , "%Y-%m-%d" ).timestamp ()
74+
75+ folder_path = "/" .join (parts [:- 1 ])
76+ keep_until_dates [folder_path ] = keep_until_timestamp
77+ except ValueError :
78+ print (f"Invalid keep_until format in { s3_filename } . Should follow keep_until_YYYY-MM-DD.lock" )
79+
5880 with open (output_csv , mode = "w" , newline = "" , encoding = "utf-8" ) as csv_file :
5981 csv_writer = csv .writer (csv_file )
6082 if dry_run :
61- csv_writer .writerow (["Filename" , "LastModified" , "Eligible for Deletion" ])
83+ csv_writer .writerow (["Filename" , "LastModified" , "Eligible for Deletion" , "Extra Info" ])
6284 else :
63- csv_writer .writerow (["Filename" , "LastModified" , "Deleted" ])
85+ csv_writer .writerow (["Filename" , "LastModified" , "Deleted" , "Extra Info" ])
6486 for page in paginator .paginate (Bucket = "silnlp" ):
6587 for obj in page ["Contents" ]:
6688 s3_filename = obj ["Key" ]
6789 if regex_to_delete .search (s3_filename ) is None :
6890 continue
91+
6992 last_modified = obj ["LastModified" ].timestamp ()
7093 now = time .time ()
94+
7195 delete = False
96+ if checkpoint_protection :
97+ parts = s3_filename .split ("/" )
98+ if len (parts ) >= 4 :
99+ experiment_folder = "/" .join (parts [:- 3 ])
100+ if experiment_folder in keep_until_dates :
101+ protect_until = keep_until_dates [experiment_folder ]
102+ if now < protect_until :
103+ print (
104+ f"Skipping { s3_filename } (Experiment '{ experiment_folder } ' protected until "
105+ f"{ datetime .datetime .fromtimestamp (protect_until , tz = datetime .timezone .utc )} )"
106+ )
107+ csv_writer .writerow (
108+ [
109+ s3_filename ,
110+ last_modified ,
111+ delete ,
112+ f"Protected until "
113+ f"{ datetime .datetime .fromtimestamp (protect_until , tz = datetime .timezone .utc )} )" ,
114+ ]
115+ )
116+ continue
117+ else :
118+ raise RuntimeError (
119+ f"Invalide checkpoint path: { s3_filename } . "
120+ f"Either disable checkpoint protection "
121+ f"or double check that only checkpoints are included in the regex_to_delete."
122+ )
123+
72124 if now - last_modified > max_age :
73125 delete = True
74126 print (s3_filename )
0 commit comments