Skip to content

Commit ee51962

Browse files
authored
Merge pull request #672 from sillsdev/#671_keep_checkpoints
keep_until files for checkpoints
2 parents 852dd2f + 19998ba commit ee51962

File tree

1 file changed

+56
-4
lines changed

1 file changed

+56
-4
lines changed

scripts/clean_s3.py

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import argparse
22
import csv
3+
import datetime
34
import re
45
import time
56
from typing import Tuple
@@ -38,7 +39,7 @@ def clean_research(max_months: int, dry_run: bool) -> Tuple[int, int]:
3839
)
3940
# create a csv filename to store the deleted files that includes the current datetime
4041
output_csv = f"deleted_research_files_{time.strftime('%Y%m%d-%H%M%S')}" + ("_dryrun" if dry_run else "") + ".csv"
41-
return _delete_data(max_months, dry_run, regex_to_delete, output_csv)
42+
return _delete_data(max_months, dry_run, regex_to_delete, output_csv, checkpoint_protection=True)
4243

4344

4445
def clean_production(max_months: int, dry_run: bool) -> Tuple[int, int]:
@@ -48,27 +49,78 @@ def clean_production(max_months: int, dry_run: bool) -> Tuple[int, int]:
4849
return _delete_data(max_months, dry_run, regex_to_delete, output_csv)
4950

5051

51-
def _delete_data(max_months: int, dry_run: bool, regex_to_delete: str, output_csv: str) -> Tuple[int, int]:
52+
def _delete_data(
53+
max_months: int, dry_run: bool, regex_to_delete: str, output_csv: str, checkpoint_protection: bool = False
54+
) -> Tuple[int, int]:
5255
max_age = max_months * MONTH_IN_SECONDS
5356

5457
s3 = boto3.client("s3")
5558
paginator = s3.get_paginator("list_objects_v2")
5659
total_deleted = 0
5760
storage_space_freed = 0
61+
keep_until_dates = {}
62+
# First pass, identify keep until files
63+
# which must follow the format keep_until_YYYY-MM-DD.lock and be located in the same folder
64+
# as the experiment's config.yml file
65+
for page in paginator.paginate(Bucket="silnlp"):
66+
for obj in page["Contents"]:
67+
s3_filename = obj["Key"]
68+
parts = s3_filename.split("/")
69+
70+
if parts[-1].startswith("keep_until_"):
71+
try:
72+
date_str = parts[-1].split("_")[-1].replace(".lock", "")
73+
keep_until_timestamp = datetime.datetime.strptime(date_str, "%Y-%m-%d").timestamp()
74+
75+
folder_path = "/".join(parts[:-1])
76+
keep_until_dates[folder_path] = keep_until_timestamp
77+
except ValueError:
78+
print(f"Invalid keep_until format in {s3_filename}. Should follow keep_until_YYYY-MM-DD.lock")
79+
5880
with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
5981
csv_writer = csv.writer(csv_file)
6082
if dry_run:
61-
csv_writer.writerow(["Filename", "LastModified", "Eligible for Deletion"])
83+
csv_writer.writerow(["Filename", "LastModified", "Eligible for Deletion", "Extra Info"])
6284
else:
63-
csv_writer.writerow(["Filename", "LastModified", "Deleted"])
85+
csv_writer.writerow(["Filename", "LastModified", "Deleted", "Extra Info"])
6486
for page in paginator.paginate(Bucket="silnlp"):
6587
for obj in page["Contents"]:
6688
s3_filename = obj["Key"]
6789
if regex_to_delete.search(s3_filename) is None:
6890
continue
91+
6992
last_modified = obj["LastModified"].timestamp()
7093
now = time.time()
94+
7195
delete = False
96+
if checkpoint_protection:
97+
parts = s3_filename.split("/")
98+
if len(parts) >= 4:
99+
experiment_folder = "/".join(parts[:-3])
100+
if experiment_folder in keep_until_dates:
101+
protect_until = keep_until_dates[experiment_folder]
102+
if now < protect_until:
103+
print(
104+
f"Skipping {s3_filename} (Experiment '{experiment_folder}' protected until "
105+
f"{datetime.datetime.fromtimestamp(protect_until, tz=datetime.timezone.utc)})"
106+
)
107+
csv_writer.writerow(
108+
[
109+
s3_filename,
110+
last_modified,
111+
delete,
112+
f"Protected until "
113+
f"{datetime.datetime.fromtimestamp(protect_until, tz=datetime.timezone.utc)})",
114+
]
115+
)
116+
continue
117+
else:
118+
raise RuntimeError(
119+
f"Invalide checkpoint path: {s3_filename}. "
120+
f"Either disable checkpoint protection "
121+
f"or double check that only checkpoints are included in the regex_to_delete."
122+
)
123+
72124
if now - last_modified > max_age:
73125
delete = True
74126
print(s3_filename)

0 commit comments

Comments
 (0)