-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathremove_duplicates.py
More file actions
45 lines (32 loc) · 1.38 KB
/
remove_duplicates.py
File metadata and controls
45 lines (32 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import hashlib
import os
root_dir = "/Users/dvir/projects/fast_eml_parse/output/carmen"
counter = 0
def calculate_file_hash(file_path):
with open(file_path, "rb") as file:
hasher = hashlib.sha256()
hasher.update(file.read())
return hasher.hexdigest()
def remove_duplicate_files_in_subfolder(subfolder_path):
# Create a dictionary to store the hashes of the files in the subfolder.
file_hashes = {}
# Walk through the subfolder.
for root, _, files in os.walk(subfolder_path):
for file in files:
# If the file is not a directory, calculate its hash.
if not os.path.isdir(os.path.join(root, file)):
file_path = os.path.join(root, file)
file_hash = calculate_file_hash(file_path)
# If the hash is already in the dictionary, then the file is a duplicate.
if file_hash in file_hashes:
# Remove the duplicate file.
os.remove(file_path)
print(f"removed file - {file_path}")
else:
# Add the hash to the dictionary.
file_hashes[file_hash] = file_path
for subfolder in os.listdir(root_dir):
subfolder_path = os.path.join(root_dir, subfolder)
remove_duplicate_files_in_subfolder(subfolder_path)
counter += 1
print(f"Done subfolder {counter}")