-
Notifications
You must be signed in to change notification settings - Fork 0
/
copycat.py
executable file
·301 lines (258 loc) · 11.1 KB
/
copycat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
import time, os, sys, hashlib, subprocess, glob, queue, shutil
from multiprocessing import Process, Queue
import platform
import sqlite3
import configparser
import json
def Ex(command, config):
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
c = p.communicate()
if config.getboolean('debug'):
if c[0] is not None:
print ("STDOUT: {}".format(c[0]))
if c[1] is not None:
print ("STDERR: {}".format(c[1]))
return c[0]
def get_free_space_in_dir(dir):
statfs = os.statvfs(dir)
state = {}
state['blocks_free'] = statfs.f_bfree
state['blocks_avail'] = statfs.f_bfree
state['blocksize'] = statfs.f_bsize
state['inodes_free'] = statfs.f_ffree
state['bytes_free'] = statfs.f_bfree * statfs.f_bsize
state['bytes_avail'] = statfs.f_bavail * statfs.f_bsize
return state
def get_disks(config):
disks = []
patterns = json.loads(config.get('patterns'))
for pattern in patterns:
patterndisks = glob.glob(pattern)
for disk in patterndisks:
disks.append(disk)
disks.sort()
return disks
def get_partitions(disk):
partitions = glob.glob("{}*".format(disk))
partitions = [p for p in partitions if p != disk]
partitions.sort()
return partitions
def hash_file(file, partial = False):
block_size = 1024*1024
m = hashlib.sha512()
with open(file, 'rb') as f:
if partial:
m.update(f.read(block_size)) # start of file
f.seek(block_size * -1, 2)
m.update(f.read(block_size)) # end of file
f.seek(int(f.tell() / 2) - int(block_size / 2))
m.update(f.read(block_size)) # middle of file
else:
while True:
data = f.read(block_size)
if not data:
break
m.update(data)
return m.hexdigest()
return None
def copylink(disk_name, location, subdir, file, backuptimestamp, q, config = None, db = None, numtry = 1):
backuplocation = os.path.join(config.get('backupdir'), backuptimestamp, disk_name)
linkdest = os.readlink(os.path.join(location, subdir, file))
dest = os.path.join(backuplocation, subdir, file)
os.makedirs(os.path.join(backuplocation, subdir), exist_ok=True)
if config.getboolean('verbose'):
q.put("linking: {} {} (to {})".format(subdir, file, linkdest))
Ex(["ln", "-snf", linkdest, dest], config)
def copyfile(disk_name, location, subdir, file, backuptimestamp, q, config = None, db = None, numtry = 1):
if config.getboolean('verbose'):
q.put("copying: {} {}".format(subdir, file))
elif config.getboolean('debug'):
q.put("DEBUG: copyfile: {} {} {}".format(location, subdir, file))
if numtry > 3:
q.put("Could not copy {}".format(os.path.join(location, subdir, file)))
return
backuplocation = os.path.join(config.get('backupdir'), backuptimestamp, disk_name)
src = os.path.join(location, subdir, file)
dest = os.path.join(backuplocation, subdir, file)
os.makedirs(os.path.join(backuplocation, subdir), exist_ok=True)
# partial hashing for files bigger than 32 MiB
hash_is_partial = False
fstat = os.stat(src)
if fstat.st_size > 33554432:
hash_is_partial = True
# hash file
pre_copy_file_hash = hash_file(src, hash_is_partial)
if config.getboolean('hardlink'):
cur = db.cursor()
cur.execute("SELECT target FROM files WHERE hash = ? LIMIT 1;", (pre_copy_file_hash,))
row = cur.fetchone()
if row is not None:
existingfile = row[0]
if config.getboolean('debug'):
q.put("DEBUG: ln {} {}".format(existingfile, dest))
Ex(["ln", existingfile, dest], config)
return
if config.getboolean('debug'):
q.put("DEBUG: {}".format(" ".join(["cp", "-a", src, dest])))
Ex(["cp", "-a", src, dest], config)
post_copy_file_hash = hash_file(dest, hash_is_partial)
if pre_copy_file_hash is None or post_copy_file_hash is None or pre_copy_file_hash != post_copy_file_hash:
# file hash does not match
copyfile(disk_name, location, subdir, file, backuptimestamp, q, config, db, numtry = numtry + 1)
else:
if config.getboolean('verbose'):
q.put("copied: {}".format(file))
# file hash matches, ensure file is recorded in database
cur = db.cursor()
info = (post_copy_file_hash, backuptimestamp, src, dest)
cur.execute("INSERT INTO files (hash, backuptime, source, target) VALUES (?, ?, ?, ?);", info)
db.commit()
def backup_dir(disk_name, srcmount, location, backuptimestamp, q, config = None, db = None):
sourcedir = os.path.join(srcmount, location)
backupdir = os.path.join(config.get('backupdir'), backuptimestamp, disk_name)
os.makedirs(backupdir, exist_ok=True)
for file in [file for file in os.listdir(sourcedir) if not file in [".",".."]]:
nfile = os.path.join(sourcedir,file)
if file[0] == "." and config.getboolean("copy_dotfiles") == False:
# don't copy hidden files/dot files if not explicitely enabled
continue
elif os.path.islink(nfile):
# don't copy symlinks, but re-link
if location.find(srcmount) == 0:
subdir = location[len(srcmount):].lstrip(os.sep)
else:
subdir = location.lstrip(os.sep)
copylink(disk_name, srcmount, subdir, file, backuptimestamp, q, config, db)
elif os.path.isdir(nfile):
backup_dir(disk_name, srcmount, nfile, backuptimestamp, q, config, db)
elif os.path.isfile(nfile):
if location.find(srcmount) == 0:
subdir = location[len(srcmount):].lstrip(os.sep)
else:
subdir = location.lstrip(os.sep)
copyfile(disk_name, srcmount, subdir, file, backuptimestamp, q, config, db)
def backup_part(disk, disktype, backuptimestamp, q, config, db):
disklocation = os.path.join(config.get('mountdir'), disk.split(os.sep)[-1])
# remove (sub-)directories previously mounted there
if (os.path.exists(disklocation) and os.path.isdir(disklocation)):
os.removedirs(disklocation)
# recreate the directory
os.makedirs(disklocation)
ostype = platform.system()
fstypes = None
if (ostype == 'FreeBSD'):
# Kernel modules
# ext2fs: ext2, ext3, ext4 (pkg: fusefs-ext2)
# fuse,exfat-fuse: exfat (port: fusefs-exfat)
# fusefs-ntfs: ntfs (pkg: fusefs-ntfs)
fstypes = "msdosfs,exfat,ntfs"
q.put("Mount and backup {} {}.".format(disktype, disk))
if (fstypes is not None):
# Mount with specific fstypes enabled
Ex(["mount", "-t", fstypes, "-o", "ro", disk, disklocation], config)
else:
# Mount with fstype autodetected
Ex(["mount", "-o", "ro", disk, disklocation], config)
time.sleep(1)
# disk name
disk_name = disk.split(os.sep)[-1]
try:
backup_dir(disk_name, disklocation, "", backuptimestamp, q, config, db)
finally:
Ex(["umount", disklocation], config)
os.rmdir(disklocation)
def backup(disk, q, config, db):
backuptimestamp = time.strftime("%Y-%m-%d_%H_%M-%S")
partitions = get_partitions(disk)
if len(partitions) == 0:
backup_part(disk, "disk", backuptimestamp, q, config, db)
else:
for partition in partitions:
backup_part(partition, "partition", backuptimestamp, q, config, db)
if __name__ == '__main__':
# read config.ini
cp = configparser.ConfigParser(default_section='copycat')
# default options
cp['copycat'] = {
'backupdir': "/mnt/copycat",
'mountdir': "/media/copycat",
'patterns': json.dumps(["/dev/sd?", "/dev/mmcblk?", "/dev/da?", "/dev/ada?"]),
'blacklist': "",
'hardlink': "yes",
'min_free_inodes': 8*1024,
'min_free_mib': 10*1024,
'copy_dotfiles': "no",
'debug': "no",
'verbose': "yes",
}
configpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')
cp.read(configpath)
config = cp['copycat']
processes = []
q = Queue()
last_disks = get_disks(config)
print ("CopyCat is ready. Insert your storage devices!")
if config.getboolean('debug'):
print ("Disks already there at startup: {}".format(last_disks))
# ensure backup directory exists
os.makedirs(config.get('backupdir'), exist_ok=True)
db = sqlite3.connect(os.path.join(config.get('backupdir'), 'files.db'))
# ensure table is present
cur = db.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS files (hash TEXT, backuptime TEXT, source TEXT, target TEXT);")
db.commit()
while True:
time.sleep(3)
current_disks = get_disks(config)
# show current state of disk list
if config.getboolean('debug'):
print ("Disks known: {}".format(current_disks))
# check for enough free space
free_space = get_free_space_in_dir(config.get('backupdir'))
# check if there are at least 8192 free inodes
if (free_space['inodes_free'] < config.getint('min_free_inodes')):
print ("WARNING: only {} free inodes for backuptarget {}!".format(free_space['inodes_free'], config.get('backupdir')))
# check if at least 1GB is free
free_mib = free_space['bytes_avail'] / 1024 / 1024
if (free_mib < int(config.get('min_free_mib'))):
print ("WARNING: only {} MiB free for backuptarget {}!".format(free_mib, config.get('backupdir')))
# iterate over known disks
for disk in current_disks:
if disk not in last_disks:
if disk not in config.get('blacklist'):
time.sleep(3)
recheck_disks = get_disks(config)
if disk in recheck_disks:
print ("Starting backup of disk {}.".format(disk))
p = Process(target=backup, args=(disk, q, config, db))
p.start()
processes.append((disk, p))
try:
while True:
message = q.get(block=False)
print (message)
except queue.Empty:
pass
still_running = []
for disk, process in processes:
process.join(timeout=1)
if process.exitcode is None:
still_running.append((disk, process))
elif process.exitcode == 0:
Ex(['sync'], config)
print ("Backup of disk {} has finished.".format(disk))
continue
elif process.exitcode < 0:
Ex(['sync'], config)
print ("Backup process died from signal {}".format(process.exitcode))
continue
elif process.exitcode > 0:
Ex(['sync'], config)
print ("Backup process terminated with exit code {}".format(process.exitcode))
continue
else:
Ex(['sync'], config)
print ("Unknown exitcode: {}".format(process.exitcode))
processes = still_running
last_disks = current_disks