diff --git a/README b/README index 07a6777..cddcda5 100644 --- a/README +++ b/README @@ -1,8 +1,37 @@ -Ultra Fast Duplicate Files Finder +================================= +original Ultra Fast Duplicate Files Finder ================================= by Gautier Portet +forked and extended: + by Gabriel Reyla +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ +On Unix systems the program can be used as follow: Takes a list of file from stdin. And print the duplicate ones. diff --git a/UltraFastDuplicateFilesFinder.py b/UltraFastDuplicateFilesFinder.py index 8e7fe78..58c7bbc 100644 --- a/UltraFastDuplicateFilesFinder.py +++ b/UltraFastDuplicateFilesFinder.py @@ -46,7 +46,7 @@ def get_file_hash(filename, limit_size=None, buffer_size=BUFFER_SIZE): """ # open file try: - f = file(filename, "rb") + f = open(filename, "rb") except IOError: return 'NONE' @@ -78,7 +78,7 @@ def check_file(filename): Compare the given file to our lists of hashes """ # compute md5 - h = get_file_hash(filename) + h = get_file_hash(filename, CHUNK_SIZE//2) # increase count i = hashcount.get(h, 0) @@ -105,46 +105,52 @@ def humanize_size(size): if hsize > 0.5: return '%.2f %s' % (hsize, suffix) - -# we start here by checking all files -for filename in sys.stdin: - filename = filename.strip() - - check_file(filename) - totalfiles += 1 - totalsize += os.path.getsize(filename) - -# print the report -print '%10s %s' % ('size', 'filename') - -for h, f in hashlist.iteritems(): - if hashcount[h] < 2: - # present one time, skip - continue - - # reference file - refsize = os.path.getsize(f[0]) - refmd5 = get_file_hash(f[0]) - print '%10d %s' % (refsize, f[0]) - - - for filename in f[1:]: - # and its copies - size = os.path.getsize(filename) - md5 = get_file_hash(filename) - - status = ' ' - msg = '' - if md5 != refmd5: - status = '!' - msg = ' partial match only!' - - print '%10d %s %s%s' % (size, status, filename, msg) - dupsize += size - dupfiles += 1 - print - -# final summary -print '%d files checked (%s), %d duplicates (%s).' % ( - totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize)) - +def main(dirWalker=sys.stdin): + global totalsize, totalfiles, dupfiles, dupsize + # we start here by checking all files + for filename in dirWalker: + filename = filename.strip() + if not totalfiles % 500: + print('files processed:', totalfiles, filename) + check_file(filename) + totalfiles += 1 + totalsize += os.path.getsize(filename) + + # print the report + print( '%10s %s' % ('size', 'filename') ) + + for h, f in hashlist.items(): + if hashcount[h] < 2: + # present one time, skip + + continue + + # reference file + refsize = os.path.getsize(f[0]) + refmd5 = get_file_hash(f[0]) + print( '%10d %s' % (refsize, f[0])) + + + for filename in f[1:]: + # and its copies + size = os.path.getsize(filename) + md5 = get_file_hash(filename) + + status = ' ' + msg = '' + if md5 != refmd5: + status = '!' + msg = ' partial match only!' + + print( '%10d %s %s%s' % (size, status, filename, msg)) + dupsize += size + dupfiles += 1 + print() + + # final summary + print( '%d files checked (%s), %d duplicates (%s).' % ( + totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize))) + return hashlist + +if __name__ == '__main__': + main() diff --git a/duplicate.py b/duplicate.py new file mode 100644 index 0000000..c3a937b --- /dev/null +++ b/duplicate.py @@ -0,0 +1,4 @@ +import fileWalker + +if __name__ == '__main__': + fileWalker.main() diff --git a/fileWalker.py b/fileWalker.py new file mode 100644 index 0000000..89d01ec --- /dev/null +++ b/fileWalker.py @@ -0,0 +1,80 @@ +import os, sys +import UltraFastDuplicateFilesFinder as ff + +testPath = os.path.join( os.path.curdir ) + + + +def walkerAdapter(walker, hiddenFolders=False): + for curDir, dirList, fileList in walker: + for filename in fileList: + filepath = os.path.join( curDir, filename ) + if not hiddenFolders and folderIsHidden(filepath): + continue + yield filepath + +def folderIsHidden(filepath): + par = filepath + while 1: + par, cd = os.path.split(par) +## print(par, ':', cd) + if cd.startswith('.') and not cd == '.': + return True + if not par or os.path.ismount(par): + break + +def getDirName(dirName=None): + if dirName: + if os.path.isdir( os.path.normpath( dirName )): + return dirName + if sys.argv[1:]: + out = sys.argv[1] + if os.path.isdir( os.path.normpath( out )): + return out + + while 1: + inp = print("which folder would you like to find the duplicates in?") +## inp = print("(make sure the path you insert has double \\ in between folders.") + inp = input() + if not inp: # mainly for debugging + global testPath + inp = testPath + break + if os.path.isdir( inp ): + break + return inp + + +def delete_duplicates(hashlist, interactive=True, verbose=True): + for fl in hashlist.values(): + if len(fl) > 1: + print('keeping:', fl[0]) + for filename in fl[1:]: + print('duplicate:', filename) + if interactive: + inp = input('?').strip() + if not inp in ['yes', 'y', 'ya']: + continue + print('deleting:', filename) + os.remove(filename) + pass + pass + +def main(dirName=None): + root = getDirName(dirName) + walker = os.walk(root) + walker = walkerAdapter(walker) + hashlist = ff.main(walker) + inp = input('would you like to remove all duplicates?\n').strip() + if inp in ['int', 'yi']: + delete_duplicates(hashlist, True) + elif inp in ['yes', 'y', 'ya']: + delete_duplicates(hashlist) + else: + print('no file was deleted') +## for j in hashlist.items(): +## if len(j[1]) >1: +## print(j) + +if __name__ == '__main__': + main() diff --git a/test/README b/test/README new file mode 100644 index 0000000..cddcda5 --- /dev/null +++ b/test/README @@ -0,0 +1,82 @@ +================================= +original Ultra Fast Duplicate Files Finder +================================= + by Gautier Portet +forked and extended: + by Gabriel Reyla + +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ + +On Unix systems the program can be used as follow: +Takes a list of file from stdin. +And print the duplicate ones. + + +example use: + + find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py + +to find duplicates in your home folder, all files more than 10MB. + +UltraFastDuplicateFilesFinder compares only the very beginning of the files. +Its sufficient for most uses, but use with caution. + +But this way is quite useful to detect duplicates within corrupted media files... + + +this is public domain. + + + +------------------------------------------------------------------------------ +example run, took less than a second to answer... + + +gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py + size filename + 12467906 /home/gautier/Photos/pict4614.mov + 12467906 /home/gautier/Photos/Videos/PICT4614.MOV + + 13068570 /home/gautier/Photos/pict4588.mov + 13068570 /home/gautier/Photos/Videos/PICT4588.MOV + +[...] + + 20865498 /home/gautier/Photos/pict4695.mov + 20865498 /home/gautier/Photos/Videos/PICT4695.MOV + + 28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac + 28270824 /home/gautier/tmp/tsunami-1.flac + +136 files checked (22.75 GiB), 8 duplicates (153.45 MiB). + + + + + + + diff --git a/test/README_2 b/test/README_2 new file mode 100644 index 0000000..cddcda5 --- /dev/null +++ b/test/README_2 @@ -0,0 +1,82 @@ +================================= +original Ultra Fast Duplicate Files Finder +================================= + by Gautier Portet +forked and extended: + by Gabriel Reyla + +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ + +On Unix systems the program can be used as follow: +Takes a list of file from stdin. +And print the duplicate ones. + + +example use: + + find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py + +to find duplicates in your home folder, all files more than 10MB. + +UltraFastDuplicateFilesFinder compares only the very beginning of the files. +Its sufficient for most uses, but use with caution. + +But this way is quite useful to detect duplicates within corrupted media files... + + +this is public domain. + + + +------------------------------------------------------------------------------ +example run, took less than a second to answer... + + +gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py + size filename + 12467906 /home/gautier/Photos/pict4614.mov + 12467906 /home/gautier/Photos/Videos/PICT4614.MOV + + 13068570 /home/gautier/Photos/pict4588.mov + 13068570 /home/gautier/Photos/Videos/PICT4588.MOV + +[...] + + 20865498 /home/gautier/Photos/pict4695.mov + 20865498 /home/gautier/Photos/Videos/PICT4695.MOV + + 28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac + 28270824 /home/gautier/tmp/tsunami-1.flac + +136 files checked (22.75 GiB), 8 duplicates (153.45 MiB). + + + + + + + diff --git a/test/README_3 b/test/README_3 new file mode 100644 index 0000000..cddcda5 --- /dev/null +++ b/test/README_3 @@ -0,0 +1,82 @@ +================================= +original Ultra Fast Duplicate Files Finder +================================= + by Gautier Portet +forked and extended: + by Gabriel Reyla + +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ + +On Unix systems the program can be used as follow: +Takes a list of file from stdin. +And print the duplicate ones. + + +example use: + + find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py + +to find duplicates in your home folder, all files more than 10MB. + +UltraFastDuplicateFilesFinder compares only the very beginning of the files. +Its sufficient for most uses, but use with caution. + +But this way is quite useful to detect duplicates within corrupted media files... + + +this is public domain. + + + +------------------------------------------------------------------------------ +example run, took less than a second to answer... + + +gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py + size filename + 12467906 /home/gautier/Photos/pict4614.mov + 12467906 /home/gautier/Photos/Videos/PICT4614.MOV + + 13068570 /home/gautier/Photos/pict4588.mov + 13068570 /home/gautier/Photos/Videos/PICT4588.MOV + +[...] + + 20865498 /home/gautier/Photos/pict4695.mov + 20865498 /home/gautier/Photos/Videos/PICT4695.MOV + + 28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac + 28270824 /home/gautier/tmp/tsunami-1.flac + +136 files checked (22.75 GiB), 8 duplicates (153.45 MiB). + + + + + + +