kassoulet · sesas · Mar 27, 2012 · Mar 27, 2012 · Mar 28, 2012 · Mar 28, 2012
diff --git a/README b/README
@@ -1,8 +1,37 @@
-Ultra Fast Duplicate Files Finder
+=================================
+original Ultra Fast Duplicate Files Finder
 =================================
   by Gautier Portet <kassoulet gmail com>
+forked and extended:
+  by Gabriel Reyla <[email protected]> 
 
+The extended version works on Windows too:
+------------------------------------------------------------------------------
+$ python fileWalker.py
+which folder would you like to find the duplicates in?
+.
+.\fileWalker.py
+      size   filename
+      1306   .\README
+      1306   .\test\README
+      1306   .\test\README_2
+      1306   .\test\README_3
+
+7 files checked (15.24 KiB), 1 duplicates (3.83 KiB).
+would you like to remove all duplicates?
+yes
+keeping: .\README
+duplicate: .\test\README
+?y
+deleting: .\test\README
+duplicate: .\test\README_2
+?n
+duplicate: .\test\README_3
+?n
+>>> 
+------------------------------------------------------------------------------
 
+On Unix systems the program can be used as follow:
 Takes a list of file from stdin.
 And print the duplicate ones.
 

diff --git a/UltraFastDuplicateFilesFinder.py b/UltraFastDuplicateFilesFinder.py
@@ -46,7 +46,7 @@ def get_file_hash(filename, limit_size=None, buffer_size=BUFFER_SIZE):
     """
     # open file
     try:
-        f = file(filename, "rb")
+        f = open(filename, "rb")
     except IOError:
         return 'NONE'
 
@@ -78,7 +78,7 @@ def check_file(filename):
     Compare the given file to our lists of hashes
     """    
     # compute md5
-    h = get_file_hash(filename)
+    h = get_file_hash(filename, CHUNK_SIZE//2)
 
     # increase count
     i = hashcount.get(h, 0)
@@ -105,46 +105,52 @@ def humanize_size(size):
         if hsize > 0.5:
             return '%.2f %s' % (hsize, suffix)
 
-
-# we start here by checking all files
-for filename in sys.stdin:
-    filename = filename.strip()
-
-    check_file(filename)
-    totalfiles += 1
-    totalsize += os.path.getsize(filename)
-
-# print the report
-print '%10s   %s' % ('size', 'filename')
-
-for h, f in hashlist.iteritems():
-    if hashcount[h] < 2:
-        # present one time, skip
-        continue
-
-    # reference file    
-    refsize = os.path.getsize(f[0])
-    refmd5 = get_file_hash(f[0])
-    print '%10d   %s' % (refsize, f[0])
-
-
-    for filename in f[1:]:
-        # and its copies
-        size = os.path.getsize(filename)
-        md5 = get_file_hash(filename)
-
-        status = ' '
-        msg = ''
-        if md5 != refmd5:
-            status = '!'
-            msg = ' partial match only!'
-
-        print '%10d %s %s%s' % (size, status, filename, msg)
-        dupsize += size
-    dupfiles += 1
-    print
-
-# final summary
-print '%d files checked (%s), %d duplicates (%s).' % (
-    totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize))
-
+def main(dirWalker=sys.stdin):
+    global totalsize, totalfiles, dupfiles, dupsize 
+    # we start here by checking all files
+    for filename in dirWalker:
+        filename = filename.strip()
+        if not totalfiles % 500:
+            print('files processed:', totalfiles, filename)
+        check_file(filename)
+        totalfiles += 1
+        totalsize += os.path.getsize(filename)
+
+    # print the report
+    print( '%10s   %s' % ('size', 'filename') )
+
+    for h, f in hashlist.items():
+        if hashcount[h] < 2:
+            # present one time, skip
+
+            continue
+
+        # reference file    
+        refsize = os.path.getsize(f[0])
+        refmd5 = get_file_hash(f[0])
+        print( '%10d   %s' % (refsize, f[0]))
+
+
+        for filename in f[1:]:
+            # and its copies
+            size = os.path.getsize(filename)
+            md5 = get_file_hash(filename)
+
+            status = ' '
+            msg = ''
+            if md5 != refmd5:
+                status = '!'
+                msg = ' partial match only!'
+
+            print( '%10d %s %s%s' % (size, status, filename, msg))
+            dupsize += size
+        dupfiles += 1
+        print()
+
+    # final summary
+    print( '%d files checked (%s), %d duplicates (%s).' % (
+        totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize)))
+    return hashlist
+
+if __name__ == '__main__':
+    main()
diff --git a/duplicate.py b/duplicate.py
@@ -0,0 +1,4 @@
+import fileWalker
+
+if __name__ == '__main__':
+    fileWalker.main()
diff --git a/fileWalker.py b/fileWalker.py
@@ -0,0 +1,80 @@
+import os, sys
+import UltraFastDuplicateFilesFinder as ff
+
+testPath = os.path.join( os.path.curdir )
+
+
+
+def walkerAdapter(walker, hiddenFolders=False):
+    for curDir, dirList, fileList in walker:
+        for filename in fileList:
+            filepath = os.path.join( curDir, filename )
+            if not hiddenFolders and folderIsHidden(filepath):
+                continue
+            yield filepath
+
+def folderIsHidden(filepath):
+    par = filepath
+    while 1:
+        par, cd = os.path.split(par)
+##        print(par, ':', cd)
+        if cd.startswith('.') and not cd == '.':
+            return True
+        if not par or os.path.ismount(par):
+            break
+
+def getDirName(dirName=None):
+    if dirName:
+        if os.path.isdir( os.path.normpath( dirName )):
+            return dirName
+    if sys.argv[1:]:
+        out = sys.argv[1]
+        if os.path.isdir( os.path.normpath( out )):
+            return out
+
+    while 1:
+        inp = print("which folder would you like to find the duplicates in?")
+##        inp = print("(make sure the path you insert has double \\ in between folders.")
+        inp = input()
+        if not inp: # mainly for debugging
+            global testPath
+            inp = testPath
+            break
+        if os.path.isdir( inp ):
+            break
+    return inp
+
+
+def delete_duplicates(hashlist, interactive=True, verbose=True):
+    for fl in hashlist.values():
+        if len(fl) > 1:
+            print('keeping:', fl[0])
+        for filename in fl[1:]:
+            print('duplicate:', filename)
+            if interactive:
+                inp = input('?').strip()
+                if not inp in ['yes', 'y', 'ya']:
+                    continue
+            print('deleting:', filename)
+            os.remove(filename)
+        pass
+    pass
+
+def main(dirName=None):
+    root = getDirName(dirName)
+    walker = os.walk(root)
+    walker = walkerAdapter(walker)
+    hashlist = ff.main(walker)
+    inp = input('would you like to remove all duplicates?\n').strip()
+    if inp in ['int', 'yi']:
+        delete_duplicates(hashlist, True)
+    elif inp in ['yes', 'y', 'ya']:
+        delete_duplicates(hashlist)
+    else:
+        print('no file was deleted')
+##    for j in hashlist.items():
+##        if len(j[1]) >1:
+##            print(j)
+
+if __name__ == '__main__':
+    main()
diff --git a/test/README b/test/README
@@ -0,0 +1,82 @@
+=================================
+original Ultra Fast Duplicate Files Finder
+=================================
+  by Gautier Portet <kassoulet gmail com>
+forked and extended:
+  by Gabriel Reyla <[email protected]> 
+
+The extended version works on Windows too:
+------------------------------------------------------------------------------
+$ python fileWalker.py
+which folder would you like to find the duplicates in?
+.
+.\fileWalker.py
+      size   filename
+      1306   .\README
+      1306   .\test\README
+      1306   .\test\README_2
+      1306   .\test\README_3
+
+7 files checked (15.24 KiB), 1 duplicates (3.83 KiB).
+would you like to remove all duplicates?
+yes
+keeping: .\README
+duplicate: .\test\README
+?y
+deleting: .\test\README
+duplicate: .\test\README_2
+?n
+duplicate: .\test\README_3
+?n
+>>> 
+------------------------------------------------------------------------------
+
+On Unix systems the program can be used as follow:
+Takes a list of file from stdin.
+And print the duplicate ones.
+
+
+example use:
+
+  find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py
+
+to find duplicates in your home folder, all files more than 10MB.
+
+UltraFastDuplicateFilesFinder compares only the very beginning of the files.
+Its sufficient for most uses, but use with caution.
+
+But this way is quite useful to detect duplicates within corrupted media files...
+
+
+this is public domain.
+
+
+
+------------------------------------------------------------------------------
+example run, took less than a second to answer...
+
+
+gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py
+      size   filename
+  12467906   /home/gautier/Photos/pict4614.mov
+  12467906   /home/gautier/Photos/Videos/PICT4614.MOV
+
+  13068570   /home/gautier/Photos/pict4588.mov
+  13068570   /home/gautier/Photos/Videos/PICT4588.MOV
+
+[...]
+
+  20865498   /home/gautier/Photos/pict4695.mov
+  20865498   /home/gautier/Photos/Videos/PICT4695.MOV
+
+  28270824   /home/gautier/tmp/tsunami 1 œ ǒǑ.flac
+  28270824   /home/gautier/tmp/tsunami-1.flac
+
+136 files checked (22.75 GiB), 8 duplicates (153.45 MiB).
+
+
+
+
+
+
+