Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated files to Python3000, Windows support, and interactive #1

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion README
Original file line number Diff line number Diff line change
@@ -1,8 +1,37 @@
Ultra Fast Duplicate Files Finder
=================================
original Ultra Fast Duplicate Files Finder
=================================
by Gautier Portet <kassoulet gmail com>
forked and extended:
by Gabriel Reyla <[email protected]>

The extended version works on Windows too:
------------------------------------------------------------------------------
$ python fileWalker.py
which folder would you like to find the duplicates in?
.
.\fileWalker.py
size filename
1306 .\README
1306 .\test\README
1306 .\test\README_2
1306 .\test\README_3

7 files checked (15.24 KiB), 1 duplicates (3.83 KiB).
would you like to remove all duplicates?
yes
keeping: .\README
duplicate: .\test\README
?y
deleting: .\test\README
duplicate: .\test\README_2
?n
duplicate: .\test\README_3
?n
>>>
------------------------------------------------------------------------------

On Unix systems the program can be used as follow:
Takes a list of file from stdin.
And print the duplicate ones.

Expand Down
96 changes: 51 additions & 45 deletions UltraFastDuplicateFilesFinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_file_hash(filename, limit_size=None, buffer_size=BUFFER_SIZE):
"""
# open file
try:
f = file(filename, "rb")
f = open(filename, "rb")
except IOError:
return 'NONE'

Expand Down Expand Up @@ -78,7 +78,7 @@ def check_file(filename):
Compare the given file to our lists of hashes
"""
# compute md5
h = get_file_hash(filename)
h = get_file_hash(filename, CHUNK_SIZE//2)

# increase count
i = hashcount.get(h, 0)
Expand All @@ -105,46 +105,52 @@ def humanize_size(size):
if hsize > 0.5:
return '%.2f %s' % (hsize, suffix)


# we start here by checking all files
for filename in sys.stdin:
filename = filename.strip()

check_file(filename)
totalfiles += 1
totalsize += os.path.getsize(filename)

# print the report
print '%10s %s' % ('size', 'filename')

for h, f in hashlist.iteritems():
if hashcount[h] < 2:
# present one time, skip
continue

# reference file
refsize = os.path.getsize(f[0])
refmd5 = get_file_hash(f[0])
print '%10d %s' % (refsize, f[0])


for filename in f[1:]:
# and its copies
size = os.path.getsize(filename)
md5 = get_file_hash(filename)

status = ' '
msg = ''
if md5 != refmd5:
status = '!'
msg = ' partial match only!'

print '%10d %s %s%s' % (size, status, filename, msg)
dupsize += size
dupfiles += 1
print

# final summary
print '%d files checked (%s), %d duplicates (%s).' % (
totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize))

def main(dirWalker=sys.stdin):
global totalsize, totalfiles, dupfiles, dupsize
# we start here by checking all files
for filename in dirWalker:
filename = filename.strip()
if not totalfiles % 500:
print('files processed:', totalfiles, filename)
check_file(filename)
totalfiles += 1
totalsize += os.path.getsize(filename)

# print the report
print( '%10s %s' % ('size', 'filename') )

for h, f in hashlist.items():
if hashcount[h] < 2:
# present one time, skip

continue

# reference file
refsize = os.path.getsize(f[0])
refmd5 = get_file_hash(f[0])
print( '%10d %s' % (refsize, f[0]))


for filename in f[1:]:
# and its copies
size = os.path.getsize(filename)
md5 = get_file_hash(filename)

status = ' '
msg = ''
if md5 != refmd5:
status = '!'
msg = ' partial match only!'

print( '%10d %s %s%s' % (size, status, filename, msg))
dupsize += size
dupfiles += 1
print()

# final summary
print( '%d files checked (%s), %d duplicates (%s).' % (
totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize)))
return hashlist

if __name__ == '__main__':
main()
4 changes: 4 additions & 0 deletions duplicate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import fileWalker

if __name__ == '__main__':
fileWalker.main()
80 changes: 80 additions & 0 deletions fileWalker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os, sys
import UltraFastDuplicateFilesFinder as ff

testPath = os.path.join( os.path.curdir )



def walkerAdapter(walker, hiddenFolders=False):
for curDir, dirList, fileList in walker:
for filename in fileList:
filepath = os.path.join( curDir, filename )
if not hiddenFolders and folderIsHidden(filepath):
continue
yield filepath

def folderIsHidden(filepath):
par = filepath
while 1:
par, cd = os.path.split(par)
## print(par, ':', cd)
if cd.startswith('.') and not cd == '.':
return True
if not par or os.path.ismount(par):
break

def getDirName(dirName=None):
if dirName:
if os.path.isdir( os.path.normpath( dirName )):
return dirName
if sys.argv[1:]:
out = sys.argv[1]
if os.path.isdir( os.path.normpath( out )):
return out

while 1:
inp = print("which folder would you like to find the duplicates in?")
## inp = print("(make sure the path you insert has double \\ in between folders.")
inp = input()
if not inp: # mainly for debugging
global testPath
inp = testPath
break
if os.path.isdir( inp ):
break
return inp


def delete_duplicates(hashlist, interactive=True, verbose=True):
for fl in hashlist.values():
if len(fl) > 1:
print('keeping:', fl[0])
for filename in fl[1:]:
print('duplicate:', filename)
if interactive:
inp = input('?').strip()
if not inp in ['yes', 'y', 'ya']:
continue
print('deleting:', filename)
os.remove(filename)
pass
pass

def main(dirName=None):
root = getDirName(dirName)
walker = os.walk(root)
walker = walkerAdapter(walker)
hashlist = ff.main(walker)
inp = input('would you like to remove all duplicates?\n').strip()
if inp in ['int', 'yi']:
delete_duplicates(hashlist, True)
elif inp in ['yes', 'y', 'ya']:
delete_duplicates(hashlist)
else:
print('no file was deleted')
## for j in hashlist.items():
## if len(j[1]) >1:
## print(j)

if __name__ == '__main__':
main()
82 changes: 82 additions & 0 deletions test/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
=================================
original Ultra Fast Duplicate Files Finder
=================================
by Gautier Portet <kassoulet gmail com>
forked and extended:
by Gabriel Reyla <[email protected]>

The extended version works on Windows too:
------------------------------------------------------------------------------
$ python fileWalker.py
which folder would you like to find the duplicates in?
.
.\fileWalker.py
size filename
1306 .\README
1306 .\test\README
1306 .\test\README_2
1306 .\test\README_3

7 files checked (15.24 KiB), 1 duplicates (3.83 KiB).
would you like to remove all duplicates?
yes
keeping: .\README
duplicate: .\test\README
?y
deleting: .\test\README
duplicate: .\test\README_2
?n
duplicate: .\test\README_3
?n
>>>
------------------------------------------------------------------------------

On Unix systems the program can be used as follow:
Takes a list of file from stdin.
And print the duplicate ones.


example use:

find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py

to find duplicates in your home folder, all files more than 10MB.

UltraFastDuplicateFilesFinder compares only the very beginning of the files.
Its sufficient for most uses, but use with caution.

But this way is quite useful to detect duplicates within corrupted media files...


this is public domain.



------------------------------------------------------------------------------
example run, took less than a second to answer...


gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py
size filename
12467906 /home/gautier/Photos/pict4614.mov
12467906 /home/gautier/Photos/Videos/PICT4614.MOV

13068570 /home/gautier/Photos/pict4588.mov
13068570 /home/gautier/Photos/Videos/PICT4588.MOV

[...]

20865498 /home/gautier/Photos/pict4695.mov
20865498 /home/gautier/Photos/Videos/PICT4695.MOV

28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac
28270824 /home/gautier/tmp/tsunami-1.flac

136 files checked (22.75 GiB), 8 duplicates (153.45 MiB).







Loading