今天用wxPython做了一个GUI程序,实现查找指定目录内的相同文件,主要原理是计算文件的md5值(计算前先找出文件大小相同的文件,然后计算这些文件的md5值,而不是所有文件都计算,大大减少了md5的计算量),加入了多线程功能。
以下是其脚本版本(无需安装wxPython)
UNIQFile-script.py
# -*- coding: gbk -*- '''
Author:@DoNotSpyOnMe
Blog: http://www.cnblogs.com/aaronhoo
''' import hashlib
import os
import threading def getFileSize(filePath):
return os.path.getsize(filePath) ''' 一般文件的md5计算方法,一次读取文件的全部内容'''
def CalcMD5(filepath):
with open(filepath,'rb') as f:
md5obj = hashlib.md5()
md5obj.update(f.read())
hash = md5obj.hexdigest()
return hash
'''大文件计算md5的方法,分批读取文件内容,防止内存爆掉'''
def GetFileMd5(filename):
if not os.path.isfile(filename):
return
myhash = hashlib.md5()
f = open(filename,'rb')
while True:
b = f.read(8*1024)
if not b :
break
myhash.update(b)
f.close()
return myhash.hexdigest() def GetAllFiles(directory):
files=[]
for dirpath, dirnames,filenames in os.walk(directory):
if filenames!=[]:
for file in filenames:
files.append(dirpath+'\\'+file)
files.sort(key=len)#按照文件名的长度排序
return files def findSameSizeFiles(files):
dicSize={}
for f in files:
size=getFileSize(f)
if not dicSize.has_key(size):
dicSize[size]=f
else:
dicSize[size]=dicSize[size]+';'+f
dicCopy=dicSize.copy()
for k in dicSize.iterkeys():
if dicSize[k].find(';')==-1:
dicCopy.pop(k)
del dicSize
return dicCopy def findSameMD5Files(files):
dicMD5={}
for f in files:
print 'calculating the md5 value of file %s'%f
md5=GetFileMd5(f)
if not dicMD5.has_key(md5):
dicMD5[md5]=f
else:
dicMD5[md5]=dicMD5[md5]+';'+f
dicCopy=dicMD5.copy()
for k in dicMD5.iterkeys():
if dicMD5[k].find(';')==-1:
dicCopy.pop(k)
del dicMD5
return dicCopy def removeSameFile(mydir):
msg=''
msgUniq='Result:No file is removed since they are all uniq.'
try:
existsFlag=False
files=GetAllFiles(mydir)
print'%s files found in directory %s\n'%(len(files),mydir)
dicFileOfSameSize=findSameSizeFiles(files)
if dicFileOfSameSize=={}:
print msgUniq
return
else:
#list the duplicated files first:
dicFiltered={}
for k in dicFileOfSameSize.iterkeys():
filesOfSameSize=dicFileOfSameSize[k].split(';')
dicSameMD5file=findSameMD5Files(filesOfSameSize)
if dicSameMD5file!={}:
existsFlag=True
for k in dicSameMD5file.iterkeys():
msg=msg+'md5 %s: %s'%(k,dicSameMD5file[k])+'\n'
dicFiltered[k]=dicSameMD5file[k]
if not existsFlag:
msg=msgUniq
return
else:
msg='Duplicated files:\n'+msg+'\n'
#then remove the duplicated files:
removeCount=0
for k in dicFiltered.iterkeys():
sameFiles=dicFiltered[k].split(';')
flagRemove=False
for f in sameFiles:
if not flagRemove:
flagRemove=True
else:
msg=msg+'Removing file: %s'%f+'\n'
os.remove(f)
removeCount=removeCount+1
msg=msg+'%s files are removed.\n'%removeCount
except Exception,e:
print e
# msg='Exception occured.'
finally:
print msg+'\n'+'Operation finished.' def listSameFile(mydir):
msg=''
msgUniq='Result:All files are uniq.'
try:
existsFlag=False
files=GetAllFiles(mydir)
print '%s files found in directory %s\n'%(len(files),mydir)
dicFileOfSameSize=findSameSizeFiles(files)
if dicFileOfSameSize=={}:
print msgUniq
return
else:
for k in dicFileOfSameSize.iterkeys():
filesOfSameSize=dicFileOfSameSize[k].split(';')
dicSameMD5file=findSameMD5Files(filesOfSameSize)
if dicSameMD5file!={}:
existsFlag=True
for k in dicSameMD5file.iterkeys():
msg=msg+'md5 %s: %s'%(k,dicSameMD5file[k])+'\n'
if not existsFlag:
msg=msgUniq
else:
msg='Duplicated files:\n'+msg
except Exception,e:
print e
# msg='Exception occured.'
finally:
print msg+'\n'+'Operation finished.' if __name__=="__main__":
print 'This program is designed for clearing the duplicated files and saving memory space.Select a directory and we will find or remove the duplicated files.'
print 'All rights are reserved by @DoNotSpyOnMe'
print '\n' print "You have three options:"
print "'f' for finding the duplicated files in the directory that you're required to enter later,or"
print "'r' for finding and the removing the duplicated file,or"
print "'q' to quit"
while True:
option=raw_input('Please enter your option:\n')
option=option.lower()
while option!='f' and option!='r' and option!='q':
option=raw_input('Please enter your option:\n')
if option=='f' or option=='r':
mydir=raw_input('Please enter the direcotry containing files:\n')
mydir=mydir.lower()
while mydir.find('\\')==-1 or not os.path.isdir(mydir):
mydir=raw_input('Please enter a valid direcotry containing files:\n')
if option=='f':
listSameFile(mydir)
else:
removeSameFile(mydir)
elif option=='q':
exit(0)
print ''