from urllib import request import threading from time import sleep,ctime
flist = [ "http://xxxx.com/upload/d1b69b20.jpg",\
"http://xxxx.com/upload/c9443ddb.jpg",\
"http://xxxx.com/upload/99ca06f1.jpg"]
def downjpg( filepath,FileName ="default.jpg" ):
web = request.urlopen( filepath)
print("访问网络文件"+filepath+"\n")
jpg = web.read()
DstDir="D:\\image\\"
print("保存文件"+DstDir+FileName)
File = open( DstDir+FileName,"wb" )
File.write( jpg)
File.close()
def downjpgmutithread( filepathlist ):
print("共有%d个文件需要下载"%len(filepathlist))
for file in filepathlist:
print( file )
print("开始多线程下载")
task_threads=[] #存储线程
count=1
for file in filepathlist:
t= threading.Thread( target=downjpg,args=(file,"%d.jpg"%count) )
count=count+1
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join() #等待所有线程结束
print("已经完成所有任务")
def main():
#globals flist
if __name__ == "__main__":
downjpgmutithread( flist)
main()
|
还需要添加的功能:
-
解析网页中的图片链接
-
对图片链接进行检测,如果图片格式 图片大小不符合要求,则不下载
-
加入异常处理机制
-
自动文件名提取,从图片链接直接提取文件名。
明天待续
经过2小时奋战,基本功能实现:
基于python 3.1版本
from urllib import request import threading from time import sleep,ctime from html import parser
def downjpg( filepath,FileName ="default.jpg" ):
try:
web = request.urlopen( filepath)
print("访问网络文件"+filepath+"\n")
jpg = web.read()
DstDir="E:\\image\\"
print("保存文件"+DstDir+FileName+"\n")
try:
File = open( DstDir+FileName,"wb" )
File.write( jpg)
File.close()
return
except IOError:
print("error\n")
return
except Exception:
print("error\n")
return
def downjpgmutithread( filepathlist ):
print("共有%d个文件需要下载"%len(filepathlist))
for file in filepathlist:
print( file )
print("开始多线程下载")
task_threads=[] #存储线程
count=1
for file in filepathlist:
t= threading.Thread( target=downjpg,args=(file,"%d.jpg"%count) )
count=count+1
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join() #等待所有线程结束
print("线程结束")
class parserLinks( parser.HTMLParser):
filelist=[]
def handle_starttag(self,tag,attrs):
if tag == 'img':
for name,value in attrs:
if name == 'src':
print( value)
self.filelist.append(value)
#print( self.get_starttag_text() )
def getfilelist(self):
return self.filelist
def main(WebUrl):
#globals flist
if __name__ == "__main__":
lparser = parserLinks()
web = request.urlopen( WebUrl )
#context= web.read()
for context in web.readlines():
_str="%s"%context
try:
lparser.feed( _str)
except parser.HTMLParseError:
#print( "parser error")
pass
web.close()
imagelist= lparser.getfilelist()
downjpgmutithread( imagelist)
#downjpgmutithread( flist)
#WebUrl="http://www.baidu.com/" #要抓去的网页链接,默认保存到e盘
WebUrl="http://hi.baidu.com/%C7%A7%D2%B6%CF%C4%D1%A9/blog/item/0f119f5404428148d109062a.html"
main(WebUrl)
|
本文转自 chengxuyonghu 51CTO博客,原文链接:http://blog.51cto.com/6226001001/1576064,如需转载请自行联系原作者