批量下载网站图片的Python实用小工具（下）

2022-05-21 01:25:07

引子###

在批量下载网站图片的Python实用小工具一文中，讲解了开发一个Python小工具来实现网站图片的并发批量拉取。不过那个工具仅限于特定网站的特定规则，本文将基于其代码实现，开发一个更加通用的图片下载工具。

通用版###

思路####

我们可以做成一个下载图片资源的通用框架：

制定生成网页资源的规则集合 PageRules；
根据 PageRules 抓取网站的网页内容集合 PageContents；
制定从网页内容集合 PageContents 获取资源真实地址的规则集合或路径集合 ResourceRules ；
根据资源规则集合批量获取资源的真实地址 ResourceTrulyAddresses ;
根据资源真实地址 ResourceTrulyAddresses 批量下载资源。

想象一条流水线：

初始 URLS --> 替换规则 --> 生成更多 URLS --> 抓取网页内容 --> 获取指定链接元素 A --> 中间 URLS -->  抓取网页内容 -->   获取指定链接元素  B -->  最终的图片源地址集合 C --> 下载图片

称 [A,B,C] 是找到图片源地址的规则路径。 其中 A, B 通常是 <a href="xxx" class="yyy"> , C 通常是 <img src="xxx.jpg" />

这里的 URLS 不一定是 .html 后缀，但通常是 html 文档，因此是什么后缀并不影响抓取网页内容。

为了使得图片下载更加通用，通用版做了如下工作：

将线程池和进程池抽离出来，做出可复用的基础组件，便于在各个环节和不同脚本里复用；
进一步拆分操作粒度，将 "抓取网页内容" 与 "根据规则从网页内容中获取链接元素" 分离出来成为单一的操作；
在单个操作的基础上提供批量操作，而这些批量操作是可以并发或并行完成的；使用 map 语法使表达更加凝练；
提供命令行参数的解析和执行；

值得提及的一点是，尽可能将操作粒度细化，形成可复用操作，对提供灵活多样的功能选项是非常有益的。比如说将下载图片操作做成一个单一函数，就可以提供一个选项，专门下载从文件中读取的图片地址资源集合；将获取初始Url 做成一个单一函数，就可以提供一个选项，专门从文件中读取初始URL资源；将"根据规则从网页内容中获取链接元素" 做成一个单一函数，就可以将规则抽离出来，以命令参数的方式传入；此外，可以在单一函数中兼容处理不同的情况，比如说，获取绝对资源链接地址，getAbsLink 就可以隔离处理相对路径的链接，绝对路径的链接，不合要求的链接等。

代码####

#!/usr/bin/python

#_*_encoding:utf-8_*_

import os

import re

import sys

import json

from multiprocessing import (cpu_count, Pool)

from multiprocessing.dummy import Pool as ThreadPool

import argparse

import requests

from bs4 import BeautifulSoup

ncpus = cpu_count()

saveDir = os.environ['HOME'] + '/joy/pic/test'

def parseArgs():

    description = '''This program is used to batch download pictures from specified urls.

                     eg python dwloadpics_general.py -u http://xxx.html -g 1 10 _p -r '[{"img":["jpg"]}, {"class":["picLink"]}, {"id": ["HidenDataArea"]}]'

                     will search and download pictures from network urls http://xxx_p[1-10].html  by specified rulePath

                  '''

    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-u','--url', nargs='+', help='At least one html urls are required', required=True)

    parser.add_argument('-g','--generate',nargs=2, help='Given range containing two number (start end) to generate more htmls if not empty ', required=False)

    parser.add_argument('-r','--rulepath',nargs=1,help='rule path to search pictures. if not given, search pictures in given urls', required=False)

    args = parser.parse_args()

    init_urls = args.url

    gene = args.generate

    rulepath = args.rulepath

    return (init_urls, gene, rulepath)

def createDir(dirName):

    if not os.path.exists(dirName):

        os.makedirs(dirName)

def catchExc(func):

    def _deco(*args, **kwargs):

        try:

            return func(*args, **kwargs)

        except Exception as e:

            print "error catch exception for %s (%s, %s): %s" % (func.__name__, str(*args), str(**kwargs), e)

            return None

    return _deco

class IoTaskThreadPool(object):

    '''

       thread pool for io operations

    '''

    def __init__(self, poolsize):

        self.ioPool = ThreadPool(poolsize)

    def execTasks(self, ioFunc, ioParams):

        if not ioParams or len(ioParams) == 0:

            return []

        return self.ioPool.map(ioFunc, ioParams)

    def execTasksAsync(self, ioFunc, ioParams):

        if not ioParams or len(ioParams) == 0:

            return []

        self.ioPool.map_async(ioFunc, ioParams)

    def close(self):

        self.ioPool.close()

    def join(self):

        self.ioPool.join()

class TaskProcessPool():

    '''

       process pool for cpu operations or task assignment

    '''

    def __init__(self):

        self.taskPool = Pool(processes=ncpus)

    def addDownloadTask(self, entryUrls):

        self.taskPool.map_async(downloadAllForAPage, entryUrls)

    def close(self):

        self.taskPool.close()

    def join(self):

        self.taskPool.join()

def getHTMLContentFromUrl(url):

    '''

       get html content from html url

    '''

    r = requests.get(url)

    status = r.status_code

    if status != 200:

        return ''

    return r.text

def batchGrapHtmlContents(urls):

    '''

       batch get the html contents of urls

    '''

    global grapHtmlPool

    return grapHtmlPool.execTasks(getHTMLContentFromUrl, urls)

def getAbsLink(link):

    global serverDomain

    try:

        href = link.attrs['href']

        if href.startswith('/'):

            return serverDomain + href

        else:

            return href

    except:

        return ''

def getTrueImgLink(imglink):

    '''

    get the true address of image link:

        (1) the image link is //bbsmax.ikafan.com/static/L3Byb3h5L2h0dHAvaW1nLnpjb29sLmNuL2NvbW11bml0eS8wMWEwNzA1N2QxYzJhNDAwMDAwMThjMWI1YjBhZTYuanBnQDkwMHdfMWxfMm9fMTAwc2guanBn.jpg

            but the better link is http://img.zcool.cn/community/01a07057d1c2a40000018c1b5b0ae6.jpg (removing what after @)

        (2) the image link is relative path /path/to/xxx.jpg

            then the true link is serverDomain/path/to/xxx.jpg serverDomain is http://somedomain

    '''

    global serverDomain

    try:

        href = imglink.attrs['src']

        if href.startswith('/'):

            href = serverDomain + href

        pos = href.find('jpg@')

        if pos == -1:

            return href

        return href[0: pos+3]

    except:

        return ''

def batchGetImgTrueLink(imgLinks):

    hrefs = map(getTrueImgLink, imgLinks)

    return filter(lambda x: x!='', hrefs)

def findWantedLinks(htmlcontent, rule):

    '''

       find html links or pic links from html by rule.

       sub rules such as:

          (1) a link with id=[value1,value2,...]

          (2) a link with class=[value1,value2,...]

          (3) img with src=xxx.jpg|png|...

       a rule is map containing sub rule such as:

          { 'id': [id1, id2, ..., idn] } or

          { 'class': [c1, c2, ..., cn] } or

          { 'img': ['jpg', 'png', ... ]}

    '''

    soup = BeautifulSoup(htmlcontent, "lxml")

    alinks = []

    imglinks = []

    for (key, values) in rule.iteritems():

        if key == 'id':

            for id in values:

                links = soup.find_all('a', id=id)

                links = map(getAbsLink, links)

                links = filter(lambda x: x !='', links)

                alinks.extend(links)

        elif key == 'class':

            for cls in values:

                if cls == '*':

                    links = soup.find_all('a')

                else:

                    links = soup.find_all('a', class_=cls)

                links = map(getAbsLink, links)

                links = filter(lambda x: x !='', links)

                alinks.extend(links)

        elif key == 'img':

            for picSuffix in values:

                imglinks.extend(soup.find_all('img', src=re.compile(picSuffix)))

    allLinks = []

    allLinks.extend(alinks)

    allLinks.extend(batchGetImgTrueLink(imglinks))

    return allLinks

def batchGetLinksByRule(htmlcontentList, rule):

    '''

       find all html links or pic links from html content list by rule

    '''

    links = []

    for htmlcontent in htmlcontentList:

        links.extend(findWantedLinks(htmlcontent, rule))

    return links

def defineResRulePath():

    '''

        return the rule path from init htmls to the origin addresses of pics

        if we find the origin addresses of pics by

        init htmls --> grap htmlcontents --> rules1 --> intermediate htmls

           --> grap htmlcontents --> rules2 --> intermediate htmls

           --> grap htmlcontents --> rules3 --> origin addresses of pics

        we say the rulepath is [rules1, rules2, rules3]

    '''

    return []

def findOriginAddressesByRulePath(initUrls, rulePath):

    '''

       find Origin Addresses of pics by rulePath started from initUrls

    '''

    result = initUrls[:]

    for rule in rulePath:

        htmlContents = batchGrapHtmlContents(result)

        links = batchGetLinksByRule(htmlContents, rule)

        result = []

        result.extend(links)

        result = filter(lambda link: link.startswith('http://'),result)

    return result

def downloadFromUrls(initUrls, rulePath):

    global dwPicPool

    picOriginAddresses = findOriginAddressesByRulePath(initUrls, rulePath)

    dwPicPool.execTasksAsync(downloadPic, picOriginAddresses)

@catchExc

def downloadPic(picsrc):

    '''

       download pic from pic href such as

            http://img.pconline.com.cn/images/upload/upc/tx/photoblog/1610/21/c9/28691979_1477032141707.jpg

    '''

    picname = picsrc.rsplit('/',1)[1]

    saveFile = saveDir + '/' + picname

    picr = requests.get(picsrc, stream=True)

    with open(saveFile, 'wb') as f:

        for chunk in picr.iter_content(chunk_size=1024):

            if chunk:

                f.write(chunk)

                f.flush()

    f.close()

def divideNParts(total, N):

    '''

       divide [0, total) into N parts:

        return [(0, total/N), (total/N, 2M/N), ((N-1)*total/N, total)]

    '''

    each = total / N

    parts = []

    for index in range(N):

        begin = index*each

        if index == N-1:

            end = total

        else:

            end = begin + each

        parts.append((begin, end))

    return parts

def testBatchGetLinks():

    urls = ['http://dp.pconline.com.cn/list/all_t145.html', 'http://dp.pconline.com.cn/list/all_t292.html']

    htmlcontentList = map(getHTMLContentFromUrl, urls)

    rules = {'class':['picLink'], 'id': ['HidenDataArea'], 'img':['jpg']}

    allLinks = batchGetLinksByRule(htmlcontentList, rules)

    for link in allLinks:

        print link

def generateMoreInitUrls(init_urls, gene):

    '''

      Generate more initial urls using init_urls and a range specified by gene

      to generate urls, we give a base url containing a placeholder, then replace placeholder with number.

       eg.

       base url:  http://xxx.yyy?k1=v1&k2=v2&page=placeholder -> http://xxx.yyy?k1=v1&k2=v2&page=[start-end]

       base url is specified by -u option if -g is given.

    '''

    if not gene:

        return init_urls

    start = int(gene[0])

    end = int(gene[1])

    truerange = map(lambda x: x+start, range(end-start+1))

    resultUrls = []

    for ind in truerange:

        for url in init_urls:

            resultUrls.append(url.replace('placeholder', str(ind)))

    return resultUrls

def parseRulePathParam(rulepathjson):

    rulepath = [{'img': ['jpg', 'png']}]

    if rulepathjson:

        try:

            rulepath = json.loads(rulepathjson[0])

        except ValueError as e:

            print 'Param Error: invalid rulepath %s %s' % (rulepathjson, e)

            sys.exit(1)

    return rulepath

def parseServerDomain(url):

    parts = url.split('/',3)

    return parts[0] + '//' + parts[2]

if __name__ == '__main__':

    #testBatchGetLinks()

    (init_urls, gene, rulepathjson) = parseArgs()

    moreInitUrls = generateMoreInitUrls(init_urls, gene)

    print moreInitUrls

    rulepath = parseRulePathParam(rulepathjson)

    serverDomain = parseServerDomain(init_urls[0])

    createDir(saveDir)

    grapHtmlPool = IoTaskThreadPool(20)

    dwPicPool = IoTaskThreadPool(20)

    downloadFromUrls(moreInitUrls, rulepath)

    dwPicPool.close()

    dwPicPool.join()

用法####

有一个 Shell 控制台或终端模拟器，安装了 Python2.7, easy_install (pip), argparse, requests, bs4, BeautifulSoup

a. 当前页面已经包含了美图的真实地址，直接下载当前页面的所有美图，比如 http://bbs.voc.com.cn/topic-7477222-1-1.html , 可以使用

python dwloadpics_general.py -u http://bbs.voc.com.cn/topic-7477222-1-1.html

轻易地将该页的美图都下载下来；

b. 当前页面包含了是一系列美图的缩小图，指向包含美图真实地址的页面。比如 http://www.zcool.com.cn/works/33!35!!0!0!200!1!1!!!/ 打开时会呈现出一系列美图以及高清图片的链接。在控制台查看链接的 className = "image-link" 最终高清图片是。那么找到图片真实地址的规则路径是： [{"class":["image-link"]}, {"img":["jpg"]}] ，　那么命令行是：

python dwloadpics_general.py -u 'http://www.zcool.com.cn/works/33!35!!0!0!200!1!1!!!/' -r '[{"class":["image-link"]}, {"img":["jpg"]}]'

这里使用了单引号，将 Shell 特殊字符 !, 空格，等转义或变成普通字符。

c. 多页拉取

假设我们对这个风光系列很感兴趣，那么可以将所有页的图片都批量下载下来。怎么做呢？　首先可以分析，

第一页是 http://www.zcool.com.cn/works/33!35!!0!0!200!1!1!!! , 　第５页是 http://www.zcool.com.cn/works/33!35!!0!0!200!1!5!!! ；以此类推，第 i 页是 http://www.zcool.com.cn/works/33!35!!0!0!200!1!i!!! , 只要生成初始 urls = http://www.zcool.com.cn/works/33!35!!0!0!200!1![1-N]!!! 即可。这时候 -g 选项就派上用场啦！

python dwloadpics_general.py -u 'http://www.zcool.com.cn/works/33!35!!0!0!200!1!placeholder!!!' -r '[{"class":["image-link"]}, {"img":["jpg"]}]' -g 1 2

-u 传入基础 url : http://www.zcool.com.cn/works/33!35!!0!0!200!1!placeholder!!! , -g 生成指定范围之间的数字 i 并替换 placeholder, 就可以拼接成目标 url 了。

终极杀手版###

思路####

技术人的追求是永无止境的，尽管未必与产品和业务同学的观点一致。^_

即使通用版，也有一些不方便的地方：

(1) 必须熟悉定义的规则路径，甚至需要了解一些CSS知识，对于普通人来说，实在难以理解；

(2) 对于千变万化的网站图片存储规则，仅从部分网站提取的规律并不能有效地推广。

因此，我又萌生一个想法：　做一个终极杀手版。

一切皆是链接。或取之，或舍之。真理的形式总是如此简洁美妙。但取舍之间，却是大智慧的体现。真理的内容又是如此错综复杂。

如果互联网是相通的一张巨大蜘蛛网，那么从一点出发，使用广度遍历算法，一定可以抵达每一个角落。通用版实际是指定了直达目标的一条路径。从另一个角度来说，实际上只要给定一个初始URL链接, 递归地获取URL链接，分析链接内容获得URL链接，提取 img 图片元素即可。

为此，定义另一种参数： loop 回合，或者深度。假设从初始URL init_url 出发，经过 init_url -> mid_1 url -> mid_2 url -> origin address of pic OAOP，那么 loop = 3. 也就是说，从 init_url 获取链接 mid_1 url ，从 mid_1 url 的文档的内容中获取链接 mid_2 url ，从 mid_2 url 的文档内容中获取图片的真实地址 OAOP，那么，称作进行了三个回合。类似于交通中的转车一样。这样，用户就不需要知道控制台，Class，规则路径之类的东东了。

现在的重点是取舍之道。取相对简单，只要获取链接元素即可，舍即是大道。对于链接元素来说，从一个网页可以链接到任意网页，如果不加限制，就会陷入失控。因此，定义了白名单网站，只获取白名单网站内的链接；对于图片元素来说，经过对几个主流网站的查看，发现最终图片基本采用 jpg 格式。而我们的目标是高清图片，那么对大小也是有要求的。可以定义大小的参数，让用户选择。更智能的，通过图片内容来分析是否是所需图片，恕才疏学浅，暂难办到。

现在打开 http://dp.pconline.com.cn/list/all_t145_p1.html ，只要使用

python dwloadpics_killer.py -u 'http://dp.pconline.com.cn/list/all_t145_p1.html' -l 3

就能下载到大量美图啦！ loop 值越大，抓取网页的范围就越大，所需流量也越大，要慎用哦！ So Crazy !

　

代码####

#!/usr/bin/python

#_*_encoding:utf-8_*_

import os

import re

import sys

import json

from multiprocessing import (cpu_count, Pool)

from multiprocessing.dummy import Pool as ThreadPool

import argparse

import requests

from bs4 import BeautifulSoup

import Image

ncpus = cpu_count()

saveDir = os.environ['HOME'] + '/joy/pic/test'

whitelist = ['pconline', 'zcool', 'huaban', 'taobao', 'voc']

DEFAULT_LOOPS = 1

DEFAULT_WIDTH = 800

DEFAULT_HEIGHT = 600

def isInWhiteList(url):

    for d in whitelist:

        if d in url:

            return True

    return False    

def parseArgs():

    description = '''This program is used to batch download pictures from specified initial url.

                     eg python dwloadpics_killer.py -u init_url

                  '''

    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-u','--url', help='One initial url is required', required=True)

    parser.add_argument('-l','--loop', help='download url depth')

    parser.add_argument('-s','--size', nargs=2, help='specify expected size that should be at least, (with,height) ')

    args = parser.parse_args()

    init_url = args.url

    size = args.size

    loops = int(args.loop)

    if loops is None:

        loops = DEFAULT_LOOPS

    if size is None:

        size = [DEFAULT_WIDTH, DEFAULT_HEIGHT]

    return (init_url,loops, size)

def createDir(dirName):

    if not os.path.exists(dirName):

        os.makedirs(dirName)

def catchExc(func):

    def _deco(*args, **kwargs):

        try:

            return func(*args, **kwargs)

        except Exception as e:

            print "error catch exception for %s (%s, %s): %s" % (func.__name__, str(*args), str(**kwargs), e)

            return None

    return _deco

class IoTaskThreadPool(object):

    '''

       thread pool for io operations

    '''

    def __init__(self, poolsize):

        self.ioPool = ThreadPool(poolsize)

    def execTasks(self, ioFunc, ioParams):

        if not ioParams or len(ioParams) == 0:

            return []

        return self.ioPool.map(ioFunc, ioParams)

    def execTasksAsync(self, ioFunc, ioParams):

        if not ioParams or len(ioParams) == 0:

            return []

        self.ioPool.map_async(ioFunc, ioParams)

    def close(self):

        self.ioPool.close()

    def join(self):

        self.ioPool.join()

class TaskProcessPool():

    '''

       process pool for cpu operations or task assignment

    '''

    def __init__(self):

        self.taskPool = Pool(processes=ncpus)

    def addDownloadTask(self, entryUrls):

        self.taskPool.map_async(downloadAllForAPage, entryUrls)

    def close(self):

        self.taskPool.close()

    def join(self):

        self.taskPool.join()

def getHTMLContentFromUrl(url):

    '''

       get html content from html url

    '''

    r = requests.get(url)

    status = r.status_code

    if status != 200:

        return ''

    return r.text

def batchGrapHtmlContents(urls):

    '''

       batch get the html contents of urls

    '''

    global grapHtmlPool

    return grapHtmlPool.execTasks(getHTMLContentFromUrl, urls)

def getAbsLink(link):

    global serverDomain

    try:

        href = link.attrs['href']

        if href.startswith('//'):

            return 'http:' + href

        if href.startswith('/'):

            return serverDomain + href

        if href.startswith('http://'):

            return href

        return ''

    except:

        return ''

def filterLink(link):

    '''

       only search for pictures in websites specified in the whitelist

    '''

    if link == '':

        return False

    if not link.startswith('http://'):

        return False

    serverDomain = parseServerDomain(link)

    if not isInWhiteList(serverDomain):

        return False

    return True

def filterImgLink(imgLink):

    '''

       The true imge addresses always ends with .jpg

    '''

    commonFilterPassed = filterLink(imgLink)

    if commonFilterPassed:

        return imgLink.endswith('.jpg')

def getTrueImgLink(imglink):

    '''

    get the true address of image link:

        (1) the image link is //bbsmax.ikafan.com/static/L3Byb3h5L2h0dHAvaW1nLnpjb29sLmNuL2NvbW11bml0eS8wMWEwNzA1N2QxYzJhNDAwMDAwMThjMWI1YjBhZTYuanBnQDkwMHdfMWxfMm9fMTAwc2guanBn.jpg

            but the better link is http://img.zcool.cn/community/01a07057d1c2a40000018c1b5b0ae6.jpg (removing what after @)

        (2) the image link is relative path /path/to/xxx.jpg

            then the true link is serverDomain/path/to/xxx.jpg serverDomain is http://somedomain

    '''

    global serverDomain

    try:

        href = imglink.attrs['src']

        if href.startswith('/'):

            href = serverDomain + href

        pos = href.find('jpg@')

        if pos == -1:

            return href

        return href[0: pos+3]

    except:

        return ''

def findAllLinks(htmlcontent, linktag):

    '''

       find html links or pic links from html by rule.

    '''

    soup = BeautifulSoup(htmlcontent, "lxml")

    if linktag == 'a':

        applylink = getAbsLink

    else:

        applylink = getTrueImgLink

    alinks = soup.find_all(linktag)

    allLinks = map(applylink, alinks)

    return filter(lambda x: x!='', allLinks)

def findAllALinks(htmlcontent):

    return findAllLinks(htmlcontent, 'a')

def findAllImgLinks(htmlcontent):

    return findAllLinks(htmlcontent, 'img')

def flat(listOfList):

    return [val for sublist in listOfList for val in sublist]

@catchExc

def downloadPic(picsrc):

    '''

       download pic from pic href such as

            http://img.pconline.com.cn/images/upload/upc/tx/photoblog/1610/21/c9/28691979_1477032141707.jpg

    '''

    picname = picsrc.rsplit('/',1)[1]

    saveFile = saveDir + '/' + picname

    picr = requests.get(picsrc, stream=True)

    with open(saveFile, 'wb') as f:

        for chunk in picr.iter_content(chunk_size=1024):

            if chunk:

                f.write(chunk)

                f.flush()

    f.close()

    return saveFile

@catchExc

def removeFileNotExpected(filename):

    global size

    expectedWidth = size[0]

    expectedHeight = size[1]

    img = Image.open(filename)

    imgsize = img.size

    if imgsize[0] < expectedWidth or imgsize[1] < expectedHeight:

       os.remove(filename) 

def downloadAndCheckPic(picsrc):

    saveFile = downloadPic(picsrc)

    removeFileNotExpected(saveFile)

def batchDownloadPics(imgAddresses):

    global dwPicPool

    dwPicPool.execTasksAsync(downloadAndCheckPic, imgAddresses)

def downloadFromUrls(urls, loops):

    htmlcontents = batchGrapHtmlContents(urls)

    allALinks = flat(map(findAllALinks, htmlcontents))

    allALinks = filter(filterLink, allALinks)

    if loops == 1:

        allImgLinks = flat(map(findAllImgLinks, htmlcontents))

        validImgAddresses = filter(filterImgLink, allImgLinks)

        batchDownloadPics(validImgAddresses)

    return allALinks

def startDownload(init_url, loops=3):

    '''

       if init_url -> mid_1 url -> mid_2 url -> true image address

       then loops = 3 ; default loops = 3

    '''

    urls = [init_url]

    while True:

        urls = downloadFromUrls(urls, loops)

        loops -= 1

        if loops == 0:

            break

def divideNParts(total, N):

    '''

       divide [0, total) into N parts:

        return [(0, total/N), (total/N, 2M/N), ((N-1)*total/N, total)]

    '''

    each = total / N

    parts = []

    for index in range(N):

        begin = index*each

        if index == N-1:

            end = total

        else:

            end = begin + each

        parts.append((begin, end))

    return parts

def parseServerDomain(url):

    parts = url.split('/',3)

    return parts[0] + '//' + parts[2]

if __name__ == '__main__':

    (init_url,loops, size) = parseArgs()

    serverDomain = parseServerDomain(init_url)

    createDir(saveDir)

    grapHtmlPool = IoTaskThreadPool(10)

    dwPicPool = IoTaskThreadPool(10)

    startDownload(init_url, loops)

    dwPicPool.close()

    dwPicPool.join()

小结###

通过一个针对特定目标网站的批量图片下载工具的实现，从一个串行版本改造成一个并发的更加通用的版本，学到了如下经验：

将线程池、进程池、任务分配等基础组件通用化，才能在后续更省力地编写程序，不必一次次写重复代码；
更加通用可扩展的程序，需要更小粒度更可复用的单一微操作；
需要能够分离变量和不变量，并敏感地意识到可能的变量以及容纳的方案；
通过寻找规律，提炼规则，并将规则使用数据结构可配置化，从而使得工具更加通用；
通过探究本质，可以达到更加简洁有效的思路和实现；

实际上，图片网站的规则可谓千变万化，针对某个或某些网站提炼的规则对于其他网站不一定有效；如果要做成更强大通用的图片下载器，则需要对主流网站的图片存放及链接方式做一番调研，归纳出诸多规则集合，然后集中做成规则匹配引擎，甚至是更智能的图片下载工具。不过，对于个人日常使用来说，只要能顺利下载比较喜欢的网站的图片，逐步增强获取图片真实地址的规则集合，也是可以滴 ~~

 本文原创， 转载请注明出处，谢谢！ :)

码农公寓

引子###

通用版###

思路####

代码####

用法####

终极杀手版###

思路####

代码####

小结###

相关文章