python爬取豆瓣里面活动照片的工厂设计模式

python爬取豆瓣里面活动照片的工厂设计模式
#!/usr/bin/python
# coding: utf-8

#
# filename: pachong of doubban
# 
# author: Hacker_MJW
#
# Date: 2014-02-28
#


import urllib
import urllib2

import logging
import os

import re

import time



class SuperReptile(object):
    ‘‘‘
    爬虫的超级类,用作继承用
    ‘‘‘
    def __init__(self, 
      init_url, pro_name):
        self.init_url = init_url
        self.pro_name = pro_name
        #logging.basicConfig()

        self.log = logging.getLogger(self.pro_name)
        self.log.setLevel(logging.DEBUG)
        #print self.pro_name+‘.txt‘

        self.handler = logging.FileHandler(filename=self.pro_name+.txt)
        self.handler.setLevel(logging.DEBUG)

        self.formatter = logging.Formatter(%(asctime)s-%(name)s-%(levelname)s-%(message)s)
        self.handler.setFormatter(self.formatter)

        self.log.addHandler(self.handler)

    def open_url(self, url):
        req = urllib2.Request(url)
        try:
            rps = urllib2.urlopen(req)
            self.page = rps.read()
            s = open %s successfully % url
             self.log.info(s)
        except urllib2.HTTPError as e:
            self.log.error(httperror %d % e.code)
            time.sleep(100)
        except:
            self.log.info(unknown error)
        self.handler.flush()
        return self.page

    def download(self):
        pass

    def close_log(self):
        self.log.removeHandler(self.handler)
        self.handler.flush()
        self.handler.close()


class DoubanReptile(SuperReptile):
    ‘‘‘
    superclass: __init__(self, init_url, pro_name)
    爬取豆瓣活动图片的爬虫
    ‘‘‘
    def __init__(self, url):
        super(DoubanReptile, self).__init__(url, DoubanReptile)
        self.start_page = super(DoubanReptile, self).open_url(url).replace(\n, ‘‘)

        self.p_img = <div\s*class=\"photo_wrap\"\s*>(.*?)</a>
        self.p_img_href = <a\s*href=\"(.*?)\"
        self.p_img_img = <img\s*src=\"(.*?)\"\s*/>
        self.p_urls_a = <span\s*class=\"thispage\".*?>(.*?)<span\s*class=\"next\">
        self.p_urls_b = <a\s*href=\"(.*?)\"\s*>
        self.img_list = []

        if not os.path.exists(os.getcwd()+\\doubanphotos):
            try:
                os.mkdir(os.getcwd()+\\doubanphotos)
            except:
                pass
        else:
            self.path = os.getcwd() + \\doubanphotos

    def compile_p(self):
        #
        #compile re pattern
        #
        self.img_p = re.compile(self.p_img)
        self.img_href = re.compile(self.p_img_href)
        self.img_img = re.compile(self.p_img_img)
        self.url_a = re.compile(self.p_urls_a)
        self.url_b = re.compile(self.p_urls_b)

    def get_urls(self):
        #
        #get a numbers of url of the item
        #
        div = self.url_a.findall(self.start_page)
        self.url_list = self.url_b.findall(div[0])

    def get_img_src(self):
        #
        #get a numbers of sources of the item
        #
        page_list = []
        page_list.append(self.start_page)
        for url in self.url_list:
            page = super(DoubanReptile, self).open_url(url).replace(\n, ‘‘)
            page_list.append(page)

        for page in page_list:
            img_divs = self.img_p.findall(page)
            name = douban
            for div in img_divs:
                img_href = self.img_href.findall(div)
                page_big = super(DoubanReptile, self).open_url(img_href[0]).replace(\n, ‘‘)
                img_src = self.img_img.findall(page_big)[0]
                print img_src
                self.img_list.append(img_src)

    def download(self):
        #
        #download the src
        #
        for src in self.img_list:
            print src
            urllib.urlretrieve(src, os.path.join(self.path,src.split(/)[-1]))


class ReptileFactory(object):
    ‘‘‘
    爬虫工厂
    ‘‘‘
    def __init__(self, reptile):
        self.value = reptile
        self.reptile_dict = {douban:DoubanReptile}

    def get_reptie(self):
        return self.reptile_dict[self.value]



if __name__ == __main__:
    factory = ReptileFactory(douban)
    douban = factory.get_reptie()(http://www.douban.com/online/11698467/album/125554615/)
    douban.compile_p()
    douban.get_urls()
    douban.get_img_src()
    douban.download()
    douban.close_log()
    
python爬取豆瓣里面活动照片的工厂设计模式

今天学了一下python强大的logging模块,于是就想着用一下,然后我就想到了爬虫,好多时候我们需要记录一下其中过程,日志是一个非常好的方法,虽然写文件也可以完成,但是却没有这个日志强大。

python爬取豆瓣里面活动照片的工厂设计模式,布布扣,bubuko.com

python爬取豆瓣里面活动照片的工厂设计模式

上一篇:const关键字在C和C++区别


下一篇:[C++ Basic]C++与Java的主要区别