#!/usr/bin/python # coding: utf-8 # # filename: pachong of doubban # # author: Hacker_MJW # # Date: 2014-02-28 # import urllib import urllib2 import logging import os import re import time class SuperReptile(object): ‘‘‘ 爬虫的超级类,用作继承用 ‘‘‘ def __init__(self, init_url, pro_name): self.init_url = init_url self.pro_name = pro_name #logging.basicConfig() self.log = logging.getLogger(self.pro_name) self.log.setLevel(logging.DEBUG) #print self.pro_name+‘.txt‘ self.handler = logging.FileHandler(filename=self.pro_name+‘.txt‘) self.handler.setLevel(logging.DEBUG) self.formatter = logging.Formatter(‘%(asctime)s-%(name)s-%(levelname)s-%(message)s‘) self.handler.setFormatter(self.formatter) self.log.addHandler(self.handler) def open_url(self, url): req = urllib2.Request(url) try: rps = urllib2.urlopen(req) self.page = rps.read() s = ‘open %s successfully‘ % url self.log.info(s) except urllib2.HTTPError as e: self.log.error(‘httperror %d‘ % e.code) time.sleep(100) except: self.log.info(‘unknown error‘) self.handler.flush() return self.page def download(self): pass def close_log(self): self.log.removeHandler(self.handler) self.handler.flush() self.handler.close() class DoubanReptile(SuperReptile): ‘‘‘ superclass: __init__(self, init_url, pro_name) 爬取豆瓣活动图片的爬虫 ‘‘‘ def __init__(self, url): super(DoubanReptile, self).__init__(url, ‘DoubanReptile‘) self.start_page = super(DoubanReptile, self).open_url(url).replace(‘\n‘, ‘‘) self.p_img = ‘<div\s*class=\"photo_wrap\"\s*>(.*?)</a>‘ self.p_img_href = ‘<a\s*href=\"(.*?)\"‘ self.p_img_img = ‘<img\s*src=\"(.*?)\"\s*/>‘ self.p_urls_a = ‘<span\s*class=\"thispage\".*?>(.*?)<span\s*class=\"next\">‘ self.p_urls_b = ‘<a\s*href=\"(.*?)\"\s*>‘ self.img_list = [] if not os.path.exists(os.getcwd()+‘\\doubanphotos‘): try: os.mkdir(os.getcwd()+‘\\doubanphotos‘) except: pass else: self.path = os.getcwd() + ‘\\doubanphotos‘ def compile_p(self): # #compile re pattern # self.img_p = re.compile(self.p_img) self.img_href = re.compile(self.p_img_href) self.img_img = re.compile(self.p_img_img) self.url_a = re.compile(self.p_urls_a) self.url_b = re.compile(self.p_urls_b) def get_urls(self): # #get a numbers of url of the item # div = self.url_a.findall(self.start_page) self.url_list = self.url_b.findall(div[0]) def get_img_src(self): # #get a numbers of sources of the item # page_list = [] page_list.append(self.start_page) for url in self.url_list: page = super(DoubanReptile, self).open_url(url).replace(‘\n‘, ‘‘) page_list.append(page) for page in page_list: img_divs = self.img_p.findall(page) name = ‘douban‘ for div in img_divs: img_href = self.img_href.findall(div) page_big = super(DoubanReptile, self).open_url(img_href[0]).replace(‘\n‘, ‘‘) img_src = self.img_img.findall(page_big)[0] print img_src self.img_list.append(img_src) def download(self): # #download the src # for src in self.img_list: print src urllib.urlretrieve(src, os.path.join(self.path,src.split(‘/‘)[-1])) class ReptileFactory(object): ‘‘‘ 爬虫工厂 ‘‘‘ def __init__(self, reptile): self.value = reptile self.reptile_dict = {‘douban‘:DoubanReptile} def get_reptie(self): return self.reptile_dict[self.value] if __name__ == ‘__main__‘: factory = ReptileFactory(‘douban‘) douban = factory.get_reptie()(‘http://www.douban.com/online/11698467/album/125554615/‘) douban.compile_p() douban.get_urls() douban.get_img_src() douban.download() douban.close_log()
今天学了一下python强大的logging模块,于是就想着用一下,然后我就想到了爬虫,好多时候我们需要记录一下其中过程,日志是一个非常好的方法,虽然写文件也可以完成,但是却没有这个日志强大。