一、下载中间件
1、应用场景
代理
USER_AGENT(在setting文件中配置即可)
2、定义类
a、process_request 返回None
执行顺序
md1 request -> md2 request -> md2 response -> md1 response
class DM1(object): def process_request(self, request, spider): print('M1 request', request) return None def process_response(self, request, response, spider): print('M1 response', response) return response def process_exception(self, request, exception, spider): pass class DM2(object): def process_request(self, request, spider): print('M2 request', request) return None def process_response(self, request, response, spider): print('M2 response', response) return response def process_exception(self, request, exception, spider): pass
b、process_request 返回 Response
顺序 md1 request -> md2 response -> md1 response
from scrapy.http import Response class DM1(object): def process_request(self, request, spider): print('M1 request', request) return Response(url="www.test.com", status=200, headers=None, body=b'test') def process_response(self, request, response, spider): print('M1 response', response) return response def process_exception(self, request, exception, spider): pass class DM2(object): def process_request(self, request, spider): print('M2 request', request) return None def process_response(self, request, response, spider): print('M2 response', response) return response def process_exception(self, request, exception, spider): pass
c、返回Request
发生阻塞,返回request到调度器->下载中间件->返回request到调度器
from scrapy.http import Response from scrapy.http import Request class DM1(object): def process_request(self, request, spider): print('M1 request', request) return Request("http://quotes.toscrape.com/page/2/") def process_response(self, request, response, spider): print('M1 response', response) return response def process_exception(self, request, exception, spider): pass class DM2(object): def process_request(self, request, spider): print('M2 request', request) return None def process_response(self, request, response, spider): print('M2 response', response) return response def process_exception(self, request, exception, spider): pass
d、抛出异常,需要process_exception捕获
from scrapy.http import Response from scrapy.http import Request from scrapy.exceptions import IgnoreRequest class DM1(object): def process_request(self, request, spider): print('M1 request', request) raise IgnoreRequest('有异常发生') def process_response(self, request, response, spider): print('M1 response', response) return response def process_exception(self, request, exception, spider): print(exception) class DM2(object): def process_request(self, request, spider): print('M2 request', request) return None def process_response(self, request, response, spider): print('M2 response', response) return response def process_exception(self, request, exception, spider): pass
3、配置文件
DOWNLOADER_MIDDLEWARES = { # 'toscrapy.middlewares.ToscrapyDownloaderMiddleware': 543, 'toscrapy.downloadermd.DM1': 543, 'toscrapy.downloadermd.DM2': 545, }
二、爬虫中间件
1、应用
深度和优先级
2、定义类
class MySpiderMiddleware(object): @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s # 进入爬虫的parse方法,执行 def process_spider_input(self, response, spider): print('in') return None # 出来爬虫的parse方法,执行一次 def process_spider_output(self, response, result, spider): print('out') for i in result: yield i def process_spider_exception(self, response, exception, spider): pass # 只在开启爬虫的时候,执行一次 def process_start_requests(self, start_requests, spider): print('start') for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
3、配置文件
SPIDER_MIDDLEWARES = { 'toscrapy.spidermd.MySpiderMiddleware': 543, }