源码
class DepthMiddleware(object): def __init__(self, maxdepth, stats, verbose_stats=False, prio=1): self.maxdepth = maxdepth self.stats = stats self.verbose_stats = verbose_stats self.prio = prio @classmethod def from_crawler(cls, crawler): settings = crawler.settings maxdepth = settings.getint('DEPTH_LIMIT') verbose = settings.getbool('DEPTH_STATS_VERBOSE') prio = settings.getint('DEPTH_PRIORITY') return cls(maxdepth, crawler.stats, verbose, prio) def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request): depth = response.meta['depth'] + 1 request.meta['depth'] = depth if self.prio: request.priority -= depth * self.prio if self.maxdepth and depth > self.maxdepth: logger.debug( "Ignoring link (depth > %(maxdepth)d): %(requrl)s ", {'maxdepth': self.maxdepth, 'requrl': request.url}, extra={'spider': spider} ) return False else: if self.verbose_stats: self.stats.inc_value('request_depth_count/%s' % depth, spider=spider) self.stats.max_value('request_depth_max', depth, spider=spider) return True # base case (depth=0) if 'depth' not in response.meta: response.meta['depth'] = 0 if self.verbose_stats: self.stats.inc_value('request_depth_count/0', spider=spider) return (r for r in result or () if _filter(r))
配置
DEPTH_LIMIT = 2 深度限制
开启后 有输出 request_depth_0 1 2 3 4 分别收集了多少个
DEPTH_STATS_VERBOSE = True 深度状态收集
DEPTH_PRIORITY = 5 深度变化 每次深度会加5 会设置为正负数