dupefilter对访问的url做去重 第一步: 在爬虫文件中chouti.py中 import scrapy from xdb.items import XdbItem from scrapy.dupefilters import RFPDupeFilter class ChoutiSpider(scrapy.Spider): name = ‘chouti‘ allowed_domains = [‘chouti.com‘] start_urls = [‘http://chouti.com/‘] # start_urls = [‘http://127.0.0.1:80/app01/login/‘] def parse(self, response): # print(response, type(response)) # print(response.text) content_list = response.xpath(‘//div[@class="link-con"]//div[@class="link-detail"]‘) for item in content_list: text = item.xpath(‘./a/text()‘).extract_first() href = item.xpath(‘./a/@href‘).extract_first() yield XdbItem(text=text, href=href) # print(href) page_list = response.xpath(‘//div[@id="dig_lcpage"]//a/@href‘).extract() for page in page_list: from scrapy.http import Request page = "https://dig.chouti.com" + page yield Request(url=page, callback=self.parse) # 在内部会先调用XdbDupeFilter类中request_seen方法 # yield Request(url=page, callback=self.parse, dont_filter=True) # dont_filter为True时不去重规则失效 第二步: 先创建一个自定义文件dupefilters.py,写 from scrapy.dupefilters import BaseDupeFilter from scrapy.utils.request import request_fingerprint class XdbDupeFilter(BaseDupeFilter): def __init__(self): self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): # 给url进行加密成一个固定的字符串 fd = request_fingerprint(request) if fd in self.visited_fd: return True # 如果返回True表示之前访问过了,不再访问了 self.visited_fd.add(fd) def open(self): # can return deferred # 爬虫开始 pass def close(self, reason): # can return a deferred # 爬虫结束 pass def log(self, request, spider): # log that a request has been filtered pass 第三步: 在settings.py中配置 # 修改默认的去重规则 # DUPILTER_CLASS = ‘scrapy.dupefilters.RFPDupeFilter‘ DUPILTER_CLASS = ‘xdb.dupefilters.XdbDupeFilter‘