爬取过程中 遇见 百度蜘蛛反爬 robot.txt,我们可以在scrapy 的setting.py 配置文件下配置
ROBOTSTXT_OBEY = False
最终代码
# -*- coding: utf-8 -*- from scrapy.spider import Spider from scrapy.contrib.spiders import CrawlSpider, Rule #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.linkextractors import LinkExtractor from scrapy.selector import Selector from scrapy.http import Request, HtmlResponse from scrapy import log from items import BDzdItem class BDzdSpider(CrawlSpider): global qa_number; qa_number=0; """爬取百度知道 银行""" log.msg("log",level=log.DEBUG) def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: if link.text.find("银行") == -1: continue; seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) name = "bankSpider" download_delay = 1 allowed_domains = ["zhidao.baidu.com"] start_urls = [ "https://zhidao.baidu.com/question/1796062605517856547.html?fr=iks&word=%D2%F8%D0%D0&ie=gbk" ] rules = [ Rule(LinkExtractor(allow=('/question/.*'), restrict_xpaths=('//a[@class="related-link"]')), callback='parse_item', follow=True) ] def parse_item(self, response): #return; # open("aa.txt", 'wb').write(response.body) sel = Selector(response) url=response._url; question=sel.xpath('//span[@class="ask-title "]/text()').extract() answer = sel.xpath('//pre[@class="best-text mb-10"]/text()').extract() otherAnswer=sel.xpath('//div[@class="answer-text line"]/span/text()').extract() #sites=sel.xpath('//a[@class="related-link"]') item = BDzdItem() item["question"] = ''.join(question); if len(answer) > 0: item["answer"] = ''.join(answer);#因为xpath text()截出来可能是字符数组,要转成字符 elif len(otherAnswer) > 0: item["answer"] = ''.join(otherAnswer[0]); else: return; global qa_number qa_number=qa_number+1; item["number"]=qa_number print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 第" + str(qa_number)+" 条"; print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + url; print "##########################################" + item["question"]; print "*******************************************" + item["answer"]; yield item
如果有多个spider在一个项目中,可以在pipelines.py中这样写
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json import codecs class TutorialPipeline(object): def process_item(self, item, spider): print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%0" return item class BDzdPipeline(object): def __init__(self): self.bankFile = codecs.open('data_bank.json', 'wb', encoding='utf-8')#银行 self.mobileFile = codecs.open('data_mobile.json', 'wb', encoding='utf-8')#移动 self.baoxianFile = codecs.open('data_baoxian.json', 'wb', encoding='utf-8')#保险 self.jinrongFile = codecs.open('data_jinrong.json', 'wb', encoding='utf-8')#金融 def process_item(self, item, spider): line = json.dumps(dict(item)) + '\n' if spider.name=='bankSpider': self.bankFile.write(line.decode("unicode_escape")) elif spider.name == 'mobileSpider': self.mobileFile.write(line.decode("unicode_escape")) elif spider.name == 'baoxianSpider': self.baoxianFile.write(line.decode("unicode_escape")) elif spider.name == 'jinrongSpider': self.jinrongFile.write(line.decode("unicode_escape")) return item