CrawlSpider 全站数据爬取
创建 crawlSpider 爬虫文件
-
scrapy genspider -t crawl chouti www.xxx.com
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CrawSpider(CrawlSpider): name = 'craw' # allowed_domains = ['www.xxx.com'] start_urls = ['https://dig.***.com/r/scoff/hot/1'] #连接提取器:可以根据指定条件提取连接 link = LinkExtractor(allow=r'/r/scoff/hot/\d+') # link1 = LinkExtractor(allow=r'/pic/$') 针对于第一页的 url 不同的 页面使用 rules = ( #规则解析器:将连接提取器提取到的连接对应的页面进行指定规则的数据解析 Rule(link, callback='parse_item', follow=True), #参数follow=True:将连接提取器继续作用到连接提取器提取到的连接所有对应的页面中 # Rule(link1, callback='parse_item', follow=False), ) def parse_item(self, response): print(response)
对于简介与详情不是一个 item 的存储
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from tenPro.items import TenproItem, TenproItem_detail class TenSpider(CrawlSpider): name = 'ten' # allowed_domains = ['www.ccc.com'] start_urls = ['https://hr.****.com/position.php?&start=#a0'] rules = ( Rule(LinkExtractor(allow=r'&start=\d+#a'), callback='parse_item', follow=True), Rule(LinkExtractor(allow=r'position_detail.php\?id ='), callback='parse_detail', follow=True), ) def parse_item(self, response): # 岗位名称和类别 tr_list = response.xpath( '//table[@class="tablelist"]/tr[@class="odd"] | //table[@class="tablelist"]/tr[@class="even"]') for tr in tr_list: title = tr.xpath('./td[1]/a/text()').extract_first() kind = tr.xpath('./td[2]/text()').extract_first() item = TenproItem() item['title'] = title item['kind'] = kind yield item def parse_detail(self, response): desc = response.xpath('//ul[@class="squareli"]//text()').extract() desc = ''.join(desc) item = TenproItem_detail() item['desc'] = desc yield item
import scrapy class TenproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() kind = scrapy.Field() # pass class TenproItem_detail(scrapy.Item): desc = scrapy.Field()
# 分别进行存储 利用数据库的 多表联查 或数据解析 class TenproPipeline(object): def process_item(self, item, spider): desc = None if item.__class__.__name__ == 'TenproItem_detail': desc = item['desc'] else: title = item['title'] kind = item['kind'] print(item) return item
思路:
基于手动请求发送的形式:对所有页面表示的url发起请求,获取页面数据,进行解析
基于CrawlSpider的形式:使用链接提取器和规则解析器进行所有页面对应页面数据的获取也指定数据的解析