crawlspider提取url
创建一个crawlspider爬虫
scrapy genspider --t crawl baidu baidu.com
创建的爬虫
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CfSpider(CrawlSpider): name = 'cf' allowed_domains = ['circ.gov.cn'] start_urls = ['http://circ.gov.cn/'] # 提取规则 follow=True 继续提取(提取下一页地址 需要True) rules = ( Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), ) def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() return item