创建爬虫
scrapy startproject wxapp
cd wxapp
scrapy gensipder -c crawl wxapp_spider "www.wxapp-union.com"
修改settings.py代码
爬虫部分代码
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from pa_chong.Scrapy.wxapp.wxapp.items import WxappItem
class WxappSpiderSpider(CrawlSpider):
name = 'wxapp_spider'
allowed_domains = ['www.wxapp-union.com']
start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
rules = (
Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
Rule(LinkExtractor(allow=r'.+article-.+html'), callback='parse_detail', follow=False)
)
'''
CrawlSpider:
使用Rule和LinkExtractor来决定爬虫的具体走向
1. allow:要能够限制在想要爬去的url上面,不能跟其他的url产生相同的正则表达式
2. follow:在爬去页面的时候,如果需要将满足条件的url再次进行跟进,此时需要将follow设置为Turn
否则设置为False,设置为False就不会继续爬取当前页面中满足规则的url
3. callback:如果只是为了获取页面的url,不需要指定callback
如果需要获取url对应页面中的数据,需要指定一个解析数据的回调函数作为参数传递给callback
'''
def parse_detail(self, response):
title = response.xpath('//h1[@class="ph"]/text()').get()
authors = response.xpath('//p[@class="authors"]/a/text()').get()
time = response.xpath('//p[@class="authors"]/span/text()').get()
article = response.xpath('//td[@id="article_content"]//text()').getall()
article = ''.join(article).strip()
item = WxappItem(title=title, authors=authors, time=time, article=article)
yield item
items.py部分代码
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WxappItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
authors = scrapy.Field()
time = scrapy.Field()
article = scrapy.Field()
pipeline部分代码
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
'''
使用scrapy.exporters下的JsonLinesItemExporter'''
class WxappPipeline(object):
def __init__(self):
self.f = open('wxsqjc.json', 'wb')
self.exporter = JsonLinesItemExporter(self.f, ensure_ascii=False, encoding='utf-8')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self, spider):
self.f.close()