scrapy startproject Scrapy_crawl # 新建Scrapy项目
scrapy genspider -l # 查看全部模板
# Available templates:
# basic
# crawl
# csvfeed
# xmlfeed
scrapy genspider -t crawl china tech.china.com # 选择crawl模板创建爬虫
scrapy crawl china # 运行爬虫
# china.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ChinaSpider(CrawlSpider):
name = 'china'
allowed_domains = ['tech.china.com']
start_urls = ['http://tech.china.com/']
# Rule的第一个参数是LinkExtractor,就是上文所说的LxmlLinkExtractor,只是名称不同。同时,默认的回调函数也不再是parse,而是parse_item
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
return item
-
修改start_urls链接:设置为爬取的第一个页面
start_urls = ['http://tech.china.com/articles/']
-
解析新闻链接:进入页面,F12使用开发者管理器查看源代码;所有的新闻链接都在ID为left_side节点中,具体就是每个class为con_item的节点里面;所有的新闻路径都是article开头
# allow:判断链接是否是新闻链接的正则;restrict_xpaths:检索新闻内容的正则;callback:回调函数,解析方法
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), callback='parse_item')
-
分析分页链接:分页链接都在ID为pageStyle的div中,然后不断的匹配下一页的链接文本
# 提取下一页链接
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
rules = (
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
)
from scrapy import Field, Item
class NewsItem(Item):
"""新闻数据结构"""
title = Field() # 标题
url = Field() # 链接
text = Field() # 正文
datetime = Field() # 发布时间
source = Field() # 来源
website = Field() # 站点名称
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Join, Compose
class NewsLoader(ItemLoader): # 继承ItemLoader
"""TakeFirst返回列表的第一个非空值,类似extract_first()的功能"""
default_output_processor = TakeFirst()
class ChinaLoader(NewsLoader): # 继承NewsLoader
"""每个输入值经过Join(),再经过strip()"""
text_out = Compose(Join(), lambda s: s.strip())
source_out = Compose(Join(), lambda s: s.strip())
-
改写china.py的parse_item()方法:不使用通用爬虫的写法和使用通用爬虫的写法
def parse_item(self, response):
"""不使用通用爬虫CrawlSpider的写法"""
item = NewsItem()
item['title'] = response.xpath('//h1[@id="chan_newsTitle"]/text()').extract_first()
item['url'] = response.url
item['text'] = ''.join(response.xpath('//div[@id="chan_newsDetail"]//text()').extract()).strip()
item['datetime'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first('(\d+-\d+-\d+\s\d+:\d+:\d+)')
item['source'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first('来源:(.*)').strip()
item['website'] = '中华网'
yield item
def parse_item(self, response):
loader = ChinaLoader(item=NewsItem(), response=response) # 用该Item和Response对象实例化ItemLoader
# 用add_xpath()、add_css()、add_value()等方法对不同属性依次赋值,最后调用load_item()方法实现Item的解析
loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
loader.add_value('url', response.url)
loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
loader.add_value('website', '中华网')
yield loader.load_item()
- 上面那些步只实现的爬虫的半通用化配置
- 下面抽取爬虫的通用配置
scrapy genspider -t crawl universal universal # 新建通用爬虫
python run.py china # 启动爬虫
-
建立configs文件夹,与spider文件夹并列,并创建配置文件china.json
{
"spider": "universal", # 爬虫名称
"website": "中华网科技", # 站点名称
"type": "新闻", # 站点类型
"index": "http://tech.china.com/", # 首页
"settings": { # user_agent
"USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"
},
"start_urls": [
"http://tech.china.com/articles/"
],
"allowed_domains": [
"tech.china.com"
],
"rules": "china"
}
-
新建rules.py:将所有rules单独定义在一块,实现Rule的分离
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
rules = {
'china': (
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
)
}
-
创建utils.py读取JSON配置文件:启动爬虫时需要读取配置文件然后动态加载到Spider中
from os.path import realpath, dirname
import json
def get_config(name):
path = dirname(realpath(__file__)) + '/configs/' + name + '.json'
with open(path, 'r', encoding='utf-8') as f:
return json.loads(f.read())
-
在项目的根目录设置入库文件run.py:用来启动爬虫
import sys
from scrapy.utils.project import get_project_settings
from Scrapy_crawl.spiders.universal import UniversalSpider
from Scrapy_crawl.utils import get_config
from scrapy.crawler import CrawlerProcess
def run():
name = sys.argv[1] # 输入参数
custom_settings = get_config(name) # 获取JSON配置文件信息
spider = custom_settings.get('spider', 'universal') # 爬取使用的爬虫名称
project_settings = get_project_settings() # 声明配置
settings = dict(project_settings.copy())
settings.update(custom_settings.get('settings')) # 获取到的settings配置和项目全局的settings配置做了合并
process = CrawlerProcess(settings) # 新建一个CrawlerProcess,传入爬取使用的配置
process.crawl(spider, **{'name': name}) # 启动爬虫
process.start()
if __name__ == '__main__':
run()
- 解析方法parse_item()的可配置化,添加新配置信息到china.json中
"item": {
"class": "NewsItem", # Item的类名
"loader": "ChinaLoader", # Item Loader的类名
"attrs": { # attrs属性来定义每个字段的提取规则
"title": [
{
"method": "xpath", # title定义的提取方法,xpath就是相当于Item Loader的add_xpath()方法
"args": [ # 定义匹配的正则表达式
"//h1[@id='chan_newsTitle']/text()"
]
}
],
"url": [
{
"method": "attr", # add_value
"args": [
"url"
]
}
],
"text": [
{
"method": "xpath",
"args": [
"//div[@id='chan_newsDetail']//text()"
]
}
],
"datetime": [
{
"method": "xpath",
"args": [
"//div[@id='chan_newsInfo']/text()"
],
"re": "(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"
}
],
"source": [
{
"method": "xpath",
"args": [
"//div[@id='chan_newsInfo']/text()"
],
"re": "来源:(.*)"
}
],
"website": [
{
"method": "value", # add_value
"args": [
"中华网"
]
}
]
}
}
- 修改配置文件china.py的start_urls参数
"start_urls": {
"type": "static", # 静态类型,直接配置URL列表
"value": [
"http://tech.china.com/articles/"
]
}
##############################################################################
"start_urls": {
"type": "dynamic", # 动态类型,调用方法生成
"method": "china",
"args": [
5, 10
]
}
-
创建urls.py类:当start_urls定义为dynamic类型,使用china()方法,只需要传入页码参数
def china(start, end):
for page in range(start, end + 1):
yield 'http://tech.china.com/articles/index_' + str(page) + '.html'
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from Scrapy_crawl.items import *
from Scrapy_crawl.loaders import *
from Scrapy_crawl.utils import get_config
from Scrapy_crawl import urls
from Scrapy_crawl.rules import rules
class UniversalSpider(CrawlSpider):
name = 'universal'
def __init__(self, name, *args, **kwargs):
config = get_config(name) # 获取配置文件
self.config = config
self.rules = rules.get(config.get('rules')) # 获取Rule名称,然后去rules类中获取对应的Rule配置
start_urls = config.get('start_urls') # 获取start_urls配置
if start_urls:
if start_urls.get('type') == 'static': # 静态类型,直接获取连接
self.start_urls = start_urls.get('value')
elif start_urls.get('type') == 'dynamic': # 动态类型
self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args', [])))
self.allowed_domains = config.get('allowed_domains') # 获取主机
super(UniversalSpider, self).__init__(*args, **kwargs)
def parse_item(self, response):
item = self.config.get('item') # 获取解析函数配置信息
if item:
cls = eval(item.get('class'))()#获取Item类
loader = eval(item.get('loader'))(cls, response=response)#获取loader
# 动态获取属性配置
for key, value in item.get('attrs').items():
for extractor in value:
if extractor.get('method') == 'xpath':
loader.add_xpath(key, *extractor.get('args'), **{'re': extractor.get('re')})
if extractor.get('method') == 'css':
loader.add_css(key, *extractor.get('args'), **{'re': extractor.get('re')})
if extractor.get('method') == 'value':
loader.add_value(key, *extractor.get('args'), **{'re': extractor.get('re')})
if extractor.get('method') == 'attr':
loader.add_value(key, getattr(response, *extractor.get('args')))
yield loader.load_item()