本项目简单介绍
- 爬取网易中的五大板块新闻:国内、国际、军事、航空、无人机
- 初始url是:https://news.163.com/,板块url类似:https://news.163.com/domestic/
- 从初始url获取各个板块url地址,然后获得板块中系列新闻的标题 title 和详情 url,最后访问详情 url 取得新闻的 content
- 特别注意五大板块的不同类型、题材的新闻,其详情页面的解析各不相同,需要区分对待,用 xpath 解析出内容 content
- 设置了 selenium 来爬取异步加载的内容
- 设置了 js 代码来控制浏览器滑块滑动,获取更多动态加载的板块中 url
项目结构
wangyiPro
wangyiPro
spiders
wangyi.py
items.py
middlewares.py
pipelines.py
settings.py
wangyi.py中
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
allowed_domains = ['news.163.com']
start_urls = ['https://news.163.com/']
model_urls = []
def __init__(self):
self.bro = webdriver.Chrome(executable_path='D:\python_charm\爬虫课件\我的项目文件夹\chromedriver.exe')
# 解析五大板块详情Url
def parse(self, response):
li_lists = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
lists = [3, 4, 6, 7, 8]
themes = []
for li in li_lists:
theme = li.xpath('./a/@href').extract_first()
print('-----> 每一个板块的分别是:', theme)
themes.append(theme)
for i in lists:
model_url = themes[i]
print('-----> 请求的主题分别是', model_url)
self.model_urls.append(model_url)
for req_url in self.model_urls:
yield scrapy.Request(req_url, callback=self.parse_model)
def parse_model(self, response): # 解析每一个板块页面中对应新闻的标题和详情页的url
div_lists = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div')
for div in div_lists:
item = WangyiproItem()
title = div.xpath('./div[1]/div[1]/h3/a/text()').extract_first()
item['title'] = title
print('------> 标题是:', title)
new_detail_url = div.xpath('./div[1]/div[1]/h3/a/@href').extract_first()
if new_detail_url:
# 对新闻详情页的url发起请求
print('-----> 详情url是:', new_detail_url)
yield scrapy.Request(url=new_detail_url, meta={'item': item}, callback=self.parse_detail)
def parse_detail(self, response): # 用于解析新闻内容
# 特别注意五大板块的内容url 的解析放不相同,来自不同板块的内容的url的content的xpath解析是不同的
# 要分别解析
print('============================== 获取的页面为:', response.url)
# print(response.text)
item = response.meta['item']
contents = response.xpath('//*[@id="endText"]//p/text()').extract()
if len(contents) == 0:
contents = response.xpath('//*[@id="content"]/div[2]//p/text()').extract()
# '//*[@id="endText"]//p/text()'
# '//*[@id="content"]/div[2]'
# '//*[@id="content"]/div[2]'
# '//*[@id="content"]/div[2]'
# '//*[@id="content"]/div[2]'
contents = ''.join(contents)
print('-----> 内容是:', contents)
item['content'] = contents
yield item
# 爬虫结束关闭浏览器
def closed(self, spider):
self.bro.quit()
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy.http import HtmlResponse
from time import sleep
class WangyiproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class WangyiproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 通过该方法拦截对应的五大板块的响应对象,进行篡改,使它满足需求
def process_response(self, request, response, spider):
bro = spider.bro # 获取了爬虫类中定义的浏览器对象
# 挑选出指定的响应对象进行篡改
# 通过url指定request
# 通过request 指定 response
if request.url in spider.model_urls:
# 五大板块对应的响应对象
# 针对定位到的这些response进行篡改
# 实例化一个新的响应对象(符合需求:包含动态加载出来的新闻数据)
# 如何获取动态加载出来的数据呢?
# 给予selenium是可以很便捷获取动态内容
bro.get(request.url)
sleep(0.5)
# 通过selenium控制浏览器滚动条
bro.execute_script("window.scrollTo(0,1000)")
page_text = bro.page_source # 包含了动态加载的新闻数据
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
else:
# 其他请求对应的响应对象
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class WangyiproPipeline:
def __init__(self):
pass
def process_item(self, item, spider):
# print(item)
conn = pymysql.connect(host='localhost', user='root', password='rootpwd', db="spider", charset='utf8mb4')
cursor = conn.cursor()
try:
sql = 'insert into wangyi(title,content) values ("%s","%s")' % (item['title'], item['content'])
print('-----> sql语句是:', sql)
cursor.execute(sql)
conn.commit()
except Exception as e:
print('-----> 保存到数据库时报错', e)
finally:
conn.close()
cursor.close()
return item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for wangyiPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wangyiPro'
SPIDER_MODULES = ['wangyiPro.spiders']
NEWSPIDER_MODULE = 'wangyiPro.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wangyiPro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wangyiPro.middlewares.WangyiproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wangyiPro.pipelines.WangyiproPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'