例1:scrapy项目的使用(利用item收集抓取的返回值)
1、创建scrapy项目
1
2
3
4
5
6
|
scrapy startproject booklist New Scrapy project 'booklist' , using template directory '/usr/local/lib/python3.6/site-packages/scrapy/templates/project' , created in :
/Users/yuanjicai/booklist
You can start your first spider with: cd booklist
scrapy genspider example example.com
|
2、定义要抓取内容的字段(用于回收抓取的数据)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
cat booklist / items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy
class BooklistItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
author = scrapy.Field()
publisher = scrapy.Field()
editor_date = scrapy.Field()
description = scrapy.Field()
|
3、编写spider进行抓取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
cat booklist / spiders / bookspider.py
import scrapy
from booklist.items import BooklistItem
class BookSpider(scrapy.Spider):
name = 'booklist'
start_urls = [ 'http://www.chinavalue.net/BookInfo/BookList.aspx?page=1' ]
def parse( self ,response):
yield scrapy.Request(response.urljoin( "?page=1" ),callback = self .parse_page)
for item in response.xpath( '//div[@id="ctl00_ContentPlaceHolder1_pagerBook"]/a/@href' ).extract():
fullurl = response.urljoin(item)
yield scrapy.Request(fullurl,callback = self .parse_page)
def parse_page( self ,response):
for item in response.xpath( '//div[@id="divBookList"]/div/div[2]/a[1]' ):
detail_url = response.urljoin(item.xpath( '@href' ).extract()[ 0 ])
yield scrapy.Request(detail_url,callback = self .parse_bookdetail)
def parse_bookdetail( self ,response):
bookinfo = BooklistItem()
basic_info = response.xpath( '//*[@id="Container"]/div[6]/div[1]/div[2]/div[1]/div[2]' )
bookinfo[ 'name' ] = basic_info.xpath( 'div[1]/text()' ).extract()[ 0 ].strip()
bookinfo[ 'author' ] = basic_info.xpath( 'div[2]/text()' ).extract()[ 0 ].strip()
bookinfo[ 'publisher' ] = basic_info.xpath( 'div[3]/text()' ).extract()[ 0 ].strip()
bookinfo[ 'editor_date' ] = basic_info.xpath( 'div[4]/text()' ).extract()[ 0 ].strip()
bookinfo[ 'description' ] = response.xpath( '//*[@id="ctl00_ContentPlaceHolder1_pnlIntroBook"]/div[2]/text()' ).extract()[ 0 ].strip()
yield bookinfo
|
parse函数中的“for循环”处理“下一页”的link(下图所示)
parse_page函数负责解析每一页书单中 各item 标题的link(如下图所示)
parse_bookdetail 负责解析每本书的详细属性及内容(如下图所示)
4、运行项目
1
|
scrapy crawl booklist -o book-info.csv |
上例中将抓取的信息通过yield返回给item中的各字段,然后再通过 output 选项 存储到 book-info.csv 文件中。
例2:scrapy下载图片,并将抓取信息存储到指定位置(文件、mysql、mongodb)
创建项目(下载图片):
1
2
|
bogon:scrapy yuanjicai$ scrapy startproject bookinfo bogon:douban_booklist yuanjicai$ cd bookinfo/
|
cat bookinfo/items.py #在item中定义抓取的各字段名,并定义image_urls、image_paths
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy
class BookinfoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
author = scrapy.Field()
publisher = scrapy.Field()
price = scrapy.Field()
rating = scrapy.Field()
editor_date = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
image_paths = scrapy.Field()
|
vim bookinfo/settings.py #通过settings指定抓取时使用的header、agent、pipeline、图片或文件保存的位置、img过期时间、mysql用户名/密码/端口、mongo用户名/密码/端口
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
grep -E - v '^(#|$)' bookinfo /settings .py
BOT_NAME = 'bookinfo'
SPIDER_MODULES = [ 'bookinfo.spiders' ]
NEWSPIDER_MODULE = 'bookinfo.spiders'
ROBOTSTXT_OBEY = True #from faker import Factory #f = Factory.create() #USER_AGENT = f.user_agent() DEFAULT_REQUEST_HEADERS = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,
'Accept-Encoding' : 'gzip, deflate, br' ,
'Accept-Language' : 'zh-CN,zh;q=0.9,en;q=0.8' ,
'Connection' : 'keep-alive' ,
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36' ,
} ITEM_PIPELINES = { #'bookinfo.pipelines.BookinfoStoreMysqlPipeline': 200,
#'bookinfo.pipelines.BookinfoStoreFilePipeline': 200,
'bookinfo.pipelines.BookinfoStoreMongoPipeline' : 200,
'bookinfo.pipelines.BookImgsDLPipeline' : 300,
} IMAGES_STORE = '/Users/yuanjicai/Downloads/bookinfo'
IMAGES_EXPIRES = 90 #IMAGES_MIN_HEIGHT = 100 #IMAGES_MIN_WIDTH = 100 #IMAGES_THUMBS = { # 'small': (50, 50), # 'big': (270, 270), #} MYSQL_HOST = '10.18.101.104'
MYSQL_DBNAME = 'book'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123'
MYSQL_PORT = 3306 MONGODB_HOST = '10.18.101.104'
MONGODB_PORT = 27017 MONGODB_DB = 'book'
MONGODB_COLLECTION = 'bookinfo'
|
(1)爬取一个Item,将图片的URLs放入image_urls字段
(2)从Spider返回的Item,传递到Item Pipeline
(3)当Item传递到ImagePipeline,将调用Scrapy 调度器和下载器完成image_urls中的url的调度和下载。ImagePipeline会自动高优先级抓取这些url,于此同时,item会被锁定直到图片抓取完毕才被解锁。
(4)图片下载成功结束后,图片下载路径、url和校验和等信息会被填充到images字段中。
cat bookinfo/spiders/bookinfo_spider.py #实现抓取内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
# -*- coding: utf-8 -*- import scrapy
import re
from bookinfo.items import BookinfoItem
class bookinfoSpider(scrapy.Spider):
name = "bookinfo"
start_urls = [ "https://book.douban.com/top250" ]
def parse( self ,response):
yield scrapy.Request(response.url,callback = self .parse_page)
for page_url in response.xpath( '//div[@class="paginator"]/a/@href' ).extract():
yield scrapy.Request(page_url,callback = self .parse_page)
def parse_page( self ,response):
for item in response.xpath( '//div[@class="article"]/div[1]/table/tr[1]' ):
bookinfo = BookinfoItem() #在for循环内实例化在item中定义的各字段,抓取每个item使用一个新的bookinfo空间,相互不影响
bookinfo[ 'name' ] = item.xpath( "td[2]/div[1]/a/text()" ).extract()[ 0 ].strip()
bookinfo[ 'price' ] = item.xpath( "td[2]/p/text()" ).extract()[ 0 ].strip().split( "/" )[ - 1 ]
bookinfo[ 'editor_date' ] = item.xpath( "td[2]/p/text()" ).extract()[ 0 ].strip().split( "/" )[ - 2 ]
bookinfo[ 'publisher' ] = item.xpath( "td[2]/p/text()" ).extract()[ 0 ].strip().split( "/" )[ - 3 ]
bookinfo[ 'author' ] = item.xpath( "td[2]/p/text()" ).extract()[ 0 ].strip().split( "/" )[ - 4 ]
bookinfo[ 'rating' ] = item.xpath( "td[2]/div[2]/span[2]/text()" ).extract()[ 0 ]
bookinfo[ 'image_urls' ] = item.xpath( "td[1]/a/img/@src" ).extract_first()
yield bookinfo
|
cat bookinfo/pipelines.py #由pipeline中定义的类、方法保存抓取的信息及图片
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.conf import settings #导入settings 是为了在pipeline中调用setting中定义的各DB参数
import scrapy
import codecs
import json
import pymysql
import pymongo
#通过以下pipeline使用pymysql连接并写入mysql server class BookinfoStoreMysqlPipeline( object ):
def __init__( self ):
pass
def dbHandle( self ):
conn = pymysql.connect(host = '10.18.101.104' , db = 'book' , user = 'root' , passwd = '123' , charset = 'utf8' )
return conn
def process_item( self ,item,spider):
conn = self .dbHandle()
cursor = conn.cursor()
insert_sql = 'insert into bookinfo(name,author,publisher,url) VALUES (%s,%s,%s,%s)'
try :
cursor.execute(insert_sql,(item[ "name" ],item[ "author" ],item[ "publisher" ],item[ "image_urls" ]))
conn.commit()
except :
conn.rollback()
conn.close()
return item
def spider_close( self ,spider):
pass
#通过以下pipeline使用json.dumps形式将返回的数据写入指定json文件中 class BookinfoStoreFilePipeline( object ):
def __init__( self ):
self . file = codecs. open ( 'bookinfo.json' , 'w' ,encoding = 'utf-8' )
def process_item( self ,item,spider):
line = json.dumps( dict (item),ensure_ascii = False ) + "\n"
self . file .write(line)
return item
def spider_colse( self ,spider):
self . file .close()
#通过以下pipeline使用pymongo形式将返回的数据写入mongodb中 class BookinfoStoreMongoPipeline( object ):
def __init__( self ):
conn = pymongo.MongoClient(settings[ 'MONGODB_HOST' ],settings[ 'MONGODB_PORT' ])
db = conn[settings[ 'MONGODB_DB' ]]
self .collection = db[settings[ 'MONGODB_COLLECTION' ]]
def process_item( self ,item,spider):
self .collection.insert( dict (item))
return item
def spider_colse( self ,spider):
conn.close()
#通过以下pipeline下载抓取过程中存储在image_urls字段的图片 class BookImgsDLPipeline(ImagesPipeline):
default_headers = {
'accept' : 'image/webp,image/*,*/*;q=0.8' ,
'accept-encoding' : 'gzip, deflate, sdch, br' ,
'accept-language' : 'zh-CN,zh;q=0.8,en;q=0.6' ,
'referer' : 'https://book.douban.com/top250/' ,
'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' ,
}
def get_media_requests( self ,item,info):
self .default_headers[ 'referer' ] = item[ 'image_urls' ]
yield scrapy.Request(item[ 'image_urls' ], headers = self .default_headers, meta = { 'bookname' :item[ 'name' ]})
#将每个bookname附加在meta中传递给下一个函数处理
def file_path( self ,request,response = None ,info = None ):
bookname = request.meta[ 'bookname' ]
image_guid = bookname + '_' + request.url.split( '/' )[ - 1 ] #自定义保存图片的名称
filename = 'full/%s' % (image_guid)
return filename
def item_completed( self ,results,item,info):
image_paths = [ value[ 'path' ] for ok, value in results if ok ]
if not image_paths:
raise DropItem( "Item contains no images" )
item[ 'image_paths' ] = image_paths[ 0 ]
item[ 'images' ] = [ value for ok, value in results if ok ]
return item
|
说明:result字典值的格式如下所示:
[(True, {'url': 'https://img3.doubanio.com/mpic/s26012674.jpg', 'path': 'full/b8700497fc0014c87e085747c89476e12162c518.jpg', 'checksum': '4da0defa1ec30229ce724d691f694ad1'})]
Pipline 下载图片时,必须是一个继承ImagesPipeline父类的 类 ,该类必须在setting中调用 ;
ImagePipeline
需要在自定义的ImagePipeline类中重载的方法有:get_media_requests(item, info)和item_completed(results, items, info)
正如工作流程所示,Pipeline将从item中获取图片的URLs并下载它们,所以必须重载get_media_requests,并返回一个Request对象,这些请求对象将被Pipeline处理,当完成下载后,结果将发送到item_completed方法,这些结果为一个二元组的list,每个元祖的包含(success, image_info_or_failure)。 success: boolean值,true表示成功下载 ;如果success=true,image_info_or_error词典包含以下键值对:
url:原始URL
path:本地存储路径
checksum:校验码