需要注意的点是评价数据是通过ajax加载的,如何读取里面返回的json格式的数据
import scrapy
import json
import re
from jd.items import JdItem
class JinDongSpider(scrapy.Spider):
name = 'jin_dong'
allowed_domains = ['jd.com']
start_urls = ['https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA']
def parse(self, response):
one_contents = response.xpath("//div[@id='J_goodsList']/ul/li")
for one_content in one_contents:
data_sku = one_content.xpath("./@data-sku").get()
data_url = one_content.xpath(".//div[@class='p-img']/a/@href").get()
# 有些连接前面添加了https的,有些连接没有添加https,需要自己判断
if data_url.startswith("http"):
pass
else:
data_url = "https:" + data_url
data_content = one_content.xpath(".//div[@class='p-img']/a/@title").get()
# print(data_content)
header={
"authority": "item.jd.com",
"method": "GET",
"path": "/{}.html".format(data_sku),
# ":scheme": "https",
}
yield scrapy.Request(data_url, callback=self.parse_content, meta={"sku":data_sku}, headers=header)
# 进去详情页面
def parse_content(self,response):
sku = response.meta.get("sku")
print(sku)
url = "https://sclub.jd.com/comment/productPageComments.action?productId={}&score=0&sortType=5&page=0&pageSize=10".format(sku)
header={
"authority": "sclub.jd.com",
"method": "GET",
"accept-language": "zh-CN,zh",
"referer": "https://item.jd.com/{}.html".format(sku),
}
yield scrapy.Request(url,callback=self.parse_contents, meta={"sku":sku},headers=header)
# 从第一页中的json数据中获取总的评论页数,在访问每一页页数,把response交给parse_cl函数处理
def parse_contents(self, response):
sku = response.meta.get("sku")
jso = json.loads(response.body.decode(response.encoding))
# print(jsons['maxPage'])
d = int(jso['maxPage'])
print("这是ddddddsdsdasdfasdas",d)
header = {
"authority": "sclub.jd.com",
"method": "GET",
"accept-language": "zh-CN,zh",
"referer": "https://item.jd.com/{}.html".format(sku),
}
if d >= 1:
for i in range(d+1):
url = "https://sclub.jd.com/comment/productPageComments.action?productId={}&score=0&sortType=5&page={}&pageSize=10".format(sku,i)
yield scrapy.Request(url,callback=self.parse_cl, meta={"sku":sku}, headers=header)
# 处理每一页评论当中的json格式的数据
def parse_cl(self,response):
sku = response.meta.get('sku')
try:
js = json.loads(response.body.decode(response.encoding))
except:
pass
else:
cc = js['comments']
if cc:
for l in cc:
user_id = l.get('id') # 用户id
# content = l['content']
content = l.get('content') # 用户评论 用get获取不存在的内容不会报错,如果获取不到返回None, 也可以自己设置默认返回值:content = l.get('content','hehe') 当不能获取content的时候返回hehe
pro_name = l.get('referenceName') # 商品名称
image_url = l.get('images')
img_list = []
if image_url:
for k in image_url:
ii = k.get('imgUrl')
img_url1 = ii.replace("n0","shaidan")
img_url = re.sub(r'(\d+x\d+)','616x415',img_url1) # 把晒图的小图片的地址换成大图片的地址
img_list.append(img_url)
else:
pass
video_url = l.get('video')
video_list = []
if video_url:
for q in video_url:
video_url = g.get('remark')
video_list.append(video_url)
else:
pass
color = l.get('productColor')
size = l.get('productSize')
origin_phone = l.get('userClientShow')
item = JdItem(
sku=sku, # 商品sku
user_id=user_id,
content=content, # 评价详情
pro_name=pro_name,
img_list=img_list, # 晒图列表
video_list=video_list, # 晒图视频列表
origin_phone=origin_phone # 评价来源
)
yield item
else:
pass
把数据保存到mongodb中settings文件与pipelines文件的相关内容填写
# 这里是本来settings文件就有的,取消注释就行了
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300, # 如果在pipelines中新创建了类,需要添加到里面来
}
MONGODB_HOST = '127.0.0.1'
# 端口号,默认27017
MONGODB_PORT = 27017
# 设置数据库名称
MONGODB_DBNAME = 'JD'
# 存放本数据的表名称
MONGODB_DOCNAME = 'jin_dong'
# pipelines文件中的内容
from scrapy.conf import settings
import pymongo
class JdPipeline(object):
def __init__(self):
# 获取setting主机名、端口号和数据库名称
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbname = settings['MONGODB_DBNAME']
# 创建数据库连接
client = pymongo.MongoClient(host=host, port=port)
# 指向指定数据库
mdb = client['JD']
# 获取数据库里面存放数据的表名
self.post = mdb[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
data = dict(item)
# 向指定的表里添加数据
self.post.insert(data)
return item