# 爬取豆瓣图书TOP250,爬取的数据存储到CSV文件中
from lxml import etree
import requests
import csv
# wt是python中以文本写 的方式打开,只能写文件,如果文件不存在则创建该文件
fp = open("D://pytext/douban_top250.csv","w")
writer = csv.writer(fp)
writer.writerow(('name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment'))
# 创建url,range的第三个参数是步长
urls = ["https://book.douban.com/top250?start={}".format(i) for i in range(0,250,25)]
# 请求头,用来模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
for url in urls:
res = requests.get(url,headers=headers)
# lxml库的etree解析html
selector = etree.HTML(res.text)
# 获取的是一页中所有的书本信息,每本的所有信息都在类为item的tr下面
infos = selector.xpath("//tr[@class='item']")
for info in infos:
name = info.xpath('td/div/a/@title')[0]
url = info.xpath('td/div/a/@href')[0]
# 获取的是书本的基本信息,有作者和出版社,和出版日期...
book_infos = info.xpath('td/p/text()')[0]
# 作者
author = book_infos.split('/')[0]
# 出版社
publisher = book_infos.split('/')[-3]
# 出版日期
date = book_infos.split('/')[-2]
# 价格
price = book_infos.split('/')[-1]
# 书本的评分
rate = info.xpath('td/div/span[2]/text()')[0]
# 下面的评论
comments = info.xpath('td/p/span/text()')
# 这里单行的if语句是:如果comments的长度不为0时,则把comments的第1个元素给comment,否则就把"空"赋值给comment
comment = comments[0] if len(comments) != 0 else "空"
writer.writerow((name.encode("utf-8"), url.encode("utf-8"), author.encode("utf-8"), publisher.encode("utf-8"), date.encode("utf-8"), price.encode("utf-8"), rate.encode("utf-8"), comment.encode("utf-8")))
# 关闭
fp.close()