抓取豆瓣读书中的(http://book.douban.com/)最受关注图书,按照评分排序,并保存至txt文件中,需要抓取书籍的名称,作者,评分,体裁和一句话评
方法一:
#coding=utf-8 from selenium import webdriver from time import sleep class DoubanPopularBook: def __init__(self): self.dr = webdriver.Chrome() self.popular_books_list = self.get_douban_popular_books() def get_douban_popular_books(self): self.dr.get('https://book.douban.com/') sleep(3) popular_books_list = [] #定义一个空list用于存放获取的书籍信息 i = 0 while i < 10: #总共10本书 book_info = self.dr.find_elements_by_css_selector("[class='list-col list-col2 list-summary s']>li")[i].text #通过css用class属性和标签li组合来获取书籍所有文本信息 popular_books_list.append(book_info.split('\n')) #向空list追加书籍信息用并换行符隔开 i += 1 #每本书籍的li标签间隔为1 #popular_books_list.sort(key=lambda x:float(x[1][0:3]), reverse=True) #用sort中key方法根据书籍评分从高到低进行排序 popular_books_list = sorted(popular_books_list, key=lambda book:float(book[1][0:3]), reverse=True) return popular_books_list def get_popular_books_rank_file(self): self.file_title = '豆瓣最受关注图书榜之评分排行' self.file = open(self.file_title + '.txt', 'wb') for item in self.popular_books_list: separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n' self.file.write(separate_line.encode('utf-8')) self.file.write(('书籍名称:'+item[0]+'\n').encode('utf-8')) self.file.write(('评分:'+item[1]+'\n').encode('utf-8')) self.file.write((item[2]+'\n').encode('utf-8')) self.file.write(('体裁:'+item[3]+'\n').encode('utf-8')) if item[4] == '有电子书': self.file.write(('一句话评论:'+item[5]+'\n').encode('utf-8')) else: self.file.write(('一句话评论:'+item[4]+'\n').encode('utf-8')) self.file.close() def quit(self): self.dr.quit() if __name__ == '__main__': popular_books = DoubanPopularBook() popular_books.get_popular_books_rank_file() popular_books.quit()
方法二:
#coding=utf-8 from selenium import webdriver from time import sleep class DoubanPopularBook: def __init__(self): self.dr = webdriver.Chrome() self.popular_books_list = self.get_douban_popular_books() def get_douban_popular_books(self): self.dr.get('https://book.douban.com/') sleep(3) popular_books_list = [] #定义一个空list用于存放获取的书籍信息 i = 0 while i < 10: #总共10本书 book_name = self.dr.find_elements_by_xpath("//h4[@class='title']/a")[i].text #定位书籍名称 book_grade = self.dr.find_elements_by_css_selector('.average-rating')[i].text #定位评分 book_auther = self.dr.find_elements_by_xpath("//p[@class='author']")[i].text #定位作者 book_genre = self.dr.find_elements_by_css_selector('.book-list-classification')[i].text #定位体裁 book_comment = self.dr.find_elements_by_css_selector('.reviews')[i].text #定位一句话评论 popular_books_list.append([book_name, book_grade, book_auther, book_genre, book_comment]) #向空list追加书籍信息 i += 1 #每本书籍间隔为1 popular_books_list = sorted(popular_books_list, key=lambda x:float(x[1]), reverse=True) #用sorted方法按评分从高到低排序 return popular_books_list def get_popular_books_rank_file(self): self.file_title = '豆瓣最受关注图书榜之评分排行' self.file = open(self.file_title + '.txt', 'wb') for item in self.popular_books_list: separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n' self.file.write(separate_line.encode('utf-8')) self.file.write(('书籍名称:'+item[0]+'\n').encode('utf-8')) self.file.write(('评分:'+item[1]+'\n').encode('utf-8')) self.file.write((''+item[2]+'\n').encode('utf-8')) self.file.write(('体裁:'+item[3]+'\n').encode('utf-8')) self.file.write(('一句话评论:'+item[4]+'\n').encode('utf-8')) self.file.close() def quit(self): self.dr.quit() if __name__ == '__main__': popular_books = DoubanPopularBook() popular_books.get_popular_books_rank_file() popular_books.quit()
方法三:
# coding=utf-8 from selenium import webdriver import unittest from time import sleep class DoubanBooks(unittest.TestCase): def setUp(self): self.dr = webdriver.Chrome() self.popular_books_list = self.get_douban_popular_books() self.books = self.get_popular_books_rank_file() def get_douban_popular_books(self): self.dr.get("https://book.douban.com/") sleep(5) book_name = self.dr.find_elements_by_xpath("//h4[@class='title']/a") #定位书名 book_author = self.dr.find_elements_by_xpath("//p[@class='author']") #定位作者 book_grade = self.dr.find_elements_by_xpath("//span[@class='average-rating']") #定位评分 book_genre = self.dr.find_elements_by_xpath("//p[@class='book-list-classification']") #定位体裁 book_comment = self.dr.find_elements_by_xpath("//p[@class='reviews']") #定位评论 douban_most_popular_book_list = [] #定义空list用来放置书籍信息 x = 0 while x < len(book_name): #数目为书名的个数 douban_most_popular_book_list.append([book_name[x].text, book_author[x].text, book_grade[x].text, book_genre[x].text,book_comment[x].text]) x += 1 douban_most_popular_book_list = sorted(douban_most_popular_book_list, key=lambda book_grade: book_grade[2],reverse=True) #用sorted方法按评分从高到低排名 return douban_most_popular_book_list def get_popular_books_rank_file(self): self.file_title = '豆瓣最受关注图书榜之评分排行' self.file = open(self.file_title + '.txt', 'wb') for item in self.popular_books_list: separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n' #分隔符 self.file.write(separate_line.encode('utf-8')) self.file.write(('书籍名称:'+item[0]+'\n').encode('utf-8')) self.file.write((''+item[1]+'\n').encode('utf-8')) self.file.write(('评分:'+item[2]+'\n').encode('utf-8')) self.file.write(('体裁:'+item[3]+'\n').encode('utf-8')) self.file.write(('一句话评论:'+item[4]+'\n').encode('utf-8')) self.file.close() def test_books(self): pass print("获取完毕") def tearDown(self): self.dr.quit() if __name__ == "__main__": unittest.main()
网页如下:
生成txt效果如下: