用python+selenium抓取豆瓣读书中最受关注图书并按评分排序

抓取豆瓣读书中的(http://book.douban.com/)最受关注图书,按照评分排序,并保存至txt文件中,需要抓取书籍的名称,作者,评分,体裁和一句话评

方法一:

 #coding=utf-8
 from selenium import webdriver
 from time import sleep

 class DoubanPopularBook:

     def __init__(self):
         self.dr = webdriver.Chrome()
         self.popular_books_list = self.get_douban_popular_books()

     def get_douban_popular_books(self):
         self.dr.get('https://book.douban.com/')
         sleep(3)
         popular_books_list = [] #定义一个空list用于存放获取的书籍信息
         i = 0
         while i < 10:  #总共10本书
             book_info = self.dr.find_elements_by_css_selector("[class='list-col list-col2 list-summary s']>li")[i].text #通过css用class属性和标签li组合来获取书籍所有文本信息
             popular_books_list.append(book_info.split('\n')) #向空list追加书籍信息用并换行符隔开
             i += 1 #每本书籍的li标签间隔为1
         #popular_books_list.sort(key=lambda x:float(x[1][0:3]), reverse=True) #用sort中key方法根据书籍评分从高到低进行排序
         popular_books_list = sorted(popular_books_list, key=lambda book:float(book[1][0:3]), reverse=True)
         return popular_books_list

     def get_popular_books_rank_file(self):
         self.file_title = '豆瓣最受关注图书榜之评分排行'
         self.file = open(self.file_title + '.txt', 'wb')
         for item in self.popular_books_list:
             separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n'
             self.file.write(separate_line.encode('utf-8'))
             self.file.write(('书籍名称:'+item[0]+'\n').encode('utf-8'))
             self.file.write(('评分:'+item[1]+'\n').encode('utf-8'))
             self.file.write((item[2]+'\n').encode('utf-8'))
             self.file.write(('体裁:'+item[3]+'\n').encode('utf-8'))
             if item[4] == '有电子书':
                 self.file.write(('一句话评论:'+item[5]+'\n').encode('utf-8'))
             else:
                 self.file.write(('一句话评论:'+item[4]+'\n').encode('utf-8'))
         self.file.close()

     def quit(self):
         self.dr.quit()

 if __name__ == '__main__':
     popular_books = DoubanPopularBook()
     popular_books.get_popular_books_rank_file()
     popular_books.quit()

方法二:

 #coding=utf-8
 from selenium import webdriver
 from time import sleep

 class DoubanPopularBook:

     def __init__(self):
         self.dr = webdriver.Chrome()
         self.popular_books_list = self.get_douban_popular_books()

     def get_douban_popular_books(self):
         self.dr.get('https://book.douban.com/')
         sleep(3)
         popular_books_list = [] #定义一个空list用于存放获取的书籍信息
         i = 0
         while i < 10:  #总共10本书
             book_name = self.dr.find_elements_by_xpath("//h4[@class='title']/a")[i].text #定位书籍名称
             book_grade = self.dr.find_elements_by_css_selector('.average-rating')[i].text #定位评分
             book_auther = self.dr.find_elements_by_xpath("//p[@class='author']")[i].text #定位作者
             book_genre = self.dr.find_elements_by_css_selector('.book-list-classification')[i].text #定位体裁
             book_comment = self.dr.find_elements_by_css_selector('.reviews')[i].text #定位一句话评论
             popular_books_list.append([book_name, book_grade, book_auther, book_genre, book_comment]) #向空list追加书籍信息
             i += 1 #每本书籍间隔为1
         popular_books_list = sorted(popular_books_list, key=lambda x:float(x[1]), reverse=True) #用sorted方法按评分从高到低排序
         return popular_books_list

     def get_popular_books_rank_file(self):
         self.file_title = '豆瓣最受关注图书榜之评分排行'
         self.file = open(self.file_title + '.txt', 'wb')
         for item in self.popular_books_list:
             separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n'
             self.file.write(separate_line.encode('utf-8'))
             self.file.write(('书籍名称:'+item[0]+'\n').encode('utf-8'))
             self.file.write(('评分:'+item[1]+'\n').encode('utf-8'))
             self.file.write((''+item[2]+'\n').encode('utf-8'))
             self.file.write(('体裁:'+item[3]+'\n').encode('utf-8'))
             self.file.write(('一句话评论:'+item[4]+'\n').encode('utf-8'))
         self.file.close()

     def quit(self):
         self.dr.quit()

 if __name__ == '__main__':
     popular_books = DoubanPopularBook()
     popular_books.get_popular_books_rank_file()
     popular_books.quit()

方法三:

 # coding=utf-8
 from selenium import webdriver
 import unittest
 from time import sleep

 class DoubanBooks(unittest.TestCase):

     def setUp(self):
         self.dr = webdriver.Chrome()
         self.popular_books_list = self.get_douban_popular_books()
         self.books = self.get_popular_books_rank_file()

     def get_douban_popular_books(self):
         self.dr.get("https://book.douban.com/")
         sleep(5)
         book_name = self.dr.find_elements_by_xpath("//h4[@class='title']/a")  #定位书名
         book_author = self.dr.find_elements_by_xpath("//p[@class='author']")  #定位作者
         book_grade = self.dr.find_elements_by_xpath("//span[@class='average-rating']")  #定位评分
         book_genre = self.dr.find_elements_by_xpath("//p[@class='book-list-classification']")  #定位体裁
         book_comment = self.dr.find_elements_by_xpath("//p[@class='reviews']")  #定位评论

         douban_most_popular_book_list = []  #定义空list用来放置书籍信息
         x = 0
         while x < len(book_name):   #数目为书名的个数
             douban_most_popular_book_list.append([book_name[x].text, book_author[x].text, book_grade[x].text, book_genre[x].text,book_comment[x].text])
             x += 1
         douban_most_popular_book_list = sorted(douban_most_popular_book_list, key=lambda book_grade: book_grade[2],reverse=True)  #用sorted方法按评分从高到低排名
         return douban_most_popular_book_list

     def get_popular_books_rank_file(self):
         self.file_title = '豆瓣最受关注图书榜之评分排行'
         self.file = open(self.file_title + '.txt', 'wb')
         for item in self.popular_books_list:
             separate_line = '~~~~~~~~~~~~~~~~~~~~~~~~\n'  #分隔符
             self.file.write(separate_line.encode('utf-8'))
             self.file.write(('书籍名称:'+item[0]+'\n').encode('utf-8'))
             self.file.write((''+item[1]+'\n').encode('utf-8'))
             self.file.write(('评分:'+item[2]+'\n').encode('utf-8'))
             self.file.write(('体裁:'+item[3]+'\n').encode('utf-8'))
             self.file.write(('一句话评论:'+item[4]+'\n').encode('utf-8'))
         self.file.close()

     def test_books(self):
         pass
         print("获取完毕")

     def tearDown(self):
         self.dr.quit()

 if __name__ == "__main__":
     unittest.main()

网页如下:

用python+selenium抓取豆瓣读书中最受关注图书并按评分排序

用python+selenium抓取豆瓣读书中最受关注图书并按评分排序

生成txt效果如下:

用python+selenium抓取豆瓣读书中最受关注图书并按评分排序

上一篇:(原创)PBS | SGE 智能任务投递系统monitor | python实现


下一篇:openstack-networking-neutron(一)---端到端和点到点的理解