使用selenium&phantomjs+bs4抓取斗鱼直播房间信息

使用selenium&phantomjs+bs4抓取斗鱼直播房间信息

# -*- coding:utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import time


class DouyuSpider(object):
    def __init__(self):
        self.base_url = "https://www.douyu.com/directory/all"
        # 浏览器对象
        self.driver = webdriver.PhantomJS()
        # 房间总数
        self.count = 0
        # 总页数
        self.page = 1

    def send_request(self):
        self.driver.get(self.base_url)

        # 什么时候结束 class = shark-pager-disable-next 有则结束,没有则继续循环
        while True:
            print '正在下载第%d页' % self.page
            time.sleep(1)
            self.page += 1
            data = self.driver.page_source
            self.parse_data(data)

            # 字符串查找find shark-pager-disable-next
            if data.find('shark-pager-disable-next') != -1:
                break

            # 点击下一页,继续获取数据
            self.driver.find_element_by_class_name('shark-pager-next').click()

    def parse_data(self, data):
        bs = BeautifulSoup(data, 'lxml')

        # 解析
        # 房间名字
        home_list = bs.select('#live-list-content .ellipsis')
        # 主播名字
        name_list = bs.select('#live-list-content .dy-name')
        # 房间人气
        pop_list = bs.select('#live-list-content .dy-num')

        for home, name, pop in zip(home_list, name_list, pop_list):
            print home.get_text().strip()
            print name.get_text()
            print pop.get_text()

            self.count += 1

        print self.count


if __name__ == '__main__':
    spider = DouyuSpider()
    spider.send_request()
上一篇:2018-11-26


下一篇:Windows Azure Service Bus (4) Service Bus Queue和Storage Queue的区别