斗鱼直播主播信息采集
from selenium import webdriver
import time
from lxml import etree
from excel_utils.excel_utils import write_to_excel,append_to_excel
import os
# 浏览器生成并解析
def get_page_content_by_selenium(url):
driver.get(url)
time.sleep(2)
driver.maximize_window()
page_content = driver.page_source
return etree.HTML(page_content)
def main():
start_url = 'https://www.douyu.com/g_LOL'
page_content = get_page_content_by_selenium(start_url)
next_btn = driver.find_element_by_xpath('//div[@class="ListFooter"]/ul/li[last()]')
print(next_btn.tag_name)
n = 1
while True:
print(f'爬取第{n}页')
titles = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h3/text()')
anchor = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//h2/div[@class="DyListCover-userName"]/text()')
focus = page_content.xpath('//section[@id="listAll"]//ul[@class="layout-Cover-list"]//span[@class="DyListCover-hot"]/text()')
anchor_list = []
for index, title in enumerate(titles):
item = {}
item['title'] = title
item['anchor'] = anchor[index]
item['focus'] = focus[index]
anchor_list.append(item)
file_name = 'anchor.xls'
if not os.path.exists(file_name):
write_to_excel(anchor_list, file_name)
else:
append_to_excel(anchor_list, file_name)
if next_btn.get_attribute('aria-disabled') == 'false':
next_btn.click()
time.sleep(0.5)
page_content = etree.HTML(driver.page_source)
else:
break
n += 1
if __name__ == '__main__':
driver = webdriver.Chrome()
main()