import time
import re
import threading
import queue
from selenium import webdriver
baseurl = 'http://www....{}...html'
# 递推表达式生成url列表
url_list = [baseurl.format(x) for x in range(1, 13)]
# 创建队列q
q = queue.Queue()
# 向队列添加数据
for i in url_list:
q.put(i)
class dragen(object):
def __init__(self):
self.max_thread = 12 # 最大线程数
def getdata(self, que):
# 单个线程判空,非空继续取出元素执行,直到取完所有元素
while not que.empty():
url = que.get()
print(url)
options = webdriver.ChromeOptions()
options.binary_location = r"C:\....\chrome.exe"
driver = webdriver.Chrome(options=options)
driver.get(url)
# 等待加载完毕,获取完整信息
time.sleep(10)
content = driver.page_source
driver.close()
url_data = re.findall('"url":"(.*?)",', content)
print(url_data)
# 获取失败的继续加入队列运行
if len(url_data) == 0:
q.put(url)
def many_t(self):
t_lists = []
# 按最大线程数创建线程运行方法
for i in range(self.max_thread):
t = threading.Thread(target=self.getdata, args=(q, ))
t.start()
t_lists.append(t)
for t_list in t_lists:
t_list.join()
def main(self):
self.many_t()
def main():
dragen().main()
if __name__ == '__main__':
main()