import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
import time
# 爬取网站
def craw(host):
try:
r = requests.get('http://' + host, timeout=10)
# print('http://{}---state----{}'.format(host, r.status_code))
r.encoding = r.apparent_encoding
text = r.text
except:
text = '无'
if text == '无':
try:
r = requests.get('https://' + host, timeout=10)
# print('https://{}---state----{}'.format(host, r.status_code))
r.encoding = r.apparent_encoding
text = r.text
except:
text = '无'
return text
# 解析网页内容
def parser(html):
try:
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.text
except:
title = '无法访问'
return title
def multi_thread(hosts):
result = []
with concurrent.futures.ThreadPoolExecutor() as pool:
htmls = pool.map(craw,hosts)
htmls = list(zip(hosts,htmls))
# for host,html in htmls:
# print(host,len(html))
# print('-----------------------爬取网页结束--------------------------')
with concurrent.futures.ThreadPoolExecutor() as pool:
futures = {}
for host,html in htmls:
future = pool.submit(parser,html)
futures[future] = host
# for future,host in futures.items():
# out = host+'\t'+str(future.result())+'\n'
# print(out)
result.extend([{'domain':host,'title':future.result()} for future,host in futures.items()])
df = pd.DataFrame(result).drop_duplicates()
return df
# print('-----------------------解析网页结束--------------------------')
if __name__=='__main__':
# 加载数据
with open(r'./host', 'r', encoding='utf-8') as f:
hosts = [host.strip() for host in f.readlines()]
res = multi_thread(hosts)
print(res)
res.to_excel(r'./host_title.xlsx',index=False)