多线程爬取网页标题

多线程爬取网页标题

import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
import time



# 爬取网站
def craw(host):
    try:
        r = requests.get('http://' + host, timeout=10)
        # print('http://{}---state----{}'.format(host, r.status_code))
        r.encoding = r.apparent_encoding
        text = r.text
    except:
        text = '无'
    if text == '无':
        try:
            r = requests.get('https://' + host, timeout=10)
            # print('https://{}---state----{}'.format(host, r.status_code))
            r.encoding = r.apparent_encoding
            text = r.text
        except:
            text = '无'
    return text


# 解析网页内容
def parser(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.title.text
    except:
        title = '无法访问'
    return title

def multi_thread(hosts):
    result = []
    with concurrent.futures.ThreadPoolExecutor() as pool:
        htmls = pool.map(craw,hosts)
        htmls = list(zip(hosts,htmls))
        # for host,html in htmls:
        #     print(host,len(html))
    # print('-----------------------爬取网页结束--------------------------')

    with concurrent.futures.ThreadPoolExecutor() as pool:
        futures = {}
        for host,html in htmls:
            future = pool.submit(parser,html)
            futures[future] = host
        # for future,host in futures.items():
        #     out = host+'\t'+str(future.result())+'\n'
        #     print(out)
        result.extend([{'domain':host,'title':future.result()} for future,host in futures.items()])
    df = pd.DataFrame(result).drop_duplicates()
    return df
    # print('-----------------------解析网页结束--------------------------')
if __name__=='__main__':
    # 加载数据
    with open(r'./host', 'r', encoding='utf-8') as f:
        hosts = [host.strip() for host in f.readlines()]
    res = multi_thread(hosts)
    print(res)
    res.to_excel(r'./host_title.xlsx',index=False)

上一篇:矩阵快速幂(快速幂)模板题目Decoding Genome


下一篇:ZCE futures模拟-7