1 # -*- coding: utf-8 -*- 2 from bs4 import BeautifulSoup 3 import concurrent.futures 4 import requests 5 6 7 8 hd = { 9 "cept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 10 "Accept-Encoding": "gzip, deflate", 11 "Accept-Language": "zh-CN,zh;q=0.9", 12 "Cache-Control": "max-age=0", 13 "Connection": "keep-alive", 14 "Host": "www.xxxx.com", 15 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" 16 } 17 18 # 输出到文件 19 def write(path, text): 20 f1 = open(path, 'ab') 21 f1.write(bytes(text, encoding="utf-8")) 22 f1.close() 23 24 # 爬取动作 25 def start(url): 26 demo = BeautifulSoup(requests.get(url, headers = hd).text, "html.parser") 27 for a in demo.find_all('div', class_='textlist-body'): 28 print(a) 29 # write("out.txt", url) 30 write("out.txt", '{}, {}\n'.format(url, a)) 31 32 33 def Country_url(): 34 url = "https://www.xxxx.com/" 35 demo = BeautifulSoup(requests.get(url+"airports", headers = kv).text, "html.parser") 36 37 for i in demo.find_all('div', class_='textlist-body'): 38 url_li = [url+x.string.replace(" ", "-") for x in i if x != " "] # 生成URL列表 39 with concurrent.futures.ThreadPoolExecutor() as executor: 40 results = executor.map(start, url_li) # 利用map对列表传递 41 for result in results: 42 print(result) 43 44 45 46 if __name__ == "__main__": 47 Country_url()