python爬虫
简单来说,Beautiful Soup 是 Python 的一个第三方库,主要帮助我们解析网页数据。
在使用这个工具前,我们需要先安装,在 cmd 中,使用 pip 或 easy_install 安装即可。
安装好后,我们还需要安装 lxml,这是解析 HTML 需要用到的依赖:
pip install beautifulsoup4# 或者easy_install beautifulsoup4
pip install lxml
实战
获取div中指定id的数据:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
target = 'https://www.xsbiquge.com/15_15338/8549128.html'
req = requests.get(url = target)
req.encoding = 'utf-8'
html = req.text
bs = BeautifulSoup(html, 'lxml')
texts = bs.find('div', id='content')
print(texts)
爬取补天所有公益src的中文名称:
处理爬取到的json数据
import time
import requests
import json
import jsonpath
def src(page1):
target = 'https://www.butian.net/Reward/pub'
#控制分页
data = {
's': '1',
'p': page1
}
req = requests.post(url=target, data=data)
#得到的是json数据,处理json数据
unicodeStr = json.loads(req.text)
#获取json数据中campny_name的值
companyName = jsonpath.jsonpath(unicodeStr, "$..company_name")
print("正在爬取第" + str(page1) + "页")
for i in companyName:
with open(r'butian.txt', 'a+') as f:
f.write(i + '\n')
f.close()
if __name__ == '__main__':
for page in range(150, 190):
try:
src(page)
time.sleep(0.2)
except Exception as e:
print("出错啦!!!")
发现获取中文名称想通过百度或者fofa再获取指定的域名太难了,成功率太低,改为另外一种方法
通过cip爬取补天目标域名:
import time
import requests
from bs4 import BeautifulSoup
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Sec-Fetch-Dest': 'document',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cookie': 'btuc_ba52447ea424004a7da412b344e5e41a=d8073109b92496b7d78efc46467bdd04ff25554c1ede609c4082521e96ee1f74'
'; PHPSESSID=nh7d72cedtmun5vid0b2cotqo1; PHPSESSID=21d2b4ldhu7ducl2d29vf40qg6; '
'wzws_cid' '=dcbc1afc22788d55232881662a4a413a61a783caabcaf0291e6e454fa304f60b28a86eb2edbddf63fbd8a5cbbc32d10ca98df05eedda82f61bb992fbfddfff0a8b79ad64694d8cd4744dbf8e1f8cbaa7;'
' __q__=1642517363830',
}
def gongyi(yeshu):
target = 'https://www.butian.net/Loo/submit?cid=' + str(yeshu)
req = requests.get(url=target, headers=headers, timeout=5)
html = req.text
bs = BeautifulSoup(html, 'lxml')
texts = bs.find('div', id='tabs') # 获取id为tabs的 div的标签
text2 = texts.form.div.ul.find_all('li') # find_all 获取所有li 的标签
i = 0
for text3 in text2:
i = i + 1
if i == 3:
test4 = text3.input.get('value') # get表示获取标签属性
if len(str(text3.input.get('value'))) == 0:
print("数据为空")
else:
print(test4)
with open(r'yuming2.txt', 'a+') as f:
f.write(test4 + '\n')
f.close()
break
if __name__ == '__main__':
breakFlag = 0 # 用来判断是否连续出错
for page in range(30000, 65000): # 1-7000 28000-28200开始 已经扫完
print("cip="+str(page))
try:
gongyi(page)
time.sleep(0.1)
breakFlag = 0
except Exception as e:
print("出错啦!!!")
if breakFlag == 2:
break
breakFlag = breakFlag+1
统一http/https格式
爬处来的格式不统一,统一加上http格式,去掉gov,cn的站点,求稳不搞事
获得的例如这些格式的:
www.baidu.com
http://www.baidu.com
test.gov.cn
增加了多线程
import queue
import sys
import threading
import time
import requests
from concurrent.futures import ThreadPoolExecutor
from requests.packages import urllib3
urllib3.disable_warnings()
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Sec-Fetch-Dest': 'document',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cookie': 'xxx', # 根据当前访问cookie
}
def check_ip(ip):
if 'gov.cn' in ip:
print("过滤gov.cn网站:" + ip)
else:
ip = ip.replace('https://', '').replace('http://', '').replace('/', '').replace('\n', '')
try:
url = "https://" + ip
print(url)
ip_code = requests.get(url, headers=headers, verify=False, timeout=1).status_code
print(ip_code)
with open(ip_file_result, 'a+') as f:
f.write(url + '\n')
f.close()
except Exception as e:
try:
urls = "http://" + ip
print(urls)
ip_code = requests.get(urls, headers=headers, verify=False, timeout=2).status_code
print(ip_code)
with open(r'50000http.txt', 'a+') as f:
f.write(urls + '\n')
f.close()
except Exception as e:
print("出错啦!")
time.sleep(0.2)
#多线程
def gaoshi():
while not q.empty():
dict = q.get()
check_ip(dict)
if __name__ == '__main__':
ip_file = sys.argv[1]
# ip_file = "55000.txt"
ip_file_result = "Result_" + ip_file
titles = list(set([x.strip() for x in open(ip_file).readlines()]))
print('目标总数:{}'.format(len(titles)))
thread_x = 100 # 多线程数
q = queue.Queue()
for ip in open(ip_file):
q.put(ip) # 把数据写这个对象里面,用于多线程调用
print(q.get())
for x in range(int(thread_x)):
t = threading.Thread(target=gaoshi)
t.start()