# -*- coding: utf-8 -*-
"""
@author: Dell Created on Sun Dec 29 17:26:43 2019
"""
import re
import time
import queue
import threading
import requests
def getpagesource(url):
"""获取网页源码"""
try:
resp = requests.get(url)
if resp.status_code == 200:
return resp.content.decode("utf-8")
except:
return ""
pass
def getemaillist(page_source):
"""根据网页源代码抓取一个页面的所有邮箱"""
try:
pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
return re.compile(pattern, re.IGNORECASE).findall(page_source)#忽略异常情况和大小写
except:
return ""
pass
def gethostname(url):
"""获取域名"""
try:
return re.compile(r"(http://\S*?)/", re.IGNORECASE).findall(url)[0]
except:
return ""
pass
def getabsurl(page_source):
"""获取所有绝对路径的超链接"""
try:
return re.compile(r"(http://s*\S*?)[\"|>|)]", re.IGNORECASE).findall(page_source)
except:
return ""
pass
def getrelurl(url, page_source):
"""获取一个页面所有相对路径的url"""
links = re.compile(r'href="(.*?)"',re.IGNORECASE).findall(page_source)
links_bak = links.copy()#深拷贝
for link in links_bak:
# 去掉所有绝对链接和非链接,保留所有相对链接
if (link.find("http://") != -1 or link.find("https://") != -1
or link.find("javascript") != -1 or link.find("#") != -1):
links.remove(link)
hostname = gethostname(url)#获取域名,拼接完整链接
if hostname != None:
for i in range(len(links)):
if links[i].startswith("/"):
links[i] = hostname + links[i]
else:
links[i] = hostname + "/" + links[i]
return links
pass
def getallurl(page_source):
"""获取所有的超链接"""
allurllist = []#保存所有的超链接
absurllist = []#绝对路径的超链接
relurllist = []#相对路径拼接的超链接
absurllist = getabsurl(page_source)
if len(absurllist) > 0:
relurllist = getrelurl(absurllist[0], page_source)
allurllist.extend(absurllist)
allurllist.extend(relurllist)
return allurllist
pass
def saveemail():#每过5s执行一次保存
global email_queue
file = open("mail.txt", "ab")
while True:
time.sleep(5)
while not email_queue.empty():
email = email_queue.get()
file.write((email+"\r\n").encode("utf-8", "ignore"))
file.flush()#实时写入
file.close()
pass
def BFS(url, email_queue, url_queue):
url = url_queue.get()#取出url
page_source = getpagesource(url)#抓取页面源码
emaillist = getemaillist(page_source)
if len(emaillist) != 0:
for email in emaillist:
email_queue.put(email)
print(email)
urllist = getallurl(page_source)#提取页面链接压入队列
if len(urllist) != 0:
for myurl in urllist:
url_queue.put(myurl)#将url压入队列
pass
def executeBFS(url,email_queue,url_queue):
url_queue.put(url)#给定初始值
global sem
with sem:#限定线程的数量
while True:
while not url_queue.empty():
for i in range(101):
threading.Thread(target=BFS, args=(url,email_queue,url_queue)).start()
pass
if __name__ == "__main__":
email_queue = queue.Queue()#邮箱队列
url_queue = queue.Queue()
sem = threading.Semaphore(20)#控制最大线程数为100
timerthd = threading.Timer(5, saveemail)#5s以后开启一个线程将邮箱保存到文件
timerthd.start()
url = "http://bbs.tianya.cn/post-140-393974-1.shtml"
executeBFS(url,email_queue,url_queue)
pass