小小脚本记录(二、批量网站存活检测)

这个脚本我使用的初衷是验证一批历史网站是否还存货,或者被跳转。
直接上脚本

import re
import requests
from selenium import webdriver
import time
import os
import socket

global info

try:
    os.system("mkdir reports")
    os.system("mkdir images")
except:
    pass
htmlHeader = """
<title>网站扫描报告</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
<style>
.reports{
width:800px;
height:600px;
display:flex;
justify-content:center;
}

.reports img{
    width:100%;
    height:auto;
}
</style>
"""



reportFile = str(time.strftime("%Y-%m-%d")) + ".html"

if not os.path.exists("./reports/" + reportFile):
    f = open("./reports/" + reportFile, "w")
    f.write(htmlHeader)
    f.close()

def getIP(domain):
    myaddr = socket.getaddrinfo(domain, 'http')
    return str(myaddr[0][4][0])

def getInfo(res):
    try:
        Server = res.headers["Server"]
    except:
        Server = None
        pass
    try:
        code = res.headers["X-Powered-By"]
    except:
        code = None
        pass
    return "Server:" + str(Server) + "\tCode:" + str(code)

def scanurl(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
    res = requests.get(url=url, headers=headers, timeout=1)
    return res

def urlformat(site):

    site = site.replace("\"", "").replace("\n", "")
    if "http" in site:
        return site
    elif site.strip():
        return "http://" + site + "/"
    else:
        pass

def outPut(target,title,imageName):
    ip = getIP(str(re.compile('http://(.*?)/').findall(target)[0]))
    f = open("./reports/" + reportFile,"a")
    f.write('<h1>URL:<a href="' + target +'" target="_blank">' + target +'</a></h1>\n')
    f.write('<h3>ip:' + str(ip) + '</h3>')
    f.write('<h3>TITLE:' + str(title) +'</h3><h4>' + str(info) +'</h4>')
    f.write('<div class="reports"><img src="../images/' + imageName +'.png"/></div><hr/>')
    f.close()

def screenshot(target):
    imageName = str(re.compile('http://(.*?)/').findall(target)[0])
    option = webdriver.FirefoxOptions()
    option.set_headless()
    driver = webdriver.Firefox(firefox_options=option)
    driver.get(target)

    try:
        if os.path.exists('./images/' + imageName + '.png'):
            print("screenshot exists")
        else:
            driver.get_screenshot_as_file('./images/' + imageName + '.png')
            print("screenshot success" + "\n")
    except BaseException as msg:
        print(msg)
        pass
    title = driver.title.encode("utf-8")
    driver.quit()
    outPut(target,title,imageName)

if __name__ == '__main__':

    for site in open("sites.txt"):
        site = site.replace("\r", "").replace("\n", "").replace(" ", "")
        if site == "":
            pass
        else:
            url = urlformat(site)
            try:
                res = scanurl(url)
                try:
                    info = getInfo(res)
                except:
                    info = None
                    pass
                try:
                    print("[*]" + str(res.status_code) + "\t" + url)
                    print(info)
                    filename = str(res.status_code) + ".txt"
                except:
                    pass
                try:
                    f = open(filename, "a")
                    f.write(url + "\t\t" + info + "\n")
                    f.close()
                except:
                    pass
                if res.status_code == 200:
                    try:
                        screenshot(url)
                    except:
                        pass
            except:
                pass


小小脚本记录(二、批量网站存活检测)
需要扫描的内容放在sites.txt中,最终结果以状态码进行分类。

上一篇:Ajax请求php返回json对象数据中包含有数字索引和字符串索引,在for in循环中取出数据的顺序问题


下一篇:指针和数组经典笔试题解析