python学习之路day5

bs4与requests结合 爬取代理目录扫描 (挂代理池)

def scan():
    header = {

        'Host': 'www.xxx.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Accept-Encoding': 'gzip, deflate',
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    i_list = {}
    for dirs in open('zidian.txt'):
        urls = 'http://www.baidu.com'
        url_dir = urls + dirs
        url_dir.replace('\n','')
        for ips in open('ip.txt'):
            ips = ips.replace('\n','')
            i_list['HTTP'] = ips

            try:
                code = requests.get(url_dir,header=header,proxies=i_list,verify=False).status_code
                if code == 200 or code == 403:
                    print(url_dir)

            except requests.exceptions.ConnectionError:
                print('ConnectionError -- 等待一分钟')
                time.sleep(1)
            except requests.exceptions.ChunkedEncodingError:
                print('ConnectionError -- 等待一分钟')
                time.sleep(1)
            except:
                print('UnboundLocalError -- 等待一分钟')
                time.sleep(1)

def getip():
    for x in range(i,int(i)+1):
        url = 'https://www.kuaidaili.com/free/inha/'+str(x)
        rep = requests.get(url).content
        time.sleep(1)
        soup = BeautifulSoup(rep,"lxml")
        ip = soup.select('td[data-title="IP"]')
        port = soup.select('td[data-title="PORT"]')

        for ips,ports in zip(ip,port):
            file = open('ip.txt','a+')
            file.write(ips.string+":"+ports.string+'\n')
            file.close()

if __name__ == '__main__':

    i = input("需要多少页ip:")
    getip()
    scan()
上一篇:Thales:我们在建立一个全新的经济系统


下一篇:南京安全测试方向的岗位