使用队列多线程爬取代理IP以及验证IP可用性

#!/usr/bin/python
#coding=utf-8

import requests
from bs4 import BeautifulSoup
import random
import lxml
import re
import threading
from  queue import  Queue
from  threading import Lock
import time



userlist= [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

# 全局变量,用于保存所有代理IP,这里使用队列,而不使用列表,是因为队列具有线程安全,是原子性操作,不用锁同步
q= Queue()
# 全局变量用于保存,所有线程共测试代理的个数
count = 0
# 全局锁
mylock = Lock()
# 队列,保存验证OK的代理IP
q_ok= Queue()
# 用户保存线程列表,后面创建所有线程后,统一join(),不能在创建线程时join()这样还是一个线程一个线程执行,不能并发多线程,
mythread_list=[]

def get_headers():
    headers={
    'User-Agent':random.choice(userlist),  # 随机更换请求头
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch'
    }
    return headers

# 获取指定页码上的代理IP,和端口号,并保存在全局列表中
def run(begin,end):
    global ip_list
    url="http://www.xicidaili.com/wt/"
    for page in range(begin,end):
        print (url+str(page))
        try:
            ret=requests.get(
            url=url+str(page),headers=get_headers()
            )
            ret.raise_for_status()  # 如果状态不是200,则抛出异常。
            ret.encoding=ret.apparent_encoding # 指定编码等于原始页面编码
            print (type(ret))
            #print ret.text
            text=BeautifulSoup(ret.text,'lxml') # 使用BeautifulSoup解析
            tr_list=text.find_all(name='tr')[1:]  # 获取所有的tr,从第1个开始,第0个不要,因为是表头 ,返回的是列表
            for tr in tr_list:
                td_list=tr.find_all(name='td')[1:] #也是从下标从1开始的,因为0下标是,国家图标
                ip= td_list[0].text+":"+td_list[1].text  #拼接ip和端口
                q.put(ip)  # 添加进IP队列中。
            # print ip_list
        except Exception as e:
            print (e)

# 检查IP的可用性  http://httpbin.org/ip  或者 http://2018.ip138.com/ic.asp
def check_ip(ip,time_out,test_url="http://2019.ip138.com/ic.asp"):
    proxies={'http': ip}
    ip_s=ip.split(":")[0]    # 截取出代理IP,用于后面比对
    try:
        ret=requests.get(url=test_url,headers=get_headers(),proxies=proxies,timeout=time_out)
        if ret.status_code==200:
            ret.encoding=ret.apparent_encoding
            ret_ip=re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',ret.text)[0]
            if ret_ip==ip_s:
                print ("线程:{0} 验证 {1} 可用:".format(threading.current_thread().name,ip))
                q_ok.put(ip)
    except Exception as e:
        print (e)


# 创建一个线程类,本线程类功能:将从西刺代理网爬取下来的IP+端口列表,拿到代理IP检测网址中去验证代理是否可用
class Mythread(threading.Thread):
    def __init__(self,name,args):
        threading.Thread.__init__(self,name=name,args=args)
    def run(self):
        # 遍历IP:端口 列表,测试可用性。
        global count
        while q.qsize():
            mylock.acquire() # 加锁
            count = count + 1
            mylock.release() # 解锁
            value=q.get() #从队列中取一个IP+端口
            check_ip(ip=value, time_out=5)
            if count % 50 == 0:
                print("已经检查了代理IP{0}个,总共{1}个:".format(count,qsize)) # 每检查100个代理IP,就打印出来检查进度

#jiexi=BeautifulSoup()
if __name__ == '__main__':
    begin_time=time.time()
    # 爬取第几页到第几页码的IP:端口
    run(1, 8)
    qsize=q.qsize()
    print('now get {0} counts ip to used'.format(qsize))

    for i in range(50): # 开启50个线程
        print('启动第{0}号线程'.format(i))
        mythread=Mythread('{0}号线程'.format(i),())
        mythread.start()
        mythread_list.append(mythread)
    # 主线程等待所有子线程
    for i in mythread_list:
        i.join()
    end_time = time.time()
    print("检查可用的代理IP如下:共{0}个,耗时{1} 秒".format(q_ok.qsize(),end_time-begin_time))
    while not q_ok.empty():
        print(q_ok.get())
    print("所有子线程执行完成,程序退出!")
上一篇:webservice远程调试开启


下一篇:如何让NoSQL内存数据库适合企业级应用