基于Python的urllib2模块的多线程网络爬虫程序

2022-09-17 08:40:47
  1 m Queue import Queue
  2 from gzip import GzipFile
  3 from StringIO import StringIO
  4 import time
  5 import socket
  6 class ContentEncodingProcessor(urllib2.BaseHandler):
  7   """A handler to add gzip capabilities to urllib2 requests """
  8  
  9   # add headers to requests
 10   def http_request(self, req):
 11     req.add_header("Accept-Encoding", "gzip, deflate")
 12     return req
 13  
 14   # decode
 15   def http_response(self, req, resp):
 16     old_resp = resp
 17     
 18    # if(resp.geturl() != req):
 19     #    print ‘no‘
 20      #   return 1
 21     # gzip
 22     if resp.headers.get("content-encoding") == "gzip":
 23         gz = GzipFile(
 24                     fileobj=StringIO(resp.read()),
 25                     mode="r"
 26                   )
 27         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
 28         resp.msg = old_resp.msg
 29     # deflate
 30     if resp.headers.get("content-encoding") == "deflate":
 31         gz = StringIO( deflate(resp.read()) )
 32         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # ‘class to add info() and
 33         resp.msg = old_resp.msg
 34     return resp
 35 
 36 # deflate support
 37 import zlib
 38 def deflate(data):   # zlib only provides the zlib compress format, not the deflate format;
 39   try:               # so on top of all there‘s this workaround:
 40     return zlib.decompress(data, -zlib.MAX_WBITS)
 41   except zlib.error:
 42     return zlib.decompress(data)
 43 
 44 
 45 #(set timeout)
 46 socket.setdefaulttimeout(10)
 47 
 48 encoding_support = ContentEncodingProcessor
 49 opener = urllib2.build_opener( encoding_support, urllib2.HTTPHandler)
 50 
 51 class Fetcher:
 52     def __init__(self,threads):
 53         self.opener = urllib2.build_opener(urllib2.HTTPHandler)
 54         self.lock = Lock() #线程锁
 55         self.q_req = Queue() #任务队列
 56         self.q_ans = Queue() #完成队列import socket
 57         self.threads = threads
 58         for i in range(threads):
 59             t = Thread(target=self.threadget)
 60             t.setDaemon(True)
 61             t.start()
 62         self.running = 0
 63  
 64     def __del__(self): #解构时需等待两个队列完成
 65         time.sleep(0.5)
 66         self.q_req.join()
 67         self.q_ans.join()
 68  
 69     def taskleft(self):
 70         return self.q_req.qsize()+self.q_ans.qsize()+self.running
 71  
 72     def push(self,req):
 73         self.q_req.put(req)
 74  
 75     def pop(self):
 76         return self.q_ans.get()
 77  
 78     def threadget(self):
 79         while True:
 80             ans = ‘‘
 81             req = self.q_req.get()
 82      #       print req
 83 
 84             with self.lock: #要保证该操作的原子性，进入critical area
 85                 self.running += 1
 86 
 87             try:
 88 #               ans = self.opener.open(req).read()
 89                #content =  opener.open(req).read()
 90                 content = urllib2.urlopen(req).read()
 91             #    print temp.geturl()
 92             #    print req
 93             #    add gzip support from here
 94                 ans = str(content)
 95             except Exception, what:
 96                 print what 
 97                 pass
 98 
 99             self.q_ans.put((ans,req))
100             with self.lock:
101                 self.running -= 1
102             self.q_req.task_done()
103             time.sleep(0.01) # don‘t spam
104  
105 if __name__ == "__main__":
106     a = [0] * 3600000
107     links = [ ‘http://www.songtaste.com/song/%d/‘%i for i in range(1,3600000) ]
108     f = Fetcher(threads=50)
109     for url in links:
110         f.push(url)
111     while f.taskleft():  
112         the_page,x =f.pop()
113        # print the_page
114         try:
115           npos = the_page.index(‘chart#fav‘)
116         except :
117           pass
118         else:
119            for j in range(npos,1,-1):
120             if the_page[j] == ‘,‘: 
121                 k = j 
122                 break
123            sum = 0 ;
124            t = 1 ; 
125            for j in range(k-1,1,-1):
126               if  the_page[j] <= ‘9‘ and the_page[j] >=‘0‘:
127                   sum = sum + (int(the_page[j]) - int(‘0‘)) * t
128                   t *= 10;
129               else :
130                   break
131            p = int(x[30:-1])
132            if(p % 10000 <= 5  )
133            a[p] = sum
134            if sum != 0:
135                 print p
136                 print sum
137
View Code
基于Python的urllib2模块的多线程网络爬虫程序,布布扣,bubuko.com
基于Python的urllib2模块的多线程网络爬虫程序
码农公寓

相关文章