github 资源文件下载(python 爬虫)

import re
import os
import sys
import time	
import threading
import socket
import urllib
import urllib2


server = ‘127.0.0.1‘
port = ‘8087‘
timeout = 720
socket.setdefaulttimeout(timeout)

class timer(threading.Thread): #The timer class is derived from the class threading.Thread	
	def __init__(self, num, interval,dir,url):	
		threading.Thread.__init__(self)	
		self.thread_num = num
		self.interval = interval	
		self.url = url	
		self.dir = dir	
		self.thread_stop = False	
	def run(self): #Overwrite run() method, put what you want the thread do here	
		#while not self.thread_stop:	
			DownloadFile(self.interval,self.url,self.dir)
		#print ‘Thread Object(%d), Time:%s‘ %(self.thread_num, time.ctime())	
		#time.sleep(self.interval)	
	def stop(self):	
		self.thread_stop = True 

def getContent(url,type):
	print(">>start connecting:%s" % url)
	from urllib2 import Request, urlopen, URLError, HTTPError
	#proxy = urllib2.ProxyHandler({‘http‘:‘http://127.0.0.1:8087‘}) 
	proxy = urllib2.ProxyHandler({}) 
	opener = urllib2.build_opener(proxy,urllib2.HTTPHandler) 
	urllib2.install_opener(opener)
	try:
		urlHandler = urllib2.urlopen(url)
		headers = urlHandler.info().headers
		length = 0
		for header in headers:
			if header.find(‘Length‘) != -1:
				length = header.split(‘:‘)[-1].strip()
				length = int(length)
		if(type=="img" and length<15000):
			print(" >>>>>>>>%d" % length)
			dataStr = ‘EOF‘
		else:
			print(" ++++++++%d" % length)
			dataStr = urlHandler.read()
	except HTTPError, e:
		print ‘The server couldn\‘t fulfill the request.‘
		print ‘Error code: ‘, e.code
	except URLError, e:
		print ‘We failed to reach a server.‘
		print ‘Reason: ‘, e.reason
	else:
#   print("%s" % dataStr)
#   f = open("text.txt",‘wb‘)
#   f.write(dataStr)
#   f.close()
		return dataStr
def DownloadFile(interval,url,dir):
	strinfo = re.compile(r‘\S*/blob/master/‘)
	dataStr = getContent(url,"html")
	print("...:%s" % url)
	#download Files
	base = url.replace(‘https://github.com‘,‘‘).replace(‘/tree/master/‘,‘/blob/master/‘).strip()
	reg = r‘href="%s(\S+)"‘ % base
	imgre = re.compile(reg)
	imglist = imgre.findall(dataStr)
	x = 0
	for fileName in imglist:
		javaFileUrl = "%s%s%s" % (‘https://raw.githubusercontent.com‘,base.replace(‘/blob/master/‘,‘/master/‘),fileName)
		imgdata=getContent(javaFileUrl,"html")
		if(imgdata != ‘EOF‘):
			outputFile = ‘%s%s‘ % (dir,strinfo.sub(‘/‘,base))
			if not os.path.exists(outputFile):
				os.makedirs(outputFile);
			f = open(‘%s%s‘ % (outputFile,fileName),‘wb‘)
			f.write(imgdata)
			f.close()
			x = x + 1
	time.sleep(interval)
	#download recursive
	base = url.replace(‘https://github.com‘,‘‘).strip()
	reg = r‘href="%s(\S+)"‘ % base
	imgre = re.compile(reg)
	imglist = imgre.findall(dataStr)
	for fileDir in imglist:
		DownloadFile(interval,‘%s%s‘ % (url,fileDir),dir)
	
#https://raw.githubusercontent.com/vogella/vogella/master/de.vogella.rcp.editor.example/src/de/vogella/rcp/editor/example/Application.java
#https://raw.githubusercontent.com/clojure/clojure/master/src/jvm/clojure/lang/Util.java
#https://github.com/vogella/vogella/blob/master/de.vogella.rcp.editor.example/src/de/vogella/rcp/editor/example/Activator.java
url_="https://github.com/vogella/vogella/tree/master/de.vogella.rcp.editor.example/src"
n=1;
thread=[]
for i in range(0, n):
	url=url_
	dir=‘‘.join(‘FILE‘)
	thread.append(timer(1, 1,dir,url))
for i in range(0, n):
	thread[i].start()

github 资源文件下载(python 爬虫),布布扣,bubuko.com

github 资源文件下载(python 爬虫)

上一篇:Java学习笔记之二


下一篇:C++虚函数与多态