我想从网站上获得很多页面,比如
curl "http://farmsubsidy.org/DE/browse?page=[0000-3603]" -o "de.#1"
但是在python中获取页面数据,而不是磁盘文件.
有人可以发布pycurl代码来做到这一点,
或者快速urllib2(不是一次一个),如果可能的话,
或者说“忘记它,卷曲更快更强大”?谢谢
解决方法:
这是一个基于urllib2和线程的解决方案.
import urllib2
from threading import Thread
BASE_URL = 'http://farmsubsidy.org/DE/browse?page='
NUM_RANGE = range(0000, 3603)
THREADS = 2
def main():
for nums in split_seq(NUM_RANGE, THREADS):
t = Spider(BASE_URL, nums)
t.start()
def split_seq(seq, num_pieces):
start = 0
for i in xrange(num_pieces):
stop = start + len(seq[i::num_pieces])
yield seq[start:stop]
start = stop
class Spider(Thread):
def __init__(self, base_url, nums):
Thread.__init__(self)
self.base_url = base_url
self.nums = nums
def run(self):
for num in self.nums:
url = '%s%s' % (self.base_url, num)
data = urllib2.urlopen(url).read()
print data
if __name__ == '__main__':
main()