# -*- coding: utf-8 -*-
# http://weixin.sogou.com/
import re
import urllib.request
import time # sleep()方法 实现延时
import urllib.error
# 为使用代理服务器爬一个网址
def use_proxy(proxy_addr,url):
# 建立异常处理机制
try:
req = urllib.request.Request(url) # 模拟浏览器
req.add_header("User-Agent","Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KHTML,like Google Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
proxy = urllib.request.ProxyHandler({'http':proxy_addr}) # 设置代理服务器
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(req).read()
return data
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
# 若为URLError异常,延时10s执行
time.sleep(10)
except Exception as e:
print("exception:"+str(e))
# 若为Exception异常,延时1s执行
time.sleep(1)
# 设置关键词
key = "Python"
# 设置代理服务器,该代理服务器可能失效,失效后需更新有效代理服务器
proxy = "127.0.0.1:8888"
# 爬取的页数
for i in range(0,10):
key = urllib.request.quote(key)
thisPageUrl="http://weixin.sogou.com/weixin?type=2&query=" + key + "&page=" + str(i)
# a = ""
thisPageData = use_proxy(proxy,thisPageUrl)
print(len(str(thisPageData)))
pat1 = '<a href="(.*?)"'
rs1 = re.compile(pat1,re.S).findall(str(thisPageData))
if(len(rs1) == 0):
print("此次("+str(i)+"页)没成功")
continue
for j in range(0,len(rs1)):
thisUrl = rs1[j]
thisUrl = thisUrl.replace("amp;","")
file = "F:/爬虫信息/result/第"+str(i)+"页第"+str(j)+"篇文章.html"
thisData = use_proxy(proxy,thisUrl)
try :
fh = open(file,"wb")
fh.write(thisData)
fh.close()
print("第"+str(i)+"页第"+str(j)+"篇文章成功!")
except Exception as e:
print(e)
print("第"+str(i)+"页第"+str(j)+"篇文章失败!")
问题如下: