昨天能够登陆成功,但是不能使用cookies,今天试了一下requests库的Session(),发现可以保持会话了,代码只是稍作改动。
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import html5lib
import re
import urllib s = requests.Session()
url1 = 'http://accounts.douban.com/login'
url2 = 'http://www.douban.com/people/****/contacts'
formdata={
"redir":"http://www.douban.com/",
"form_email":"*******",
"form_password":"******",
#'captcha-solution':'blood',
#'captcha-id':'cRPGXEYPFHjkfv3u7K4Pm0v1:en',
"login":"登录"
} headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8",
"Referer":"http://accounts.douban.com/login",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
} r1 = s.post(url1,data=formdata,headers=headers)
rcontent = r1.text
soup = BeautifulSoup(rcontent,"html5lib")
#安装了html5lib没用python本身的html解析库
captchaAddr = soup.find('img',id='captcha_image')['src']
reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
captchaID = re.findall(reCaptchaID,rcontent)
print(captchaID)
urllib.request.urlretrieve(captchaAddr,"captcha.jpg")
captcha = input('please input the captcha:')
formdata['captcha-solution'] = captcha
formdata['captcha-id'] = captchaID
r1 = s.post(url1,data=formdata,headers=headers)
r2 = s.get(url2)
f = open('spider2.txt','w',encoding='utf-8')
f.write(r2.text)
f.close()