1、urlib2和urllib两个内置模块实现http请求
例子:
import urllib2
res = urllib2.urlopen('http://www.zhihu.com')
html = res.read()
print html
或者
import urllib2 request = urllib2.Request('http://www.zhihu.com') res = urllib2.urlopen(request) html = res.read() print html
或者
import urllib2 import urllib url = 'http://test.vfast.com/login' data = {'username': 'vfast', 'password': 'vfast' } data = urllib.urlencode(data) result = urllib2.Request(url, data) response = urllib2.urlopen(result) html = response.read() print html
添加header头
import urllib2 import urllib url = 'https://mail.qq.com/' data = {'username': 'xxxx@qq.com', 'password': 'xxxx' } user_agent = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0' referer = 'https://mail.qq.com/' headers = {'User-Agent': user_agent, 'Referer': referer} data = urllib.urlencode(data) result = urllib2.Request(url, data, headers) response = urllib2.urlopen(result) html = response.read() print html
或者
import urllib2 import urllib url = 'https://mail.qq.com/' data = {'username': 'xxxx@qq.com', 'password': 'xxxx' } user_agent = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0' referer = 'https://mail.qq.com/' headers = {'User-Agent': user_agent, 'Referer': referer} data = urllib.urlencode(data) result = urllib2.Request(url) result.add_header('User-Agent', user_agent) result.add_header('Referer', referer) result.add_data(data) response = urllib2.urlopen(result) html = response.read() print html
获取cookie的值
import urllib2 import cookielib cookie = cookielib.CookieJar() res = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) response = res.open('http://www.zhihu.com') for item in cookie: print item.name + ':' + item.value
自定义cookie内容
import urllib2 opener = urllib2.build_opener() opener.addheaders.append(('Cookie', 'email=' + 'xxxxx')) requests = urllib2.Request('http://www.zhihu.com/') response = opener.open(requests) print response.headers data = response.read() print data
timeout设置超时
python2.6版本之前,urllib2的api没有暴露timeout,可以通过更改Socket的全局timeout
import urllib2 import socket socket.setdefaulttimeout(10) urllib2.socket.setdefaulttimeout(10)
python2.6之后,urlopen函数提供了timeout设置
import urllib2 resuest = urllib2.Request('http://www.zhihu.com') response = urllib2.urlopen(resuest, timeout=2) html = response.read() print html
获取http响应码
import urllib2 try: response = urllib2.urlopen('http://www.zhihu.com') print response.code except urllib2.HTTPError as e: if hasattr(e, 'code'): print 'Error code:', e.code
重定向
import urllib2 response = urllib2.urlopen('http://www.zhihu.com') isRedirected = response.geturl() == 'http://www.zhihu.com' #检查重定向
自定义HTTPRedirectHandler类,不自动重定向
import urllib class RedirectHandler(urllib.request.HTTPRedirectHandler): def http_error_301(self, req, fp, code, msg, headers): pass def http_error_302(self, req, fp, code, msg, headers): result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) result.status = code result.newurl = result.geturl() return result opener = urllib.request.build_opener(RedirectHandler) opener.open('http://www.zhihu.com')
Proxy代理设置
import urllib2 proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'}) opener = urllib2.build_opener(proxy,) response = urllib2.urlopen('http://www.zhihu.com/') print response.read()
2、httplib结合urllib实现http请求
创建HTTPConnection对象:class httplib.HTTPConnection(host[,port[,strict[,timeout[,source_address]]]])
发送请求:HTTPConnection.request(method,url[,body[,headers]])
获取响应:HTTPConnection.getresponse()
读取响应信息:HTTPResponse.read([amt])
获得指定头信息:HTTPResponse.getheader(name[,default])
获取响应头(header,value)元组的列表:HTTPResponse.getheaders()
获得底层socket文件描述符:HTTPResponse.fileno()
获得头内容:HTTPResponse.msg
获得头http版本:HTTPResponse.version
获得返回状态码:HTTPResponse.status
获得返回说明:HTTPResponse.reason
例子:
get请求
import httplib conn = None try: conn = httplib.HTTPConnection('www.zhihu.com') conn.request('GET', '/') response = conn.getresponse() print response.status, response.reason print '-' * 40 headers = response.getheaders() for h in headers: print h print '-' * 40 print response.msg except Exception, e: print e finally: if conn: conn.close()
post请求
import httplib,urllib conn = None try: params = urllib.urlencode({'name':'vfast','age':22}) headers = {'Content-type':'application/x-www-form-urlencoded','Accept':'text/plain'} conn = httplib.HTTPConnection('www.zhihu.com',80,timeout=3) conn.request('POST','/login',params,headers) response = conn.getresponse() print response.getheaders() print response.status print response.read() except Exception,e: print e finally: if conn: conn.close()
python爬虫与项目实战学习记录