http访问请求的实现


1、urlib2和urllib两个内置模块实现http请求

例子:

import urllib2
res = urllib2.urlopen('http://www.zhihu.com')
html = res.read()
print html

或者

import urllib2
request = urllib2.Request('http://www.zhihu.com')
res = urllib2.urlopen(request)
html = res.read()
print html

或者

import urllib2
import urllib

url = 'http://test.vfast.com/login'
data = {'username': 'vfast',
        'password': 'vfast'
        }

data = urllib.urlencode(data)
result = urllib2.Request(url, data)
response = urllib2.urlopen(result)
html = response.read()
print html

添加header头

import urllib2
import urllib

url = 'https://mail.qq.com/'
data = {'username': 'xxxx@qq.com',
        'password': 'xxxx'
        }
user_agent = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
referer = 'https://mail.qq.com/'

headers = {'User-Agent': user_agent, 'Referer': referer}
data = urllib.urlencode(data)
result = urllib2.Request(url, data, headers)
response = urllib2.urlopen(result)
html = response.read()
print html

或者

import urllib2
import urllib

url = 'https://mail.qq.com/'
data = {'username': 'xxxx@qq.com',
        'password': 'xxxx'
        }
user_agent = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
referer = 'https://mail.qq.com/'

headers = {'User-Agent': user_agent, 'Referer': referer}
data = urllib.urlencode(data)
result = urllib2.Request(url)

result.add_header('User-Agent', user_agent)
result.add_header('Referer', referer)
result.add_data(data)

response = urllib2.urlopen(result)
html = response.read()
print html

获取cookie的值

import urllib2
import cookielib

cookie = cookielib.CookieJar()
res = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = res.open('http://www.zhihu.com')
for item in cookie:
    print item.name + ':' + item.value

 

自定义cookie内容

import urllib2

opener = urllib2.build_opener()
opener.addheaders.append(('Cookie', 'email=' + 'xxxxx'))
requests = urllib2.Request('http://www.zhihu.com/')
response = opener.open(requests)
print response.headers
data = response.read()
print data

timeout设置超时

python2.6版本之前,urllib2的api没有暴露timeout,可以通过更改Socket的全局timeout

import urllib2
import socket

socket.setdefaulttimeout(10)
urllib2.socket.setdefaulttimeout(10)

python2.6之后,urlopen函数提供了timeout设置

import urllib2

resuest = urllib2.Request('http://www.zhihu.com')
response = urllib2.urlopen(resuest, timeout=2)
html = response.read()
print html

获取http响应码

import urllib2

try:
    response = urllib2.urlopen('http://www.zhihu.com')
    print response.code
except urllib2.HTTPError as e:
    if hasattr(e, 'code'):
        print 'Error code:', e.code

重定向

import urllib2

response = urllib2.urlopen('http://www.zhihu.com')
isRedirected = response.geturl() == 'http://www.zhihu.com'  #检查重定向

自定义HTTPRedirectHandler类,不自动重定向

import urllib


class RedirectHandler(urllib.request.HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        pass

    def http_error_302(self, req, fp, code, msg, headers):
        result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
        result.status = code
        result.newurl = result.geturl()
        return result


opener = urllib.request.build_opener(RedirectHandler)
opener.open('http://www.zhihu.com')

Proxy代理设置

import urllib2

proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy,)
response = urllib2.urlopen('http://www.zhihu.com/')
print response.read()

2、httplib结合urllib实现http请求

创建HTTPConnection对象:class httplib.HTTPConnection(host[,port[,strict[,timeout[,source_address]]]])

发送请求:HTTPConnection.request(method,url[,body[,headers]])

获取响应:HTTPConnection.getresponse()

读取响应信息:HTTPResponse.read([amt])

获得指定头信息:HTTPResponse.getheader(name[,default])

获取响应头(header,value)元组的列表:HTTPResponse.getheaders()

获得底层socket文件描述符:HTTPResponse.fileno()

获得头内容:HTTPResponse.msg

获得头http版本:HTTPResponse.version

获得返回状态码:HTTPResponse.status

获得返回说明:HTTPResponse.reason

例子:

get请求

import httplib

conn = None
try:
    conn = httplib.HTTPConnection('www.zhihu.com')
    conn.request('GET', '/')
    response = conn.getresponse()
    print response.status, response.reason
    print '-' * 40
    headers = response.getheaders()
    for h in headers:
        print h
    print '-' * 40
    print response.msg
except Exception, e:
    print e
finally:
    if conn:
        conn.close()

post请求

import httplib,urllib

conn = None
try:
    params = urllib.urlencode({'name':'vfast','age':22})
    headers = {'Content-type':'application/x-www-form-urlencoded','Accept':'text/plain'}
    conn = httplib.HTTPConnection('www.zhihu.com',80,timeout=3)
    conn.request('POST','/login',params,headers)
    response = conn.getresponse()
    print response.getheaders()
    print response.status
    print response.read()
except Exception,e:
    print e
finally:
    if conn:
        conn.close()

 python爬虫与项目实战学习记录

上一篇:爬虫实战:urllib2 应用之如何实现对某贴吧数据针对爬取、并完成实现代码封装?


下一篇:python-初学爬虫