python3 urllib模块使用

2022-06-25 23:12:18

urllib模块使用

urllib.request

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

import urllib.request

url = 'http://httpbin.org/ip'

response = urllib.request.urlopen(url)

html = response.read()  # 返回bytes类型数据

print(html)

url = 'http://www.baidu.com'

response = urllib.request.urlopen(url)

html = response.read().decode('utf-8') # 通过decode()方法将bytes类型数据转化为str类型数据

print(html)

发送post数据

import urllib.request

import urllib.parse

url = 'http://httpbin.org/post'

data = {

    'name' : "小明",

    'age' : 30

}

# data = urllib.parse.urlencode(data)  # Error: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str

# data = urllib.parse.urlencode(data).encode('utf-8')

data = bytes(urllib.parse.urlencode(data),encoding="utf-8")

response = urllib.request.urlopen(url, data=data)

html = response.read().decode('utf-8')

print(html)

设置timeout

import urllib.request

url = 'http://httpbin.org/get'

response = urllib.request.urlopen(url, timeout=1)

html = response.read().decode('utf-8')

print(html)

import socket

import urllib.request

import urllib.error

url = 'http://httpbin.org/get'

try:

    response = urllib.request.urlopen(url, timeout=0.1)

    html = response.read().decode('utf-8')

    print(html)

except urllib.error.URLError as e:

    print("捕获异常....")

    print(e.reason)

    if isinstance(e.reason, socket.timeout):

        print("请求超时")

响应

响应类型、状态码、响应头、实际获取的url

import urllib.request

url = 'http://www.python.org'

response = urllib.request.urlopen(url)

# 响应类型

response_type = type(response)

print(response_type)  # <class 'http.client.HTTPResponse'>

# 状态码

status_code = response.getcode()

print(status_code)

# 状态码对应的信息

status = response.reason

print(status)    # 比如 200对应Ok, 404对应Not Found

# 响应头

response_headers = response.getheaders()  # 返回列表

print(response_headers)

server_type = response.getheader('Server') # getheader()获取响应头的指定部分信息

print(server_type)

print(type(response.headers))  # <class 'http.client.HTTPMessage'>

content_type = response.headers['Content-Type'] # 获取Content-Type

print(content_type)

# 实际获取的url, 可以用来判断是否发生重定向

actual_url = response.geturl()

print(actual_url)

class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

import urllib.request

url = 'http://httpbin.org/get'

request = urllib.request.Request(url)  # 创建请求对象

response = urllib.request.urlopen(request) # 发送请求

html = response.read().decode('utf-8')

print(html)

# 默认的User-Agent为"Python-urllib/x.x" # x.x为python版本号

发送post数据

import urllib.request

import urllib.parse

url = 'http://httpbin.org/post'

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',

}

data = {

    'name' : 'peter',

    'age' : 20

}

data = bytes(urllib.parse.urlencode(data), encoding="utf-8") # POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str

request = urllib.request.Request(url, data=data, headers=headers)

response = urllib.request.urlopen(request)

html = response.read().decode('utf-8')

print(html)

# post数据时  "Content-Type": "application/x-www-form-urlencoded"

urllib.request.Request 对象方法

import urllib.request

url = 'http://httpbin.org/get'

request = urllib.request.Request(url)

# add_header(key, val)   # 添加请求头信息

request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36')

response = urllib.request.urlopen(request)

html = response.read().decode('utf-8')

print(html)

Handlers

ProxyHandler(代理)

import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号

proxy_dict = {

    'http': '127.0.0.1:6688',

    'https': '127.0.0.1:6688',

}

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',

}

proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)

opener = urllib.request.build_opener(proxy_hanlder)

urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip

response = urllib.request.urlopen(url)

print(response.read().decode('utf-8'))

# 常见错误:

# HTTPError: HTTP Error 403: Forbidden : 很可能代理服务器设置了权限，当前ip不在代理服务器允许访问列表中

代理需要身份认证

# 错误提示: HTTPError: HTTP Error 407: Proxy Authentication Required

#方法1: 代理ip设置格式 http://用户名:密码@ip地址:端口号

import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号

proxy_dict = {

    'http': 'http://name:password@127.0.0.1:6688',

    'https': 'http://name:password@127.0.0.1:6688',

}

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',

}

proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)

opener = urllib.request.build_opener(proxy_hanlder)

urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip

response = opener.open(url)

print(response.read().decode('utf-8'))

#方法2: 使用ProxyBasicAuthHandler用于代理登陆验证(需要提供相应的用户名和密码)

import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号

proxy_dict = {

    'http': 'http://127.0.0.1:6688',

}

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',

}

proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)

password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

password_mgr.add_password(None, 'http://127.0.0.1:6688', 'name', 'password') # #realm(域)设为None即可

proxy_auth_handler = urllib.request.ProxyBasicAuthHandler(password_mgr)

opener = urllib.request.build_opener(proxy_hanlder, proxy_auth_handler)

urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip

response = opener.open(url)

print(response.read().decode('utf-8'))

HTTPBasicAuthHandler

用于访问web服务器时的身份验证

import urllib.request

url = 'http://127.0.0.1/test/'

password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

password_mgr.add_password(None, url, 'admin','password')  # 添加对应url的用户名和密码

http_auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

opener = urllib.request.build_opener(http_auth_handler)

response = opener.open(url)

print(response.read().decode('utf-8'))

FTPHandler

import urllib.request

url = 'ftp://ftp1.linuxidc.com'

username = 'ftp1.linuxidc.com'

password = 'www.linuxidc.com'

ftp_url = 'ftp://%s:%s@ftp1.linuxidc.com' %(username, password)

ftp_handler = urllib.request.FTPHandler()

opener = urllib.request.build_opener(ftp_handler)

response = opener.open(ftp_url)

print(response.read().decode('utf-8', 'ignore'))

HTTPHandler、HTTPSHandler

import urllib.request

url = 'http://www.baidu.com'

# 通过将debuglevel=1,将debug Log 打开,这样收发包的内容就会在屏幕上打印出来，方便调试

http_handler = urllib.request.HTTPHandler(debuglevel=1)

https_handler = urllib.request.HTTPSHandler(debuglevel=1)

opener = urllib.request.build_opener(http_handler, https_handler)

response = opener.open(url)

'''

效果:

send: b'GET / HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: www.baidu.com\r\nUser-Agent: Python-urllib/3.6\r\nConnection: close\r\n\r\n'

reply: 'HTTP/1.1 200 OK\r\n'

header: Date header: Content-Type header: Transfer-Encoding header: Connection header: Vary header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: P3P header: Cache-Control header: Cxy_all header: Expires header: X-Powered-By header: Server header: X-UA-Compatible header: BDPAGETYPE header: BDQID header: BDUSERID

'''

Cookie

CookieJar

import urllib.request

import http.cookiejar

url = 'http://www.baidu.com'

cookie = http.cookiejar.CookieJar()

cookie_handler = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(cookie_handler)

response = opener.open(url)

print(response.getcode())

for item in cookie:  # item为<class 'http.cookiejar.Cookie'>

    print(item.name, item.value, sep=" : ")

MozillaCookieJar

创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

import urllib.request

import http.cookiejar

url = 'https://www.zhihu.com/settings/profile'

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'

}

cookie = http.cookiejar.MozillaCookieJar("zhihu_cookie.txt")

cookie_handler = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(cookie_handler)

opener.addheaders = headers.items()

try:

    cookie.load()    # 将cookie数据从文件加载到内存  很重要

except http.cookiejar.LoadError as e:

    print('cookie文件加载失败')

except IOError as e:

    print("cookie文件不存在")

response = opener.open(url)

print(response.geturl())  # 将geturl()返回的结果和url比对，判断是否登陆成功，失败会转到知乎登陆界面

html = response.read().decode('utf-8')

print(html)

# 对于登陆成功，需要调用MozillaCookieJar对象的save()方法，将数据从内存保存到文件中

LWPCookieJar

创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

import urllib.request

import http.cookiejar

url = 'http://www.baidu.com'

cookie = http.cookiejar.LWPCookieJar("cookies.txt")

opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))

response = opener.open(url)

# 必须调用save方法 将内存中的cookie对象保存到本地文件中, 下次再次使用cookie，只需调用load方法将其加载到内存中即可

cookie.save(ignore_discard=True, ignore_expires=True)

异常处理

URLError

引起URLError的原因通常有: 无网络连接,即本机无法上网、访问的目标服务器不存在。在这种情况下，异常对象会有reason属性(它是一个由(错误码、错误原因)组成的元组对象)。捕获异常方法如下:

import urllib.request

try:

    response = urllib.request.urlopen('http://www.hello_world.org')

except urllib.request.URLError as e:

    print(type(e.reason)) #  <class 'socket.gaierror'>

    print(e.reason)  # # [Errno 11001] getaddrinfo failed

HTTPError

HTTPError是URLError的子类，每次调用urlopen方法发出一个请求时，服务器上都会产生对应response,它包含一个数字"状态码"，

常见的状态码有200(请求成功),302(重定向)，304(文档的内容(自上次访问以来或者根据请求的条件)并没有改变)

这些状态码有的表示服务器无法完成请求。如果无法处理请求，urlopen会抛出HTTPError。

典型的错误包括404(页面没有找到)、403(请求被禁止)、401(当前请求需要用户认证)、407(需要代理验证)、500(服务器内部错误)

# 方式1

import urllib.request

import urllib.error

url = 'http://www.hello_world.org'

# url = 'http://example.com/test.html'

try:

    response = urllib.request.urlopen(url)

# HTTPError是URLError子类，要放到前面处理

except urllib.error.HTTPError as e:

    print("The server cannot fulfill the request...")

    print("Error code: ", e.code)

    print("Reason: ", e.reason)

except urllib.error.URLError as e:

    print("failed to fetch the server...")

    print("Reason: ", e.reason)

# 方式2

import urllib.request

import urllib.error

url = 'http://www.hello_world.org'

# url = 'http://example.com/test.html'

try:

    response = urllib.request.urlopen(url)

except urllib.error.URLError as e:

    if hasattr(e, 'code'):

        print("The server cannot fulfill the request...")

        print("Error code: ", e.code)

        print("Reason: ", e.reason)

    else:

        print("failed to fetch the server...")

        print("Reason: ", e.reason)

urllib.parse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

负责解析URL

from urllib.parse import urlparse

# def urlparse(url, scheme='', allow_fragments=True)

# 将url解析成6部分 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>

# 返回6元祖 (scheme, netloc, path, params, query, fragment)

result = urlparse('http://www.baidu.com/index.html;user?id=100#comment')

print(type(result))  # <class 'urllib.parse.ParseResult'>

print(result)   # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

# 只有通过// 才能识别netloc

result = urlparse(

    '//www.baidu.com/index.html;user?id=100#comment',

    scheme="https")

print(result)  # ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

result = urlparse(

    'www.baidu.com/index.html;user?id=100#comment',

    scheme="https")

print(result)  # ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=100', fragment='comment')

# 原url已包含scheme，使用已有的scheme

result = urlparse(

    'http://www.baidu.com/index.html;user?id=100#comment',

    scheme="https")

print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

result = urlparse(

    "http://www.baidu.com/index.html;user?id=100#comment",

    allow_fragments=False)

print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100#comment', fragment='')

result = urlparse(

    "http://www.baidu.com/index.html#comment",

    allow_fragments=False)

print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')

urllib.parse.urlunparse(parts)

from urllib.parse import urlunparse

data = ("http", 'www.baidu.com','index.html', 'user','id=100','comment')

url = urlunparse(data)

print(url)

urllib.parse.urljoin(base, url, allow_fragments=True)

# 以相对路径的url为准，base url向相对路径url提供相对路径url缺少的scheme(协议),netloc(主机地址), 来构造完整的url路径

from urllib.parse import urljoin

print(urljoin("http://www.baidu.com","FAQ.html"))

print(urljoin("http://www.baidu.com/index.html","FAQ.html"))

print(urljoin("http://www.baiud.com/index.html", "http://www.google.com/FAQ.html"))

print(urljoin("http://www.baidu.com/index.html", "http://www.google.com/FAQ.html?question=2"))

print(urljoin("http://www.baidu.com/index.html?wd=abc", "http://www.google.com/FAQ.html"))

print(urljoin("http://www.baidu.com/", "?category=5#comment"))

print(urljoin("http://www.baidu.com/#comment", "?category=5"))

'''

http://www.baidu.com/FAQ.html

http://www.baidu.com/FAQ.html

http://www.google.com/FAQ.html

http://www.google.com/FAQ.html?question=2

http://www.google.com/FAQ.html

http://www.baidu.com/?category=5#comment

http://www.baidu.com/?category=5

'''

urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)

 from urllib.parse import urlencode

basic_url = 'http://httpbin.org/get'

data = {

    "key": '天气',

}

data = urlencode(data)

full_url = '%s?%s' % (basic_url, data)

print(full_url) # http://httpbin.org/get?key=%E5%A4%A9%E6%B0%94

码农公寓

urllib模块使用

urllib.request

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

发送post数据

设置timeout

响应

响应类型、状态码、响应头、实际获取的url

class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

发送post数据

urllib.request.Request 对象方法

Handlers

ProxyHandler(代理)

代理需要身份认证

HTTPBasicAuthHandler

用于访问web服务器时的身份验证

FTPHandler

HTTPHandler、HTTPSHandler

Cookie

CookieJar

MozillaCookieJar

创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

LWPCookieJar

创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

异常处理

URLError

HTTPError

urllib.parse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

负责解析URL

urllib.parse.urlunparse(parts)

urllib.parse.urljoin(base, url, allow_fragments=True)

urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)

相关文章