urllib
是python
的基本库之一,内置四大模块,即request
,error
,parse
,robotparser
,常用的request
,error
,一个用于发送HTTP请求,一个用于处理请求的错误。parse
用于对URL的处理,拆分,合并等
1、urllib库之urlopen函数
1 """urllib库之urlopen函数""" 2 3 4 #from urllib import request 5 import urllib.request 6 7 # 使用urlopen获得网页代码 8 resp = urllib.request.urlopen(‘http://www.baidu.com/‘) 9 10 # print(resp.read()) #读取的是编码数据 11 print(resp.read().decode(‘utf-8‘)) 12 # <class ‘http.client.HTTPResponse‘> 13 print(type(resp)) 14 15 # print(resp.readline()) #读取1行 16 #print(resp.readlines()) #读取多行
2、urllib库之urlretrieve下载数据
1 """urllib库之urlretrieve下载数据""" 2 3 4 import urllib.request 5 6 # 使用urlretrieve下载数据 7 urllib.request.urlretrieve(‘http://www.baidu.com‘, ‘baidu.html‘) 8 9 # 使用urlretrieve下载图片 10 urllib.request.urlretrieve(‘http://img01.tooopen.com/Downs/images/2011/10/30/sy_20111030205827520061.jpg‘, ‘image.jpg‘)
3、urllib库之解码与编码
1 """urllib库之解码与编码""" 2 3 4 import urllib.parse 5 import urllib.request 6 7 #urlencode函数的用法 8 params = {‘name‘:‘张三‘, ‘age‘:18, ‘年级‘:‘一年级‘} 9 # 编码 10 result = urllib.parse.urlencode(params) 11 #name=%E5%BC%A0%E4%B8%89&age=18&%E5%B9%B4%E7%BA%A7=%E4%B8%80%E5%B9%B4%E7%BA%A7 12 print(result) 13 14 # url = ‘https://www.baidu.com/s?wd=刘德华‘ 15 url = ‘https://www.baidu.com/s?‘ 16 param = {‘wd‘:‘刘德华‘} 17 qs = urllib.parse.urlencode(param) 18 url += qs 19 print(url) 20 resp = urllib.request.urlopen(url) 21 print(resp.read().decode(‘utf-8‘)) 22 23 24 params = {‘name‘:‘张三‘, ‘age‘:18, ‘年级‘:‘一年级‘} 25 # 编码 26 result = urllib.parse.urlencode(params) 27 # name=%E5%BC%A0%E4%B8%89&age=18&%E5%B9%B4%E7%BA%A7=%E4%B8%80%E5%B9%B4%E7%BA%A7 28 print(result) 29 # 解码 30 result2 = urllib.parse.parse_qs(result) 31 #{‘name‘: [‘张三‘], ‘age‘: [‘18‘], ‘年级‘: [‘一年级‘]} 32 print(result2) 33 34 35 params2 = "张三李四" 36 # 编码 37 rs = urllib.parse.quote(params2) 38 print(‘rs=%s‘, rs) 39 # 解码 40 rs2 = urllib.parse.unquote(rs) 41 print(‘rs2=%s‘, rs2)
4、urllib库之urlparse
1 """urllib库之urlparse""" 2 3 4 import urllib.parse 5 6 url = ‘http://www.google.com/search;hello?hl=en&q=urlparse&btnG=Google+Search#1‘ 7 result = urllib.parse.urlparse(url) 8 print(‘result = {}‘.format(result)) 9 print(result.scheme) # http 10 print(result.netloc) # www.google.com 11 print(result.path) # /search 12 print(result.params) # hello (用的极少) 13 print(result.query) # hl=en&q=urlparse&btnG=Google+Search 14 print(result.fragment) # 1 15 result2 = urllib.parse.urlunparse(result) 16 print(‘result2 = {}‘.format(result2)) 17 18 19 url2 = ‘http://www.google.com/search;hello?hl=en&q=urlparse&btnG=Google+Search#1‘ 20 result3 = urllib.parse.urlsplit(url2) 21 print(‘result3 = {}‘.format(result3)) 22 print(result.scheme) # http 23 print(result.netloc) # www.google.com 24 print(result.path) # /search 25 print(result.query) # hl=en&q=urlparse&btnG=Google+Search 26 print(result.fragment) # 1 27 result4 = urllib.parse.urlunsplit(result3) 28 print(‘result4 = {}‘.format(result4)) 29 30 31 url3 = urllib.parse.urljoin(‘http://www.google.com/search?‘, ‘hl=en&q=urlparse&btnG=Google+Search#1‘) 32 print(‘url3 = {}‘.format(url3)) 33 url4 = urllib.parse.urljoin(‘http://www.google.com/search?/‘, ‘hl=en&q=urlparse&btnG=Google+Search#1‘) 34 print(‘url4 = {}‘.format(url4)) 35 url5 = urllib.parse.urljoin(‘http://www.google.com/search‘, ‘?hl=en&q=urlparse&btnG=Google+Search#1‘) 36 print(‘url5 = {}‘.format(url5))