1,Python如何访问互联网?
url + lib = urllib
>>> # 使用urllib包下的request模块 >>> >>> import urllib.request >>> >>> # 使用函数 .urlopen(),第一个是 string形式的地址,或者Request对象 >>> response = urllib.request.urlopen("http://www.baidu.com/") >>> print(response) <http.client.HTTPResponse object at 0x02927610> >>> # 读取返回的信息 >>> # 这里要返回的是二进制数据,需要解码的 >>> html = response.read() >>> html = html.decode("utf-8") >>> print(html)
2, 下载一只猫?
我们可以访问网站 http://placekitten.com/ ,我们只需要加上 宽度和高度参数就可以得到一张量身定制的猫的图片.
如下形式 : http://placekitten.com/g/200/300 http://placekitten.com/400/300
# 编写一个 download_cat.py ,完成下载一张猫图 内容如下: import urllib.request response = urllib.request.urlopen("http://placekitten.com/g/200/300") cat_img = response.read() with open("cat_200_300.jpg","wb") as f: f.write(cat_img)
>>> >>> # 除了read()方法之外,还可以使用以下方法: >>> # geturl() info() getcode() >>> >>> response.geturl() 'http://placekitten.com/g/200/300' >>> response.info() <http.client.HTTPMessage object at 0x028A6E50> >>> print(response.info()) Date: Tue, 02 Aug 2016 08:57:00 GMT Content-Type: image/jpeg Content-Length: 9162 Connection: close Set-Cookie: __cfduid=d58fa9ee9079943b9db4ce64366aa85f61470128220; expires=Wed, 02-Aug-17 08:57:00 GMT; path=/; domain=.placekitten.com; HttpOnly Accept-Ranges: bytes X-Powered-By: PleskLin Access-Control-Allow-Origin: * Cache-Control: public Expires: Thu, 31 Dec 2020 20:00:00 GMT Server: cloudflare-nginx CF-RAY: 2cc051e22cad22a0-LAX >>> >>> response.getcode() 200 >>>
3,模拟有道翻译 POST请求
如果 urllib.request.urlopen(url,data) data参数被赋值时,就会使用POST请求,并且data参数是基于 application/x-www-form-urlencoded格式,可以使用urllib.parse.urlencode()处理data
import urllib.request import urllib.parse import json url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null' conent = input('请输入要翻译的内容:') data = {'type':'AUTO','i':conent,'doctype':'json','xmlVersion':'1.8', 'keyfrom':'fanyi.web','ue':'UTF-8','action':'FY_BY_CLICKBUTTON', 'typoResult':'true'} # encode 是把Unicode形式变成其它编码形式 data = urllib.parse.urlencode(data).encode('utf-8') response = urllib.request.urlopen(url,data) # decode 是把其它编码形式变成Unicode形式 html = response.read().decode("utf-8") # html是一个json格式的字符串数据 target = json.loads(html) print("翻译结果: %s" %(target['translateResult'][0][0]['tgt']))
Python编码问题的解决方案总结 http://bbs.fishc.com/thread-56452-1-1.html
4,headers 设置
headers是一个字典形式的数据,有两种设置方式 第一种是在
urlopen(url,data,headers)的第三个参数中设置;第二种方式是生成Request对象,调用 add_header(key,value)添加
服务器检查是程序访问还是浏览器访问一般是通过
-
User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36
header ={} header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36' req = urllib.request.Request(url,data,header) response = urllib.request.urlopen(req)
>>> req.headers {'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'} >>>
req = urllib.request.Request(url,data) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
time 模块的 sleep(5) , 表示睡5秒钟
5,代理
步骤:
1,参数是一个字典{'类型':'代理IP:端口port'}
proxy_support = urllib.request.ProxyHandler({})
2,定制、创建一个opener
opener = urllib.request.build_opener(proxy_support )
3a. 安装opener
urllib.request.install_opener(opener)
3b.调用opener
opener.open(url)
在网上搜索 代理IP,可以搜索到很多免费的代理IP 比如: http://www.xicidaili.com/
import urllib.request import random #该url地址是一个查询IP的地址 url = 'http://www.whatismyip.com.tw' iplist = ['183.129.178.14:8080','123.57.190.51:7777','101.231.250.102:80'] proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)}) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')] urllib.request.install_opener(opener) response = urllib.request.urlopen(url) html = response.read().decode("utf-8") print(html)
小应用,下载煎蛋网的妹子图 http://jandan.net/
import urllib.request import os import random ''' 打开网址. ''' def open_url(url): iplist = ['121.193.143.249:80','119.6.136.122:80','101.231.250.102:80'] proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)}) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')] urllib.request.install_opener(opener) response = urllib.request.urlopen(url) html = response.read() #req = urllib.request.Request(url) #req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') #response = urllib.request.urlopen(req) #html = response.read() return html ''' 获取当前要下载的图片编号 ''' def get_page(url): html = open_url(url).decode('utf-8') # 在html中查找 <span class="current-comment-page">[2081]</span> a = html.find('current-comment-page') + 23 # 从a开始找 有中括号 ] b = html.find(']',a) return html[a:b] def find_imgs(page_url): html = open_url(page_url).decode('utf-8') # 在html中查找 <img src="XXX.jpg"> img_addrs = [] find_start = html.find('ol class="commentlist"') find_end = html.find('/ol',find_start) a = html.find('img src=',find_start,find_end) while a != -1: #从a开始找,最多找255个字符 b = html.find('.jpg',a,a+255) if b != -1: img_addrs.append(html[a+9:b+4]) else: b =a + 9 a = html.find('img src=', b,find_end) return img_addrs def save_imgs(folder,img_addrs): print(folder) for each in img_addrs: # 图片地址 斜杠拆分,取最后一个 img_name = each.split('/')[-1] with open(img_name,'wb') as f: img = open_url(each) f.write(img) ''' 下载妹子图,保存到folder文件夹,下载pages张图 ''' def download_mm(folder='ooxx' , pages = 10): #使用 os模块,创建文件夹和切换到该文件夹 #os.mkdir(folder) os.chdir(folder) url = 'http://jandan.net/ooxx/' #获取要下载的当前图片编号 page_num = int(get_page(url)) # 组装下载链接 for i in range(pages): page_num -= i page_url = url + 'page-' + str(page_num)+ '#comments' #从链接中获取图片地址 image_addrs = find_imgs(page_url) # 下载保存图片 save_imgs(folder,image_addrs) print('--download__over--') if __name__ == '__main__': download_mm()
网络异常 URLError HTTPError , HTTPError 是 URLError的子类