文章目录
爬虫笔记
一,urllib库的使用
urllib库包含多个功能的子模块:
urllib.request
: 实现基本HTTP请求的模块
urllib.error
:异常处理模块
urllib.parse
:解析url模块
python/demo
import urllib
from urllib import parse, request, error
from urllib.request import urlopen
url = "https://www.baidu.com" # // 设置爬取数据的网站
response = urlopen(url=url) #//建立http请求
print(response.read().decode('utf-8'))# // 解析网页源码
HttpResponse的属性和方法:
import urllib
from urllib import parse, request, error
from urllib.request import urlopen
url = "http://www.baidu.com"
response = urlopen(url=url)
print(response.read().decode('utf-8'))
print(response.status) #//状态码
print(response.getheaders()) #//响应头信息
print(response.getheader('Accept-Ranges')) #//获取响应头指定信息
urlopen
默认GET请求,如需发送POST请求则需urlopen(url=url,data=data)
上代码:
import urllib
from urllib import parse, request, error
from urllib.request import urlopen
url = "https://httpbin.org/post"
data = bytes(parse.urlencode({'hello': 'python'}), encoding='utf-8') #//将表单数据转换成bytes类型,并设置编码方式为UTF-8
response = urlopen(url=url, data=data) # //发送网络请求
print(response.read().decode('utf-8'))
伪装请求头信息:上代码
import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
url = "https://httpbin.org/post"
headers = {
'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
data = bytes(parse.urlencode({'hello': 'python'}), encoding='utf-8')
r = Request(url=url, data=data, headers=headers, method='POST') # //创建Request对象
response = urlopen(r)
print(response.read().decode('utf-8'))
模拟登录过程中获取cookie
import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
import http.cookiejar
import json
url = "http://site2.rjkflm.com:666/index/index/chklogin.html"
# headers = {
# 'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
data = bytes(parse.urlencode(
{'username': 'mrsoft', 'password': 'mrsoft'}), encoding='utf-8') # //设置表单提交数据
cookie = http.cookiejar.CookieJar() # //创建CookieJar对象
cookie_processor = request.HTTPCookieProcessor(cookie)# //生成Cookie处理器
opener = request.build_opener(cookie_processor) # //创建opener对象
response = opener.open(url, data=data) # //发送登录请求
response = json.loads(response.read().decode('utf-8'))['msg']
if response == '登录成功!':
for i in cookie:
print(i.name + '=' + i.value)
"""
urlopen不支持自定义代理,所以通过opener对象的open方法发送请求
"""
将cookie信息保存为LWP格式的文件
import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
import http.cookiejar
import http
import json
url = "http://site2.rjkflm.com:666/index/index/chklogin.html"
headers = {
'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
data = bytes(parse.urlencode(
{'username': 'zero', 'password': "123456"}), encoding='utf-8')
cookie_file = "cookie.txt" # 存cookie的文件名包括格式
cookie = http.cookiejar.LWPCookieJar(cookie_file) #/ 创建LWPCookieJard对象
cookie_processor = request.HTTPCookieProcessor(cookie)#//创建cookie处理器
opener = request.build_opener(cookie_processor)#//创建opener对象
response = opener.open(url, data=data) #/发送请求
response = json.loads(response.read().decode('utf-8'))['msg']
if response == '登录成功!':
cookie.save(ignore_discard=True, ignore_expires=True)#// 将cookie信息写进文件里
使用cookie
import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
import http.cookiejar
import http
import json
url = "http://site2.rjkflm.com:666/index/index/index.html"
cookie_file = "cookie.txt"
cookie = http.cookiejar.LWPCookieJar() # 创建一个LWP对象
cookie.load(cookie_file, ignore_discard=True, ignore_expires=True) #读取cookie文件
handler = urllib.request.HTTPCookieProcessor(cookie) # 生成cookie处理器
opener = request.build_opener(handler) # 创建opener对象
response = opener.open(url) # 发送请求
print(response.read().decode('GBK', 'ignore')) # 这行代码中的ignore是过滤GBK编码
ld_opener(handler) # 创建opener对象
response = opener.open(url) # 发送请求
print(response.read().decode('GBK', 'ignore')) # 这行代码中的ignore是过滤GBK编码