爬虫笔记《一》

文章目录

爬虫笔记

一,urllib库的使用

​ urllib库包含多个功能的子模块:

urllib.request: 实现基本HTTP请求的模块

urllib.error:异常处理模块

urllib.parse:解析url模块

​ python/demo

import urllib
from urllib import parse, request, error
from urllib.request import urlopen

url = "https://www.baidu.com" # // 设置爬取数据的网站
response = urlopen(url=url) #//建立http请求
print(response.read().decode('utf-8'))# // 解析网页源码

HttpResponse的属性和方法:

import urllib
from urllib import parse, request, error
from urllib.request import urlopen

url = "http://www.baidu.com"
response = urlopen(url=url)
print(response.read().decode('utf-8'))
print(response.status) #//状态码
print(response.getheaders()) #//响应头信息
print(response.getheader('Accept-Ranges')) #//获取响应头指定信息

urlopen 默认GET请求,如需发送POST请求则需urlopen(url=url,data=data)上代码:

import urllib
from urllib import parse, request, error
from urllib.request import urlopen

url = "https://httpbin.org/post"
data = bytes(parse.urlencode({'hello': 'python'}), encoding='utf-8') #//将表单数据转换成bytes类型,并设置编码方式为UTF-8
response = urlopen(url=url, data=data) # //发送网络请求 
print(response.read().decode('utf-8'))

伪装请求头信息:上代码

import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request

url = "https://httpbin.org/post"
headers = {
    'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
data = bytes(parse.urlencode({'hello': 'python'}), encoding='utf-8')
r = Request(url=url, data=data, headers=headers, method='POST') # //创建Request对象
response = urlopen(r)
print(response.read().decode('utf-8'))

模拟登录过程中获取cookie

import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
import http.cookiejar
import json

url = "http://site2.rjkflm.com:666/index/index/chklogin.html"
# headers = {
#   'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
data = bytes(parse.urlencode(
    {'username': 'mrsoft', 'password': 'mrsoft'}), encoding='utf-8') # //设置表单提交数据
cookie = http.cookiejar.CookieJar() # //创建CookieJar对象
cookie_processor = request.HTTPCookieProcessor(cookie)# //生成Cookie处理器
opener = request.build_opener(cookie_processor) # //创建opener对象
response = opener.open(url, data=data) # //发送登录请求
response = json.loads(response.read().decode('utf-8'))['msg']
if response == '登录成功!':
    for i in cookie:
        print(i.name + '=' + i.value)
"""
urlopen不支持自定义代理,所以通过opener对象的open方法发送请求
"""

将cookie信息保存为LWP格式的文件

import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
import http.cookiejar
import http
import json

url = "http://site2.rjkflm.com:666/index/index/chklogin.html"
headers = {
    'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
data = bytes(parse.urlencode(
    {'username': 'zero', 'password': "123456"}), encoding='utf-8')

cookie_file = "cookie.txt" # 存cookie的文件名包括格式
cookie = http.cookiejar.LWPCookieJar(cookie_file) #/ 创建LWPCookieJard对象
cookie_processor = request.HTTPCookieProcessor(cookie)#//创建cookie处理器
opener = request.build_opener(cookie_processor)#//创建opener对象
response = opener.open(url, data=data) #/发送请求
response = json.loads(response.read().decode('utf-8'))['msg']
if response == '登录成功!':
    cookie.save(ignore_discard=True, ignore_expires=True)#// 将cookie信息写进文件里

使用cookie

import urllib
from urllib import parse, request, error
from urllib.request import urlopen, Request
import http.cookiejar
import http
import json


url = "http://site2.rjkflm.com:666/index/index/index.html"
cookie_file = "cookie.txt"
cookie = http.cookiejar.LWPCookieJar() # 创建一个LWP对象
cookie.load(cookie_file, ignore_discard=True, ignore_expires=True) #读取cookie文件
handler = urllib.request.HTTPCookieProcessor(cookie)  # 生成cookie处理器
opener = request.build_opener(handler) # 创建opener对象
response = opener.open(url) # 发送请求
print(response.read().decode('GBK', 'ignore')) # 这行代码中的ignore是过滤GBK编码
ld_opener(handler) # 创建opener对象
response = opener.open(url) # 发送请求
print(response.read().decode('GBK', 'ignore')) # 这行代码中的ignore是过滤GBK编码
上一篇:使用Python做爬虫时出现中文乱码


下一篇:ICPC 2018 徐州赛区网络赛