requests模块
代理(proxies)
西刺代理
快代理
全网代理
高匿ip:看不到真实ip
透明ip:可以看到代理 和 真实ip
普通代理
proxies = {"协议":"协议://IP地址:端口号"}
'''01_普通代理示例.py'''
import requests
url = "http://www.baidu.com/"
proxies = {"http":"http://183.129.207.82:11597"}
headers = {"User-Agent":"Mozilla/5.0"}
res = requests.get(url,proxies=proxies,headers=headers)
print(res.status_code)
私密代理
proxies = {"协议": "协议://用户名:密码@ip地址:端口号"}
'''02_私密代理示例.py'''
import requests
url = "http://httpbin.org/get"
headers = {"User-Agent":"Mozilla/5.0"}
proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"}
res = requests.get(url,proxies=proxies,headers=headers)
res.encoding = "utf-8"
print(res.text)
爬取链家二手房信息 --> 存到MySQL数据库中
'''05_链家数据ToMongo.py'''
import requests
import re
import pymysql
import warnings
class LianjiaSpider:
def __init__(self):
self.baseurl = "https://bj.lianjia.com/ershoufang/pg"
self.page = 1
self.headers = {"User-Agent": "Mozilla/5.0"}
self.proxies = {"http": "http://127.0.0.1:8888"}
self.db = pymysql.connect("localhost",
"root","ParisPython",charset="utf8")
self.cursor = self.db.cursor()
def getPage(self,url):
res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5)
res.encoding = "utf-8"
html = res.text
print("页面爬取成功,正在解析...")
self.parsePage(html)
def parsePage(self,html):
p = re.compile('<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S)
r_list = p.findall(html)
# [("天通苑","480","万"),()..]
print("页面解析完成,正在存入数据库...")
self.writeTomysql(r_list)
def writeTomysql(self,r_list):
c_db = "create database if not exists Lianjiadb \
character set utf8"
u_db = "use Lianjiadb"
c_tab = "create table if not exists housePrice( \
id int primary key auto_increment,\
housename varchar(50), \
totalprice int)charset=utf8"
warnings.filterwarnings("ignore")
try:
self.cursor.execute(c_db)
self.cursor.execute(u_db)
self.cursor.execute(c_tab)
except Warning:
pass
ins = "insert into housePrice(housename,totalprice) \
values(%s,%s)"
for r_tuple in r_list:
name = r_tuple[0].strip()
price = float(r_tuple[1].strip())*10000
L = [name,price]
self.cursor.execute(ins,L)
self.db.commit()
print("存入数据库成功")
def workOn(self):
while True:
c = input("爬取按y(q退出):")
if c.strip().lower() == "y":
url = self.baseurl + str(self.page) + "/"
self.getPage(url)
self.page += 1
else:
self.cursor.close()
self.db.close()
print("爬取结束,谢谢使用!")
break
if __name__ == "__main__":
spider = LianjiaSpider()
spider.workOn()
找URL
https://bj.lianjia.com/ershoufang/pg1/
正则
<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div="totalPrice">.*?<span>(.*?)</span>(.*?)</div>
Web客户端验证(参数名:auth)
auth=("用户名","密码")
案例 :09_Web客户端验证.py
'''09_Web客户端验证.py'''
import requests
import re
class NoteSpider:
def __init__(self):
self.headers = {"User-Agent":"Mozilla/5.0"}
self.url = "网址"
self.proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"}
# auth参数存储用户名和密码(必须为元组)
self.auth = ("账号","密码")
def getParsePage(self):
res = requests.get(self.url,
proxies=self.proxies,
headers=self.headers,
auth=self.auth,
timeout=3)
res.encoding = "utf-8"
html = res.text
print(html)
p = re.compile('<a href=".*?>(.*?)</a>',re.S)
r_list = p.findall(html)
print(r_list)
self.writePage(r_list)
def writePage(self,r_list):
print("开始写入文件...")
with open("达内科技.txt","a") as f:
for r_str in r_list:
f.write(r_str + "\n\n")
print("写入成功")
if __name__ == "__main__":
spider = NoteSpider()
spider.getParsePage()
SSL证书认证(参数名:verify)
verify = True : 默认,进行SSL证书认证
verify = False: 不做认证
'''10_SSL证书认证示例.py'''
import requests
url = "https://www.12306.cn/mormhweb/"
headers = {"User-Agent":"Mozilla/5.0"}
res = requests.get(url,headers=headers,verify=False)
res.encoding = "utf-8"
print(res.text)
urllib.request中Handler处理器
定义
自定义的urlopen()方法,urlopen()方法是一个特殊的opener(模块已定义好),
不支持代理等功能,通过Handler处理器对象来自定义opener对象
常用方法
build_opener(Handler处理器对象) :创建opener对象
opener.open(url,参数)
# 创建Handler处理器对象
http_handler = urllib.request.HTTPHandler()
#proxy_handler = urllib.request.ProxyHandler()
# 创建自定义的opener对象
opener = urllib.request.build_opener(http_handler)
# 利用opener对象的open()方法发请求
req = urllib.request.Request(url)
res = opener.open(req)
print(res.read().decode("utf-8"))
Handler处理器分类
HTTPHandler() :没有任何特殊功能
ProxyHandler(普通代理)
代理: {"协议":"IP地址:端口号"}
ProxyBasicAuthHandler(密码管理器对象) :私密代理
HTTPBasicAuthHandler(密码管理器对象) : web客户端认证
密码管理器对象作用
私密代理
Web客户端认证
程序实现流程
创建密码管理器对象
pwdmg = urllib.request.HTTPPasswordMgrWithDefaultRealm()
把认证信息添加到密码管理器对象
pwdmg.add_password(None,webserver,user,passwd)
创建Handler处理器对象
私密代理
proxy = urllib.request.ProxyAuthBasicHandler(pwdmg)
Web客户端
webbasic = urllib.request.HTTPBasicAuthHandler(pwdmg)
# 创建Handler处理器对象
pro_hand = urllib.request.ProxyHandler(proxy)
# 创建自定义opener对象
opener = urllib.request.build_opener(pro_hand)
# opener对象open方法发请求
req = urllib.request.Request(url)
res = opener.open(req)
print(res.read().decode("utf-8"))
爬取猫眼电影排行榜存入MongoDB数据库
'''06_猫眼电影top100抓取.py'''
import requests
import re
import pymongo
class MaoyanSpider:
def __init__(self):
self.baseurl = "http://maoyan.com/board/4?offset="
self.headers = {"User-Agent":"Mozilla/5.0"}
self.page = 1
self.offset = 0
self.proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"}
self.conn = pymongo.MongoClient("localhost",27017)
self.db = self.conn.Film
self.myset = self.db.top100
# 下载页面
def loadPage(self,url):
res = requests.get(url,headers=self.headers)
res.encoding = "utf-8"
html = res.text
self.parsePage(html)
# 解析页面
def parsePage(self,html):
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
r_list = p.findall(html)
# print(r_list)
# [("霸王别姬","张国荣","1994-01-01"),(),()...]
self.writeTomysql(r_list)
def writeTomysql(self,r_list):
for r_tuple in r_list:
name = r_tuple[0].strip()
star = r_tuple[1].strip()
releasetime = r_tuple[2].strip()
D = {"name":name,
"star":star,
"releasetime":releasetime}
self.myset.insert(D)
print("存入数据库成功")
def workOn(self):
while True:
c = input("爬取请按y(y/n):")
if c.strip().lower() == "y":
self.offset = (self.page-1)*10
url = self.baseurl + str(self.offset)
self.loadPage(url)
self.page += 1
else:
print("爬取结束,谢谢使用!")
break
if __name__ == "__main__":
spider = MaoyanSpider()
spider.workOn()