一、爬取网站大学排名Top500
1、选择要爬取目标网站
2、分析网站所需数据的源码
3、编写python爬虫程序
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配`
import urllib.request, urllib.error # 制定URL,获取网页数据
import xlwt # 进行excel操作
class SchoolSorting:
def __init__(self, baseUrl, savePath):
self.baseUrl = baseUrl
self.savePath = savePath
self.findName = re.compile(r'<td>([\u4e00-\u9fa5]+)</td>')
self.findScore = re.compile(r'<td>(\d+\.{0,1}\d*)</td>')
self.html = "" # 网页原始数据
self.nameLink = [] # 大学名称
self.scoreLink = [] # 大学排名和评分
self.dataList = []
def run(self):
self.askURL() # 爬取网页,得到网页原始数据
self.getData() # 处理,获取大学名称、大学排名和评分s
self.getListData() # 集成大学名称、大学排名和评分
self.saveData() # 保存数据到表格
# 爬取网页,得到网页原始数据
def askURL(self):
head = { # 模拟浏览器头部信息,向服务器发送消息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
# 用户代理,表示告诉服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
request = urllib.request.Request(self.baseUrl, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
self.html = html
# 处理,获取大学名称、大学排名和评分
def getData(self):
soup = BeautifulSoup(self.html, "html.parser")
item = soup.find('div', class_="u-body") # 查找符合要求的字符串
item = str(item)
self.nameLink = re.findall(self.findName, item) # 通过正则表达式查找
self.scoreLink = re.findall(self.findScore, item)
print(self.nameLink)
print(self.scoreLink)
# 集成大学名称、大学排名和评分
def getListData(self):
j = 0
z = 3
try:
for i in range(len(self.nameLink) + len(self.scoreLink)):
if i % 3 == 0:
self.dataList.append(self.scoreLink[j])
j = j + 1
elif i % 3 == 1:
self.dataList.append(self.nameLink[z])
z = z + 1
elif i % 3 == 2:
self.dataList.append(self.scoreLink[j])
j = j + 1
except Exception:
pass
print(self.dataList)
# 保存数据到表格
def saveData(self):
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('中国学校排名', cell_overwrite_ok=True) # 创建工作表
col = ("排名", "学校名称", "总分")
colLen = len(col)
for i in range(colLen):
sheet.write(0, i, col[i]) # 列名
j = 1 # 第二行开始
for i in range(0, len(self.dataList)-1):
sheet.write(j, i % colLen, self.dataList[i]) # 数据
if i % colLen == 2:
j = j + 1
book.save(self.savePath) # 保存
if __name__ == '__main__':
baseUrl = "https://www.eol.cn/e_html/gk/dxpm/index.shtml"
savePath = "中国学校排名.xls"
# baseUrl = "http://localhost:63342/Sholl/sholl/python%E7%88%AC%E8%99%AB/main.html?_ijt=jm2fph57cumgtgv2rk542p35r1"
schoolSorting = SchoolSorting(baseUrl=baseUrl, savePath=savePath)
schoolSorting.run()
4、查看结果Excel表格文件
二、爬取58同城的房价信息
1、选择要爬取目标网站
北京二手房网,北京房产网,北京二手房买卖出售交易信息-北京58同城
2、分析网站所需数据的源码
3、编写python爬虫程序
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配`
import urllib.request, urllib.error # 制定URL,获取网页数据
import xlwt # 进行excel操作
class Home:
def __init__(self, baseUrl, savePath):
self.baseUrl = baseUrl
self.savePath = savePath
self.html = ""
self.findTitle = re.compile(r'<p class="property-content-info-comm-name" data-v-196d9aea="">(.+)</p>')
self.findPrice = re.compile(r'<p class="property-price-total" data-v-196d9aea=""><span '
r'class="property-price-total-num" data-v-196d9aea="">(.+)</span> <span '
r'class="property-price-total-text" data-v-196d9aea="">(.+)</span></p>')
self.titleList = [] # 标题
self.priceList = [] # 价格
def run(self):
self.askURL() # 爬取网页,得到网页原始数据
self.getData() # 处理,获取数据
self.saveData() # 保存数据到表格
# 爬取网页,得到网页原始数据
def askURL(self):
head = { # 模拟浏览器头部信息,向服务器发送消息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
# 用户代理,表示告诉服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
request = urllib.request.Request(self.baseUrl, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
self.html = html
# 处理,获取数据
def getData(self):
try:
soup = BeautifulSoup(self.html, "html.parser")
# <p class="property-content-info-comm-name" data-v-196d9aea>长阳半岛云湾家园1号院</p>
for item in soup.find_all('p', class_="property-content-info-comm-name"): # 查找符合要求的字符串
item = str(item)
self.titleList.append(re.findall(self.findTitle, item)[0]) # 通过正则表达式查找
for item in soup.find_all('p', class_="property-price-total"): # 查找符合要求的字符串
item = str(item)
# print(item)
result = re.findall(self.findPrice, item) # 通过正则表达式查找
# print(result[0][0] + result[0][1])
self.priceList.append(result[0][0] + result[0][1])
except Exception:
pass
print(self.titleList)
print(self.priceList)
# 保存数据到表格
def saveData(self):
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('二手房信息', cell_overwrite_ok=True) # 创建工作表
col = ("名称", "价格")
colLen = len(col)
for i in range(colLen):
sheet.write(0, i, col[i]) # 列名
j = 1 # 第二行开始
z = 1
for i in range(0, len(self.titleList) + len(self.priceList) - 1):
if i % colLen == colLen - 1:
try:
sheet.write(j, 1, self.priceList[j]) # 数据
j = j + 1
except Exception:
pass
else:
try:
sheet.write(j, 0, self.titleList[z]) # 数据
z = z + 1
except Exception:
pass
book.save(self.savePath) # 保存
if __name__ == '__main__':
baseUrl = "https://bj.58.com/ershoufang/p1/"
savePath = "二手房信息.xls"
home = Home(baseUrl=baseUrl, savePath=savePath)
home.run()
4、查看结果Excel表格文件
三、爬取小说《红楼梦》
1、爬取《红楼梦》目录
<1> 选择要爬取目标网站
红楼梦_红楼梦小说全集_红楼梦小说在线阅读_古典文学网 (gdwxcn.com)
<2> 分析网站所需数据的源码
<3> 编写python爬虫程序
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配`
import urllib.request, urllib.error # 制定URL,获取网页数据
import xlwt # 进行excel操作
class Book:
def __init__(self, baseUrl, savePath):
self.baseUrl = baseUrl
self.savePath = savePath
self.html = ""
self.findData = re.compile(r'<li><a href="(.+)" target="_blank">(.+)</a></li>')
self.resultList = []
def run(self):
self.askURL() # 爬取网页,得到网页原始数据
# print(self.html)
self.getData() # 处理,获取数据
self.saveData() # 保存数据到表格
# 爬取网页,得到网页原始数据
def askURL(self):
head = { # 模拟浏览器头部信息,向服务器发送消息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
# 用户代理,表示告诉服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
request = urllib.request.Request(self.baseUrl, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gb18030")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
self.html = html
# 处理,获取数据
def getData(self):
soup = BeautifulSoup(self.html, "html.parser")
zhangjie = soup.find('div', class_="zhangjie")
zhangjie = str(zhangjie)
self.resultList = re.findall(self.findData, zhangjie)
print(self.resultList)
# 保存数据到表格
def saveData(self):
book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象
sheet = book.add_sheet('红楼梦目录', cell_overwrite_ok=True) # 创建工作表
col = ("名称", "url")
colLen = len(col)
for i in range(colLen):
sheet.write(0, i, col[i]) # 列名
for i in range(1, len(self.resultList) - 1):
sheet.write(i, 0, self.resultList[i][1]) # 名称
sheet.write(i, 1, self.resultList[i][0]) # url
book.save(self.savePath) # 保存
if __name__ == '__main__':
baseUrl = "https://www.gdwxcn.com/gdxs/hlm/"
savePath = "红楼梦目录.xls"
book = Book(baseUrl=baseUrl, savePath=savePath)
book.run()
<4>查看结果Excel表格文件
2、爬取文章
<1> 查看url规律:
<2> 分析html 源码
<3>编写python爬虫代码:
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配`
import urllib.request, urllib.error # 制定URL,获取网页数据
import xlwt # 进行excel操作
class BookContent:
def __init__(self, baseUrl, savePath, chapter):
self.baseUrl = baseUrl
self.savePath = savePath
self.chapter = chapter
self.findData = re.compile(r'<p>(.+)</p>')
self.findTitle = re.compile(r'<title>(.+)</title>')
def run(self):
i = self.chapter['startNamber']
while i > self.chapter['start'] - self.chapter['end'] + 2163:
url = self.baseUrl + str(i) + ".html"
print(url)
html = self.askURL(url) # 爬取网页,得到网页原始数据
textData = self.getData(html) # 处理,获取数据
self.saveData(textData) # 保存数据
i = i - 1
# 爬取网页,得到网页原始数据
def askURL(self, url):
head = { # 模拟浏览器头部信息,向服务器发送消息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
# 用户代理,表示告诉服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)
html = ''
request = urllib.request.Request(url, headers=head)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gb18030")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def getData(self, html):
soup = BeautifulSoup(html, "html.parser")
title = re.findall(self.findTitle, str(soup.title))[0]
print(title)
xstext = soup.find('div', class_="xstext")
xstext = str(xstext)
contentList = re.findall(self.findData, xstext)
textData = {
"title": title,
"contentList": contentList
}
return textData
# 保存数据到表格
def saveData(self, textData,):
with open(self.savePath+textData["title"]+".txt", "w") as f:
f.write(textData["title"] + '\n') # 自带文件关闭功能,不需要再写f.close()
with open(self.savePath + textData["title"] + ".txt", "a") as fp:
for item in textData["contentList"]:
try:
fp.write(item + '\n')
except Exception:
pass
if __name__ == '__main__':
baseUrl = "https://www.gdwxcn.com/gdxs/hlm/"
savePath = "temp/"
start = int(input("请输入要爬取全文的起始章节:"))
end = int(input("请输入要爬取全文的末尾章节:"))
chapter = {
"startNamber": 2164,
"start": start,
"end": end,
}
bookContent = BookContent(baseUrl=baseUrl, savePath=savePath, chapter=chapter)
bookContent.run()