''' 1 BeautifulSoup 解析库 2.MongoDB 存储库 3.requests-html ''' ''' 1. 什么是bs4? 是一个基于re开发的解析库,可以提供一些强大的解析功能。 提高提取数据的效率和与爬虫开发效率 ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 从bs4中导入BeautifulSoup from bs4 import BeautifulSoup # 调用实例化得到䘝soup对象 # 参数1:解析文本 参数2: 解析器(html,parser,'lxml') soup = BeautifulSoup(html_doc,'lxml') # 第一个参数是对象,第二个是解析器 print(soup) print('*'*100) print(type(soup)) # 文档美化 html_doc = soup.prettify() print(soup)
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>lyj</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') ''' 1. 直接使用 2.获取标签的名称 3. 获取标签的属性 4. 获取标签的内容 5. 嵌套选择 6. 子节点、子孙节点 7. 父节点、祖先节点 8. 兄弟节点 ''' # # 直接使用 # print(soup.p) # 查找第一个p标签 # print(soup.a) # 查找第一个a标签 # # # 2.获取标签的名称 # print(soup.head.name) #获取head标签的名称 # # # 3. 获取标签的属性 # print(soup.a.attrs) # 获取a标签中的所有属性 # print(soup.a.attrs['href']) # 获取a标签中的href的属性 # # # 4. 获取标签的内容 # print(soup.p.text) # $37 # # # 5. 嵌套选择 # print(soup.html.head) # # # 6. 子节点、子孙节点 # print(soup.body.children) # body所有子节点,返回的是一个迭代器对象 # print(list(soup.body.children)) # 强转成列表类型 # print(soup.body.descendants) # print(list(soup.body.descendants)) # # 7. 父节点、祖先节点 # print(soup.p.parent) # 获取p标签的父亲节点 # print(soup.p.parents) # 获取p标签的所有祖先节点,返回的是迭代器对象 # print(list(soup.p.parents)) # # 8. 兄弟节点 # # 找下一个兄弟 # print(soup.p.next_sibling) # # 找下面所有兄弟,返回的是生成器对象 # print(soup.p.next_siblings) # print(list(soup.p.next_siblings)) # 找上一个兄弟 print(soup.a.previous_sibling) # 文本是一个节点,也是他的兄弟 # 找到a标签上面的所有兄弟节点 print(soup.a.previous_siblings) print(list(soup.a.previous_siblings)) # 返回时生成器
''' find() 找一个 find_all() 找多个 ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'lxml') # # 字符串过滤器 # p_tag = soup.find(name='p') # print(p_tag) # 根据文本p查找某个标签 # p_tags = soup.find_all(name='p') # print(p_tags) # 找到所有标签为p的 # # attrs # # 查找到第一个class为sister的节点 # p = soup.find(attrs={'class':'sister'}) # print(p) # # 查找到所有class为sister的节点 # tag_s = soup.find_all(attrs={'class':'sister'}) # print(tag_s) # # text # text = soup.find(text="$37") # print(text) # # 配合使用: # # 找到一个ID为link2、文本为Lacie的标签 # a = soup.find(name='a', attrs={'id': "link2"}, text='Lacie') # print(a) # # 正则过滤器 # import re # # name # p_tag = soup.find(name=re.compile('p')) # print(p_tag) # # 列表过滤器 # import re # tags = soup.find_all(name=['p','a',re.compile('html')]) # print(tags) # -bool 过滤器 # True匹配 # 找到有id的p标签 p = soup.find(name='p', attrs={"id":True}) print(p) # # 方法过滤器 # # # 匹配标签名为a、属性有id,class的标签 # # def have_id_class(tag): # # if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'): # # return tag # # # # tag = soup.find(name=have_id_class) # # print(tag)
''' 主页: 图标地址、下载次数、大小、详情页地址 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=wu90ydNj9Q4dxxHzRq5PvALC https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=wu90ydNj9Q4dxxHzRq5PvALC 32 ''' import requests from bs4 import BeautifulSoup # 发送请求 def get_page(url): response = requests.get(url) return response # 开始解析 # 解析主页 def parse_detail(text): soup = BeautifulSoup(text, 'lxml') name = soup.find(name='span',attrs={'class':'title'}).text print(name) love = soup.find(name='span',attrs={'class':'love'}).text print(love) commit_num = soup.find(name='a', attrs={'class':'comment-open'}).text print(commit_num) commit_content = soup.find(name='div', attrs={'class': 'con'}).text print(commit_content) download_url = soup.find(name='a', attrs={'class': 'normal-dl-btn'}).attrs['href'] # print(download_url) print( f''' =========begin============ app名称:{name} 好评率:{love} 评论数:{commit_num} 小编点评:{commit_content} app下载链接:{download_url} ==========end=============== ''' ) def parse_index(data): soup = BeautifulSoup(data, 'lxml') # 获取所有app的li标签 app_list = soup.find_all(name='li',attrs={'class':'card'}) for app in app_list: # 图标地址 # 获取第一个img标签中的data-original属性 img = app.find(name='img').attrs['data-original'] print(img) # 下载次数 # 获取class为install-count的span标签中的文本 down_num = app.find(name='span',attrs={'class':'install-count'}).text print(down_num) import re # 大小 # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本 size = soup.find(name='span', text=re.compile("\d+MB")).text print(size) # 详情页地址 # 获取class为detail-check-btn的a标签中的href属性 detail_url = app.find(name='a').attrs['href'] print(detail_url) # 3,往详情页发送请求 response = get_page(detail_url) # 4, 解析app详情页 parse_detail(response.text) def main(): for line in range(1, 33): url = f'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=lAd3GvU1DbFpJzYVdADWw9pS' # 1. 往app接口发送请求 response = get_page(url) print('*'*1000) # 反序列化为字典 data = response.json() # 获取接口中app标签数据 app_li = data['data']['content'] # print(app_li) # 2. 解析app标签数据 parse_index(app_li) if __name__ == '__main__': main()
''' 1. 下载安装 https://www.cnblogs.com/kermitjam/p/10863933.html#_label1 2. 在c盘创建一个data/db文件夹 - 数据的存放路径 3. mongod启动服务 进入终端,输入mongod启动mongoDB服务 4. mongo进入mongoDB客户端 打开一个新的终端,输入mongo进入客户端 数据库操作: SQL: 切换库 use admin; 有则切换,无则报错 MongoDB: 切换库 use tank; 有则切换,无则创建,并切换到tank库中。 查数据库 SQL: show database MongoDB: show dbs 显示的数据库若无数据,则不显示 删除库: SQL: drop database MongoDB: db.dropDatabase() 集合的操作: MySQL中叫表 SQL: create table f1,f2 MongoDB: #在当前库中,通过.来创建集合 db.student 插入数据: # 插入多条数据 db.student.insert([{"name1":"lyj"},{"name2":"zmm"}]) # 插入一条 db.student.insert({"name":"lyj"}) 查数据: #查找student集合所有数据 db.student.find({}) db.student.find({"name":"lyj"}) '''
from pymongo import MongoClient # 1. 链接mongoDB客户端 # 参数1:mongoDB的ip地址 # 参数2:mongoDB的端口号 默认:27017 client = MongoClient('localhost',27017) # print(client) # # # 2.进入lyj_db库,没有则创建 # print(client['lyj_db']) # # # 3.创建集合 # print(client['lyj_db']['people']) # 4. 给lyj_db库插入数据 # a.插入一条 data1 = { 'name':'lyj', 'age':'21', 'sex':'female' } data2 = { 'name':'zmm', 'age':'20', 'sex':'female' } data3 = { 'name':'zcj', 'age':'21', 'sex':'female' } client['lyj_db']['people'].insert([data1,data2,data3]) # 5.查数据 # 查看所有 data_s = client['lyj_db']['people'].find() print(data_s) # 需要循环打印所有数据 for data in data_s: print(data) # 查看一条数据 data = client['lyj_db']['people'].find_one() print(data) # 官方推荐使用 #插入一条insert_one client['lyj_db']['people'].insert_one() # 插入多条insert_many client['lyj_db']['people'].insert_many()