python爬虫实践——爬取豌豆荚“休闲益智”游戏app

  1 ‘‘‘
  2 主页:
  3     图标地址、下载次数、大小、详情页地址
  4 
  5 详情页:
  6     游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
  7 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
  8 
  9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 10 
 11 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 12 
 13 32
 14 ‘‘‘
 15 import requests
 16 from bs4 import BeautifulSoup
 17 from pymongo import MongoClient
 18 import re
 19 
 20 #连接mongoDB数据库
 21 client=MongoClient(localhost,27017)
 22 #主页信息
 23 index_col=client[wandoujia][index]
 24 #详情页信息
 25 detail_col=client[wandoujia][detail]
 26 
 27 # 1、发送请求
 28 def get_page(url):
 29     response = requests.get(url)
 30     return response
 31 
 32 # 2、开始解析
 33 # 解析详情页
 34 def parse_detail(text):
 35     soup = BeautifulSoup(text, lxml)
 36     # print(soup)
 37 
 38     # app名称
 39     try:
 40         name = soup.find(name="span", attrs={"class": "title"}).text
 41     except Exception:
 42         name=None
 43     # print(name)
 44 
 45     # 好评率
 46     try:
 47         love = soup.find(name=span, attrs={"class": "love"}).text
 48     except Exception:
 49         love = None
 50     # print(love)
 51 
 52     # 评论数
 53     try:
 54         commit_num = soup.find(name=a, attrs={"class": "comment-open"}).text
 55     except Exception:
 56         commit_num = None
 57     # print(commit_num)
 58 
 59     # 小编点评
 60     try:
 61         commit_content = soup.find(name=div, attrs={"class": "con"}).text
 62     except Exception:
 63         commit_content = None
 64     # print(commit_content)
 65 
 66     # app下载链接
 67     try:
 68         download_url = soup.find(name=a, attrs={"class": "normal-dl-btn"}).attrs[href]
 69     except Exception:
 70         download_url = None
 71     # print(download_url)
 72 
 73     print(‘‘‘
 74         ============= tank ==============
 75         app名称:{name}
 76         好评率: {love}
 77         评论数: {commit_num}
 78         小编点评: {commit_content}
 79         app下载链接: {download_url}
 80         ============= end ==============
 81         ‘‘‘.format(name=name,love=love,commit_num=commit_num,commit_content=commit_content,download_url=download_url)
 82          )
 83 
 84     #判断所有数据都存在,正常赋值
 85     if name and love and commit_num and commit_content and download_url:
 86         detail_data={
 87             name:name,
 88             love:love,
 89             commit_num:commit_num,
 90             commit_content:commit_content,
 91             download_url:download_url,
 92         }
 93 
 94     #若love没有值,则设置为  没人点赞,很惨
 95     if not love:
 96         detail_data = {
 97             name: name,
 98             love: "没人点赞,很惨",
 99             commit_num:commit_num,
100             commit_content:commit_content,
101             download_url:download_url
102         }
103 
104     # 若download_url没有值,则设置为  没有安装包
105     if not love:
106         detail_data = {
107             name:name,
108             love:love,
109             commit_num: commit_num,
110             commit_content: commit_content,
111             download_url: "没有安装包",
112         }
113 
114     #插入详情页数据
115     detail_col.insert(detail_data)
116     print({name}app数据插入成功!)
117 
118 
119 
120 
121 # 解析主页
122 def parse_index(data):
123     soup = BeautifulSoup(data, lxml)
124 
125     # 获取所有app的li标签
126     app_list = soup.find_all(name=li, attrs={"class": "card"})
127     for app in app_list:
128         # print(app)
129         # print(‘tank‘ * 1000)
130         # print(‘tank *‘ * 1000)
131         # print(app)
132         # 图标地址
133         # 获取第一个img标签中的data-original属性
134         img = app.find(name=img).attrs[data-original]
135         print(img)
136 
137         # 下载次数
138         # 获取class为install-count的span标签中的文本
139         down_num = app.find(name=span, attrs={"class": "install-count"}).text
140         print(down_num)
141 
142 
143         # 大小
144         # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
145         size = soup.find(name=span, text=re.compile("\d+MB")).text
146         print(size)
147 
148         # 详情页地址
149         # 获取class为detail-check-btn的a标签中的href属性
150         # detail_url = soup.find(name=‘a‘, attrs={"class": "name"}).attrs[‘href‘]
151         # print(detail_url)
152 
153         # 详情页地址
154         detail_url = app.find(name=a).attrs[href]
155         print(detail_url)
156 
157         # 拼接数据
158         index_data = {
159             img: img,
160             down_num: down_num,
161             size: size,
162             detail_url: detail_url,
163         }
164 
165         # 插入数据
166         index_col.insert(index_data)
167         print(主页数据插入成功!)
168 
169         # 3、往app详情页发送请求
170         response = get_page(detail_url)
171 
172         # 4、解析app详情页
173         parse_detail(response.text)
174 
175 
176 def main():
177     for line in range(1, 33):
178         url = "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
179 
180         # 1、往app接口发送请求
181         response = get_page(url)
182         # print(response.text)
183         print(* * 1000)
184         # 反序列化为字典
185         data = response.json()
186 
187         # 获取接口中app标签数据
188         app_li = data[data][content]
189         # print(app_li)
190         # 2、解析app标签数据
191         parse_index(app_li)
192 
193         #执行完所有函数关闭mongoDB客户端
194         client.close()
195 
196 if __name__ == __main__:
197     main()

 

python爬虫实践——爬取豌豆荚“休闲益智”游戏app

上一篇:H5 IOS 虚拟键盘不回落的问题


下一篇:javascript没那么简单(转)