python百度热搜可视化数据分析

一、选题背景

现如今,人们关注新闻、热点的方式有很多种。今日头条、QQ看点、bilibili、微博、网易新闻……此次选题通过百度热搜,根据热搜指数,进行可视化数据分析。

二、网络爬虫设计方案

爬虫名称:python百度热搜爬取

内容:通过爬虫程序爬取热搜头条、指数,然后进行数据可视化分析。

方案描述:

1、request请求访问

2、解析网页,爬取数据。这里采用xtree.xpath

3、数据保存,使用sys

三、结构特征分析

1、结构特征:内容导航型

python百度热搜可视化数据分析

 

 

 节点查找:

hot_title = html.xpath('//[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div[1])
hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2])

节点遍历:

 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div[1]/text()'.format(count))
        for i in hot_title:
                hot_title = i
 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2]/text()'.format(count))
        for i in hot_zhishu:
            hot_zhishu = i

四、网络爬虫设计

1、数据爬取与采集

代码分析:

  1 import time
  2 import random
  3 import requests
  4 from lxml import etree
  5 import sys
  6 import re
  7 
  8 USER_AGENTS = [
  9                 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0',
 10                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0',
 11                 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0',
 12                 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0',
 13                 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
 14                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1',
 15                 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0',
 16                 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0',
 17                 'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0',
 18                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0',
 19                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0',
 20                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0',
 21                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
 22                 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0',
 23                 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0',
 24                 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0',
 25                 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0',
 26                 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0',
 27                 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0',
 28                 'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0',
 29                 'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0',
 30                 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0',
 31                 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1',
 32                 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0',
 33                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 34                 ]
 35 headers = {
 36     'User-Agent':random.choice(USER_AGENTS),
 37     'Connection':'keep-alive',
 38     'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
 39     }
 40 
 41 
 42 Baidu_hotdic = {
 43     "热搜":"realtime",
 44     "小说":"novel",
 45     "电影":"movie",
 46     "电视剧":"teleplay",
 47     "动漫":"cartoon",
 48     "综艺":"variety",
 49     "纪录片":"documentary",
 50     "汽车":"car",
 51     "游戏":"game"
 52 }
 53 def realtime():
 54     # 创建hot_realtime.csv
 55     file = open("hot_realtime.csv", "a")
 56     file.write("hot_title" + "," + "hot_zhishu" + '\n')
 57     file = file.close()
 58 
 59     req = requests.get(url=url,headers=headers)
 60     # print(req.text)
 61 
 62     html = etree.HTML(req.text)
 63     count = 1
 64     print("\t标题\t\t\t\t\t\t", "指数")
 65     for i in range(1,31):
 66         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div[1]/text()'.format(count))
 67         for i in hot_title:
 68                 hot_title = i
 69         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2]/text()'.format(count))
 70         for i in hot_zhishu:
 71             hot_zhishu = i
 72 
 73         with open("hot_realtime.csv", "a") as f:
 74             f.writelines(hot_title + "," + hot_zhishu + '\n')
 75             f.close()
 76         count += 1
 77         print(hot_title,hot_zhishu)
 78 
 79 def novel():
 80     # 创建hot_novel.csv
 81     file = open("hot_novel.csv", "a")
 82     file.write("hot_title" + "," + "hot_zhishu" + '\n')
 83     file = file.close()
 84     req = requests.get(url=url, headers=headers)
 85     # print(req.text)
 86 
 87     html = etree.HTML(req.text)
 88     count = 1
 89     print("\t标题\t\t", "指数")
 90     for i in range(1, 31):
 91         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
 92         for i in hot_title:
 93             hot_title = i
 94         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2]/text()'.format(count))
 95         for i in hot_zhishu:
 96             hot_zhishu = i
 97 
 98         with open("hot_novel.csv", "a") as f:
 99             f.writelines(hot_title + "," + hot_zhishu + '\n')
100             f.close()
101 
102         count += 1
103         print(hot_title, hot_zhishu)
104 
105 def movie():
106     # 创建hot_movie.csv
107     file = open("hot_movie.csv", "a")
108     file.write("hot_title" + "," + "hot_zhishu" + '\n')
109     file = file.close()
110     req = requests.get(url=url, headers=headers)
111     # print(req.text)
112 
113     html = etree.HTML(req.text)
114     count = 1
115     print("\t标题\t\t", "指数")
116     for i in range(1, 31):
117         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
118         for i in hot_title:
119             hot_title = i
120         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
121         for i in hot_zhishu:
122             hot_zhishu = i
123 
124         with open("hot_movie.csv", "a") as f:
125             f.writelines(hot_title + "," + hot_zhishu + '\n')
126             f.close()
127 
128         count += 1
129         print(hot_title, hot_zhishu)
130 
131 def teleplay():
132     # 创建hot_teleplay.csv
133     file = open("hot_teleplay.csv", "a")
134     file.write("hot_title" + "," + "hot_zhishu" + '\n')
135     file = file.close()
136     req = requests.get(url=url, headers=headers)
137     # print(req.text)
138 
139     html = etree.HTML(req.text)
140     count = 1
141     print("\t标题\t\t", "指数")
142     for i in range(1, 31):
143         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
144         for i in hot_title:
145             hot_title = i
146         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
147         for i in hot_zhishu:
148             hot_zhishu = i
149 
150         with open("hot_teleplay.csv", "a") as f:
151             f.writelines(hot_title + "," + hot_zhishu + '\n')
152             f.close()
153 
154         count += 1
155         print(hot_title, hot_zhishu)
156 
157 def cartoon():
158     # 创建hot_cartoon.csv
159     file = open("hot_cartoon.csv", "a")
160     file.write("hot_title" + "," + "hot_zhishu" + '\n')
161     file = file.close()
162     req = requests.get(url=url, headers=headers)
163     # print(req.text)
164 
165     html = etree.HTML(req.text)
166     count = 1
167     print("\t标题\t\t", "指数")
168     for i in range(1, 31):
169         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
170         for i in hot_title:
171             hot_title = i
172         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
173         for i in hot_zhishu:
174             hot_zhishu = i
175 
176         with open("hot_cartoon.csv", "a") as f:
177             f.writelines(hot_title + "," + hot_zhishu + '\n')
178             f.close()
179 
180         count += 1
181         print(hot_title, hot_zhishu)
182 
183 def variety():
184     # variety.csv
185     file = open("variety.csv", "a")
186     file.write("hot_title" + "," + "hot_zhishu" + '\n')
187     file = file.close()
188     req = requests.get(url=url, headers=headers)
189     # print(req.text)
190 
191     html = etree.HTML(req.text)
192     count = 1
193     print("\t标题\t\t", "指数")
194     for i in range(1, 31):
195         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
196         for i in hot_title:
197             hot_title = i
198         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
199         for i in hot_zhishu:
200             hot_zhishu = i
201 
202         with open("variety.csv", "a") as f:
203             f.writelines(hot_title + "," + hot_zhishu + '\n')
204             f.close()
205 
206         count += 1
207         print(hot_title, hot_zhishu)
208 
209 def documentary():
210     # documentary.csv
211     file = open("documentary.csv", "a")
212     file.write("hot_title" + "," + "hot_zhishu" + '\n')
213     file = file.close()
214     req = requests.get(url=url, headers=headers)
215     # print(req.text)
216 
217     html = etree.HTML(req.text)
218     count = 1
219     print("\t标题\t\t", "指数")
220     for i in range(1, 31):
221         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
222         for i in hot_title:
223             hot_title = i
224         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
225         for i in hot_zhishu:
226             hot_zhishu = i
227 
228         with open("documentary.csv", "a") as f:
229             f.writelines(hot_title + "," + hot_zhishu + '\n')
230             f.close()
231 
232         count += 1
233         print(hot_title, hot_zhishu)
234 
235 def car():
236     # car.csv
237     file = open("car.csv", "a")
238     file.write("hot_title" + "," + "hot_zhishu" + '\n')
239     file = file.close()
240     req = requests.get(url=url, headers=headers)
241     # print(req.text)
242 
243     html = etree.HTML(req.text)
244     count = 1
245     print("\t标题\t\t", "指数")
246     for i in range(1, 31):
247         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
248         for i in hot_title:
249             hot_title = i
250         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
251         for i in hot_zhishu:
252             hot_zhishu = i
253 
254         with open("car.csv", "a") as f:
255             f.writelines(hot_title + "," + hot_zhishu + '\n')
256             f.close()
257 
258         count += 1
259         print(hot_title, hot_zhishu)
260 
261 def game():
262     # game.csv
263     file = open("game.csv", "a")
264     file.write("hot_title" + "," + "hot_zhishu" + '\n')
265     file = file.close()
266     req = requests.get(url=url, headers=headers)
267     # print(req.text)
268 
269     html = etree.HTML(req.text)
270     count = 1
271     print("\t标题\t\t", "指数")
272     for i in range(1, 31):
273         hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count))
274         for i in hot_title:
275             hot_title = i
276         hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count))
277         for i in hot_zhishu:
278             hot_zhishu = i
279 
280         with open("game.csv", "a") as f:
281             f.writelines(hot_title + "," + hot_zhishu + '\n')
282             f.close()
283 
284         count += 1
285         print(hot_title, hot_zhishu)
286 
287 if __name__ == '__main__':
288     while(True):
289         hot = input("输入热搜关键词:热搜、小说、电影、电视剧、动漫、综艺、纪录片、汽车、游戏。\n")
290         if hot in "热搜":
291             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["热搜"])
292             realtime()
293         elif hot in "小说":
294             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["小说"])
295             novel()
296         elif hot in "电影":
297             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["电影"])
298             movie()
299         elif hot in "电视剧":
300             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["电视剧"])
301             teleplay()
302         elif hot in "动漫":
303             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["动漫"])
304             cartoon()
305         elif hot in "综艺":
306             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["综艺"])
307             variety()
308         elif hot in "纪录片":
309             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["纪录片"])
310             documentary()
311         elif hot in "汽车":
312             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["汽车"])
313             car()
314             print(url)
315         elif hot in "游戏":
316             url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["游戏"])
317             game()
318         else:
319             print("搜索词错误!!!")
320 
321         choice = input("继续还是结束:")
322         if choice in "继续":
323             continue
324         elif choice in "结束":
325             break
326         else:
327             print("输入有误!")

执行:

python百度热搜可视化数据分析

 

python百度热搜可视化数据分析

 

python百度热搜可视化数据分析

2、数据的清洗与处理

import pandas as pd
import numpy as np
# xs为销量排行的表格、zh为综合表排序
xs = pd.read_csv('hot_novel.csv',error_bad_lines=False,encoding='gbk')
rs = pd.read_csv('hot_realtime.csv',error_bad_lines=False,encoding='gbk')
# 重复值处理
rs = rs.drop_duplicates('hot_title')
xs = xs.drop_duplicates('hot_title')
# Nan处理
rs = rs.dropna(axis = 0)
ss = xs.dropna(axis = 0)

# 根据价格数降序排序
xs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False])
rs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False])

python百度热搜可视化数据分析

 

 

 

 python百度热搜可视化数据分析

 

 

 

import matplotlib.pyplot as plt
x = rs['hot_title'].head(20)
y = rs['hot_zhishu'].head(20)
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False
plt.xticks(rotation=90)
plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3,label="hot_zhishu")
plt.plot(x,y,'-',color = 'r',label="hot_zhishu")
plt.legend(loc = "best")#图例
plt.title("热搜指数趋势图")
plt.xlabel("hot_title",)#横坐标名字
plt.ylabel("hot_zhishu")#纵坐标名字
plt.show()

python百度热搜可视化数据分析

 

plt.barh(x,y, alpha=0.2, height=0.4, color='pink',label="价格", lw=3)
plt.title("热搜指数水平图")
plt.legend(loc = "best")#图例
plt.xlabel("hot_zhishu",)#横坐标名字
plt.ylabel("hot_title")#纵坐标名字
plt.show()

python百度热搜可视化数据分析

 

 

 

# 散点图
plt.scatter(x,y,color='b',marker='o',s=40,alpha=0.5)
plt.xticks(rotation=90)
plt.title("热搜指数散点图")
plt.xlabel("hot_title",)#横坐标名字
plt.ylabel("hot_zhishu")#纵坐标名字
plt.show()

python百度热搜可视化数据分析

 

plt.boxplot(y)
plt.title("热搜指数盒图")
plt.show()

python百度热搜可视化数据分析

 

import pandas as pd
import numpy as np
import wordcloud as wc
from PIL import Image
import matplotlib.pyplot as plt
import random

bk = np.array(Image.open(r"C:\Users\X0iaoyan\Downloads\111.jpg"))
mask = bk
# 定义尺寸
word_cloud = wc.WordCloud(
                       width=1000,  # 词云图宽
                       height=1000,  # 词云图高
                       mask = mask,
                       background_color='black',  # 词云图背景颜色,默认为白色
                       font_path='msyhbd.ttc',  # 词云图 字体(中文需要设定为本机有的中文字体)
                       max_font_size=400,  # 最大字体,默认为200
                       random_state=50,  # 为每个单词返回一个PIL颜色
                       )
text = rs["hot_title"]
text = " ".join(text)
word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()

 

 

 python百度热搜可视化数据分析

 

 

 

 数据分析总代码:

 1 import pandas as pd
 2 import numpy as np
 3 # xs为销量排行的表格、zh为综合表排序
 4 xs = pd.read_csv('hot_novel.csv',error_bad_lines=False,encoding='gbk')
 5 rs = pd.read_csv('hot_realtime.csv',error_bad_lines=False,encoding='gbk')
 6 
 7 # 重复值处理
 8 rs = rs.drop_duplicates('hot_title')
 9 xs = xs.drop_duplicates('hot_title')
10 # Nan处理
11 rs = rs.dropna(axis = 0)
12 ss = xs.dropna(axis = 0)
13 
14 # 根据价格数降序排序
15 xs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False])
16 rs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False])
17 
18 import matplotlib.pyplot as plt
19 x = rs['hot_title'].head(20)
20 y = rs['hot_zhishu'].head(20)
21 plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
22 plt.rcParams['axes.unicode_minus']=False
23 plt.xticks(rotation=90)
24 plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3,label="hot_zhishu")
25 plt.plot(x,y,'-',color = 'r',label="hot_zhishu")
26 plt.legend(loc = "best")#图例
27 plt.title("热搜指数趋势图")
28 plt.xlabel("hot_title",)#横坐标名字
29 plt.ylabel("hot_zhishu")#纵坐标名字
30 plt.show()
31 
32 plt.barh(x,y, alpha=0.2, height=0.4, color='pink',label="价格", lw=3)
33 plt.title("热搜指数水平图")
34 plt.legend(loc = "best")#图例
35 plt.xlabel("hot_zhishu",)#横坐标名字
36 plt.ylabel("hot_title")#纵坐标名字
37 plt.show()
38 
39 # 散点图
40 plt.scatter(x,y,color='b',marker='o',s=40,alpha=0.5)
41 plt.xticks(rotation=90)
42 plt.title("热搜指数散点图")
43 plt.xlabel("hot_title",)#横坐标名字
44 plt.ylabel("hot_zhishu")#纵坐标名字
45 plt.show()
46 
47 plt.boxplot(y)
48 plt.title("热搜指数盒图")
49 plt.show()
50 
51 import pandas as pd
52 import numpy as np
53 import wordcloud as wc
54 from PIL import Image
55 import matplotlib.pyplot as plt
56 import random
57 
58 bk = np.array(Image.open(r"C:\Users\X0iaoyan\Downloads\111.jpg"))
59 mask = bk
60 # 定义尺寸
61 word_cloud = wc.WordCloud(
62                        width=1000,  # 词云图宽
63                        height=1000,  # 词云图高
64                        mask = mask,
65                        background_color='black',  # 词云图背景颜色,默认为白色
66                        font_path='msyhbd.ttc',  # 词云图 字体(中文需要设定为本机有的中文字体)
67                        max_font_size=400,  # 最大字体,默认为200
68                        random_state=50,  # 为每个单词返回一个PIL颜色
69                        )
70 text = rs["hot_title"]
71 text = " ".join(text)
72 word_cloud.generate(text)
73 plt.imshow(word_cloud)
74 plt.show()

 五、总结

1.经过对主题数据的分析与可视化,可以得到哪些结论?是否达到预期的目标? 根据热度指数可以了解到实时热度情况结果达到预期。 2.在完成此设计过程中,得到哪些收获?以及要改进的建议?

 在此次设计过程种我对数据处理种的数据筛出有了很大的收获,说白了就是怎么进行类型转换,然后达到自己的想要的效果。受益匪浅!需要改进的地方可能就是编写程序反应时间过慢了!编程经验比较欠缺。

 

上一篇:【推荐算法】推荐系统中的特征工程(五):特征样本构造工具的实现3


下一篇:PCIe扫盲——复位机制介绍(Fundamental & Hot)