一、选题背景
现如今,人们关注新闻、热点的方式有很多种。今日头条、QQ看点、bilibili、微博、网易新闻……此次选题通过百度热搜,根据热搜指数,进行可视化数据分析。
二、网络爬虫设计方案
爬虫名称:python百度热搜爬取
内容:通过爬虫程序爬取热搜头条、指数,然后进行数据可视化分析。
方案描述:
1、request请求访问
2、解析网页,爬取数据。这里采用xtree.xpath
3、数据保存,使用sys
三、结构特征分析
1、结构特征:内容导航型
节点查找:
hot_title = html.xpath('//[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div[1]) hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2])
节点遍历:
hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div[1]/text()'.format(count)) for i in hot_title: hot_title = i hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2]/text()'.format(count)) for i in hot_zhishu: hot_zhishu = i
四、网络爬虫设计
1、数据爬取与采集
代码分析:
1 import time 2 import random 3 import requests 4 from lxml import etree 5 import sys 6 import re 7 8 USER_AGENTS = [ 9 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0', 10 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0', 11 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0', 12 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0', 13 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1', 14 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1', 15 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0', 16 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0', 17 'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0', 18 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0', 19 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0', 20 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0', 21 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0', 22 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0', 23 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0', 24 'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0', 25 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0', 26 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0', 27 'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0', 28 'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0', 29 'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0', 30 'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0', 31 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1', 32 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', 33 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 34 ] 35 headers = { 36 'User-Agent':random.choice(USER_AGENTS), 37 'Connection':'keep-alive', 38 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' 39 } 40 41 42 Baidu_hotdic = { 43 "热搜":"realtime", 44 "小说":"novel", 45 "电影":"movie", 46 "电视剧":"teleplay", 47 "动漫":"cartoon", 48 "综艺":"variety", 49 "纪录片":"documentary", 50 "汽车":"car", 51 "游戏":"game" 52 } 53 def realtime(): 54 # 创建hot_realtime.csv 55 file = open("hot_realtime.csv", "a") 56 file.write("hot_title" + "," + "hot_zhishu" + '\n') 57 file = file.close() 58 59 req = requests.get(url=url,headers=headers) 60 # print(req.text) 61 62 html = etree.HTML(req.text) 63 count = 1 64 print("\t标题\t\t\t\t\t\t", "指数") 65 for i in range(1,31): 66 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div[1]/text()'.format(count)) 67 for i in hot_title: 68 hot_title = i 69 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2]/text()'.format(count)) 70 for i in hot_zhishu: 71 hot_zhishu = i 72 73 with open("hot_realtime.csv", "a") as f: 74 f.writelines(hot_title + "," + hot_zhishu + '\n') 75 f.close() 76 count += 1 77 print(hot_title,hot_zhishu) 78 79 def novel(): 80 # 创建hot_novel.csv 81 file = open("hot_novel.csv", "a") 82 file.write("hot_title" + "," + "hot_zhishu" + '\n') 83 file = file.close() 84 req = requests.get(url=url, headers=headers) 85 # print(req.text) 86 87 html = etree.HTML(req.text) 88 count = 1 89 print("\t标题\t\t", "指数") 90 for i in range(1, 31): 91 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 92 for i in hot_title: 93 hot_title = i 94 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[1]/div[2]/text()'.format(count)) 95 for i in hot_zhishu: 96 hot_zhishu = i 97 98 with open("hot_novel.csv", "a") as f: 99 f.writelines(hot_title + "," + hot_zhishu + '\n') 100 f.close() 101 102 count += 1 103 print(hot_title, hot_zhishu) 104 105 def movie(): 106 # 创建hot_movie.csv 107 file = open("hot_movie.csv", "a") 108 file.write("hot_title" + "," + "hot_zhishu" + '\n') 109 file = file.close() 110 req = requests.get(url=url, headers=headers) 111 # print(req.text) 112 113 html = etree.HTML(req.text) 114 count = 1 115 print("\t标题\t\t", "指数") 116 for i in range(1, 31): 117 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 118 for i in hot_title: 119 hot_title = i 120 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 121 for i in hot_zhishu: 122 hot_zhishu = i 123 124 with open("hot_movie.csv", "a") as f: 125 f.writelines(hot_title + "," + hot_zhishu + '\n') 126 f.close() 127 128 count += 1 129 print(hot_title, hot_zhishu) 130 131 def teleplay(): 132 # 创建hot_teleplay.csv 133 file = open("hot_teleplay.csv", "a") 134 file.write("hot_title" + "," + "hot_zhishu" + '\n') 135 file = file.close() 136 req = requests.get(url=url, headers=headers) 137 # print(req.text) 138 139 html = etree.HTML(req.text) 140 count = 1 141 print("\t标题\t\t", "指数") 142 for i in range(1, 31): 143 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 144 for i in hot_title: 145 hot_title = i 146 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 147 for i in hot_zhishu: 148 hot_zhishu = i 149 150 with open("hot_teleplay.csv", "a") as f: 151 f.writelines(hot_title + "," + hot_zhishu + '\n') 152 f.close() 153 154 count += 1 155 print(hot_title, hot_zhishu) 156 157 def cartoon(): 158 # 创建hot_cartoon.csv 159 file = open("hot_cartoon.csv", "a") 160 file.write("hot_title" + "," + "hot_zhishu" + '\n') 161 file = file.close() 162 req = requests.get(url=url, headers=headers) 163 # print(req.text) 164 165 html = etree.HTML(req.text) 166 count = 1 167 print("\t标题\t\t", "指数") 168 for i in range(1, 31): 169 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 170 for i in hot_title: 171 hot_title = i 172 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 173 for i in hot_zhishu: 174 hot_zhishu = i 175 176 with open("hot_cartoon.csv", "a") as f: 177 f.writelines(hot_title + "," + hot_zhishu + '\n') 178 f.close() 179 180 count += 1 181 print(hot_title, hot_zhishu) 182 183 def variety(): 184 # variety.csv 185 file = open("variety.csv", "a") 186 file.write("hot_title" + "," + "hot_zhishu" + '\n') 187 file = file.close() 188 req = requests.get(url=url, headers=headers) 189 # print(req.text) 190 191 html = etree.HTML(req.text) 192 count = 1 193 print("\t标题\t\t", "指数") 194 for i in range(1, 31): 195 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 196 for i in hot_title: 197 hot_title = i 198 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 199 for i in hot_zhishu: 200 hot_zhishu = i 201 202 with open("variety.csv", "a") as f: 203 f.writelines(hot_title + "," + hot_zhishu + '\n') 204 f.close() 205 206 count += 1 207 print(hot_title, hot_zhishu) 208 209 def documentary(): 210 # documentary.csv 211 file = open("documentary.csv", "a") 212 file.write("hot_title" + "," + "hot_zhishu" + '\n') 213 file = file.close() 214 req = requests.get(url=url, headers=headers) 215 # print(req.text) 216 217 html = etree.HTML(req.text) 218 count = 1 219 print("\t标题\t\t", "指数") 220 for i in range(1, 31): 221 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 222 for i in hot_title: 223 hot_title = i 224 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 225 for i in hot_zhishu: 226 hot_zhishu = i 227 228 with open("documentary.csv", "a") as f: 229 f.writelines(hot_title + "," + hot_zhishu + '\n') 230 f.close() 231 232 count += 1 233 print(hot_title, hot_zhishu) 234 235 def car(): 236 # car.csv 237 file = open("car.csv", "a") 238 file.write("hot_title" + "," + "hot_zhishu" + '\n') 239 file = file.close() 240 req = requests.get(url=url, headers=headers) 241 # print(req.text) 242 243 html = etree.HTML(req.text) 244 count = 1 245 print("\t标题\t\t", "指数") 246 for i in range(1, 31): 247 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 248 for i in hot_title: 249 hot_title = i 250 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 251 for i in hot_zhishu: 252 hot_zhishu = i 253 254 with open("car.csv", "a") as f: 255 f.writelines(hot_title + "," + hot_zhishu + '\n') 256 f.close() 257 258 count += 1 259 print(hot_title, hot_zhishu) 260 261 def game(): 262 # game.csv 263 file = open("game.csv", "a") 264 file.write("hot_title" + "," + "hot_zhishu" + '\n') 265 file = file.close() 266 req = requests.get(url=url, headers=headers) 267 # print(req.text) 268 269 html = etree.HTML(req.text) 270 count = 1 271 print("\t标题\t\t", "指数") 272 for i in range(1, 31): 273 hot_title = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[{}]/div[2]/a/div/text()'.format(count)) 274 for i in hot_title: 275 hot_title = i 276 hot_zhishu = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div[1{}]/div[1]/div[2]/text()'.format(count)) 277 for i in hot_zhishu: 278 hot_zhishu = i 279 280 with open("game.csv", "a") as f: 281 f.writelines(hot_title + "," + hot_zhishu + '\n') 282 f.close() 283 284 count += 1 285 print(hot_title, hot_zhishu) 286 287 if __name__ == '__main__': 288 while(True): 289 hot = input("输入热搜关键词:热搜、小说、电影、电视剧、动漫、综艺、纪录片、汽车、游戏。\n") 290 if hot in "热搜": 291 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["热搜"]) 292 realtime() 293 elif hot in "小说": 294 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["小说"]) 295 novel() 296 elif hot in "电影": 297 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["电影"]) 298 movie() 299 elif hot in "电视剧": 300 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["电视剧"]) 301 teleplay() 302 elif hot in "动漫": 303 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["动漫"]) 304 cartoon() 305 elif hot in "综艺": 306 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["综艺"]) 307 variety() 308 elif hot in "纪录片": 309 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["纪录片"]) 310 documentary() 311 elif hot in "汽车": 312 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["汽车"]) 313 car() 314 print(url) 315 elif hot in "游戏": 316 url = 'https://top.baidu.com/board?tab={}'.format(Baidu_hotdic["游戏"]) 317 game() 318 else: 319 print("搜索词错误!!!") 320 321 choice = input("继续还是结束:") 322 if choice in "继续": 323 continue 324 elif choice in "结束": 325 break 326 else: 327 print("输入有误!")
执行:
2、数据的清洗与处理
import pandas as pd import numpy as np # xs为销量排行的表格、zh为综合表排序 xs = pd.read_csv('hot_novel.csv',error_bad_lines=False,encoding='gbk') rs = pd.read_csv('hot_realtime.csv',error_bad_lines=False,encoding='gbk')
# 重复值处理 rs = rs.drop_duplicates('hot_title') xs = xs.drop_duplicates('hot_title') # Nan处理 rs = rs.dropna(axis = 0) ss = xs.dropna(axis = 0) # 根据价格数降序排序 xs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False]) rs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False])
import matplotlib.pyplot as plt x = rs['hot_title'].head(20) y = rs['hot_zhishu'].head(20) plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False plt.xticks(rotation=90) plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3,label="hot_zhishu") plt.plot(x,y,'-',color = 'r',label="hot_zhishu") plt.legend(loc = "best")#图例 plt.title("热搜指数趋势图") plt.xlabel("hot_title",)#横坐标名字 plt.ylabel("hot_zhishu")#纵坐标名字 plt.show()
plt.barh(x,y, alpha=0.2, height=0.4, color='pink',label="价格", lw=3) plt.title("热搜指数水平图") plt.legend(loc = "best")#图例 plt.xlabel("hot_zhishu",)#横坐标名字 plt.ylabel("hot_title")#纵坐标名字 plt.show()
# 散点图 plt.scatter(x,y,color='b',marker='o',s=40,alpha=0.5) plt.xticks(rotation=90) plt.title("热搜指数散点图") plt.xlabel("hot_title",)#横坐标名字 plt.ylabel("hot_zhishu")#纵坐标名字 plt.show()
plt.boxplot(y) plt.title("热搜指数盒图") plt.show()
import pandas as pd import numpy as np import wordcloud as wc from PIL import Image import matplotlib.pyplot as plt import random bk = np.array(Image.open(r"C:\Users\X0iaoyan\Downloads\111.jpg")) mask = bk # 定义尺寸 word_cloud = wc.WordCloud( width=1000, # 词云图宽 height=1000, # 词云图高 mask = mask, background_color='black', # 词云图背景颜色,默认为白色 font_path='msyhbd.ttc', # 词云图 字体(中文需要设定为本机有的中文字体) max_font_size=400, # 最大字体,默认为200 random_state=50, # 为每个单词返回一个PIL颜色 ) text = rs["hot_title"] text = " ".join(text) word_cloud.generate(text) plt.imshow(word_cloud) plt.show()
数据分析总代码:
1 import pandas as pd 2 import numpy as np 3 # xs为销量排行的表格、zh为综合表排序 4 xs = pd.read_csv('hot_novel.csv',error_bad_lines=False,encoding='gbk') 5 rs = pd.read_csv('hot_realtime.csv',error_bad_lines=False,encoding='gbk') 6 7 # 重复值处理 8 rs = rs.drop_duplicates('hot_title') 9 xs = xs.drop_duplicates('hot_title') 10 # Nan处理 11 rs = rs.dropna(axis = 0) 12 ss = xs.dropna(axis = 0) 13 14 # 根据价格数降序排序 15 xs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False]) 16 rs.sort_values(by=["hot_zhishu"],inplace=True,ascending=[False]) 17 18 import matplotlib.pyplot as plt 19 x = rs['hot_title'].head(20) 20 y = rs['hot_zhishu'].head(20) 21 plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 22 plt.rcParams['axes.unicode_minus']=False 23 plt.xticks(rotation=90) 24 plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3,label="hot_zhishu") 25 plt.plot(x,y,'-',color = 'r',label="hot_zhishu") 26 plt.legend(loc = "best")#图例 27 plt.title("热搜指数趋势图") 28 plt.xlabel("hot_title",)#横坐标名字 29 plt.ylabel("hot_zhishu")#纵坐标名字 30 plt.show() 31 32 plt.barh(x,y, alpha=0.2, height=0.4, color='pink',label="价格", lw=3) 33 plt.title("热搜指数水平图") 34 plt.legend(loc = "best")#图例 35 plt.xlabel("hot_zhishu",)#横坐标名字 36 plt.ylabel("hot_title")#纵坐标名字 37 plt.show() 38 39 # 散点图 40 plt.scatter(x,y,color='b',marker='o',s=40,alpha=0.5) 41 plt.xticks(rotation=90) 42 plt.title("热搜指数散点图") 43 plt.xlabel("hot_title",)#横坐标名字 44 plt.ylabel("hot_zhishu")#纵坐标名字 45 plt.show() 46 47 plt.boxplot(y) 48 plt.title("热搜指数盒图") 49 plt.show() 50 51 import pandas as pd 52 import numpy as np 53 import wordcloud as wc 54 from PIL import Image 55 import matplotlib.pyplot as plt 56 import random 57 58 bk = np.array(Image.open(r"C:\Users\X0iaoyan\Downloads\111.jpg")) 59 mask = bk 60 # 定义尺寸 61 word_cloud = wc.WordCloud( 62 width=1000, # 词云图宽 63 height=1000, # 词云图高 64 mask = mask, 65 background_color='black', # 词云图背景颜色,默认为白色 66 font_path='msyhbd.ttc', # 词云图 字体(中文需要设定为本机有的中文字体) 67 max_font_size=400, # 最大字体,默认为200 68 random_state=50, # 为每个单词返回一个PIL颜色 69 ) 70 text = rs["hot_title"] 71 text = " ".join(text) 72 word_cloud.generate(text) 73 plt.imshow(word_cloud) 74 plt.show()
五、总结
1.经过对主题数据的分析与可视化,可以得到哪些结论?是否达到预期的目标? 根据热度指数可以了解到实时热度情况结果达到预期。 2.在完成此设计过程中,得到哪些收获?以及要改进的建议?在此次设计过程种我对数据处理种的数据筛出有了很大的收获,说白了就是怎么进行类型转换,然后达到自己的想要的效果。受益匪浅!需要改进的地方可能就是编写程序反应时间过慢了!编程经验比较欠缺。