1 #1.爬取原始数据
2 # 导入requests库
3 import requests as rs
4 #获取网页源代码 修改headers通过基本猫眼发爬虫审查
5 headers = {
6 ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36‘
7 }
8 url=‘https://maoyan.com‘
9 resp = rs.get(url, headers=headers)
10 print(resp.status_code)
11 print(type(resp))
12 print(‘----------------‘)
13 #设置编码格式避免乱码
14 resp.encoding=‘utf-8‘
15 # 保存网页源代码
16 webText=resp.text
17 open("source.txt", "w").write(webText)
18 print(‘----souce download------------‘)
19
20
21 #2. 使?用BeautifulSoup进?数据解析
22 from bs4 import BeautifulSoup
23 # HtmlParser,是解析Html的一个工具。python自带的,用来解析数据
24 soup = BeautifulSoup(resp.text, ‘html.parser‘)
25 #获取blog文本,保存
26 webContent=soup.text
27 file=open("webContent.txt", "w").write(webContent)
28 print(‘----written---------‘)
29
30
31 #3. 使?WordCloud库生成词云;使用matplotlib库进行可视化
32 from wordcloud import WordCloud
33 import matplotlib.pyplot as plt
34 #读出文本
35 text= open("webContent.txt").read().replace("票","").replace("购","").replace36 ("想看","").replace("人",‘‘).replace("分","").replace("预告片","").replace37 ("想","").replace("预","").replace("万","").replace("售","").replace38 ("上映","").replace("猫眼电影","").replace("maoyan","")
39 print(text)
40 #print(type(text))
41 #设置词云字体格式
42 font = r‘/simhei.ttf‘
43 #调用WordCloud()词云生产函数
44 wc = WordCloud(font_path=font, width=1400, height=1400, margin=2).generate(text)
45 #imshow()函数负责对图像进行处理
46 plt.imshow(wc)
47 #plt.axis("off")
48 #show()函数负责对图像进行展示
49 plt.show()
50 #词云保存为图片
51 wc.to_file(‘webToWordCloud.png‘) # 把词云保存下来
52 print(‘----pic saved---------‘)