Data Processing and Visulisation with Python
News
引:近年中美关系纷繁复杂,本project试图使用python抓取、整理信息,并进行分析希望能挖掘出一些有趣的细节、重要的信息以及主线的规律。
目标:通过分析这些新闻的文本,分析近几年的中美关系(包括主题、核心事务的变化规律)。
# 首先安装本project会使用到的额外的包
!pip install tqdm
!pip install nltk
## nltk 的运用会出现问题 可以通过报错自行解决
爬虫部分:
爬虫代码隐去,本project直接提供爬取到的信息。
信息整理部分:
import os
import re
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
strptime = datetime.datetime.strptime
def get_standard_time(file_name):
return strptime(file_name[:-4].replace('#',':'),'%Y-%m-%d@%H:%M')
爬取到的文档中仍然包含特殊的字符和tag
series_file = pd.Series(os.listdir(r'.\all_news'))
#%%删除无关tag与链接、图片内容
# os.mkdir(r'.\deltag')
os.chdir(r'.\deltag')
!del *.txt
os.chdir(r'..')
for each_file in tqdm(series_file):
with open(f'.\\all_news\\{each_file}','r',encoding='utf-8') as f:
content = f.read()
content1 = re.sub(r'<table.*?</table>','',content,flags=re.S)
content1 = re.sub(r'<em>.*?</em>','',content1,flags=re.S)
content1 = re.sub(r'<span.*</span>','',content1,flags=re.S)
content1 = re.sub(r'<center> .*?</center>','',content1,flags=re.S)
content1 = re.sub(r'<p.*?>','',content1,flags=re.S)
content1 = re.sub(r'<img.*?>','',content1,flags=re.S)
content1 = re.sub(r'<br />','',content1,flags=re.S)
content1 = re.sub(r' ','',content1,flags=re.S)
content1 = re.sub(r'"','',content1,flags=re.S)
content1 = re.sub(r'<.*?>','',content1,flags=re.S)
with open(f'.\\deltag\\{each_file}','w',encoding='utf-8') as file:
file.write(content1)
查看文件的年份分布范围
# 查看爬取到的信息的时间范围:主要在2017-2020年之间,因此主要研究这一时间段
series_datetime = series_file.apply(get_standard_time)
series_datetime.apply(lambda x:x.year).value_counts()
信息处理部分:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.cluster.hierarchy import linkage,fcluster
import scipy.spatial.distance as dis
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from nltk.corpus import stopwords as stp
from nltk import data
data.path.append(r".")
使用nltk分词、判断词性、抓取词干、并过滤:
########
ym_dict = {f'{y}-{m:0>2d}':{} for y in range(2017,2021) for m in range(1,13)}
########
stop_words = set(stp.words('english'))
stop_words.update([',','.','—',' ','(',')','``','\'\'','’','“','”','[',']',
'US','China','United','States','Chinese',#高频的常规信息先剔除掉
])
series_file = pd.Series(os.listdir(r".\deltag"))
poster = nltk.PorterStemmer() # 用来提取英文的单词的主干,去除单复数、时态等差异
for file in tqdm(series_file):
ym_ = file[:7]
if ym_ in ym_dict.keys():
dict_ = ym_dict[ym_]
f = open(f'.\\deltag\\{file}','r',encoding='utf-8')
t = f.read()
for _ in [poster.stem(w) for w in nltk.word_tokenize(t) if w not in stop_words]:# 使用nltk分词,并过滤停用词
dict_[_] = dict_.get(_,0)+1
f.close()
for ym in tqdm(ym_dict.keys()):
tagged = nltk.pos_tag(ym_dict[ym].keys())
list_ = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW']
filter_list = [k for k,v in filter(lambda x:x[1] in list_,tagged)]
ym_dict[ym] = {k:v for k, v in ym_dict[ym].items() if k in filter_list if v > 10}
ym_dict[ym] = dict(sorted(ym_dict[ym].items(),key=lambda x:x[1],reverse=True))
nltk词性查看help
nltk.help.upenn_tagset()
nltk提取词干的示例
poster.stem('countries') == poster.stem('country')
剔除空行空列,然后将空值填为0
# ym_dict.keys()
df = pd.DataFrame(ym_dict)
df.dropna(axis=1,how='all',inplace=True)
df.dropna(axis=0,how='all',inplace=True)
df = df.T
df.fillna(value=0,inplace=True)
df.head()
df.iloc[:,:2].plot()
单词聚类
def cluster_words(df,cluster_param=50):
df_delta = df - df.shift()
df_delta.dropna(axis=0,how='all',inplace=True)
hcluster = linkage(dis.pdist(df_delta.T,'cosine'),'single')
clustered_ans = fcluster(hcluster, t=cluster_param,criterion='maxclust',)
ans = pd.DataFrame([df.columns,clustered_ans]).T
ans = ans.sort_values(by=1)
group = ans.groupby(by=1).count()
group.columns = ['num']
group = group.sort_values(by='num',ascending=False)
gid_to_info = {i:{'words':[]} for i in set(clustered_ans)}
for word, gid in dict(zip(df.columns,clustered_ans)).items():
gid_to_info[gid]['words'].append(word)
gid_to_info[gid]['weight'] = df[word][df[word]!=0].mean().mean()
df_temp = pd.DataFrame(gid_to_info).T
df_temp['num'] = df_temp['words'].apply(lambda x: len(x))
# df_temp.sort_values(by='weight').to_csv("F:\desktop\out.csv")
df_temp.sort_values(by='num',ascending=False)
return gid_to_info,df_temp
为什么使用cosine的解释
df_delta = df - df.shift()
df_delta.dropna(axis=0,how='all',inplace=True)
hcluster = linkage(dis.pdist(df_delta.T,'cosine'),'single')
clustered_ans = fcluster(hcluster, t=50,criterion='maxclust',)
ans = pd.DataFrame([df.columns,clustered_ans]).T
ans = ans.sort_values(by=1)
group = ans.groupby(by=1).count()
group.columns = ['num']
group = group.sort_values(by='num',ascending=False)
gid_to_info = {i:{'words':[]} for i in set(clustered_ans)}
for word, gid in dict(zip(df.columns,clustered_ans)).items():
gid_to_info[gid]['words'].append(word)
gid_to_info[gid]['weight'] = df[word][df[word]!=0].mean().mean()
df_temp = pd.DataFrame(gid_to_info).T
df_temp['num'] = df_temp['words'].apply(lambda x: len(x))
# df_temp.sort_values(by='weight').to_csv("F:\desktop\out.csv")
df_temp.sort_values(by='num',ascending=False,inplace=True)
df_temp
由于集合单词数量过多,先绘制词云获取直观印象
import wordcloud
from os import path
from PIL import Image
def creat_cloud(set_id,gid_to_info):
stopwords = ['baby', 'pacifier','and','that','the','it','this','in','am','is','are','for','with','we','to','of','they','he','them','have',' ']
wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\STFANGSO.TTF',
width=900,
height=700,
margin=2,
prefer_horizontal=1,
repeat=False,
contour_width=1,
contour_color='red',
min_font_size=4,
max_font_size=200,
background_color='white',
mode='RGB',
relative_scaling=0.4,
include_numbers=False,
stopwords=stopwords,
)
text = ''
for w,t in df[gid_to_info[set_id]['words']].sum().to_dict().items():
text += (w+' ')*int(t)
wc.generate(text)
image = wc.to_image()
return image
creat_cloud(df_temp.num.idxmax(),gid_to_info)
def show_plot(set_id,gid_to_info=gid_to_info,df=df,mode='mean',num=20):
if mode == 'mean':
fig = px.line(df[gid_to_info[set_id]['words']].mean(axis=1))
fig.show()
else:
temp = df[gid_to_info[set_id]['words']].sum()
fig = px.line(df[temp.sort_values(ascending=False).index].iloc[:,:num])
fig.show()
show_plot(df_temp.num.idxmax(),gid_to_info,mode='')
show_plot(4,gid_to_info,mode='')
show_plot(5,gid_to_info,mode='')
show_plot(36,gid_to_info,mode='')
对最大的词集继续聚类,探索是否能获取更多信息
#从初始的最大词集开始
gid_to_info_sub, temp_df_sub = cluster_words(df[gid_to_info[df_temp.num.idxmax()]['words']],10)
temp_df_sub
为什么不直接绘制这种图形,或者用heatmap?因为数量过多
df.shape
show_plot(set_id=10,gid_to_info=gid_to_info_sub,df=df,mode='')
#对上一次的子聚类中的最大词集继续聚类,(本步骤可重复允许迭代)
gid_to_info_sub, temp_df_sub = cluster_words(df[gid_to_info_sub[temp_df_sub.num.idxmax()]['words']],10)
temp_df_sub