Project 4 : News

Data Processing and Visulisation with Python

News

引:近年中美关系纷繁复杂,本project试图使用python抓取、整理信息,并进行分析希望能挖掘出一些有趣的细节、重要的信息以及主线的规律。
目标:通过分析这些新闻的文本,分析近几年的中美关系(包括主题、核心事务的变化规律)。

# 首先安装本project会使用到的额外的包
!pip install tqdm
!pip install nltk
## nltk 的运用会出现问题 可以通过报错自行解决

爬虫部分:

爬虫代码隐去,本project直接提供爬取到的信息。

信息整理部分:

import os
import re
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm


strptime = datetime.datetime.strptime
def get_standard_time(file_name):
    return strptime(file_name[:-4].replace('#',':'),'%Y-%m-%d@%H:%M')

爬取到的文档中仍然包含特殊的字符和tag

Project 4 : News

series_file = pd.Series(os.listdir(r'.\all_news'))


#%%删除无关tag与链接、图片内容

# os.mkdir(r'.\deltag')
os.chdir(r'.\deltag')
!del *.txt
os.chdir(r'..')



for each_file in tqdm(series_file):
    with open(f'.\\all_news\\{each_file}','r',encoding='utf-8') as f:
        content = f.read()
        content1 = re.sub(r'<table.*?</table>','',content,flags=re.S)
        content1 = re.sub(r'<em>.*?</em>','',content1,flags=re.S)
        content1 = re.sub(r'<span.*</span>','',content1,flags=re.S)
        content1 = re.sub(r'<center> .*?</center>','',content1,flags=re.S)
        
        content1 = re.sub(r'<p.*?>','',content1,flags=re.S)
        content1 = re.sub(r'<img.*?>','',content1,flags=re.S)
        content1 = re.sub(r'<br />','',content1,flags=re.S)
        content1 = re.sub(r'&nbsp;','',content1,flags=re.S)
        content1 = re.sub(r'&quot;','',content1,flags=re.S)
        content1 = re.sub(r'<.*?>','',content1,flags=re.S)
        with open(f'.\\deltag\\{each_file}','w',encoding='utf-8') as file:
            file.write(content1)

查看文件的年份分布范围

# 查看爬取到的信息的时间范围:主要在2017-2020年之间,因此主要研究这一时间段
series_datetime = series_file.apply(get_standard_time)
series_datetime.apply(lambda x:x.year).value_counts()

Project 4 : News

信息处理部分:

import os
import numpy as np
import pandas as pd

from tqdm import tqdm

from scipy.cluster.hierarchy import linkage,fcluster
import scipy.spatial.distance as dis

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import nltk
from nltk.corpus import stopwords as stp

from nltk import data
data.path.append(r".")

使用nltk分词、判断词性、抓取词干、并过滤:

########
ym_dict = {f'{y}-{m:0>2d}':{} for y in range(2017,2021) for m in range(1,13)}
########


stop_words = set(stp.words('english'))
stop_words.update([',','.','—',' ','(',')','``','\'\'','’','“','”','[',']',
'US','China','United','States','Chinese',#高频的常规信息先剔除掉
])

series_file = pd.Series(os.listdir(r".\deltag"))

poster = nltk.PorterStemmer() # 用来提取英文的单词的主干,去除单复数、时态等差异

for file in tqdm(series_file):
    ym_ = file[:7]
    if ym_ in ym_dict.keys():
        dict_ = ym_dict[ym_]
        f = open(f'.\\deltag\\{file}','r',encoding='utf-8')
        t = f.read()
        for _ in [poster.stem(w) for w in nltk.word_tokenize(t) if w not in stop_words]:# 使用nltk分词,并过滤停用词
            dict_[_] = dict_.get(_,0)+1

        f.close()
for ym in tqdm(ym_dict.keys()):
    tagged = nltk.pos_tag(ym_dict[ym].keys())

    list_ = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW']
    filter_list = [k for k,v in filter(lambda x:x[1] in list_,tagged)]
    ym_dict[ym] = {k:v for k, v in ym_dict[ym].items() if k in filter_list if v > 10}
    ym_dict[ym] = dict(sorted(ym_dict[ym].items(),key=lambda x:x[1],reverse=True))

nltk词性查看help

nltk.help.upenn_tagset()

nltk提取词干的示例

poster.stem('countries') == poster.stem('country')

剔除空行空列,然后将空值填为0

# ym_dict.keys()
df = pd.DataFrame(ym_dict)
df.dropna(axis=1,how='all',inplace=True)
df.dropna(axis=0,how='all',inplace=True)

df = df.T

df.fillna(value=0,inplace=True)
df.head()
df.iloc[:,:2].plot()

单词聚类

def cluster_words(df,cluster_param=50):
    df_delta = df - df.shift()
    df_delta.dropna(axis=0,how='all',inplace=True)
    hcluster = linkage(dis.pdist(df_delta.T,'cosine'),'single')
    clustered_ans = fcluster(hcluster, t=cluster_param,criterion='maxclust',)
    
    
    ans = pd.DataFrame([df.columns,clustered_ans]).T
    ans = ans.sort_values(by=1)
    group = ans.groupby(by=1).count()
    group.columns = ['num']
    group = group.sort_values(by='num',ascending=False)
    gid_to_info = {i:{'words':[]} for i in set(clustered_ans)}


    for word, gid in dict(zip(df.columns,clustered_ans)).items():
        gid_to_info[gid]['words'].append(word)
        gid_to_info[gid]['weight'] = df[word][df[word]!=0].mean().mean()
        
        
    df_temp = pd.DataFrame(gid_to_info).T
    df_temp['num'] = df_temp['words'].apply(lambda x: len(x))
    # df_temp.sort_values(by='weight').to_csv("F:\desktop\out.csv")
    df_temp.sort_values(by='num',ascending=False)
    return gid_to_info,df_temp

为什么使用cosine的解释
Project 4 : News

df_delta = df - df.shift()
df_delta.dropna(axis=0,how='all',inplace=True)
hcluster = linkage(dis.pdist(df_delta.T,'cosine'),'single')
clustered_ans = fcluster(hcluster, t=50,criterion='maxclust',)
ans = pd.DataFrame([df.columns,clustered_ans]).T
ans = ans.sort_values(by=1)
group = ans.groupby(by=1).count()
group.columns = ['num']
group = group.sort_values(by='num',ascending=False)
gid_to_info = {i:{'words':[]} for i in set(clustered_ans)}

for word, gid in dict(zip(df.columns,clustered_ans)).items():
    gid_to_info[gid]['words'].append(word)
    gid_to_info[gid]['weight'] = df[word][df[word]!=0].mean().mean()
df_temp = pd.DataFrame(gid_to_info).T
df_temp['num'] = df_temp['words'].apply(lambda x: len(x))
# df_temp.sort_values(by='weight').to_csv("F:\desktop\out.csv")
df_temp.sort_values(by='num',ascending=False,inplace=True)
df_temp

由于集合单词数量过多,先绘制词云获取直观印象

import wordcloud
from os import path
from PIL import Image

def creat_cloud(set_id,gid_to_info):
    stopwords = ['baby', 'pacifier','and','that','the','it','this','in','am','is','are','for','with','we','to','of','they','he','them','have',' ']

    wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\STFANGSO.TTF',
                             width=900,
                             height=700,
                             margin=2,
                             prefer_horizontal=1,
                             repeat=False,
                             contour_width=1,
                             contour_color='red',
                             min_font_size=4,
                             max_font_size=200,
                             background_color='white',
                             mode='RGB',
                             relative_scaling=0.4,
                             include_numbers=False,
                             stopwords=stopwords,
                             )

    text = ''
    for w,t in df[gid_to_info[set_id]['words']].sum().to_dict().items():
        text += (w+' ')*int(t)
    wc.generate(text)

    image = wc.to_image()
    return image
creat_cloud(df_temp.num.idxmax(),gid_to_info)
def show_plot(set_id,gid_to_info=gid_to_info,df=df,mode='mean',num=20):
    if mode == 'mean':
        fig = px.line(df[gid_to_info[set_id]['words']].mean(axis=1))
        fig.show()
    else:
        temp = df[gid_to_info[set_id]['words']].sum()

        fig = px.line(df[temp.sort_values(ascending=False).index].iloc[:,:num])
        
        fig.show()
show_plot(df_temp.num.idxmax(),gid_to_info,mode='')
show_plot(4,gid_to_info,mode='')
show_plot(5,gid_to_info,mode='')
show_plot(36,gid_to_info,mode='')

对最大的词集继续聚类,探索是否能获取更多信息

#从初始的最大词集开始
gid_to_info_sub, temp_df_sub = cluster_words(df[gid_to_info[df_temp.num.idxmax()]['words']],10)
temp_df_sub

为什么不直接绘制这种图形,或者用heatmap?因为数量过多
Project 4 : News

df.shape
show_plot(set_id=10,gid_to_info=gid_to_info_sub,df=df,mode='')
#对上一次的子聚类中的最大词集继续聚类,(本步骤可重复允许迭代)
gid_to_info_sub, temp_df_sub = cluster_words(df[gid_to_info_sub[temp_df_sub.num.idxmax()]['words']],10)
temp_df_sub
上一篇:[Vue基础实战]路由测试-2


下一篇:《Dota2》、《穿越火线》等市场上较为热门的端手游