import pandas as pd
raw = pd.read_csv('work/金庸-射雕英雄传txt精校版.txt',names=['txt'],sep="aaa",encoding="GBK")
print(len(raw))
raw
#章节判断用变量预处理
def m_head(tmpstr):
return tmpstr[:1]
def m_mid(tmpstr):
return tmpstr.find("回 ")
raw['head']=raw.txt.apply(m_head)
raw['mid']=raw.txt.apply(m_mid)
raw['len']=raw.txt.apply(len)
raw.head(50)
#章节判断
chapnum = 0
for i in range(len(raw)):
if raw['head'][i]=="第" and raw['mid'][i]>0 and raw['len'][i]<30:
chapnum+=1
if chapnum>=40 and raw['txt'][i]=="附录一:成吉思汗家族":
chapnum=0
raw.loc[i,'chap']=chapnum
# 删除临时变量
del raw['head']
del raw['mid']
del raw['len']
raw.head(50)
# 提取所需要的章节
raw[raw.chap==1].head()
%matplotlib inline
raw.txt.agg(len).plot.box()
rawgrp =raw.groupby('chap')
chapter =rawgrp.agg(sum) #只有字符串列的情况下,sum函数自动转换为合并字符串
print(chapter)
chapter =chapter[chapter.index !=0]
chapter.txt[1]
import jieba
word_list=jieba.lcut(chapter.txt[1])
word_list[:10]
# 使用pandas统计
df =pd.DataFrame(word_list,columns=['word'])
df.head(30)
result = df.groupby(['word']).size()
print(type(result))
freqlist=result.sort_values(ascending=False)
freqlist[:20]
fdist=nltk.FreqDist(word_list)
fdist
#带上某个词,可以看到这个词在整个文章中出现的次数
fdist["颜烈"]
#列出词频列表
fdist.keys()
fdist.tabulate(10)
fdist.most_common(5)