jieba 分词-西游记

import jieba
 
def takeSecond(elem):
    return elem[1]
 
def main():
    path = "西游记.txt"
    file = open(path,"r",encoding="utf-8")
    text=file.read()
    file.close()
 
    words = jieba.lcut(text)
    counts = {}
    for word in words:
        if len(word) == 1:
            continue    
        elif word == "大圣" or word=="老孙" or word=="行者" or word=="孙大圣" or word=="孙行者" or word=="猴王" or word=="悟空" or word=="齐天大圣" or word=="猴子":
            rword = "孙悟空"
        elif word == "师父" or word == "三藏" or word=="圣僧":
            rword = "唐僧"
        elif word == "呆子" or word=="八戒" or word=="老猪":
            rword = "猪八戒"
        elif word=="沙和尚":
            rword="沙僧"
        elif word == "妖精" or word=="妖魔" or word=="妖道":
            rword = "妖怪"
        elif word=="佛祖":
            rword="如来"
        elif word=="三太子":
            rword="白马"
        else:
            rword = word
        counts[rword] = counts.get(rword,0) + 1
 
    file = open("excludes.txt","r",encoding="utf-8")
    excludes = file.read().split(",")
    file.close
 
    for delWord in excludes:
        try:
            del counts[delWord]
        except:
            continue
 
    items = list(counts.items())
    items.sort(key = takeSecond,reverse=True)    
 
    for i in range(20):
        item=items[i]
        keyWord =item[0]
        count=item[1]
        print("{0:<10}{1:>5}".format(keyWord,count))
main()

jieba 分词-西游记

 

 jieba 分词-西游记

 

上一篇:jieba


下一篇:乒乓球比赛