用jieba统计《红楼梦》词频

import jieba
jieba.setLogLevel(jieba.logging.INFO)
f = open('红楼梦.txt', 'r', encoding='utf-8')
txt = f.read()
f.close()
words = jieba.lcut(txt)
counts = {}
for word in words:
    if len(word) == 1:
        continue
    elif word == "奶奶" or word == "老太太" or word == "老祖宗" or word == "史太君" :
        rword = "贾母"
    elif word == "黛玉" or word == "林姑娘" or word == "林妹妹":
        rword = "林黛玉"
    elif word == "凤姐" or word == "辣妹子" or word == "熙凤":
        rword = "王熙凤"
    elif word == "宝玉"or word == "宝二爷":
        rword = "贾宝玉"
    else:
        rword = word
    counts[rword] = counts.get(rword, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(20):
    word, count = items[i]
    print('{0:<10}{1:>5}'.format(word, count))

课堂作业,尚不完善、严谨,仅供参考

上一篇:jieba


下一篇:if语句