python对文章词频的统计

import os
import re

from nltk import ne_chunk, pos_tag, word_tokenize
import nltk
from docx import Document
import langid
import pandas as pd


def readWord():
    text = ""
    rootdir = 'C:\\Users\\Administrator\\Desktop\\一季度'
    list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        print(path)
        document = Document(path)
        # 获取所有段落
        all_paragraphs = document.paragraphs
        for paragraph in all_paragraphs:
            if langid.classify(paragraph.text)[0] == 'en':
                text += paragraph.text + "\n"
    return text


def get_entities():
    obj = {}
    arr = []
    # 对文章分词
    # sentence = "I am named John Doe  AI AI AI AI"
    sentence = readWord()

    obj = {}
    tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
    for tagged in tagged_sentence:
        if len(tagged) == 2:
            # print(tagged[1])
            pattern = re.compile("\b’\b|\b”\b|\b—\b|\b\[\b|\b…\b|\b/\b|\bs\b|\bP\b|\bII\b|\bR\b|\bA\b|\b]\b")
            if pattern.findall(tagged[0][0]) and (tagged[1] == "NNP" or tagged[1] == "NNPS"):
                # if (tagged[1] == "NNP" or tagged[1] == "NNPS") and tagged[0] != "’" and tagged[0][0] != "”" and tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and tagged[0][0] != "–":

                if obj.get(tagged[0]) is not None:
                    obj[tagged[0]] += 1
                else:
                    obj[tagged[0]] = 1
        else:
            # print(tagged)
            # print(tagged[0])
            if len(tagged[0]) == 2:
                # print(tagged[1])
                if (tagged[0][1] == "NNP" or tagged[0][1] == "NNPS") and tagged[0][0] != "’" and tagged[0][0] != "”" and \
                        tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and \
                        tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and \
                        tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and \
                        tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and \
                        tagged[0][0] != "–":
                    if obj.get(tagged[0][0]) is not None:
                        obj[tagged[0][0]] += 1
                    else:
                        obj[tagged[0][0]] = 1

    # # tagged_sentence = nltk.tag.pos_tag(sentence.split())
    # tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
    # # print(ne_chunk(pos_tag(word_tokenize(sentence))))
    # # print(tagged_sentence)
    # for tagged in tagged_sentence:
    #     if tagged[1] == "NNP" or tagged[1] == "NNPS":
    #         # if obj.get(tagged[0]) is not None:
    #         #     obj[tagged[0]] += 1
    #         # else:
    #         #     obj[tagged[0]] = 1
    #         if obj.get(tagged[0].strip(",")) is not None:
    #             obj[tagged[0].strip(",").strip(".")] += 1
    #         else:
    #             obj[tagged[0]] = 1
  
  # 将对象转为数组对象,便于pd将数据转为一种数据结构,写入excel中 dataframe是一种表格型的数据存储结构,可以看作是几个serie的集合。dataframe既有行索引,也有列索引。 for o in obj: obja = {"word": o, "num": obj[o]} arr.append(obja) p = pd.DataFrame(arr) # print(p) p.to_csv('c4i.csv', encoding='utf_8_sig') # print(p) if __name__ == '__main__': get_entities() # readWord()

 使用的依赖库如下所示:

python-docx==0.8.11

 

上一篇:一键翻译自动填表工具


下一篇:算法题:恢复空格(题目+思路+代码+注释)