数据清洗 一
数据已经取出,然后是对数据的一个清洗
其中中文文本的分类
需要分词
还需要进行 停用词的取出
以防对特征值的抽取造成过大影响
分词是为了进行特征抽取的一个词语分割 然后进行提取作用
# encoding=utf-8 #遍历文件,用ProsessofWords处理文件
from imp import reload
import jieba
import os
import numpy as np
import sys
reload(sys)
def EnumPathFiles(path, callback, stop_words_list):
if not os.path.isdir(path):
print('Error:"', path, '" is not a directory or does not exist.')
return
list_dirs = os.walk(path)
for root, dirs, files in list_dirs:
for d in dirs:
print(d)
EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
for f in files:
callback(root, f, stop_words_list)
def ProsessofWords(textpath, stop_words_list):
f = open(textpath, 'r', encoding='utf-8')
text = f.read()
f.close()
result = list()
outstr = ''
seg_list = jieba.cut(text, cut_all=False)
for word in seg_list:
if word not in stop_words_list:
if word != '\t':
outstr += word
outstr += " "
f = open(textpath, 'w+', encoding='utf-8')
f.write(outstr)
f.close()
def callback1(path, filename, stop_words_list):
textpath = path + '\\' + filename
print(textpath)
ProsessofWords(textpath, stop_words_list)
if __name__ == '__main__':
stopwords_file = "../stopword/stopword.txt"
stop_f = open(stopwords_file, "r", encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
line = line.strip()
if not len(line):
continue
stop_words.append(line)
stop_f.close()
print(len(stop_words))
EnumPathFiles(r'../article', callback1, stop_words)