## 中文情感分类--关于疫情、微博、中文、文本
本次中文情感分析源于数据挖掘与分析课大作业,主要内容为:对疫情期间的微博文本进行情感分类,进而分析情感变化。
1. 数据集:训练集和待预测数据集,其中训练集为打好标签的微博疫情相关文本,待预测训练集为情感趋势来源。
2. python库:主要使用 jieba、pandas,其余详见import
3. 主要涉及内容有:分词,去停用词,构建词向量模型,分词文本向量化,模型训练,预测等部分。
【文件路径\\、/没有修改成一致。部分代码不够简洁流畅,仅提供步骤参考,相关文件、代码(同组成员的微博爬虫、清洗、以及数据集链接)会考虑需要上传】
part 1:训练集文本
--分词,去停,构建词向量(这里没有用pandas,十分后悔)
1.import部分及main方法:
import jieba
import numpy as np
import pandas as pd
import os
import gensim
from gensim.test.utils import common_texts,get_tmpfile
from gensim.models import Word2Vec
import math
import csv
if __name__=='__main__':
data = pd.read_csv('D:\\documents\\data mining\\数据集\\情感分类-疫情微博\\nCoV_100k_train.labled.csv',engine="python")
#data = pd.read_csv('D:\\documents\\data mining\\数据集\\普通情感分类-7\\情感训练集.csv')
#print(data.head())
#提取目标列,第2列
data1 = list(data.iloc[:,3]) #根据数据集修改 100k-3,情感训练集-0
#print(data1[0])
label = list(data.iloc[:,6]) #根据数据集修改 100k-6,情感训练集-1
#分词
size = 100 #词向量模型
(data2,label) = word_cut(data1,label,size) #返回分词后列表,以字符串为元素,字符串用','隔开字符
print('分词成功')
print(len(data2),len(label))
2.分词,去停,词向量
def word_cut(data1,label,size):
filelist = []
for i in data1:
i=str(i)
i = i.replace('展开全文c','')
s=jieba.cut(i,cut_all=False)
cutstr = '$$$'.join(s)
'''
s1 = iter(s)
cutstr=''
for i in s1:
if cutstr =='':
cutstr+=i
else:
cutstr+='$$$'
cutstr+=i
'''
textlist = cutstr.split('$$$')
#print(textlist)
filelist.append(textlist)
filelist = removesw(filelist) #去停用词后的list,可能有空
j=0
for i in range(len(filelist)):#删除空值
if len(filelist[i-j])== 0:
del filelist[i-j]
del label[i-j]
j+=1
#print(len(filelist),len(label))
#print(filelist[0],label[0])
#print(filelist[1],label[1])
#print(filelist[-2],label[-2])
#print(filelist[-1],label[-1])
#打开txt
txtfile = open('D:/documents/data mining/数据集/代码/data_cut.txt',mode = 'w')
for i in range(len(filelist)):
string=''
for j in filelist[i]:
if j != '':
if string == '':
string += j
else:
string += ','
string += j
##写入txt文件 #分词+label
txtfile.write(string.encode("gbk", 'ignore').decode("gbk", "ignore")+' '+str(label[i])+'\n')
txtfile.close()
print('cut_word写入txt')
model = Word2Vec(filelist,size=size,window=5,min_count=1,workers=4)
model.save("D:/documents/data mining/数据集/代码/word2vec.bin")
print('cut_word加入词向量模型')
return (filelist,label)
本段主要为 利用结巴分词进行分词,分词结果使用$$$分隔,使用下方去停方法。
将去停后的分词文本加入词向量模型,其中word2vec中的filelist只要为可循环的变量均可,后续往词向量模型加入,以及获得文本向量的语句见part2.
def removesw(filelist): #filelist:由分词构成的list
stop_word = None
with open('D:/documents/data mining/数据集/stopwords-master/cn_stopwords.txt','r',encoding = 'utf-8') as f:
stop_words = f.readlines()
stop_words = [word.replace('\n','') for word in stop_words]
# stop word 替换
#i=0
for i in range(len(filelist)):
filelist[i]=[x for x in filelist[i] if x not in stop_words]
return filelist
本段去停用词,txt为网络找的停用词表,中途会根据微博语境增删改。for循环里的代码比较核心。
part 2:预测集数据
--本部分主要使用pandas库,对预测集分词、去停,结果加入part1中构建的词向量模型。然后利用词向量模型、训练集&预测集分析结果,构建文本向量并写入.csv文件。
1.import部分+数据清洗、分词、去停
(清洗部分希望去掉部分无意义词段,防止分词后无法去除。)
import os
import pandas as pd
import jieba
import gensim
from gensim.test.utils import common_texts,get_tmpfile
from gensim.models import Word2Vec
import numpy as np
import csv
#----数据清洗,分词----
with open('D:/documents/data mining/数据集/stopwords-master/cn_stopwords.txt','r',encoding = 'utf-8') as f:
stop_words = f.readlines()
stop_words = [word.replace('\n','') for word in stop_words]
stop_words.append('\u200b')
origin_dir='D:\\documents\\data mining\\数据集\\代码\\cleaned_text\\'
files=os.listdir(origin_dir)
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
def clean_mix(s):
#print(type(s))
return s.replace('收起全文d','').replace('展开全文d','').replace('的秒拍视频','').replace('的微博视频','').replace('的快手视频','').replace('\n','').replace('O网页链接','')
def after_jieba_stopword(s):
a=jieba.cut(str(s),cut_all=False)
b = '$$$'.join(a)
c=[x for x in b.split('$$$') if x not in stop_words]
return ' '.join(c)
N_origin=0
N_filter=0
for file in files:
data=pd.read_table(origin_dir+file,sep=',',encoding='utf-8')
N_origin+=len(data)
#分词
data['cleaned_text']=data['cleaned_text'].map(lambda x:clean_mix(str(x)) if type(x)==type('') else '') #去词
data['cleaned_text']=data['cleaned_text'].map(lambda x:after_jieba_stopword(x)) #分词,去停用词
data['removeWellSign']=data['removeWellSign'].map(lambda x:clean_mix(str(x)) if type(x)==type('') else '')
data['removeWellSign']=data['removeWellSign'].map(lambda x:after_jieba_stopword(x))
data_filter=data.loc[data['cleaned_text']!='',:]
data_filter['id']=np.arange(0,len(data_filter),1)
N_filter+=len(data_filter)
data_filter[['id','original_text','cleaned_text','removeWellSign']].to_csv(after_clean_dir+file,sep=',',index=None,encoding='utf-8')
print(file,'over')
print(N_origin)
print(N_filter)
2.词向量模型训练
--待预测数据集分词结果加入词向量模型
#训练模型,向量化
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
files=os.listdir(after_clean_dir)
model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
for file in files:
data=pd.read_table(after_clean_dir+file,sep=',',encoding='utf-8')
filelist=list(data['cleaned_text'].map(lambda x:x.split(' ')) )
model.train(filelist,total_examples=model.corpus_count,epochs= model.iter)
print(file,'train over')
model.save("D:/documents/data mining/数据集/代码/word2vec.bin")
print('预测文本加入词向量模型-成功')
3.文本向量化
利用分词后的文本,分别从词向量模型中获得词语对应向量(向量中不包含所有词),加总(权重为1)、平均,得到句子对应文本向量。
#模型106万条文本的向量化
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
vectors_dir='D:\\documents\\data mining\\数据集\\代码\\vectors\\'
files=os.listdir(after_clean_dir)
model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
for file in files:
data=pd.read_table(after_clean_dir+file,sep=',',encoding='utf-8')
filelist=list(data['cleaned_text'].map(lambda x:x.split(' ')))
df=pd.DataFrame()
for text in filelist:
text_vector = np.zeros(100).reshape((1,100))
count = 0
for word in text:
try:
text_vector += model[word].reshape((1,100))
#print(word,model[word])
count += 1
except KeyError:
continue
if count !=0:
text_vector /= count #count个单词,所以除以count
vector_list= list(list(text_vector)[0])
df=df.append(pd.Series(vector_list),ignore_index=True)
df.to_csv(vectors_dir+file,sep=',',index=None,header=None)
print(file,'train over')
#---训练集文本向量化---
model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
txtfile = open('D:\\documents\\data mining\\数据集\\代码\\data_cut.txt','r')
data=[]
for i in txtfile.readlines():
a=i.split(' ')
a = [word.replace('\n','') for word in a]
#print(a)
data.append(a) #[[cut_word,label],[cut_word,label]]
for i in data:
text = i[0].split(',')
text_vector = np.zeros(100).reshape((1,100))
count = 0
for word in text:
try:
text_vector += model[word].reshape((1,100))
count += 1
except KeyError:
continue
if count !=0:
text_vector /= count #count个单词,所以除以count
vector_list= list(list(text_vector)[0])
#print(i[0],vector_list)
i=i.append(vector_list) #
print(data[0])
with open('D:\\documents\\data mining\\数据集\\代码\\trainText_vector.csv','w',newline='') as tf:
writer = csv.writer(tf,delimiter = ',')
#writer.writerow(file_columns)
for row in data:
#print(row)
row1 = row[2]
row1.append(int(row[1]))
#print(row1)
writer.writerow(row1)
tf.close()
print('训练文本向量化完成')
4.模型训练
--这里的模型为决策树模型,使用OneVsOne分类方式,是经过挑选的。训练过程中,将训练集向量9:1分为训练集和测试集,正确率较高,且在预测分类中效果较好。
from sklearn.multiclass import OneVsOneClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from joblib import dump, load
#---模型训练及预测---
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
vectors_dir='D:\\documents\\data mining\\数据集\\代码\\vectors\\'
label_dir='D:\\documents\\data mining\\数据集\\代码\\text_label\\'
files=os.listdir(after_clean_dir)
#模型训练
labeled_path = 'D:\\documents\\data mining\\数据集\\代码\\trainText_vector.csv'
labeled=pd.read_table(labeled_path,sep=',')
n=len(labeled)#11281
vectors=labeled.iloc[:,:-1]
labels=labeled.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2)
y_test_list=list(y_test)
y_train_list2=np.array(list(y_train.map(lambda x:[x])))
X_train_list=np.array(X_train)
X_test_list=np.array(X_test)
n_train=len(y_train)#10152
n_test=len(y_test)#1129
def accuracy(a,b):
c=[]
for i in range(len(a)):
if a[i]==b[i]:
c.append(1)
else:
c.append(0)
return sum(c)/len(c)
model_tree_one=OneVsOneClassifier(DecisionTreeRegressor()) #2v2
model_tree_one.fit(X_train,y_train)
predict_tree_one=model_tree_one.predict(X_test)
print(predict_tree_one)
accuracy_tree_one=accuracy(predict_tree_one,y_test_list) #0.7478753541076487
print("accuracy_tree_one:"+str(accuracy_tree_one))
dump(model_tree_one,'model_tree_one.joblib')
print('预测模型建立并存储完成')
5.情感分类预测
#预测
#model_tree_one=load('D:\\documents\\data mining\\数据集\\代码\\model_tree_one.joblib')
model_tree_one=load('D:\\documents\\data mining\\数据集\\代码\\svc.joblib')
for file in files:
vectors_file=pd.read_table(vectors_dir+file,sep=',',header=None)
text_file=pd.read_table(after_clean_dir+file,sep=',')
result=model_tree_one.predict(vectors_file)
text_file['label']=result
text_file.to_csv(label_dir+file,sep=',',index=None)
print(file,'predict over')
6.随便输出到.csv的分类结果(积极,消极,总数等)
# 预测结果统计
from pandas import DataFrame
analysis_dir = 'D:\\documents\\data mining\\数据集\\代码\\text_label\\'
analysis_files = os.listdir(analysis_dir)
#analysis_data = {'date':[],'neg':[],'pos':[],'total':[]}
analysis_df = DataFrame(data=[],index=[],columns=['deta','neg','pos','total'])
for file in analysis_files:
analysis_file = pd.read_table(analysis_dir+file,sep=',')
#pos = analysis_file.loc[analysis_file['label'] == '1',:].count()
#neg = analysis_file.loc[analysis_file['label'] == '-1',:].count()
vc=analysis_file['label'].value_counts(normalize = False, dropna = False)
pos = vc[1]
neg = vc[-1]
total = analysis_file['label'].count()
print(file,neg,pos,total) #
analysis_df=analysis_df.append(pd.DataFrame([[file.replace('.csv','').replace('.','-'),neg,pos,total]],columns=['deta','neg','pos','total']))
analysis_df.to_csv('D:\\documents\\data mining\\数据集\\代码\\结果图.csv',sep=',',index=None)