Word2vec 实战

1、对英文文本做情感分析

import  os
import re
import  numpy as np
import  pandas as pd
from bs4 import BeautifulSoup
import  nltk.data
from nltk import  word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def load_dataset(name,nrows=None):
    datasets={
        "unlabeled_train":"unlabeledTrainData.tsv",
        "labeled_train":"labeledTrainData.tsv",
        "test":"testData.tsv"
    }
    if name not in datasets:
        raise ValueError(name)
    data_file=os.path.join("..","data",datasets[name])
    df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows)
    print("number of reviews:{}".format(len(df)))
    return df

#读入无标签数据
df=load_dataset("unlabeled_train")
print(df.head())#50000

#数据预处理
stopword=list(stopwords.words("english"))

def clean_text(text,remove_stopwords=False):
    text=BeautifulSoup(text,"html.parser").get_text()
    text=re.sub(r"[^a-zA-Z]"," ",text)
    words=text.lower().split()
    if remove_stopwords:
        words=[w for w in words if w not in stopword]
    return words

def split_sentences(review):
    raw_sentences=word_tokenize(review.strip())
    sentences=[clean_text(s) for s in raw_sentences if s]
    return sentences

#将dataframe文本预处理,分词。

df["clean_review"]=df.review.apply(clean_text)
sentences=sum(df.review.apply(split_sentences,[]))

#用gensim训练词嵌入模型
num_features=300
min_word_count=40
num_workes=4
context=10
downsampling=1e-3

model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count,
               window=context,sample=downsampling)

model.init_sims(replace=True)
model.save(os.path.join("..","models","model_name"))

#看看训练的词向量结果如何
print(model.doesnt_match("man woman child kitchen".split()))#kitchen
model.most_similar("man")


df=load_dataset("labeled_train")

def to_review_vector(review):
    words=clean_text(review,remove_stopwords=True)
    array=np.array(model[w] for w in words if w in model)
    return pd.Series(array.mean(axis=0))

train_data_feature=df.review.apply(to_review_vector)
print(train_data_feature.head())

#使用随机森林构建分类器
forest=RandomForestClassifier(n_estimators=100,random_state=42)
clf=forest.fit(train_data_feature,df["sentiment"])

confusion_matrix(df["sentiment",forest.predict(train_data_feature)])

del df
del train_data_feature


df=load_dataset("test")
test_data=df.review.apply(to_review_vector)

predict=forest.predict(test_data)
output=pd.DataFrame({"id":df["id"],"sentiment":predict})
#保存到csv文件

2、使用word2vec做情感分析

import  os
import re
import  numpy as np
import  pandas as pd
from bs4 import BeautifulSoup
import  nltk.data
from nltk import  word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def load_dataset(name,nrows=None):
    datasets={
        "unlabeled_train":"unlabeledTrainData.tsv",
        "labeled_train":"labeledTrainData.tsv",
        "test":"testData.tsv"
    }
    if name not in datasets:
        raise ValueError(name)
    data_file=os.path.join("..","data",datasets[name])
    df=pd.read_csv(data_file,sep="\t",escapechar="\\",nrows=nrows)
    print("number of reviews:{}".format(len(df)))
    return df

#读入无标签数据
df=load_dataset("unlabeled_train")
print(df.head())#50000

#数据预处理
stopword=list(stopwords.words("english"))

def clean_text(text,remove_stopwords=False):
    text=BeautifulSoup(text,"html.parser").get_text()
    text=re.sub(r"[^a-zA-Z]"," ",text)
    words=text.lower().split()
    if remove_stopwords:
        words=[w for w in words if w not in stopword]
    return words

def split_sentences(review):
    raw_sentences=word_tokenize(review.strip())
    sentences=[clean_text(s) for s in raw_sentences if s]
    return sentences

#将dataframe文本预处理,分词。

df["clean_review"]=df.review.apply(clean_text)
sentences=sum(df.review.apply(split_sentences,[]))

#用gensim训练词嵌入模型
num_features=300
min_word_count=40
num_workes=4
context=10
downsampling=1e-3

model=Word2Vec(sentences,workers=num_workes,size=num_features,min_count=min_word_count,
               window=context,sample=downsampling)

model.init_sims(replace=True)
model.save(os.path.join("..","models","model_name"))

#看看训练的词向量结果如何
print(model.doesnt_match("man woman child kitchen".split()))#kitchen
model.most_similar("man")


df=load_dataset("labeled_train")

def to_review_vector(review):
    words=clean_text(review,remove_stopwords=True)
    array=np.array(model[w] for w in words if w in model)
    return pd.Series(array.mean(axis=0))

train_data_feature=df.review.apply(to_review_vector)
print(train_data_feature.head())

#使用随机森林构建分类器
forest=RandomForestClassifier(n_estimators=100,random_state=42)
clf=forest.fit(train_data_feature,df["sentiment"])

confusion_matrix(df["sentiment",forest.predict(train_data_feature)])

del df
del train_data_feature


df=load_dataset("test")
test_data=df.review.apply(to_review_vector)

predict=forest.predict(test_data)
output=pd.DataFrame({"id":df["id"],"sentiment":predict})
#保存到csv文件
上一篇:PAT 1011 World Cup Betting


下一篇:一个不常遇到的HbuilderX自动化测试运行问题