import spacy
nlp = spacy.load("en_core_web_lg")
# 读取停用词列表
from nltk.corpus import stopwords
stopword_list = list(stopwords.words('english'))
add_stopword_list = ["'s",'also','even']
stopword_list+=add_stopword_list
# pos_tag = ['JJ','JJR','JJS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ','NN','NNP','NNPS','NNS']
pos_tag = ['JJ','JJR','JJS','RB','RBR','RBS']
def dataPrepro(corpus,stopword_list,pos_tag):
"""corpus:语料,str格式;stopword_list停用词列表;pos_tag:词性列表,筛选出指定词性的词"""
token_list = []
for token in nlp(corpus):
if token.text not in stopword_list and token.tag_ in pos_tag: # 去停用词 + 词性筛选
token_list.append(token.text)
output = " ".join(token_list)
return output