import re
import numpy as np
train_path = 'data/train.txt'
embedding_file = 'model/token_vec_300.bin'
stop_words_path = 'data/stop_words.txt'
temporary_variable_path = 'data/variable'
embedding_file_path = 'data/embedding/token_vec_300.bin'
返回x,y的列表集合
def loadData(train_path):
y_list = []
x_left_list = []
x_right_list = []
for line in open(train_path,encoding='utf-8'):
line = line.rstrip().split('\t')
if line:
seg_left = line[0]
seg_rigth = line[1]
label = line[2]
x_left_list.append(seg_left)
x_right_list.append(seg_left)
y_list.append(label)
return x_left_list,x_right_list,y_list
x_left_list,x_right_list,y_list = load_data(train_path)
正则--只保留汉字
def cleanData(data_list):
clear_data = []
for sentece in data_list:
clear_sentece = re.sub("[^\u4e00-\u9fa5]+", "",sentece)
clear_data.append(clear_sentece)
return clear_data
x_left_clear = clean_data(x_left_list)
def getCharFromSentence(sentence_list):
char_list = []
for sentence in sentence_list:
char_sentence = [char for char in sentence if char]
char_list.append(char_sentence)
print('cleaning finished!!!')
print('last sentence:{}'.format(char_list[-1]))
return char_list
x_left_char = getCharFromSentence(x_left_clear)
def getStopWords(stop_words_path):
stop_words = []
with open(stop_words_path,'r',encoding='utf-8') as f:
lines = f.readlines()
for i in lines:
word = i.strip()
if word:
stop_words.append(word)
print('loading stopwords finished!!!')
print(stop_words[-10:])
return stop_words
stop_words_list = getStopWords(stop_words_path)
## 该方法只能用于分词之后
def removeStopWords(sentence_list,stop_words_list):
data = []
for sentence in sentence_list:
data.append([ word for word in sentence if word not in stop_words_list])
return data
x_left_char = removeStopWords(x_left_char,stop_words_list)
获取句子的最大长度
def getMaxLength(train_data):
len_list = [len(i) for i in train_data]
len_array_list = np.array(len_list)
for i in range(10,100): # 给一个预估量
num = np.sum(len_array_list<i)
rate = num/len(len_array_list)
if rate>0.95:
max_length = i
print("max_tokens={}".format(max_length))
break
return max_length
# 传入分完字/词后的句子列表
def buildWordDict(all_text_list):
word_list = []
for sentence in all_text_list:
word_list.extend(sentence)
vocabulary_list =list(set(word_list))
import collections
word2index = collections.OrderedDict()
for index ,word in enumerate(vocabulary_list):
word2index[word]=index+1
return word2index
def buildIndex2Word(word2index):
index2word = dict(zip(word2index.values(), word2index.keys())) # 反转字典
return index2word
word2index= buildWordDict(x_left_char)
index2word = buildIndex2Word(word)
句子转数字 ,输入是句子的列表
def tokenizeSentence(sentence_list,word2index):
x_tokens=[]
for sentence in sentence_list:
number_list = []
for word in sentence:
try:
number_list.append(word2index[word])
except:
number_list.append(0)
x_tokens.append(number_list)
return x_tokens
数字转句子
def tokenizeNumber(number_list,index2word):
sentence=[]
for number in number_list:
new_text = []
for index in number:
try:
new_text.append(index2word[index])
except:
new_text.append(' ')
sentence.append(new_text)
return sentence
tokens_List = tokenizeSentence(x_left_char,word2index)
def loadEmbeddingsFile(embedding_file_path):
embeddings_dict = {}
with open(embedding_file_path, 'r',encoding='utf-8') as f:
for line in f:
values = line.strip().split(' ')
if len(values) < 300:
continue
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_dict[word] = coefs
print('Found %s word vectors.' % len(embeddings_dict))
return embeddings_dict
# 获取本语料的word2vector
def getEmbeddingsDict(embeddings_dict,word2index):
word2vector_dict = {}
for word in word2index.keys():
try:
word2vector_dict[word] = embeddings_dict[word]
except:
print(word)
return word2vector_dict
word2vector_dict = getEmbeddingsDict(modelbin,word2index)
def getEmbeddingWeights(word2index,word2vector_dict,embedding_dim):
n_symbols = len(word2index) + 1 # 所有单词的索引数,未登录词语索引为0,所以加1
embedding_weights = np.zeros((n_symbols, embedding_dim))
for word, index in word2index.items():#从索引为1的词语开始,对每个词语对应其词向量
embedding_vector = word2vector_dict.get(word)
if embedding_vector is not None:
embedding_weights[index, :] = word2vector_dict[word]
return embedding_weights
embedding_weights = getEmbeddingWeights(word2index,word2vector_dict,300)