预处理方法

2024-03-24 10:47:34

import re
import numpy as np
train_path = 'data/train.txt'
embedding_file = 'model/token_vec_300.bin'
stop_words_path = 'data/stop_words.txt'
temporary_variable_path = 'data/variable'
embedding_file_path = 'data/embedding/token_vec_300.bin'

返回x，y的列表集合

def loadData(train_path):
    y_list = []
    x_left_list = []
    x_right_list = []
    for line in open(train_path,encoding='utf-8'):
        line = line.rstrip().split('\t')
        if line:
            seg_left = line[0]
            seg_rigth = line[1]
            label = line[2]
            x_left_list.append(seg_left)
            x_right_list.append(seg_left)
            y_list.append(label)
    return x_left_list,x_right_list,y_list

x_left_list,x_right_list,y_list = load_data(train_path)

正则--只保留汉字

def cleanData(data_list):
    clear_data = []
    for sentece in data_list:
        clear_sentece = re.sub("[^\u4e00-\u9fa5]+", "",sentece)
        clear_data.append(clear_sentece)
    return clear_data

x_left_clear = clean_data(x_left_list)

def getCharFromSentence(sentence_list):
    char_list = []
    for sentence in sentence_list:
        char_sentence = [char for char in sentence if char]
        char_list.append(char_sentence)
    print('cleaning finished!!!')
    print('last sentence:{}'.format(char_list[-1]))
    return char_list

x_left_char = getCharFromSentence(x_left_clear)

def getStopWords(stop_words_path):
    stop_words = []
    with open(stop_words_path,'r',encoding='utf-8') as f:
        lines = f.readlines()
        for i in lines:
            word = i.strip()
            if word:
                stop_words.append(word)
    print('loading stopwords finished!!!')
    print(stop_words[-10:])
    return stop_words
stop_words_list = getStopWords(stop_words_path)
## 该方法只能用于分词之后
def removeStopWords(sentence_list,stop_words_list):
    data = []
    for sentence in sentence_list:
        data.append([ word for word in sentence if word not in stop_words_list])
    return data
x_left_char = removeStopWords(x_left_char,stop_words_list)

获取句子的最大长度

def getMaxLength(train_data):
    len_list = [len(i) for i in train_data]
    len_array_list = np.array(len_list)
    for i in range(10,100):  # 给一个预估量
        num = np.sum(len_array_list<i)
        rate = num/len(len_array_list)
        if rate>0.95:
            max_length = i
            print("max_tokens={}".format(max_length))
            break
    return max_length

# 传入分完字/词后的句子列表
def buildWordDict(all_text_list):
    word_list = []
    for sentence in all_text_list:
        word_list.extend(sentence)
    vocabulary_list =list(set(word_list))
    import collections
    word2index = collections.OrderedDict()
    for index ,word in enumerate(vocabulary_list):
        word2index[word]=index+1
    return word2index
def buildIndex2Word(word2index):
    index2word = dict(zip(word2index.values(), word2index.keys())) # 反转字典
    return index2word

word2index= buildWordDict(x_left_char)
index2word = buildIndex2Word(word)

句子转数字，输入是句子的列表

def tokenizeSentence(sentence_list,word2index):
    x_tokens=[]
    for sentence in sentence_list:
        number_list = []
        for word in sentence:
            try:
                number_list.append(word2index[word])
            except:
                number_list.append(0)
        x_tokens.append(number_list)
    return x_tokens

数字转句子

def tokenizeNumber(number_list,index2word):
    sentence=[]
    for number in number_list:
        new_text = []
        for index in number:
            try:
                new_text.append(index2word[index])
            except:
                new_text.append(' ')
        sentence.append(new_text)
    return sentence

tokens_List = tokenizeSentence(x_left_char,word2index)

def loadEmbeddingsFile(embedding_file_path):
    embeddings_dict = {}
    with open(embedding_file_path, 'r',encoding='utf-8') as f:
        for line in f:
            values = line.strip().split(' ')
            if len(values) < 300:
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = coefs
    print('Found %s word vectors.' % len(embeddings_dict))
    return embeddings_dict

# 获取本语料的word2vector
def getEmbeddingsDict(embeddings_dict,word2index):
    word2vector_dict = {}
    for word in word2index.keys():
        try:
            word2vector_dict[word] = embeddings_dict[word]
        except:
            print(word)
    return word2vector_dict

word2vector_dict = getEmbeddingsDict(modelbin,word2index)

def getEmbeddingWeights(word2index,word2vector_dict,embedding_dim):
    n_symbols = len(word2index) + 1  # 所有单词的索引数，未登录词语索引为0，所以加1
    embedding_weights = np.zeros((n_symbols, embedding_dim))
    for word, index in word2index.items():#从索引为1的词语开始，对每个词语对应其词向量
        embedding_vector = word2vector_dict.get(word)
        if embedding_vector is not None:
            embedding_weights[index, :] = word2vector_dict[word]
    return embedding_weights
embedding_weights = getEmbeddingWeights(word2index,word2vector_dict,300)

码农公寓

返回x，y的列表集合

正则--只保留汉字

获取句子的最大长度

句子转数字 ，输入是句子的列表

数字转句子

相关文章

句子转数字，输入是句子的列表