nltk 获取 gutenberg 语料，gensim 生成词库和 onehot 编码

2022-05-04 06:56:34

nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码

正在尝试基于 Tensorflow LSTM 模型开发另外一个项目，需要自然语言处理的工具和语料。

import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities


class Book2Array(object):
    sentences=None
    token2id_dic=None
    def __init__(self,sentences):
        self.sentences=sentences
        self.token2id_dic=self.get_token2id_dic()

    def get_sentences(self):
        #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
        #print(macbeth_sentences)
        #print(type(macbeth_sentences))
        print(len(macbeth_sentences))
        sentences_list=[sentence for sentence in self.sentences]
        #print(type(macbeth_list))
        return sentences_list

    def get_token2id_dic(self):
        # collect statistics about all tokens
        dictionary = corpora.Dictionary(self.sentences)
        # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed
        print(len(dictionary))
        token2id_dic=dictionary.token2id
        return token2id_dic

    def word2onehot(self,word):
        onehot_list=np.zeros(8192)
        onehot_list[self.token2id_dic[word]]=1
        return onehot_list

    def sent2vec(self,sentence):
        vec=[]
        if(len(sentence)>20):
            sentence=sentence[0:20]
        for word in sentence:
            onehot_list=self.word2onehot(word)
            vec.append(onehot_list)
        len_vec=len(vec)
        for i in range(0,20-len_vec):
            vec.append(np.zeros(8192))
        #print(len(vec))
        vec_np=np.asarray(vec)
        return vec_np

    def sentences2array(self):
        array=[]
        for sentence in self.sentences:
            array.append(self.sent2vec(sentence))
        return array

    def gen_batch(self):
        pass

if __name__ == '__main__':
    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    book_array=Book2Array(macbeth_sentences)
    book_array.get_sentences()
    array=book_array.sentences2array()
    np_array=np.array(array[0])
    print(np_array.shape)

更多教程：http://www.tensorflownews.com/

码农公寓

相关文章