6.1 处理文本数据
6.1.1 单词和字符的one-hot编码
(1)单词级的one-hot编码:
1 # 单词级的one-hot编码 2 import numpy as np 3 4 # 初始数据:每个样本是列表的一个元素(本例中的样本是一个句子,但也可以是一整篇文档) 5 samples = ['The cat sat on the mat.', 'The dog ate my homework.'] 6 7 # 构建数据中所有标记的索引 8 token_index = {} 9 for sample in samples: 10 # 利用split方法对样本进行分词,在实际应用中,还需要从样本中去掉标点和特殊符号。 11 for word in sample.split(): 12 if word not in token_index: 13 # 为每一个唯一的单词分配一个唯一的索引 14 token_index[word] = len(token_index) + 1 15 # 0号索引没有分配给任何单词 16 17 # 对样本进行分词。只考虑每个样本前max_length个单词 18 max_length = 10 19 20 # 将结果保存在result中 21 # result是一个3D张量,第一维(高)是样本个数,第二维(行)是某样本第几个单词, 22 # 第三维(列)是这个单词的向量表示 23 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1)) 24 for i, sample in enumerate(samples): 25 for j, word in list(enumerate(sample.split()))[:max_length]: 26 index = token_index.get(word) 27 results[i, j, index] = 1.#将3D向量中出现的单词标记为1.
(2)字符级的one-hot编码:
1 import string 2 import numpy as np 3 4 samples = ['The cat sat on the mat.', 'The dog ate my homework.'] 5 characters = string.printable # 所有可打印的ASCII字符 6 token_index = dict(zip(characters, range(1, len(characters) + 1))) 7 8 max_length = 50 9 results = np.zeros((len(samples), max_length, max(token_index.values()) + 1)) 10 for i, sample in enumerate(samples): 11 for j, character in enumerate(sample[:max_length]): 12 index = token_index.get(character) 13 results[i, j, index] = 1.
(3)使用Keras实现单词级的one-hot编码:
1 from keras.preprocessing.text import Tokenizer 2 3 samples = ['The cat sat on the mat.', 'The dog ate my homework.'] 4 5 # 创建一个分词器,设置为只考虑前1000个最常见的单词 6 tokenizer = Tokenizer(num_words=1000) 7 # 构建单词索引 8 tokenizer.fit_on_texts(samples) 9 10 # 将字符串转换为整数索引组成的列表 11 sequences = tokenizer.texts_to_sequences(samples) 12 13 # 也可以直接得到one-hot二进制表示。这个分词器也支持除one-hot编码外的其他向量化模式 14 one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') 15 16 # 找回单词索引 17 word_index = tokenizer.word_index 18 print('Found %s unique tokens.' % len(word_index))
(4)使用散列技巧的单词级的one-hot编码:
1 import numpy as np 2 3 samples = ['The cat sat on the mat.', 'The dog ate my homework.'] 4 5 # 将单词保存为长度为1000的向量。如果单词数量接近1000个(或更多), 6 # 那么会遇到很多散列冲突,这回降低这种编码方法的准确性 7 dimensionality = 1000 8 max_length = 10 9 10 results = np.zeros((len(samples), max_length, dimensionality)) 11 for i, sample in enumerate(samples): 12 for j, word in list(enumerate(sample.split()))[:max_length]: 13 # 将单词散列为0~1000范围内的一个随机整数索引 14 index = abs(hash(word)) % dimensionality 15 results[i, j, index] = 1.
6.1.2 使用词嵌入
(1)利用Embedding层学习词嵌入:
1 from keras.layers import Embedding 2 3 # Embedding层至少需要两个参数:标记的个数(这里是1000,即最大单词索引+1) 4 # 和嵌入的维度(这里是64) 5 embedding_layer = Embedding(1000, 64) 6 7 from keras.datasets import imdb 8 from keras import preprocessing 9 10 # 作为特征的单词个数 11 max_features = 10000 12 # 超出的单词会被截断 13 # (这些单词是最常见单词) 14 maxlen = 20 15 16 # 加载数据,整数列表 17 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) 18 19 # 将整数列表转换为(samples, maxlen)的2D张量 20 x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen) 21 x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen) 22 23 24 from keras.models import Sequential 25 from keras.layers import Flatten, Dense 26 27 model = Sequential() 28 # 指定Embedding层的最大输入长度,以便后面将嵌入输入展平。 29 30 model.add(Embedding(max_features, 8, input_length=maxlen)) 31 # Embedding层激活的形状为(samples, maxlen, 8) 32 33 # 将3D的嵌入张量展平成形状(samples, maxlen * 8)的2D张量 34 model.add(Flatten()) 35 36 # 添加分类器 37 model.add(Dense(1, activation='sigmoid')) 38 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) 39 # model.summary() 40 41 history = model.fit(x_train, y_train, 42 epochs=10, 43 batch_size=32, 44 validation_split=0.2)
(2)使用预训练的词嵌入:
1 import os 2 3 imdb_dir = '/home/ubuntu/data/aclImdb' 4 train_dir = os.path.join(imdb_dir, 'train') 5 6 labels = [] 7 texts = [] 8 9 for label_type in ['neg', 'pos']: 10 dir_name = os.path.join(train_dir, label_type) 11 for fname in os.listdir(dir_name): 12 if fname[-4:] == '.txt': 13 f = open(os.path.join(dir_name, fname)) 14 texts.append(f.read()) 15 f.close() 16 if label_type == 'neg': 17 labels.append(0) 18 else: 19 labels.append(1) 20 21 from keras.preprocessing.text import Tokenizer 22 from keras.preprocessing.sequence import pad_sequences 23 import numpy as np 24 25 maxlen = 100 # We will cut reviews after 100 words 26 training_samples = 200 # We will be training on 200 samples 27 validation_samples = 10000 # We will be validating on 10000 samples 28 max_words = 10000 # We will only consider the top 10,000 words in the dataset 29 30 tokenizer = Tokenizer(num_words=max_words) 31 tokenizer.fit_on_texts(texts) 32 sequences = tokenizer.texts_to_sequences(texts) 33 34 word_index = tokenizer.word_index 35 print('Found %s unique tokens.' % len(word_index)) 36 37 data = pad_sequences(sequences, maxlen=maxlen) 38 39 labels = np.asarray(labels) 40 print('Shape of data tensor:', data.shape) 41 print('Shape of label tensor:', labels.shape) 42 43 # Split the data into a training set and a validation set 44 # But first, shuffle the data, since we started from data 45 # where sample are ordered (all negative first, then all positive). 46 indices = np.arange(data.shape[0]) 47 np.random.shuffle(indices) 48 data = data[indices] 49 labels = labels[indices] 50 51 x_train = data[:training_samples] 52 y_train = labels[:training_samples] 53 x_val = data[training_samples: training_samples + validation_samples] 54 y_val = labels[training_samples: training_samples + validation_samples] 55 56 glove_dir = '/home/ubuntu/data/' 57 58 embeddings_index = {} 59 f = open(os.path.join(glove_dir, 'glove.6B.100d.txt')) 60 for line in f: 61 values = line.split() 62 word = values[0] 63 coefs = np.asarray(values[1:], dtype='float32') 64 embeddings_index[word] = coefs 65 f.close() 66 67 print('Found %s word vectors.' % len(embeddings_index)) 68 69 embedding_dim = 100 70 71 embedding_matrix = np.zeros((max_words, embedding_dim)) 72 for word, i in word_index.items(): 73 embedding_vector = embeddings_index.get(word) 74 if i < max_words: 75 if embedding_vector is not None: 76 # Words not found in embedding index will be all-zeros. 77 embedding_matrix[i] = embedding_vector 78 79 from keras.models import Sequential 80 from keras.layers import Embedding, Flatten, Dense 81 82 model = Sequential() 83 model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 84 model.add(Flatten()) 85 model.add(Dense(32, activation='relu')) 86 model.add(Dense(1, activation='sigmoid')) 87 model.summary() 88 89 model.layers[0].set_weights([embedding_matrix]) 90 model.layers[0].trainable = False 91 92 model.compile(optimizer='rmsprop', 93 loss='binary_crossentropy', 94 metrics=['acc']) 95 history = model.fit(x_train, y_train, 96 epochs=10, 97 batch_size=32, 98 validation_data=(x_val, y_val)) 99 model.save_weights('pre_trained_glove_model.h5') 100 101 import matplotlib.pyplot as plt 102 103 acc = history.history['acc'] 104 val_acc = history.history['val_acc'] 105 loss = history.history['loss'] 106 val_loss = history.history['val_loss'] 107 108 epochs = range(1, len(acc) + 1) 109 110 plt.plot(epochs, acc, 'bo', label='Training acc') 111 plt.plot(epochs, val_acc, 'b', label='Validation acc') 112 plt.title('Training and validation accuracy') 113 plt.legend() 114 115 plt.figure() 116 117 plt.plot(epochs, loss, 'bo', label='Training loss') 118 plt.plot(epochs, val_loss, 'b', label='Validation loss') 119 plt.title('Training and validation loss') 120 plt.legend() 121 122 plt.show() 123 124 from keras.models import Sequential 125 from keras.layers import Embedding, Flatten, Dense 126 127 model = Sequential() 128 model.add(Embedding(max_words, embedding_dim, input_length=maxlen)) 129 model.add(Flatten()) 130 model.add(Dense(32, activation='relu')) 131 model.add(Dense(1, activation='sigmoid')) 132 model.summary() 133 134 model.compile(optimizer='rmsprop', 135 loss='binary_crossentropy', 136 metrics=['acc']) 137 history = model.fit(x_train, y_train, 138 epochs=10, 139 batch_size=32, 140 validation_data=(x_val, y_val)) 141 142 acc = history.history['acc'] 143 val_acc = history.history['val_acc'] 144 loss = history.history['loss'] 145 val_loss = history.history['val_loss'] 146 147 epochs = range(1, len(acc) + 1) 148 149 plt.plot(epochs, acc, 'bo', label='Training acc') 150 plt.plot(epochs, val_acc, 'b', label='Validation acc') 151 plt.title('Training and validation accuracy') 152 plt.legend() 153 154 plt.figure() 155 156 plt.plot(epochs, loss, 'bo', label='Training loss') 157 plt.plot(epochs, val_loss, 'b', label='Validation loss') 158 plt.title('Training and validation loss') 159 plt.legend() 160 161 plt.show() 162 163 164 test_dir = os.path.join(imdb_dir, 'test') 165 166 labels = [] 167 texts = [] 168 169 for label_type in ['neg', 'pos']: 170 dir_name = os.path.join(test_dir, label_type) 171 for fname in sorted(os.listdir(dir_name)): 172 if fname[-4:] == '.txt': 173 f = open(os.path.join(dir_name, fname)) 174 texts.append(f.read()) 175 f.close() 176 if label_type == 'neg': 177 labels.append(0) 178 else: 179 labels.append(1) 180 181 sequences = tokenizer.texts_to_sequences(texts) 182 x_test = pad_sequences(sequences, maxlen=maxlen) 183 y_test = np.asarray(labels) 184 185 186 model.load_weights('pre_trained_glove_model.h5') 187 model.evaluate(x_test, y_test)