背景
前一篇文章中,采用了在当前比较主流的Transformer模型,Transformer模型本质来看是基于Encoder-Decoder框架,其解码方式本质上和seq2seq模型的解码方式基本相同。seq2seq的重要缺陷之一在于其不具备生成能力,而PGN模型具备着良好的生成能力。因此,本文拟结合Transformer强大的特征抽取能力以及PGN模型的生成能力,希望能碰撞出一些火花。这两个模型的原理,前面系列文章已做具体讲解。本文着重于介绍该模型的实现。
核心内容
整体流程
整个项目的大体流程,如数据加载、训练流程、测试流程等结构,和前面的模型介绍基本相同,而本文是基于上一篇文章Transformer实现的提升,因此基本代码相同。差别在于PGN机制的融入。
整体模型
Transformer模型架构和前文内容基本相同:Encoder、Decoder、以及输出层。
class PGN_TRANSFORMER(tf.keras.Model):
def __init__(self, params):
super(PGN_TRANSFORMER, self).__init__()
self.num_blocks = params["num_blocks"]
self.batch_size = params["batch_size"]
self.vocab_size = params["vocab_size"]
self.num_heads = params["num_heads"]
self.embedding = Embedding(params["vocab_size"], params["d_model"])
self.encoder = Encoder(params["num_blocks"],
params["d_model"],
params["num_heads"],
params["dff"],
params["vocab_size"],
params["dropout_rate"])
self.decoder = Decoder(params["num_blocks"],
params["d_model"],
params["num_heads"],
params["dff"],
params["vocab_size"],
params["dropout_rate"])
self.final_layer = tf.keras.layers.Dense(params["vocab_size"])
def call(self, inp, extended_inp, max_oov_len, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
# print('inp is ', inp)
embed_x = self.embedding(inp)
embed_dec = self.embedding(tar)
enc_output = self.encoder(embed_x, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model)
# dec_output.shape == (batch_size, tar_seq_len, d_model)
dec_output, attention_weights, p_gens = self.decoder(embed_dec,
enc_output,
training,
look_ahead_mask,
dec_padding_mask)
final_output = self.final_layer(dec_output)
# (batch_size, tar_seq_len, target_vocab_size)
final_output = tf.nn.softmax(final_output)
# p_gens = tf.keras.layers.Dense(tf.concat([before_dec, dec, attn_dists[-1]], axis=-1),units=1,activation=tf.sigmoid,trainable=training,use_bias=False)
attn_dists = attention_weights['decoder_layer{}_block2'.format(self.num_blocks)]
# (batch_size,num_heads, targ_seq_len, inp_seq_len)
attn_dists = tf.reduce_sum(attn_dists, axis=1) / self.num_heads
# (batch_size, targ_seq_len, inp_seq_len)
final_dists = calc_final_dist(extended_inp,
tf.unstack(final_output, axis=1),
tf.unstack(attn_dists, axis=1),
tf.unstack(p_gens, axis=1),
max_oov_len,
self.vocab_size,
self.batch_size)
outputs = dict(logits=tf.stack(final_dists, 1), attentions=attn_dists)
return outputs
整体架构于Transformer模型的区别在于call函数中,decoder在解码过程中,需要返回概率p_gen以及上一步解码过程中的注意力分布。而在计算最终概率分布的时候(calc_final_dist),需要综合考虑更新后的词汇表概率分布以及注意力分数。
def calc_final_dist(_enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, vocab_size, batch_size):
"""
Calculate the final distribution, for the pointer-generator model
Args:
vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays.
The words are in the order they appear in the vocabulary file.
attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays
Returns:
final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays.
"""
# Multiply vocab dists by p_gen and attention dists by (1-p_gen)
vocab_dists = [p_gen * dist for (p_gen, dist) in zip(p_gens, vocab_dists)]
attn_dists = [(1-p_gen) * dist for (p_gen, dist) in zip(p_gens, attn_dists)]
# Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words
# the maximum (over the batch) size of the extended vocabulary
extended_size = vocab_size + batch_oov_len
extra_zeros = tf.zeros((batch_size, batch_oov_len))
# list length max_dec_steps of shape (batch_size, extended_size)
vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists]
# Project the values in the attention distributions onto the appropriate entries in the final distributions
# This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary
# then we add 0.1 onto the 500th entry of the final distribution
# This is done for each decoder timestep.
# This is fiddly; we use tf.scatter_nd to do the projection
batch_nums = tf.range(0, limit=batch_size) # shape (batch_size)
batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1)
attn_len = tf.shape(_enc_batch_extend_vocab)[1] # number of states we attend over
batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len)
indices = tf.stack((batch_nums, _enc_batch_extend_vocab), axis=2) # shape (batch_size, enc_t, 2)
shape = [batch_size, extended_size]
# list length max_dec_steps (batch_size, extended_size)
attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists]
# Add the vocab distributions and the copy distributions together to get the final distributions
# final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_size) giving
# the final distribution for that decoder timestep
# Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
final_dists = [vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected)]
return final_dists
Decoder
Decoder部分和Transformer模型的Decoder区别在于context vector以及p_gen概率的计算。
class Decoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.num_heads = num_heads
self.depth = self.d_model // self.num_heads
self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
self.Wh = tf.keras.layers.Dense(1)
self.Ws = tf.keras.layers.Dense(1)
self.Wx = tf.keras.layers.Dense(1)
self.V = tf.keras.layers.Dense(1)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
attention_weights = {}
out = self.dropout(x, training=training)
for i in range(self.num_layers):
out, block1, block2 = self.dec_layers[i](out, enc_output, training, look_ahead_mask, padding_mask)
attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
# x.shape == (batch_size, target_seq_len, d_model)
# context vectors
enc_out_shape = tf.shape(enc_output)
context = tf.reshape(enc_output, (enc_out_shape[0], enc_out_shape[1], self.num_heads, self.depth)) # shape : (batch_size, input_seq_len, num_heads, depth)
context = tf.transpose(context, [0, 2, 1, 3]) # (batch_size, num_heads, input_seq_len, depth)
context = tf.expand_dims(context, axis=2) # (batch_size, num_heads, 1, input_seq_len, depth)
attn = tf.expand_dims(block2, axis=-1) # (batch_size, num_heads, target_seq_len, input_seq_len, 1)
context = context * attn # (batch_size, num_heads, target_seq_len, input_seq_len, depth)
context = tf.reduce_sum(context, axis=3) # (batch_size, num_heads, target_seq_len, depth)
context = tf.transpose(context, [0, 2, 1, 3]) # (batch_size, target_seq_len, num_heads, depth)
context = tf.reshape(context, (tf.shape(context)[0], tf.shape(context)[1], self.d_model)) # (batch_size, target_seq_len, d_model)
# P_gens computing
a = self.Wx(x)
b = self.Ws(out)
c = self.Wh(context)
p_gens = tf.sigmoid(self.V(a + b + c))
return out, attention_weights, p_gens