从bert之中读取出来的权重矩阵内容如下:
{
'cls/seq_relationship/output_bias': [2], Unused weights
'cls/predictions/transform/dense/kernel': [768, 768], Unused weights
'cls/predictions/transform/dense/bias': [768], Unused weights
'cls/predictions/transform/LayerNorm/gamma': [768], Unused weights
'cls/predictions/transform/LayerNorm/beta': [768], Unused weights
'cls/predictions/output_bias': [30522], Unused weights
'cls/seq_relationship/output_weights': [2, 768], Unused weights
'bert/embeddings/token_type_embeddings': [2, 768], Unused weights
'bert/embeddings/word_embeddings': [30522, 768], 1
'bert/embeddings/LayerNorm/beta': [768], 4
'bert/pooler/dense/bias': [768], Unused weights
'bert/embeddings/position_embeddings': [512, 768], 2
'bert/embeddings/LayerNorm/gamma': [768], 3
'bert/pooler/dense/kernel': [768, 768], Unused weights
(1.bert/embeddings/word_embeddings:[30522,768]
2.bert/embeddings/position_embeddings:[512,768]
3.bert/embeddings/LayerNorm/gamma:[768]
4.bert/embeddings/LayerNorm/beta: [768])
'bert/encoder/layer_0/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_0/attention/self/query/bias': [768],
'bert/encoder/layer_0/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_0/attention/self/key/bias': [768],
'bert/encoder/layer_0/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_0/attention/self/value/bias': [768],
'bert/encoder/layer_0/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_0/attention/output/dense/bias': [768],
'bert/encoder/layer_0/intermediate/dense/kernel': [768,3072],
'bert/encoder/layer_0/intermediate/dense/bias': [3072,],
'bert/encoder/layer_0/output/dense/kernel': [3072,768],
'bert/encoder/layer_0/output/dense/bias': [768,],
'bert/encoder/layer_0/output/LayerNorm/gamma': [768,],
'bert/encoder/layer_0/output/LayerNorm/beta': [768,],
'bert/encoder/layer_0/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_0/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_1/attention/self/value/bias': [768],
'bert/encoder/layer_1/output/dense/kernel': [3072, 768],
'bert/encoder/layer_1/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_1/output/LayerNorm/beta': [768],
'bert/encoder/layer_1/intermediate/dense/bias': [3072],
'bert/encoder/layer_1/attention/output/dense/bias': [768],
'bert/encoder/layer_1/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_1/attention/self/query/kernel': [768, 768]
'bert/encoder/layer_1/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_1/attention/self/key/bias': [768],
'bert/encoder/layer_1/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_1/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_1/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_1/output/dense/bias': [768],
'bert/encoder/layer_1/attention/self/query/bias': [768],
'bert/encoder/layer_1/output/LayerNorm/gamma': [768],
'bert/encoder/layer_2/attention/self/value/bias': [768],
'bert/encoder/layer_2/attention/self/query/bias': [768],
'bert/encoder/layer_2/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_2/attention/self/key/bias': [768],
'bert/encoder/layer_2/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_2/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_2/attention/output/dense/bias': [768],
'bert/encoder/layer_2/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_2/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_2/intermediate/dense/bias': [3072],
'bert/encoder/layer_2/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_2/output/LayerNorm/beta': [768],
'bert/encoder/layer_2/output/LayerNorm/gamma': [768],
'bert/encoder/layer_2/output/dense/bias': [768],
'bert/encoder/layer_2/output/dense/kernel': [3072, 768],
'bert/encoder/layer_2/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_3/attention/self/key/bias': [768],
'bert/encoder/layer_3/output/LayerNorm/gamma': [768],
'bert/encoder/layer_3/output/LayerNorm/beta': [768],
'bert/encoder/layer_3/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_3/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_3/output/dense/kernel': [3072, 768],
'bert/encoder/layer_3/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_3/attention/output/dense/bias': [768],
'bert/encoder/layer_3/attention/self/query/bias': [768],
'bert/encoder/layer_3/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_3/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_3/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_3/output/dense/bias': [768],
'bert/encoder/layer_3/attention/self/value/bias': [768],
'bert/encoder/layer_3/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_3/intermediate/dense/bias': [3072],
'bert/encoder/layer_4/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_4/intermediate/dense/bias': [3072],
'bert/encoder/layer_4/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_4/output/dense/bias': [768],
'bert/encoder/layer_4/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_4/attention/output/dense/bias': [768],
'bert/encoder/layer_4/attention/self/key/bias': [768],
'bert/encoder/layer_4/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_4/attention/self/query/bias': [768],
'bert/encoder/layer_4/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_4/attention/self/value/bias': [768],
'bert/encoder/layer_4/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_4/output/LayerNorm/beta': [768],
'bert/encoder/layer_4/output/LayerNorm/gamma': [768],
'bert/encoder/layer_4/output/dense/kernel': [3072, 768],
'bert/encoder/layer_4/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_5/output/dense/bias': [768],
'bert/encoder/layer_5/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_5/attention/output/dense/bias': [768],
'bert/encoder/layer_5/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_5/attention/self/value/bias': [768],
'bert/encoder/layer_5/attention/self/query/bias': [768],
'bert/encoder/layer_5/output/dense/kernel': [3072, 768],
'bert/encoder/layer_5/intermediate/dense/bias': [3072],
'bert/encoder/layer_5/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_5/attention/self/key/bias': [768],
'bert/encoder/layer_5/output/LayerNorm/beta': [768],
'bert/encoder/layer_5/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_5/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_5/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_5/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_5/output/LayerNorm/gamma': [768],
'bert/encoder/layer_6/attention/self/value/bias': [768],
'bert/encoder/layer_6/output/dense/kernel': [3072, 768],
'bert/encoder/layer_6/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_6/intermediate/dense/bias': [3072],
'bert/encoder/layer_6/attention/output/dense/bias': [768],
'bert/encoder/layer_6/output/dense/bias': [768],
'bert/encoder/layer_6/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_6/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_6/attention/self/key/bias': [768],
'bert/encoder/layer_6/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_6/attention/self/query/bias': [768],
'bert/encoder/layer_6/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_6/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_6/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_6/output/LayerNorm/beta': [768],
'bert/encoder/layer_6/output/LayerNorm/gamma': [768],
'bert/encoder/layer_7/output/LayerNorm/beta': [768],
'bert/encoder/layer_7/output/dense/bias': [768],
'bert/encoder/layer_7/output/dense/kernel': [3072, 768]
'bert/encoder/layer_7/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_7/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_7/output/LayerNorm/gamma': [768],
'bert/encoder/layer_7/attention/self/query/bias': [768],
'bert/encoder/layer_7/attention/output/dense/bias': [768],
'bert/encoder/layer_7/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_7/intermediate/dense/bias': [3072],
'bert/encoder/layer_7/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_7/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_7/attention/self/key/bias': [768],
'bert/encoder/layer_7/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_7/attention/self/value/bias': [768],
'bert/encoder/layer_7/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_8/output/LayerNorm/beta': [768],
'bert/encoder/layer_8/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_8/intermediate/dense/bias': [3072],
'bert/encoder/layer_8/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_8/attention/self/value/bias': [768],
'bert/encoder/layer_8/attention/self/query/bias': [768],
'bert/encoder/layer_8/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_8/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_8/output/dense/kernel': [3072, 768],
'bert/encoder/layer_8/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_8/attention/self/key/bias': [768],
'bert/encoder/layer_8/output/LayerNorm/gamma': [768],
'bert/encoder/layer_8/attention/output/dense/bias': [768],
'bert/encoder/layer_8/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_8/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_8/output/dense/bias': [768],
'bert/encoder/layer_9/output/dense/kernel': [3072, 768],
'bert/encoder/layer_9/output/dense/bias': [768],
'bert/encoder/layer_9/output/LayerNorm/gamma': [768],
'bert/encoder/layer_9/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_9/intermediate/dense/bias': [3072],
'bert/encoder/layer_9/attention/self/query/bias': [768],
'bert/encoder/layer_9/attention/self/key/bias': [768],
'bert/encoder/layer_9/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_9/attention/self/value/bias': [768],
'bert/encoder/layer_9/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_9/output/LayerNorm/beta': [768],
'bert/encoder/layer_9/attention/output/dense/bias': [768],
'bert/encoder/layer_9/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_9/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_9/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_9/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_10/output/dense/bias': [768],
'bert/encoder/layer_10/intermediate/dense/kernel': [768, 3072],
'bert/encoder/layer_10/attention/self/query/bias': [768],
'bert/encoder/layer_10/attention/self/key/bias': [768],
'bert/encoder/layer_10/attention/output/dense/bias': [768],
'bert/encoder/layer_10/output/LayerNorm/beta': [768],
'bert/encoder/layer_10/output/LayerNorm/gamma': [768],
'bert/encoder/layer_10/attention/output/LayerNorm/gamma': [768],
'bert/encoder/layer_10/output/dense/kernel': [3072, 768],
'bert/encoder/layer_10/attention/self/query/kernel': [768, 768],
'bert/encoder/layer_10/attention/self/key/kernel': [768, 768],
'bert/encoder/layer_10/intermediate/dense/bias': [3072],
'bert/encoder/layer_10/attention/self/value/bias': [768],
'bert/encoder/layer_10/attention/output/LayerNorm/beta': [768],
'bert/encoder/layer_10/attention/self/value/kernel': [768, 768],
'bert/encoder/layer_10/attention/output/dense/kernel': [768, 768],
'bert/encoder/layer_11/attention/output/dense/kernel': [768, 768], 1
'bert/encoder/layer_11/attention/self/value/kernel': [768, 768], 2
'bert/encoder/layer_11/attention/self/query/bias': [768], 3
'bert/encoder/layer_11/attention/self/value/bias': [768], 4
'bert/encoder/layer_11/attention/self/key/kernel': [768, 768], 5
'bert/encoder/layer_11/output/LayerNorm/gamma': [768], 6
'bert/encoder/layer_11/attention/self/query/kernel': [768, 768], 7
'bert/encoder/layer_11/intermediate/dense/kernel': [768, 3072], 8
'bert/encoder/layer_11/attention/output/dense/bias': [768], 9
'bert/encoder/layer_11/output/dense/kernel': [3072, 768], 10
'bert/encoder/layer_11/output/LayerNorm/beta': [768], 11
'bert/encoder/layer_11/attention/output/LayerNorm/gamma': [768], 12
'bert/encoder/layer_11/attention/output/LayerNorm/beta': [768], 13
'bert/encoder/layer_11/attention/self/key/bias': [768], 14
'bert/encoder/layer_11/output/dense/bias': [768], 15
'bert/encoder/layer_11/intermediate/dense/bias': [3072], 16
}
这里面所有的权重值均为从bert_model.ckpt.data-00000-of-00001,bert_model.ckpt.index,bert_model.ckpt.meta读取出来的参数,其中标注unused weights的内容为微调之后并没有使用到的对应的参数,其余的参数为微调之后使用到的参数,对应的总体参数结构图如下:
对应的整个bert模型参数的结构图为: