【推荐系统】TensorFlow复现论文NeuralCF网络结构

文章目录

下图为NeutralCF的模型结构图,总共两个分支,第一个分支为GML,第二个为MLP,GML通路将两个特征的Embedding向量进行内积操作,MLP将两个特征的Embedding的向量进行拼接,然后使用多层感知机进行传播,然后将两个通路输出的向量进行拼接,导入全连接层(输出层),输出Score。

【推荐系统】TensorFlow复现论文NeuralCF网络结构

一、导包

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *

from tensorflow.keras.utils import plot_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

import itertools
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import namedtuple

import warnings
warnings.filterwarnings("ignore")

二、读取数据

# 读取数据,NCF使用的特征只有user_id和item_id
rnames = ['user_id','movie_id','rating','timestamp']
data = pd.read_csv('./data/ml-1m/ratings.dat', sep='::', engine='python', names=rnames)

【推荐系统】TensorFlow复现论文NeuralCF网络结构

三、特征编码处理

lbe = LabelEncoder()
data['user_id'] = lbe.fit_transform(data['user_id'])
data['movie_id'] = lbe.fit_transform(data['movie_id'])

train_data = data[['user_id', 'movie_id']]
train_data['label'] = data['rating']

四、使用具名元组为特征进行处理


SparseFeat = namedtuple('SparseFeat', ['name', 'vocabulary_size', 'embedding_dim'])
DenseFeat = namedtuple('DenseFeat', ['name', 'dimension'])

dnn_features_columns = [SparseFeat('user_id', train_data['user_id'].nunique(), 8),
                        SparseFeat('movie_id', train_data['movie_id'].nunique(), 8)]

【推荐系统】TensorFlow复现论文NeuralCF网络结构

五、构建模型

5.1 输入层

def build_input_layers(dnn_features_columns):
    dense_input_dict, sparse_input_dict = {}, {}
    
    for f in dnn_features_columns:
        if isinstance(f, SparseFeat):
            sparse_input_dict[f.name] = Input(shape=(1), name=f.name)
        elif isinstance(f, DenseFeat):
            dense_input_dict[f.name] = Input(shape=(f.dimension), name=f.name)
    
    return dense_input_dict, sparse_input_dict

5.2 Embedding层

def build_embedding_layers(dnn_features_columns, sparse_input_dict, prefix="", is_linear=True):
    embedding_layers_dict = {}
    
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), dnn_features_columns)) if dnn_features_columns else []
    
    if is_linear:
        for f in sparse_feature_columns:
            embedding_layers_dict[f.name] = Embedding(f.vocabulary_size + 1, 1, name= prefix + '_1d_emb_' +  + f.name)
    else:
        for f in sparse_feature_columns:
            embedding_layers_dict[f.name] = Embedding(f.vocabulary_size + 1, f.embedding_dim, name=prefix + '_kd_emb_' +  f.name)
    
    return embedding_layers_dict

5.3 GML

def build_gml_layers(gml_user_embedding, gml_movie_embedding):
    
    return Multiply()([gml_user_embedding, gml_movie_embedding])

5.4 MLP

def build_mlp_layers(mlp_input, units=(32, 16)):
    for out_dim in units:
        mlp_input = Dense(out_dim)(mlp_input)
    
    return mlp_input

5.5 输出层

def bulid_output_layers(concat_output):
    return Dense(1)(concat_output)

5.6 构建模型

def NCF(dnn_features_columns):
    # 1. 获取字典输入层,键为列名,值为对应的Input
    _, sparse_input_dict = build_input_layers(dnn_features_columns)
    
    # 2. 获取真实输入层,使用列表存储每个列的Input
    input_layers = list(sparse_input_dict.values())
    
    # 3. 将SparseFeature进行Embedding,有两路,分别是GML和MLP
    embedding_gml_dict = build_embedding_layers(dnn_features_columns, sparse_input_dict, prefix="GML", is_linear=False)
    embedding_mlp_dict = build_embedding_layers(dnn_features_columns, sparse_input_dict, prefix="MLP", is_linear=False)
    
    # 4. 将Embedding后的特征进行展开,因为Embedding后为(?,1,8)
    gml_user_embedding = Flatten()(embedding_gml_dict['user_id'](sparse_input_dict['user_id']))
    gml_movie_embedding = Flatten()(embedding_gml_dict['movie_id'](sparse_input_dict['movie_id']))
    
    mlp_user_embedding = Flatten()(embedding_mlp_dict['user_id'](sparse_input_dict['user_id']))
    mlp_movie_embedding = Flatten()(embedding_mlp_dict['movie_id'](sparse_input_dict['movie_id']))
    
    # 5. 进行GML,就是展开后的特征进行内积
    gml_output = build_gml_layers(gml_user_embedding, gml_movie_embedding)
#     gml_output = tf.multiply(gml_movie_embedding, gml_user_embedding)
#     gml_output = Multiply()([gml_user_embedding, gml_movie_embedding])
    
    # 6. 进行MLP,将特征进行连接,传入MLP层
    mlp_input = Concatenate(axis=1)([mlp_user_embedding, mlp_movie_embedding])
    mlp_output = build_mlp_layers(mlp_input, (32, 16))
    
    # 7. 将GML和MLP层的输出进行连接
    concat_output = Concatenate(axis=1)([gml_output, mlp_output])
    
    # 8.传入到输出层中,获取评分
    output_layers = bulid_output_layers(concat_output)
    
    # 构建模型
    model = Model(input_layers, output_layers)
    
    return model

六、运转模型

history = NCF(dnn_features_columns)

# 编译模型
history.compile(optimizer="adam", 
                loss="mse", 
                metrics=['mae'])

# 训练数据做成字典,与输入层做对应
train_model_input = {name: train_data[name] for name in ['user_id', 'movie_id']}

history.fit(train_model_input, 
            train_data['label'].values,
            batch_size=128, 
            epochs=2, 
            validation_split=0.2)

【推荐系统】TensorFlow复现论文NeuralCF网络结构

# 绘制网络结构图
plot_model(history,show_shapes=True)
上一篇:python使用反向传播(backpropagation)算法在sklearn库自带的缩小版MINIST手写数据集(1797张图片)上进行训练并测试


下一篇:【神经网络】(5) 卷积神经网络(ResNet50),案例:艺术画作10分类,附数据集