数据科学技术与应用第五章机器学习建模分析

基于Keras建立深度神经网络模型,在bankpep数据集上训练神经网络分类模型,将训练模型的耗时以及模型性能,与XGBoost、SVM、朴素贝叶斯等方法进行比较。

 

import pandas,datetime,xgboost,numpy
from sklearn import model_selection,preprocessing,metrics,tree,naive_bayes,svm
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.utils import np_utils
from graphviz import Source
from IPython.display import Image

#请根据 bankpep.csv 保存位置适当调整代码
df=pandas.read_csv('data/bankpep.csv',index_col='id')

seq=['married','car','save_act','current_act','mortgage','pep']
for feature in seq:
    df.loc[df[feature]=='YES',feature]=1
    df.loc[df[feature] == 'NO', feature] = 0

df.loc[df['sex']=='FEMALE','sex']=1
df.loc[df['sex']=='MALE','sex']=0

dumm_region=pandas.get_dummies(df['region'],prefix='region')
dumm_child=pandas.get_dummies(df['children'],prefix='children')
df=df.drop(['region','children'],axis=1)
df=df.join([dumm_region,dumm_child],how='outer')

x=df.drop(['pep'],axis=1).values.astype(float)
#x=preprocessing.scale(x)
y=df['pep'].values.astype(int)

x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.2,random_state=1)

featureName=df.drop(['pep'],axis=1).columns.values
className=['pep','no pep']

#tree
print('Tree')
start_time=datetime.datetime.now()
clf_tree=tree.DecisionTreeClassifier()
clf_tree.fit(x_train,y_train)
pre_y_train_tree=clf_tree.predict(x_train)
pre_y_test_tree=clf_tree.predict(x_test)
print('train_tree')
print(clf_tree.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_tree))
print(metrics.confusion_matrix(y_train,pre_y_train_tree))
print('test_tree')
tree_score=clf_tree.score(x_test,y_test)
print(tree_score)
print(metrics.classification_report(y_test,pre_y_test_tree))
print(metrics.confusion_matrix(y_test,pre_y_test_tree))
'''
graph_tree=Source(tree.export_graphviz(clf_tree,out_file=None,feature_names=featureName,class_names=className))
png_bytes=graph_tree.pipe(format='png')
with open('mooc_5.2_tree.png','wb') as f:
    f.write(png_bytes)
'''
end_time = datetime.datetime.now()
time_tree=end_time-start_time
print("time:",time_tree)

#naive_bayes.MultinomialNB
print('MultinomialNB')
start_time=datetime.datetime.now()
clf_MultinomialNB=naive_bayes.MultinomialNB()
clf_MultinomialNB.fit(x_train,y_train)
pre_y_train_MultinomialNB=clf_MultinomialNB.predict(x_train)
pre_y_test_MultinomialNB=clf_MultinomialNB.predict(x_test)
print('train_MultinomialNB')
print(clf_MultinomialNB.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_MultinomialNB))
print(metrics.confusion_matrix(y_train,pre_y_train_MultinomialNB))
print('test_MultinomialNB')
MultinomialNB_score=clf_MultinomialNB.score(x_test,y_test)
print(MultinomialNB_score)
print(metrics.classification_report(y_test,pre_y_test_MultinomialNB))
print(metrics.confusion_matrix(y_test,pre_y_test_MultinomialNB))
end_time=datetime.datetime.now()
time_MultinomialNB=end_time-start_time
print("time:",time_MultinomialNB)

#naive_bayes.GaussianNB
print('GaussianNB')
start_time=datetime.datetime.now()
clf_GaussianNB=naive_bayes.GaussianNB()
clf_GaussianNB.fit(x_train,y_train)
pre_y_train_GaussianNB=clf_GaussianNB.predict(x_train)
pre_y_test_GaussianNB=clf_GaussianNB.predict(x_test)
print('train_GaussianNB')
print(clf_GaussianNB.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_GaussianNB))
print(metrics.confusion_matrix(y_train,pre_y_train_GaussianNB))
print('test_GaussianNB')
GaussianNB_score=clf_GaussianNB.score(x_test,y_test)
print(GaussianNB_score)
print(metrics.classification_report(y_test,pre_y_test_GaussianNB))
print(metrics.confusion_matrix(y_test,pre_y_test_GaussianNB))
end_time=datetime.datetime.now()
time_GaussianNB=end_time-start_time
print("time:",time_GaussianNB)

#naive_bayes.BernoulliNB
print('BernoulliNB')
start_time=datetime.datetime.now()
clf_BernoulliNB=naive_bayes.BernoulliNB()
clf_BernoulliNB.fit(x_train,y_train)
pre_y_train_BernoulliNB=clf_BernoulliNB.predict(x_train)
pre_y_test_BernoulliNB=clf_BernoulliNB.predict(x_test)
print('train_BernoulliNB')
print(clf_BernoulliNB.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_BernoulliNB))
print(metrics.confusion_matrix(y_train,pre_y_train_BernoulliNB))
print('test_BernoulliNB')
BernoulliNB_score=clf_BernoulliNB.score(x_test,y_test)
print(BernoulliNB_score)
print(metrics.classification_report(y_test,pre_y_test_BernoulliNB))
print(metrics.confusion_matrix(y_test,pre_y_test_BernoulliNB))
end_time=datetime.datetime.now()
time_BernoulliNB=end_time-start_time
print("time:",time_BernoulliNB)

#SVM
print('SVM')
start_time=datetime.datetime.now()
clf_SVM=svm.SVC()
clf_SVM.fit(x_train,y_train)
pre_y_train_SVM=clf_SVM.predict(x_train)
pre_y_test_SVM=clf_SVM.predict(x_test)
print('train_SVM')
print(clf_SVM.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_SVM))
print(metrics.confusion_matrix(y_train,pre_y_train_SVM))
print('test_SVM')
SVM_score=clf_SVM.score(x_test,y_test)
print(SVM_score)
print(metrics.classification_report(y_test,pre_y_test_SVM))
print(metrics.confusion_matrix(y_test,pre_y_test_SVM))
end_time=datetime.datetime.now()
time_SVM=end_time-start_time
print("time:",time_SVM)

#GBM
print('GBM')
start_time=datetime.datetime.now()
clf_GBM=GradientBoostingClassifier()
clf_GBM.fit(x_train,y_train)
pre_y_train_GBM=clf_GBM.predict(x_train)
pre_y_test_GBM=clf_GBM.predict(x_test)
print('train_GBM')
print(clf_GBM.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_GBM))
print(metrics.confusion_matrix(y_train,pre_y_train_GBM))
print('test_GBM')
GBM_score=clf_GBM.score(x_test,y_test)
print(GBM_score)
print(metrics.classification_report(y_test,pre_y_test_GBM))
print(metrics.confusion_matrix(y_test,pre_y_test_GBM))
end_time=datetime.datetime.now()
time_GBM=end_time-start_time
print("time:",time_GBM)

#XGBoost
print('XGBoost')
start_time=datetime.datetime.now()
clf_XGBoost=xgboost.XGBClassifier()
clf_XGBoost.fit(x_train,y_train)
pre_y_train_XGBoost=clf_XGBoost.predict(x_train)
pre_y_test_XGBoost=clf_XGBoost.predict(x_test)
print('train_XGBoost')
print(clf_XGBoost.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_XGBoost))
print(metrics.confusion_matrix(y_train,pre_y_train_XGBoost))
print('test_XGBoost')
XGBoost_score=clf_XGBoost.score(x_test,y_test)
print(XGBoost_score)
print(metrics.classification_report(y_test,pre_y_test_XGBoost))
print(metrics.confusion_matrix(y_test,pre_y_test_XGBoost))
end_time=datetime.datetime.now()
time_XGBoost=end_time-start_time
print("time:",time_XGBoost)

#RandomForestClassifier
print('RFC')
start_time=datetime.datetime.now()
clf_RFC=RandomForestClassifier()
clf_RFC.fit(x_train,y_train)
pre_y_train_RFC=clf_RFC.predict(x_train)
pre_y_test_RFC=clf_RFC.predict(x_test)
print('train_RFC')
print(clf_RFC.score(x_train,y_train))
print(metrics.classification_report(y_train,pre_y_train_RFC))
print(metrics.confusion_matrix(y_train,pre_y_train_RFC))
print('test_RFC')
RFC_score=clf_RFC.score(x_test,y_test)
print(RFC_score)
print(metrics.classification_report(y_test,pre_y_test_RFC))
print(metrics.confusion_matrix(y_test,pre_y_test_RFC))
end_time=datetime.datetime.now()
time_RFC=end_time-start_time
print("time:",time_RFC)

#Keras
print('Keras')
start_time=datetime.datetime.now()
model=Sequential()
model.add(Dense(units=16,input_shape=(16,)))
model.add(Activation('relu'))
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['binary_accuracy'])
y_train_ohe=np_utils.to_categorical(y_train,2)
y_test_ohe=np_utils.to_categorical(y_test,2)
model.fit(x_train,y_train_ohe,epochs=25,batch_size=1,verbose=2,validation_data=(x_test,y_test_ohe))
loss,accuracy=model.evaluate(x_test,y_test_ohe)
print(loss,accuracy)
classes=model.predict(x_test,batch_size=1,verbose=2)
Keras_score=loss
end_time=datetime.datetime.now()
time_Keras=end_time-start_time
print("time:",time_Keras)

#Matplotlib
model=['tree','MultinomialNB','GaussianNB','BernoulliNB','SVM','GBM','XGBoost','RFC']
column=['Score','Time']
datas=[]
for i in model:
    data=[]
    data.append(eval(i+"_score"))
    data.append(eval("time_"+i).total_seconds())
    datas.append(data)
df_Matplotlib=pandas.DataFrame(datas,columns=column,index=model)
print(df_Matplotlib)
print('Keras',loss,accuracy,time_Keras.total_seconds())
df_Matplotlib.plot()
plt.grid()
plt.show()

输出结果:

                  Score      Time
tree           0.775000  0.081810
MultinomialNB  0.666667  0.009974
GaussianNB     0.700000  0.008011
BernoulliNB    0.741667  0.009941
SVM            0.566667  0.027959
GBM            0.825000  0.100698
XGBoost        0.816667  0.153870
RFC            0.833333  0.282304
Keras 0.6881586909294128 0.550000011920929 13.049028

数据科学技术与应用第五章机器学习建模分析

 

上一篇:笔记 task1


下一篇:Python网页分析,分析网站的日志数据