python数据分析之分类模型与回归模型-第七次笔记
1.分类模型
– *1.1KNN 算法
– *1.2朴素贝叶斯 算法
– *1.3支持向量机SVM 算法
– *1.4集成方法—随机森林算法
– *1.5集成方法—Adaboost 算法
– *1.6决策树
2.回归模型
– *2.1线性回归
– *2.2岭回归
– *2.3Lasso回归
– *2.4逻辑回归
– *2.5人工神经网络
– *2.6GBDT,回归树和提升树
提取数据
#提取训练集,验证集,测试集 比例为6:2:2
from sklearn.model_selection import train_test_split
f_v = features.values
f_names = features.columns.values
l_v = label.values
X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
1.分类模型
1.1KNN 算法
#导入模块
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
[/code]
#### 1.2朴素贝叶斯 算法
```code
from sklearn.naive_bayes import GaussianNB,BernoulliNB
#朴素贝叶斯
models.append(("GaussianNB",GaussianNB()))
models.append(("BernoulliNB",BernoulliNB()))
[/code]
#### 1.3支持向量机SVM 算法
```code
from sklearn.svm import SVC
# SVM 支持向量机 C参数控制精度
models.append(("SVM Classifier",SVC(C=1000)))
[/code]
#### 1.4集成方法—随机森林算法
```code
from sklearn.ensemble import RandomForestClassifier
#原始森林
models.append(("OriginalRandomForest",RandomForestClassifier()))
#随机森林
models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
[/code]
#### 1.5集成方法—Adaboost 算法
```code
from sklearn.ensemble import AdaBoostClassifier
#分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
[/code]
#### 1.6决策树
```code
from sklearn.tree import DecisionTreeClassifier,export_graphviz
#min_impurity_split=0.1 最小不纯度的区分,减枝方法
#决策树(Gini)不纯度
models.append(("DecisionTreeGini",DecisionTreeClassifier()))
#决策树
models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
[/code]
### 2.回归模型
#### 2.1线性回归
```code
#线性回归
from sklearn.linear_model import LinearRegression,Ridge,Lasso
#线性回归
#regr=LinearRegression()
[/code]
#### 2.2岭回归
```code
#岭回归
regr=Ridge(alpha=1)
[/code]
#### 2.3Lasso回归
```code
#Lasso
regr=Lasso(alpha=0.001)
2.4逻辑回归
#逻辑回归也是一种线性回归
models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))
2.5人工神经网络
#人工神经网络 人工神经网络的一个容器
from keras.models import Sequential
#Dense 神经网络层(稠密层)Activation激活函数
from keras.layers.core import Dense,Activation
#SGD 随机梯度下降算法
from keras.optimizers import SGD
#建个容器
mdl=Sequential()
#建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
mdl.add(Dense(50,input_dim=len(f_v[0])))
#加入激活函数
mdl.add(Activation("sigmoid"))
#输出层:2:有两个标注所以为2,
mdl.add(Dense(2))
mdl.add(Activation("softmax"))
#学习率为0.01
sgd=SGD(lr=0.05)
#参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器 adam亚当优化器
mdl.compile(loss="mean_squared_error",optimizer="adam")
#nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,auc,roc_auc_score
f=plt.figure()
for i in range(len(xy_lst)):
X_part = xy_lst[i][0]
Y_part = xy_lst[i][1]
#predict_classes()输出分类标注
#Y_pred = mdl.predict_classes(X_part)
Y_pred = mdl.predict(X_part)
print(Y_pred)
Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]
# print(i)
# print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
# print("NN", "-REC", recall_score(Y_part, Y_pred))
# print("NN", "-Fl", f1_score(Y_part, Y_pred))
f.add_subplot(1,3,i+1)
fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
plt.plot(fpr,tpr)
print("NN","AUC",auc(fpr,tpr))
print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
plt.show()
2.6GBDT,回归树和提升树
from sklearn.ensemble import GradientBoostingClassifier
#GBDT,回归树和提升树 参数max_depth=6一般深度为6,n_estimators=树的数量
models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
[/code]
### 模型的评估
```code
#准确度,召回度, F-score度,为了评价模型的好坏。
from sklearn.metrics import accuracy_score, recall_score, f1_score
for clf_name ,clf in models:
clf.fit(X_train,Y_train)
xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
for i in range(len(xy_lst)):
X_part=xy_lst[i][0]
Y_part=xy_lst[i][1]
Y_pred=clf.predict(X_part)
print(i)
print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
print(clf_name,"-REC",recall_score(Y_part,Y_pred))
print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
[/code]
### 完整的程序:
```code
#encoding utf-8
# time: 2018/08/08
# name: py粉
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import os
import pydotplus
os.environ["PATH"]+=os.pathsep+"E:/Program/Graphviz/bin/"
#sl:satisfaction_level---Flase:MinMaxScaler;Ture:StandardScaler
#le:last_evaluation---Flase:MinMaxScaler;Ture:StandardScaler
#npr:number_project---Flase:MinMaxScaler;Ture:StandardScaler
#amh:average_monthly_hours---Flase:MinMaxScaler;Ture:StandardScaler
#tsc:time_spend_company---Flase:MinMaxScaler;Ture:StandardScaler
#wa:Work_accident---Flase:MinMaxScaler;Ture:StandardScaler
#pl5:promotion_last_5years---Flase:MinMaxScaler;Ture:StandardScaler
#dp:department---False:LabelEncoding;True:OneHotEncoding
#slr:salary---False:LabelEncoding;True:OneHotEncoding
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
f = open("D:\Python\python'数据分析与建模实现\data\HR.csv")
df = pd.read_csv(f)
#1.清洗数据
#satisfaction_level, last_evaluation, number_project,\
#average_monthly_hours, time_spend_company, Work_accident,
# left, promotion_last_5years, department, salary
df=df.dropna(subset=["satisfaction_level","last_evaluation"])
df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
# 2.得到标注
label = df["left"]
df = df.drop("left", axis=1)
#3.特征选取
#4.特征处理
scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
column_lst=["satisfaction_level","last_evaluation","number_project",\
"average_monthly_hours","time_spend_company","Work_accident",\
"promotion_last_5years"]
for i in range(len(scaler_lst)):
if not scaler_lst[i]:
df[column_lst[i]]=\
MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
else:
df[column_lst[i]]=\
StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
scaler_lst=[slr,dp]
column_lst=[ "salary","department"]
for i in range(len(scaler_lst)):
if not scaler_lst[i]:
if column_lst[i]=="salary":
df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
else:
df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
else:
# pandas提供了一个OneHotEncoding的方法
df=pd.get_dummies(df,columns=[column_lst[i]])
if lower_d:
return PCA(n_components=ld_n).fit_transform(df.values),label
return df,label
#把“salary”的值标签化
d=dict([("low",0),("medium",1),("high",2)])
def map_salary(s):
return d.get(s,0)
def hr_modeling(features,label):
#提取训练集,验证集,测试集 比例为6:2:2
from sklearn.model_selection import train_test_split
f_v = features.values
f_names = features.columns.values
l_v = label.values
X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
#models
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
#人工神经网络 人工神经网络的一个容器
from keras.models import Sequential
#Dense 神经网络层(稠密层)Activation激活函数
from keras.layers.core import Dense,Activation
#SGD 随机梯度下降算法
from keras.optimizers import SGD
#建个容器
mdl=Sequential()
#建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
mdl.add(Dense(50,input_dim=len(f_v[0])))
#加入激活函数
mdl.add(Activation("sigmoid"))
#输出层:2:有两个标注所以为2,
mdl.add(Dense(2))
mdl.add(Activation("softmax"))
#学习率为0.01
sgd=SGD(lr=0.05)
#参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器 adam亚当优化器
mdl.compile(loss="mean_squared_error",optimizer="adam")
#nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,auc,roc_auc_score
f=plt.figure()
for i in range(len(xy_lst)):
X_part = xy_lst[i][0]
Y_part = xy_lst[i][1]
#predict_classes()输出分类标注
#Y_pred = mdl.predict_classes(X_part)
Y_pred = mdl.predict(X_part)
print(Y_pred)
Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]
# print(i)
# print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
# print("NN", "-REC", recall_score(Y_part, Y_pred))
# print("NN", "-Fl", f1_score(Y_part, Y_pred))
f.add_subplot(1,3,i+1)
fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
plt.plot(fpr,tpr)
print("NN","AUC",auc(fpr,tpr))
print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
plt.show()
return
models=[]
models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
#朴素贝叶斯
models.append(("GaussianNB",GaussianNB()))
models.append(("BernoulliNB",BernoulliNB()))
#min_impurity_split=0.1 最小不纯度的区分,减枝方法
#决策树(Gini)不纯度
models.append(("DecisionTreeGini",DecisionTreeClassifier()))
#决策树
models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
# SVM 支持向量机 C参数控制精度
models.append(("SVM Classifier",SVC(C=1000)))
#原始森林
models.append(("OriginalRandomForest",RandomForestClassifier()))
#随机森林
models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
#分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
#逻辑回归也是一种线性回归
models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))
#GBDT,回归树和提升树 参数max_depth=6一般深度为6,n_estimators=树的数量
models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
for clf_name ,clf in models:
clf.fit(X_train,Y_train)
xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
for i in range(len(xy_lst)):
X_part=xy_lst[i][0]
Y_part=xy_lst[i][1]
Y_pred=clf.predict(X_part)
print(i)
print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
print(clf_name,"-REC",recall_score(Y_part,Y_pred))
print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
#绘制决策树
#dot_data=StringIO()
#export_graphviz(clf,out_file=dot_data,
# feature_names=f_names,
# class_names=["NL","L"],
# filled=True,
# rounded=True,
# special_characters=True)
#graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
#graph.write_pdf("dt_tree_2.pdf")
def regr_test(features,label):
print("X",features)
print("Y",label)
#线性回归
from sklearn.linear_model import LinearRegression,Ridge,Lasso
#线性回归
#regr=LinearRegression()
#岭回归
regr=Ridge(alpha=1)
#Lasso
#regr=Lasso(alpha=0.001)
regr.fit(features.values,label.values)
Y_pred=regr.predict(features.values)
print("Coef:",regr.coef_)
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
print("MSE:",mean_squared_error(label.values,Y_pred))
print("MAE:",mean_absolute_error(label.values,Y_pred))
print("R2:",r2_score(label.values,Y_pred))
def main():
#数据处理,特征处理
features,label=hr_preprocessing()
#线性回归
regr_test(features[["number_project","average_monthly_hours"]],features["last_evaluation"])
#分类与集成
#hr_modeling(features, label)
if __name__ == '__main__':
main()