python数据分析之分类模型与回归模型-第七次笔记

python数据分析之分类模型与回归模型-第七次笔记


1.分类模型

– *1.1KNN 算法
– *1.2朴素贝叶斯 算法
– *1.3支持向量机SVM 算法
– *1.4集成方法—随机森林算法
– *1.5集成方法—Adaboost 算法
– *1.6决策树

2.回归模型

– *2.1线性回归
– *2.2岭回归
– *2.3Lasso回归
– *2.4逻辑回归
– *2.5人工神经网络
– *2.6GBDT,回归树和提升树


提取数据

        #提取训练集,验证集,测试集   比例为6:2:2
        from  sklearn.model_selection import train_test_split
        f_v = features.values
        f_names = features.columns.values
        l_v = label.values
        X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
        X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    

1.分类模型

1.1KNN 算法

    #导入模块
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
[/code]

####  1.2朴素贝叶斯 算法

```code
        from sklearn.naive_bayes import GaussianNB,BernoulliNB
        #朴素贝叶斯
        models.append(("GaussianNB",GaussianNB()))
        models.append(("BernoulliNB",BernoulliNB()))
[/code]

####  1.3支持向量机SVM 算法

```code
        from sklearn.svm import SVC
        # SVM 支持向量机  C参数控制精度
        models.append(("SVM Classifier",SVC(C=1000)))
[/code]

####  1.4集成方法—随机森林算法

```code
        from sklearn.ensemble import RandomForestClassifier
        #原始森林
      models.append(("OriginalRandomForest",RandomForestClassifier()))
        #随机森林
        models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
[/code]

####  1.5集成方法—Adaboost 算法

```code
        from sklearn.ensemble import AdaBoostClassifier
        #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
        models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
[/code]

####  1.6决策树

```code
        from sklearn.tree import DecisionTreeClassifier,export_graphviz
        #min_impurity_split=0.1 最小不纯度的区分,减枝方法
        #决策树(Gini)不纯度
        models.append(("DecisionTreeGini",DecisionTreeClassifier()))
        #决策树
        models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
[/code]

###  2.回归模型

####  2.1线性回归

```code
        #线性回归
        from sklearn.linear_model import LinearRegression,Ridge,Lasso
            #线性回归
        #regr=LinearRegression()
[/code]

####  2.2岭回归

```code
        #岭回归
        regr=Ridge(alpha=1)
[/code]

####  2.3Lasso回归

```code
        #Lasso
        regr=Lasso(alpha=0.001)

2.4逻辑回归

        #逻辑回归也是一种线性回归
        models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))

2.5人工神经网络

        #人工神经网络  人工神经网络的一个容器
        from keras.models import Sequential
        #Dense 神经网络层(稠密层)Activation激活函数
        from keras.layers.core import Dense,Activation
        #SGD 随机梯度下降算法
        from keras.optimizers import SGD
        #建个容器
        mdl=Sequential()
        #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
        mdl.add(Dense(50,input_dim=len(f_v[0])))
        #加入激活函数
        mdl.add(Activation("sigmoid"))
        #输出层:2:有两个标注所以为2,
        mdl.add(Dense(2))
        mdl.add(Activation("softmax"))
        #学习率为0.01
        sgd=SGD(lr=0.05)
        #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器  adam亚当优化器
        mdl.compile(loss="mean_squared_error",optimizer="adam")
        #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
        mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
            xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
        import matplotlib.pyplot as plt
        from sklearn.metrics import roc_curve,auc,roc_auc_score
        f=plt.figure()
    
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            #predict_classes()输出分类标注
            #Y_pred = mdl.predict_classes(X_part)
            Y_pred = mdl.predict(X_part)
            print(Y_pred)
            Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]
    
            # print(i)
            # print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
            # print("NN", "-REC", recall_score(Y_part, Y_pred))
            # print("NN", "-Fl", f1_score(Y_part, Y_pred))
            f.add_subplot(1,3,i+1)
            fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
            plt.plot(fpr,tpr)
            print("NN","AUC",auc(fpr,tpr))
            print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
        plt.show()

2.6GBDT,回归树和提升树

        from sklearn.ensemble import GradientBoostingClassifier
        #GBDT,回归树和提升树  参数max_depth=6一般深度为6,n_estimators=树的数量
        models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
[/code]

###  模型的评估

```code
    #准确度,召回度, F-score度,为了评价模型的好坏。
        from sklearn.metrics import accuracy_score, recall_score, f1_score
        for clf_name ,clf in models:
            clf.fit(X_train,Y_train)
            xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
            for i in  range(len(xy_lst)):
                X_part=xy_lst[i][0]
                Y_part=xy_lst[i][1]
                Y_pred=clf.predict(X_part)
                print(i)
                print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
                print(clf_name,"-REC",recall_score(Y_part,Y_pred))
                print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
[/code]

###  完整的程序:

```code
    #encoding utf-8
    # time: 2018/08/08
    # name: py粉
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler,StandardScaler
    from sklearn.preprocessing import LabelEncoder,OneHotEncoder
    from sklearn.preprocessing import Normalizer
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.decomposition import PCA
    import os
    import pydotplus
    os.environ["PATH"]+=os.pathsep+"E:/Program/Graphviz/bin/"
    
    
    #sl:satisfaction_level---Flase:MinMaxScaler;Ture:StandardScaler
    #le:last_evaluation---Flase:MinMaxScaler;Ture:StandardScaler
    #npr:number_project---Flase:MinMaxScaler;Ture:StandardScaler
    #amh:average_monthly_hours---Flase:MinMaxScaler;Ture:StandardScaler
    #tsc:time_spend_company---Flase:MinMaxScaler;Ture:StandardScaler
    #wa:Work_accident---Flase:MinMaxScaler;Ture:StandardScaler
    #pl5:promotion_last_5years---Flase:MinMaxScaler;Ture:StandardScaler
    #dp:department---False:LabelEncoding;True:OneHotEncoding
    #slr:salary---False:LabelEncoding;True:OneHotEncoding
    
    def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
        f = open("D:\Python\python'数据分析与建模实现\data\HR.csv")
        df = pd.read_csv(f)
    
        #1.清洗数据
        #satisfaction_level, last_evaluation, number_project,\
        #average_monthly_hours, time_spend_company, Work_accident,
        #  left, promotion_last_5years, department, salary
        df=df.dropna(subset=["satisfaction_level","last_evaluation"])
        df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
        # 2.得到标注
        label = df["left"]
        df = df.drop("left", axis=1)
        #3.特征选取
        #4.特征处理
        scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
        column_lst=["satisfaction_level","last_evaluation","number_project",\
                    "average_monthly_hours","time_spend_company","Work_accident",\
                    "promotion_last_5years"]
        for i in range(len(scaler_lst)):
            if not scaler_lst[i]:
                df[column_lst[i]]=\
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
            else:
                df[column_lst[i]]=\
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    
    
    
        scaler_lst=[slr,dp]
        column_lst=[ "salary","department"]
        for i in range(len(scaler_lst)):
            if not scaler_lst[i]:
                if column_lst[i]=="salary":
                    df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
                else:
                    df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
                df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
            else:
                # pandas提供了一个OneHotEncoding的方法
                df=pd.get_dummies(df,columns=[column_lst[i]])
        if lower_d:
            return PCA(n_components=ld_n).fit_transform(df.values),label
    
    
        return df,label
    #把“salary”的值标签化
    d=dict([("low",0),("medium",1),("high",2)])
    def map_salary(s):
        return d.get(s,0)
    def hr_modeling(features,label):
        #提取训练集,验证集,测试集   比例为6:2:2
        from  sklearn.model_selection import train_test_split
        f_v = features.values
        f_names = features.columns.values
        l_v = label.values
        X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
        X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    
    
        #models
        from sklearn.metrics import accuracy_score, recall_score, f1_score
        from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
        from sklearn.naive_bayes import GaussianNB,BernoulliNB
        from sklearn.tree import DecisionTreeClassifier,export_graphviz
        from sklearn.externals.six import StringIO
        from sklearn.svm import SVC
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.ensemble import AdaBoostClassifier
        #逻辑回归
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import GradientBoostingClassifier
    
    
    
        #人工神经网络  人工神经网络的一个容器
        from keras.models import Sequential
        #Dense 神经网络层(稠密层)Activation激活函数
        from keras.layers.core import Dense,Activation
        #SGD 随机梯度下降算法
        from keras.optimizers import SGD
        #建个容器
        mdl=Sequential()
        #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度
        mdl.add(Dense(50,input_dim=len(f_v[0])))
        #加入激活函数
        mdl.add(Activation("sigmoid"))
        #输出层:2:有两个标注所以为2,
        mdl.add(Dense(2))
        mdl.add(Activation("softmax"))
        #学习率为0.01
        sgd=SGD(lr=0.05)
        #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器  adam亚当优化器
        mdl.compile(loss="mean_squared_error",optimizer="adam")
        #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数
        mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999)
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]
        import matplotlib.pyplot as plt
        from sklearn.metrics import roc_curve,auc,roc_auc_score
        f=plt.figure()
    
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            #predict_classes()输出分类标注
            #Y_pred = mdl.predict_classes(X_part)
            Y_pred = mdl.predict(X_part)
            print(Y_pred)
            Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0]
    
            # print(i)
            # print("NN", "-ACC", accuracy_score(Y_part, Y_pred))
            # print("NN", "-REC", recall_score(Y_part, Y_pred))
            # print("NN", "-Fl", f1_score(Y_part, Y_pred))
            f.add_subplot(1,3,i+1)
            fpr,tpr,threshold=roc_curve(Y_part,Y_pred)
            plt.plot(fpr,tpr)
            print("NN","AUC",auc(fpr,tpr))
            print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred))
        plt.show()
    
    
    
        return
        models=[]
        models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
        #朴素贝叶斯
        models.append(("GaussianNB",GaussianNB()))
        models.append(("BernoulliNB",BernoulliNB()))
        #min_impurity_split=0.1 最小不纯度的区分,减枝方法
        #决策树(Gini)不纯度
        models.append(("DecisionTreeGini",DecisionTreeClassifier()))
        #决策树
        models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
        # SVM 支持向量机  C参数控制精度
        models.append(("SVM Classifier",SVC(C=1000)))
        #原始森林
        models.append(("OriginalRandomForest",RandomForestClassifier()))
        #随机森林
        models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None)))
        #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME")))
        models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
        #逻辑回归也是一种线性回归
        models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))
        #GBDT,回归树和提升树  参数max_depth=6一般深度为6,n_estimators=树的数量
        models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
        for clf_name ,clf in models:
            clf.fit(X_train,Y_train)
            xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
            for i in  range(len(xy_lst)):
                X_part=xy_lst[i][0]
                Y_part=xy_lst[i][1]
                Y_pred=clf.predict(X_part)
                print(i)
                print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred))
                print(clf_name,"-REC",recall_score(Y_part,Y_pred))
                print(clf_name,"-Fl",f1_score(Y_part,Y_pred))
                #绘制决策树
                #dot_data=StringIO()
                #export_graphviz(clf,out_file=dot_data,
                #                         feature_names=f_names,
                #                         class_names=["NL","L"],
                #                         filled=True,
                #                         rounded=True,
                #                         special_characters=True)
                #graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
                #graph.write_pdf("dt_tree_2.pdf")
    
    def regr_test(features,label):
        print("X",features)
        print("Y",label)
        #线性回归
        from sklearn.linear_model import LinearRegression,Ridge,Lasso
        #线性回归
        #regr=LinearRegression()
        #岭回归
        regr=Ridge(alpha=1)
        #Lasso
        #regr=Lasso(alpha=0.001)
    
        regr.fit(features.values,label.values)
        Y_pred=regr.predict(features.values)
        print("Coef:",regr.coef_)
        from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
        print("MSE:",mean_squared_error(label.values,Y_pred))
        print("MAE:",mean_absolute_error(label.values,Y_pred))
        print("R2:",r2_score(label.values,Y_pred))
    
    def main():
        #数据处理,特征处理
        features,label=hr_preprocessing()
        #线性回归
        regr_test(features[["number_project","average_monthly_hours"]],features["last_evaluation"])
        #分类与集成
        #hr_modeling(features, label)
    
    if __name__ == '__main__':
        main()

python数据分析之分类模型与回归模型-第七次笔记

上一篇:一线互联网架构师360°全方面性能调优,深度集成!


下一篇:2021最后一次Java面试,dockerexec原理