基于逐步法思想的多元线性回归(改进)

学期结束,稍微完善下之前的程序,新增,对原始数据进行归一化处理,以及模型预测值和真实值的比对图示。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import f as F
from sklearn.model_selection import train_test_split

#train data
Train_Data = []
#test data
Test_Data = []
#Y
Y_Train = []
Y_Test = []
Y_Pred = []
#Train_Data.T.dot(Train_Data)
C = []
#向前法偏F检验结果
F_Exam = []
#向后法偏F检验结果
F_Exam_Back = []
#右分位数,令Fin=Fout
quant = 0
#无关变量索引
irrel_Train_Data = []
#权重
weight = []
#样本尺寸
n_samples,n_features =0,0


def Data_Process(Source_Data_Path):
    global Train_Data,Y_Train,C,n_samples,n_features,Test_Data,Y_Test
    Train_Data = np.array(pd.read_excel(Source_Data_Path))
    Y_Train = Train_Data[:,-1]
    Train_Data = Train_Data[:,0:-1]
    print('original data:\n',Train_Data.shape,Y_Train.shape)
    #Normalization
    Col_Mean=np.mean(Train_Data,axis=0)
    Col_Mean=np.array(Col_Mean)
    print(Col_Mean.shape)
    col_var=np.std(Train_Data,axis=0,ddof=1);
    col_var=np.array(col_var)
    Train_Data = Train_Data-Col_Mean
    Train_Data /= col_var
    Train_Data,Test_Data,Y_Train,Y_Test = train_test_split(Train_Data,Y_Train,train_size=0.95,random_state=1);
    print('train set:\n',Train_Data.shape)
    print('test set:\n',Test_Data.shape)
    T_Train_Data = np.array([1] * Train_Data.shape[0])
    T_Test_Data = np.array([1] * Test_Data.shape[0])
    Train_Data = np.array(np.insert(Train_Data,0,values = T_Train_Data, axis=1),dtype=np.float64)
    Test_Data = np.array(np.insert(Test_Data,0,values = T_Test_Data, axis=1),dtype=np.float64)
    n_samples,n_features = Train_Data.shape
    C = np.linalg.pinv(Train_Data.T.dot(Train_Data))


def Linear_Regress():
    global n_features,Train_Data,n_samples,C,irrel_Train_Data,weight
    index = 1
    #逐步回归
    while index < n_features:
        print('Train_Data variations:\n',Train_Data.shape)
        num_Train_Data = index +1
        Train_Data_default = Train_Data[:,0:num_Train_Data]
        print('Train_Data_default',Train_Data_default.shape)
        #LSM, get weight
        weight = np.linalg.pinv(Train_Data_default.T.dot(Train_Data_default)).dot(Train_Data_default.T).dot(Y_Train)
        #output weight
        print('the ',index,' weight:\n',weight)
        #forward
        print('******************************************向前迭代******************************************')
        if 1-F_Examination(Get_Q(weight,num_Train_Data),weight,index,'Forward'):
             #重置
             index = 1
             n_features -= 1
             Train_Data = np.delete(Train_Data,num_Train_Data-1,1)
             if n_features>1:
                 #reset
                 n_samples,n_features = Train_Data.shape
                 C = np.linalg.pinv(Train_Data.T.dot(Train_Data))
                 irrel_Train_Data.append(num_Train_Data-1)
                 continue
             else:
                 print('因变量与所有自变量线性无关!')
                 break
        #back
        elif len(F_Exam)>=2:
            print('******************************************向后迭代******************************************')
            F_Exam_Back.clear()
            for num in range (1,len(F_Exam)):
                print('向后法对第 ',num,' 个变量做偏F检验')
                # F examination for the i parameter
                F_Examination(Get_Q(weight,num_Train_Data),weight,num,'Back')
            min_index = F_Exam_Back.index(min(F_Exam_Back))
            if min(F_Exam_Back) <= quant:
                print('the MIN index of F_Exam_Back:\n',min_index)
                #reset
                index = 1
                #delete unrelated varibles
                n_features -= 1
                Train_Data = np.delete(Train_Data,min_index+1,1)
                irrel_Train_Data.append(min_index+1)
                if n_features>1:
                    #reset
                    n_samples,n_features = Train_Data.shape
                    C = np.linalg.pinv(Train_Data.T.dot(Train_Data))
                    F_Exam.clear()
                    print('删除变量后从第一个变量重新开始逐步回归算法!')
                    continue
                else:
                    print('因变量与所有自变量线性无关!')
                    break
            else:
                print('F_Exam_Back最小值索引:\n',min_index)
                print('所有变量对Y_Train的影响效果显著!')
            index +=1
        #iteration
        else:
            index +=1
    #舍去的变量权重置零
    for j in range(len(irrel_Train_Data)):
        weight = np.insert(weight,irrel_Train_Data[j],0)
    print('最终的拟合系数为:\n',weight)


#计算偏F检验
def F_Examination(Q,weight,index,mark):
    global quant,F_Exam
    #弃真
    alpha = 0.05
    n = n_samples-n_features-2
    print('样本*度:\n',n)
    quant = F.isf(q=alpha, dfn=1, dfd=n)
    print('分位数:\n',quant)
    sigma2 = Q/(n)
    print( 'sigma2',sigma2)
    β2 = (weight[index]) ** 2
    print('对角线元素:\n',C[index][index])
    if mark == 'Forward':
        F_Exam.append(β2/(sigma2 * C[index][index]))
        print('F_Exam:\n',F_Exam)
        if max(F_Exam) < quant :
            del F_Exam[index-1]
            return False
        else:
            print('F_Exam大小:\n',len(F_Exam))
            return True
    else:
        F_Exam_Back.append(β2/(sigma2 * C[index][index]))
        print('F_Exam_Back:\n',F_Exam_Back)


def Get_Y():
    global Y_Pred
    Y_Pred_Temp =  np.zeros(len(Y_Test),dtype=np.float64)
    for i in range(len(weight)):
        Y_Pred_Temp += weight[i] * Test_Data[:,i]
    Y_Pred = Y_Pred_Temp


#获取Q
def Get_Q(weight,num_param):
    predict_Y_Train = np.zeros(n_samples,dtype=np.float64)
    for index in range(num_param):
        predict_Y_Train += weight[index] * Train_Data[:,index]
    Q_Para = Y_Train - predict_Y_Train
    Q = np.inner(Q_Para,Q_Para)
    return Q


def figure2D():
    #image path
    image_path='D:/File/project_pca/exp_image/'
    plt.figure(1)
    plt.plot(range(len(Y_Pred)),Y_Pred,'b',label="Predict");
    plt.plot(range(len(Y_Test)),Y_Test,'r',label="Real");
    plt.xlabel("Number");
    plt.ylabel("Value");
    plt.title('Predict vs Real')
    image_name='_Predict_vs_Real.png'
    whole_image_path=image_path+image_name
    plt.legend()  # 用于显示plot函数里面 label标签
    plt.savefig(whole_image_path,dpi=800,bbox_inches='tight')
    plt.show()
    plt.clf()


#process control
if __name__ == '__main__':
    Data_Process('D:\\File\\project_pca\\data_segmentation\\abalon.xlsx')
    #逐步回归
    Linear_Regress()
    #根据模型获取预测数据
    Get_Y()
    #绘制
    figure2D()

数据集:

http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data
上一篇:Python 笔记


下一篇:Leetcode 2024. Maximize the Confusion of an Exam [Python]