学期结束,稍微完善下之前的程序,新增,对原始数据进行归一化处理,以及模型预测值和真实值的比对图示。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import f as F
from sklearn.model_selection import train_test_split
#train data
Train_Data = []
#test data
Test_Data = []
#Y
Y_Train = []
Y_Test = []
Y_Pred = []
#Train_Data.T.dot(Train_Data)
C = []
#向前法偏F检验结果
F_Exam = []
#向后法偏F检验结果
F_Exam_Back = []
#右分位数,令Fin=Fout
quant = 0
#无关变量索引
irrel_Train_Data = []
#权重
weight = []
#样本尺寸
n_samples,n_features =0,0
def Data_Process(Source_Data_Path):
global Train_Data,Y_Train,C,n_samples,n_features,Test_Data,Y_Test
Train_Data = np.array(pd.read_excel(Source_Data_Path))
Y_Train = Train_Data[:,-1]
Train_Data = Train_Data[:,0:-1]
print('original data:\n',Train_Data.shape,Y_Train.shape)
#Normalization
Col_Mean=np.mean(Train_Data,axis=0)
Col_Mean=np.array(Col_Mean)
print(Col_Mean.shape)
col_var=np.std(Train_Data,axis=0,ddof=1);
col_var=np.array(col_var)
Train_Data = Train_Data-Col_Mean
Train_Data /= col_var
Train_Data,Test_Data,Y_Train,Y_Test = train_test_split(Train_Data,Y_Train,train_size=0.95,random_state=1);
print('train set:\n',Train_Data.shape)
print('test set:\n',Test_Data.shape)
T_Train_Data = np.array([1] * Train_Data.shape[0])
T_Test_Data = np.array([1] * Test_Data.shape[0])
Train_Data = np.array(np.insert(Train_Data,0,values = T_Train_Data, axis=1),dtype=np.float64)
Test_Data = np.array(np.insert(Test_Data,0,values = T_Test_Data, axis=1),dtype=np.float64)
n_samples,n_features = Train_Data.shape
C = np.linalg.pinv(Train_Data.T.dot(Train_Data))
def Linear_Regress():
global n_features,Train_Data,n_samples,C,irrel_Train_Data,weight
index = 1
#逐步回归
while index < n_features:
print('Train_Data variations:\n',Train_Data.shape)
num_Train_Data = index +1
Train_Data_default = Train_Data[:,0:num_Train_Data]
print('Train_Data_default',Train_Data_default.shape)
#LSM, get weight
weight = np.linalg.pinv(Train_Data_default.T.dot(Train_Data_default)).dot(Train_Data_default.T).dot(Y_Train)
#output weight
print('the ',index,' weight:\n',weight)
#forward
print('******************************************向前迭代******************************************')
if 1-F_Examination(Get_Q(weight,num_Train_Data),weight,index,'Forward'):
#重置
index = 1
n_features -= 1
Train_Data = np.delete(Train_Data,num_Train_Data-1,1)
if n_features>1:
#reset
n_samples,n_features = Train_Data.shape
C = np.linalg.pinv(Train_Data.T.dot(Train_Data))
irrel_Train_Data.append(num_Train_Data-1)
continue
else:
print('因变量与所有自变量线性无关!')
break
#back
elif len(F_Exam)>=2:
print('******************************************向后迭代******************************************')
F_Exam_Back.clear()
for num in range (1,len(F_Exam)):
print('向后法对第 ',num,' 个变量做偏F检验')
# F examination for the i parameter
F_Examination(Get_Q(weight,num_Train_Data),weight,num,'Back')
min_index = F_Exam_Back.index(min(F_Exam_Back))
if min(F_Exam_Back) <= quant:
print('the MIN index of F_Exam_Back:\n',min_index)
#reset
index = 1
#delete unrelated varibles
n_features -= 1
Train_Data = np.delete(Train_Data,min_index+1,1)
irrel_Train_Data.append(min_index+1)
if n_features>1:
#reset
n_samples,n_features = Train_Data.shape
C = np.linalg.pinv(Train_Data.T.dot(Train_Data))
F_Exam.clear()
print('删除变量后从第一个变量重新开始逐步回归算法!')
continue
else:
print('因变量与所有自变量线性无关!')
break
else:
print('F_Exam_Back最小值索引:\n',min_index)
print('所有变量对Y_Train的影响效果显著!')
index +=1
#iteration
else:
index +=1
#舍去的变量权重置零
for j in range(len(irrel_Train_Data)):
weight = np.insert(weight,irrel_Train_Data[j],0)
print('最终的拟合系数为:\n',weight)
#计算偏F检验
def F_Examination(Q,weight,index,mark):
global quant,F_Exam
#弃真
alpha = 0.05
n = n_samples-n_features-2
print('样本*度:\n',n)
quant = F.isf(q=alpha, dfn=1, dfd=n)
print('分位数:\n',quant)
sigma2 = Q/(n)
print( 'sigma2',sigma2)
β2 = (weight[index]) ** 2
print('对角线元素:\n',C[index][index])
if mark == 'Forward':
F_Exam.append(β2/(sigma2 * C[index][index]))
print('F_Exam:\n',F_Exam)
if max(F_Exam) < quant :
del F_Exam[index-1]
return False
else:
print('F_Exam大小:\n',len(F_Exam))
return True
else:
F_Exam_Back.append(β2/(sigma2 * C[index][index]))
print('F_Exam_Back:\n',F_Exam_Back)
def Get_Y():
global Y_Pred
Y_Pred_Temp = np.zeros(len(Y_Test),dtype=np.float64)
for i in range(len(weight)):
Y_Pred_Temp += weight[i] * Test_Data[:,i]
Y_Pred = Y_Pred_Temp
#获取Q
def Get_Q(weight,num_param):
predict_Y_Train = np.zeros(n_samples,dtype=np.float64)
for index in range(num_param):
predict_Y_Train += weight[index] * Train_Data[:,index]
Q_Para = Y_Train - predict_Y_Train
Q = np.inner(Q_Para,Q_Para)
return Q
def figure2D():
#image path
image_path='D:/File/project_pca/exp_image/'
plt.figure(1)
plt.plot(range(len(Y_Pred)),Y_Pred,'b',label="Predict");
plt.plot(range(len(Y_Test)),Y_Test,'r',label="Real");
plt.xlabel("Number");
plt.ylabel("Value");
plt.title('Predict vs Real')
image_name='_Predict_vs_Real.png'
whole_image_path=image_path+image_name
plt.legend() # 用于显示plot函数里面 label标签
plt.savefig(whole_image_path,dpi=800,bbox_inches='tight')
plt.show()
plt.clf()
#process control
if __name__ == '__main__':
Data_Process('D:\\File\\project_pca\\data_segmentation\\abalon.xlsx')
#逐步回归
Linear_Regress()
#根据模型获取预测数据
Get_Y()
#绘制
figure2D()
数据集:
http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data