本代码参考自:https://github.com/lawlite19/MachineLearning_Python#%E4%B8%80%E7%BA%BF%E6%80%A7%E5%9B%9E%E5%BD%92
首先,线性回归公式:y = X*W +b 其中X是m行n列的数据集,m代表样本的个数,n代表每个样本的数据维度。则W是n行1列的数据,b是m行1列的数据,y也是。
损失函数采用MSE,采用梯度下降法进行训练
1 .加载数据集并进行读取
def load_csvdata(filename,split,dataType): #加载数据集 return np.loadtxt(filename,delimiter = split,dtype = dataType) def read_data(): #读取数据集 data = load_csvdata("data.txt",split=",",dataType=np.float64) print(data.shape) X = data[:,0:-1] #取data的前两列 y = data[:,-1] #取data的最后一列作为标签 return X,y
2 . 对数据进行标准化
def feature_normalization(X): X_norm = np.array(X) mu = np.zeros((1,X.shape[1])) std = np.zeros((1,X.shape[1])) mu = np.mean(X_norm,0) std = np.std(X_norm,0) for i in range(X.shape[1]): X_norm[:,i] = (X_norm[:,i] - mu[i]) / std[i] return X_norm,mu,std
3. 损失值的计算
def loss(X,y,w): m = len(y) J = 0 J = (np.transpose(X*w - y))*(X*w - y) / (2*m) print(J) return J
4. 梯度下降算法的python实现
def gradientDescent(X,y,w,lr,num_iters): m = len(y) #获取数据集长度 n = len(w) #获取每个输入数据的维度 temp = np.matrix(np.zeros((n,num_iters))) J_history = np.zeros((num_iters,1)) for i in range(num_iters): #进行迭代 h = np.dot(X,w) #线性回归的矢量表达式 temp[:,i] = w - ((lr/m)*(np.dot(np.transpose(X),h-y))) #梯度的计算 w = temp[:,i] J_history[i] = loss(X,y,w) return w,J_history
5. 绘制损失值随迭代次数变化的曲线图
def plotLoss(J_history,num_iters): x = np.arange(1,num_iters+1) plt.plot(x,J_history) plt.xlabel("num_iters") plt.ylabel("loss") plt.title("Loss value changes with the number of iterations") plt.show()
6. 主函数
if __name__ == "__main__": X,y = read_data() X,mu,sigma = feature_normalization(X) m = len(y) #样本的总个数 X = np.hstack((np.ones((m,1)),X)) #在x上加上1列是为了计算偏移b X=[x0,x1,x2,......xm] 其中x0=1 y = x*w y = y.reshape((-1,1)) lr = 0.01 num_iters = 400 w = np.random.normal(scale=0.01, size=((X.shape[1],1))) theta,J_history = gradientDescent(X,y,w,lr,num_iters) plotLoss(J_history,num_iters)
7.结果