在本练习中,您将实现正则化的线性回归和多项式回归,并使用它来研究具有不同偏差-方差属性的模型
在前半部分的练习中,你将实现正则化线性回归,以预测水库中的水位变化,从而预测大坝流出的水量。在下半部分中,您将通过一些调试学习算法的诊断,并检查偏差 v.s. 方差的影响。
1 Prepare dataset
1.1 Visualizing the dataset
可视化数据
数据集分为:
- **training set 训练集:**训练模型
- **cross validation set 交叉验证集:**选择正则化参数
- **test set 测试集:**评估i性能,模型训练中不曾用过的样本
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.optimize as opt
'''
1.Prepare dataset
'''
data = loadmat('ex5data1.mat')
print(data)
X, y = data['X'], data['y']
Xval, yval = data['Xval'], data['yval']
Xtest, ytest = data['Xtest'], data['ytest']
# Insert a column of 1's to all of the X's, as usual
X = np.insert(X, 0, 1, axis=1)
Xval = np.insert(Xval, 0, 1, axis=1)
Xtest = np.insert(Xtest, 0, 1, axis=1)
print(X.shape, y.shape) # (12, 2) (12, 1)
print(Xval.shape, yval.shape) # (21, 2) (21, 1)
print(Xtest.shape, ytest.shape) # (21, 2) (21, 1)
# Visualizing the data 可视化数据
def plotData(X, y):
'''可视化数据'''
fig, ax = plt.subplots(figsize=(6, 4))
X = X[:, 1:]
ax.scatter(X, y, c='r')
ax.set_xlabel('Change in water level (x)')
ax.set_ylabel('Water flowing out of the dam (y)')
ax.grid(True)
# plt.show()
pass
1.2 Regularized linear regression cost function
J ( θ ) = 1 2 m ( ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ) + λ 2 m ( ∑ j = 1 n θ j 2 ) J(\theta)=\frac{1}{2 m}\left(\sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right)^{2}\right)+\frac{\lambda}{2 m}\left(\sum_{j=1}^{n} \theta_{j}^{2}\right) J(θ)=2m1(i=1∑m(hθ(x(i))−y(i))2)+2mλ(j=1∑nθj2)
def costReg(theta, X, y, lam):
'''
do not regularizethe theta0
theta is a 1-d array with shape (n+1,)
X is a matrix with shape (m, n+1)
y is a matrix with shape (m, 1)
:param theta:weights
:param X:输入矩阵
:param y:输出向量
:param lam:lambda
:return:costReg
'''
cost = np.nansum((X @ theta - y.flatten()) ** 2)
reg = lam * theta[1:] @ theta[1:]
return (cost + reg) * (1 / (2 * len(X)))
theta = np.ones(X.shape[1])
a = costReg(theta, X, y, 1) # 303.9931922202643
∂ J ( θ ) ∂ θ 0 = 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) for j = 0 ∂ J ( θ ) ∂ θ j = ( 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) ) + λ m θ j for j ≥ 1 \begin{array}{ll} \frac{\partial J(\theta)}{\partial \theta_{0}}=\frac{1}{m} \sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right) x_{j}^{(i)} & \text { for } j=0 \\ \frac{\partial J(\theta)}{\partial \theta_{j}}=\left(\frac{1}{m} \sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right) x_{j}^{(i)}\right)+\frac{\lambda}{m} \theta_{j} & \text { for } j \geq 1 \end{array} ∂θ0∂J(θ)=m1∑i=1m(hθ(x(i))−y(i))xj(i)∂θj∂J(θ)=(m1∑i=1m(hθ(x(i))−y(i))xj(i))+mλθj for j=0 for j≥1
def gradientReg(theta, X, y, lam):
'''
theta: 1-d array with shape (2,)
X: 2-d array with shape (12, 2)
y: 2-d array with shape (12, 1)
l: lambda constant
grad has same shape as theta (2,)
:param theta:weights
:param X:输入矩阵
:param y:输出向量
:param lam:lambda
:return:costReg
'''
grad = (X @ theta - y.flatten()) @ X
reg = lam * theta
reg[0] = 0
return (grad + reg) / (len(X))
b = gradientReg(theta, X, y, 1) # [-15.30301567 598.25074417]
1.4 Fitting linear regression
def trainLinearReg(X, y, lam):
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=costReg, jac=gradientReg, method='TNC', args=(X, y, lam), x0=theta)
return res.x
fit_theta = trainLinearReg(X, y, 0)
# plotData(X, y)
# plt.plot(X[:, 1], X @ fit_theta)
# plt.show()
这里我们把λ= 0,因为我们现在实现的线性回归只有两个参数,这么低的维度,正则化并没有用。
从图中可以看到,拟合最好的这条直线告诉我们这个模型并不适合这个数据。
在下一节中,您将实现一个函数来生成学习曲线,它可以帮助您调试学习算法,即使可视化数据不那么容易。
2 Bias-variance
机器学习中一个重要的概念是偏差(bias)和方差(variance)的权衡。高偏差意味着欠拟合,高方差意味着过拟合。
在这部分练习中,您将在学习曲线上绘制训练误差和验证误差,以诊断bias-variance问题。
2.1 Learning curves 学习曲线
J t r a i n ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ] J_{\mathrm{train}}(\theta)=\frac{1}{2 m}\left[\sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right)^{2}\right] Jtrain(θ)=2m1[i=1∑m(hθ(x(i))−y(i))2]
训练样本X从1开始逐渐增加,训练出不同的参数向量θ。接着通过交叉验证样本Xval计算验证误差。
使用训练集的子集来训练模型,得到不同的theta。
通过theta计算训练代价和交叉验证代价,切记此时不要使用正则化,将 λ = 0 \lambda = 0λ=0。
计算交叉验证代价时记得整个交叉验证集来计算,无需分为子集。
def plot_learning_curve(X, y, Xval, yval, lam):
'''画出学习曲线,即交叉验证误差和训练需查随样本数量的变化的变化'''
xx = range(1, len(X) + 1)
training_cost, cv_cost = [], []
for i in xx:
res = trainLinearReg(X[:i], y[:i], lam)
training_cost_i = costReg(res, X[:i], y[:i], lam)
cv_cost_i = costReg(res, Xval, yval, lam)
training_cost.append(training_cost_i)
cv_cost.append(cv_cost_i)
pass
plt.figure(figsize=(8, 5))
plt.plot(xx, training_cost, label='training cost')
plt.plot(xx, cv_cost, label='cross validation cost')
plt.legend()
plt.xlabel('Number of Training samples')
plt.ylabel('Error')
plt.title('Learning curve for linear regression')
plt.grid(True)
plt.show()
pass
# plot_learning_curve(X, y, Xval, yval, 0)
从图中看出来,随着样本数量的增加,训练误差和交叉验证误差都很高,这属于高偏差,欠拟合。
3 Polynomial regression 多项式回归
数据预处理
- X,Xval,Xtest都需要添加多项式特征,这里我们选择增加到6次方,因为若选8次方无法达到作业pdf上的效果图,这是因为scipy和octave版本的优化算法不同。
- 不要忘了标准化。
# 数据预处理
def genPolyFeatures(X, power):
'''
添加多项式特征
每次在array的最后一列插入第二列的i+2次方(第一列为偏置)
从二次方开始开始插入(因为本身含有一列一次方)
:param X:
:param power:
:return:
'''
Xpoly = X.copy()
for i in range(2, power+1):
Xpoly = np.insert(Xpoly, Xpoly.shape[1], np.power(Xpoly[:, 1], i), axis=1)
pass
return Xpoly
关于归一化,所有数据集应该都用训练集的均值和样本标准差处理。切记。所以要将训练集的均值和样本标准差存储起来,对后面的数据进行处理。
def get_mean_std(X):
'''获取数据的均值和误差,用来标准化所有数据'''
means = np.nanmean(X, axis=0)
# 每一行的相加起来除以总数
stds = np.nanstd(X, axis=0, ddof=1) # ddof=1 means 样本标准差
return means, stds
def featureNormalize(myX, means, stds):
'''标准化'''
X_norm = myX.copy()
X_norm[:, 1:] = X_norm[:, 1:] - means[1:]
# 减去相应的位置
X_norm[:, 1:] = X_norm[:, 1:] / stds[1:]
# 除相应的位置
return X_norm
而且注意这里是样本标准差而不是总体标准差,使用np.std()时,将ddof=1则是样本标准差,默认=0是总体标准差。而pandas默认计算样本标准差。
获取添加多项式特征以及标准化之后的数据。
power = 6
T = genPolyFeatures(X, power)
train_means, train_stds = get_mean_std(genPolyFeatures(X, power))
X_norm = featureNormalize(genPolyFeatures(X, power), train_means, train_stds)
Xval_norm = featureNormalize(genPolyFeatures(Xval, power), train_means, train_stds)
Xtest_norm = featureNormalize(genPolyFeatures(Xtest, power), train_means, train_stds)
def plot_fit(means, stds, lam, power):
'''画出拟合曲线'''
theta = trainLinearReg(X_norm, y, lam)
x = np.linspace(-75, 55, 50)
xmat = x.reshape(-1, 1)
# 数组新的shape属性应该要与原来的配套,如果等于-1的话,那么Numpy会根据剩下的维度计算出数组的另外一个shape属性值。
xmat = np.insert(xmat, 0, 1, axis=1)
Xmat = genPolyFeatures(xmat, power)
Xmat_norm = featureNormalize(Xmat, means, stds)
plotData(X, y)
plt.plot(x, Xmat_norm @ theta, 'b--')
plt.show()
pass
# lambda = 0
plot_fit(train_means, train_stds, 0, 6)
plot_learning_curve(X_norm, y, Xval_norm, yval, 0)
# lambda = 1
plot_fit(train_means, train_stds, 1, 6)
plot_learning_curve(X_norm, y, Xval_norm, yval, 1)
# lambda = 100
plot_fit(train_means, train_stds, 100, 6)
plot_learning_curve(X_norm, y, Xval_norm, yval, 100)
惩罚过多,欠拟合状态
3.1 Selecting λ using a cross validation set
lambdas = [0, 0.01, 0.02, 0.04, 0.08, 0.15, 0.32, 0.64, 1.28, 2.56, 3, 5.12, 10]
errors_train, errors_val = [], []
for l in lambdas:
theta = trainLinearReg(X_norm, y, l)
errors_train.append(costReg(theta, X_norm, y, 0)) # 记得把lambda = 0
errors_val.append(costReg(theta, Xval_norm, yval, 0))
pass
plt.figure(figsize=(8, 6))
plt.plot(lambdas, errors_train, c='b', label='train')
plt.plot(lambdas, errors_val, c='r', label='cv')
plt.legend()
plt.xlabel('Learning parameters: λ')
plt.ylabel('Error')
plt.grid(True)
# plt.show()
# 可以看到时交叉验证代价最小的是 lambda = 2.56
var = lambdas[np.nanargmin(errors_val)] # 2.56
# print(var)
3.2 Computing test set error
'''
6.Computing test set error
'''
theta = trainLinearReg(X_norm, y, 2.56)
print('test cost(l={}) = {}'.format(2.56, costReg(theta, Xtest_norm, ytest, 0)))
# for l in lambdas:
# theta = trainLinearReg(X_norm, y, l)
# print('test cost(l={}) = {}'.format(l, costReg(theta, Xtest_norm, ytest, 0
参考链接:https://blog.csdn.net/Cowry5/article/details/80421712