在统计学中,线性回归(Linear Regression)是利用称为线性回归方程的最小平方函数对一个或多个自变量和因变量之间关系进行建模的一种回归分析。
基本函数形式:
一元线性函数:
y=w*x+b
# 一元线性回归的实现# 导入matplotlib库,主要用于可视化 import numpy as np from matplotlib.font_manager import FontProperties import matplotlib.pyplot as plt
# 引入本地字体文件,否则中文会有乱码 # font_set = FontProperties(fname=r"./work/ simsun.ttc", size=12)
# 构造用于训练的数据集 x_train = [4, 8, 5, 10, 12] y_train = [20, 50, 30, 70, 60]
# 画图函数 def draw(x_train, y_train): plt.scatter(x_train, y_train)
# 定义函数求得斜率w和截距b # 使用最小二乘法对斜率和截距求导并使得导数值等于0求解出斜率和截距 def fit(x_train, y_train): size = len(x_train) numerator = 0 # 初始化分子 denominator = 0 # 初始化分母 for i in range(size): numerator += (x_train[i] - np.mean(x_train)) * \ (y_train[i] - np.mean(y_train)) denominator += (x_train[i] - np.mean(x_train)) ** 2 w = numerator / denominator b = np.mean(y_train) - w * np.mean(x_train) return w, b
# 根据斜率w和截距b,输入x计算输出值 def predict(x, w, b): # 预测模型 y = w * x + b return y
# 根据W,B画图 def fit_line(w, b): # 测试集进行测试,并作图 # linspace 创建等差数列的函数 #numpy.limspace(start,stop,num,endpoint=True,retstep=False,dtype=None,axis=0#) x = np.linspace(4, 15, 9) y = w * x + b plt.plot(x, y) plt.show()
if __name__ == "__main__": draw(x_train, y_train) w, b = fit(x_train, y_train) print(w, b) # 输出斜率和截距 fit_line(w, b) # 绘制预测函数图像
多元线性回归函数,有多个影响y的x值,可以理解为:
yn=w1*x+w2*x+......+wn*x+b
教科书表达为:
Yi=β0+β1X1i+β2X2i+…+βkXki+μi i=1,2,…,n
# 导入sklearn下的LinearRegression 方法 from sklearn.linear_model import LinearRegression import numpy as np import matplotlib.pyplot as plt model = LinearRegression(fit_intercept=True) #默认为True,fit_intercept=False的时候无截距,分数略低于有截距 # 构造用于训练的数据集 x_train = np.array([[2, 4], [5, 8], [5, 9], [7, 10], [9, 12]]) y_train = np.array([20, 50, 30, 70, 60]) # 训练模型并输出模型系数和训练结果 model.fit(x_train, y_train) # fit(x,y,sample_weight=None)x:训练集 y:目标值 sample_weight:每个样本的个数 # coef_ 系数w,intercept_截距 print(model.coef_) # 输出系数w print(model.intercept_) # 输出截距b print(model.score(x_train, y_train)) # 输出模型的评估分数R2 model = LinearRegression(normalize=False) # 请在此处修改 normalize 的参数值 # 构造用于训练的数据集 x_train = np.array([[2, 4], [5, 8], [5, 9], [7, 10], [9, 12]]) y_train = np.array([20, 50, 30, 70, 60]) plt.scatter(x_train[:, 0],x_train[:, 1]) plt.show() # 训练模型并输出模型系数和训练结果 model.fit(x_train, y_train) # fit(x,y,sample_weight=None)x:训练集 y:目标值 sample_weight:每个样本的个数 # coef_ 系数w,intercept_截距 print(model.coef_) # 输出系数w w=model.coef_ print(model.intercept_) # 输出截距b b=model.intercept_ print(model.score(x_train, y_train)) # 输出模型的评估分数R2 x = (0, 20) y = x*w + b plt.plot(x, y) # predicted plt.show() #预测单个值 print(model.predict([[5,9]]))