1.多元线性回归
states <- as.data.frame(state.x77[,c("Murder", "Population","Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
summary(fit)
coef(fit) # 输出回归参数
confint(fit) # 提供模型参数的置信区间(默认 95%)
fitted(fit) # 列出拟合模型的预测值
residuals(fit) # 列出拟合模型的残差值
rstandard(fit) # 标准化残差
rstudent(fit) # 学生化残差:SREi
AIC(fit) # 赤池信息
2.基本假设检验
library(car)
library(ggpubr)
res = rstudent(fit) #学生化残差
ggqqplot(res,color = '#E7B800')
durbinWatsonTest(fit) # p>0.05 表明应变量相互独立
ncvTest(fit) # p>0.05 表明同方差
3.异常值观测
outlierTest(fit) #若不显著,则说明数据集中没有离群点,一般会去除离群点
高杠杆值点,hii大于2倍或者3倍hii平均值(p+1)/n
hii = hatvalues(fit)
high_leverage_point = hii[hii >= 2 * (4+1)/length(hii)]
强影响点,库克距离Di > 4/(n-p-1),或者Di > 1,若有强影响点,则去除
Di = cooks.distance(fit)
Di[Di > 1]
4.多重共线性
vif(fit) # VIF>4:存在多重共线性,VIF>10:存在严重多重共线性
5.变量选择,逐步回归
library(MASS)
AIC(fit)
stepAIC(fit, direction = 'both')
6.确定最终函数
states = states[-grep('Nevada', rownames(states)),]
final_fit = lm(Murder~Population+Illiteracy, data = states)
summary(final_fit)
7.在新的数据集上预测
df = data.frame(Population = c(360,2200),
Illiteracy = c(1.4,1.9))
myfunction = function(x1){
sum(coef(fit)[2:length(coef(final_fit))] * x1) + coef(final_fit)[1]
}
apply(df, 1, myfunction) # 预测新的数据集