多元线性回归 回归诊断 方差分析 功效分析 广义线性模型 logistics回归 主成分分析 因子分析 购物篮分析
多元线性回归
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
#将矩阵转换为数据框
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
summary(fit)
coef(fit)
library(car)
qqPlot(fit, labels=row.names(states), id.method="identify",
simulate=TRUE, main="Q-Q Plot")
#Mutiple linear regression with a significant interaction term
fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
summary(fit)#查看拟合结果
#说明马力和车重的交互项显著
library(effects)
plot(effect("hp:wt", fit,, list(wt=c(2.2, 3.2, 4.2))), multiline=TRUE)
#AIC
fit1 <- lm (Murder ~ Population+Illiteracy+Income+Frost,data=states)
fit2 <- lm (Murder ~ Population+Illiteracy,data=states)
AIC(fit1,fit2)
#使用AIC来检测模型,第一个模型包含四个自变量,第二个模型包含两个自变量
#Backward stepwise selection
library(MASS)
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost,
data=states)
stepAIC(fit, direction="backward")
#All subsets regression
library(leaps)
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
leaps <-regsubsets(Murder ~ Population + Illiteracy + Income +
Frost, data=states, nbest=4)
plot(leaps, scale="adjr2")
回归诊断
opar <- par(no.readonly=TRUE)
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)#生成评价拟合模型的四幅图
par(opar)#对绘图选项进行限制
fit2 <- lm(weight ~ height + I(height^2), data=women)
opar <- par(no.readonly=TRUE)
par(mfrow=c(2,2))#横排两张竖排两张
plot(fit2)
#第一幅图是残差与拟合的图,用来表示因变量与自变量是否呈线性关系
#图中的点是残差分布,曲线是拟合曲线
#第二幅图用来描述正态性,正态分布情况下,应该是一条直线
#第三幅图是位置与尺寸图,用来描述同方差性,如果满足方差不变,图中的点
#随机分布
#第四幅图是用来判断离群点,高杠杆点
par(opar)
方差分析
?aov
#ANOVA :Analysis of Variance
#研究对结果有影响的变量
library(multcomp)
attach(cholesterol)
?cholesterol
table(trt)
aggregate(response, by=list(trt), FUN=mean)
#分组统计,可以看出药物E的效果最好
#aggregate(response, by=list(trt), FUN=sd)
fit <- aov(response ~ trt,data =cholesterol)
summary(fit)
fit.lm <- lm(response~trt,data=cholesterol)
plot(fit)
#One-way ANCOVA
data(litter, package="multcomp")
attach(litter)
table(dose)
aggregate(weight, by=list(dose), FUN=mean)
#分组统计平均数
fit <- aov(weight ~ gesttime + dose)
#使用aov分析因变量weight与协变量gesttime与自变量dose的关系
summary(fit)
#Two way ANOVA
attach(ToothGrowth)
table(supp,dose)
aggregate(len, by=list(supp,dose), FUN=mean)
aggregate(len, by=list(supp,dose), FUN=sd)
class(ToothGrowth$dose)
dose <- factor(dose)
fit <- aov(len~ supp*dose,data=ToothGrowth)
summary(fit)
install.packages("HH")
library(HH)
interaction.plot(dose, supp, len, type="b",
col=c("red","blue"), pch=c(16, 18),
main = "Interaction between Dose and Supplement Type")
#使用HH包中的interaction函数对结果可视化
#One-way MANOVA
library(MASS)
attach(UScereal)
shelf <- factor(shelf)#将shelf列转换为因子
y <- cbind(calories, fat, sugars)
aggregate(y, by=list(shelf), FUN=mean)
cov(y)
fit <- manova(y ~ shelf)
summary(fit)
summary.aov(fit)
功效分析
install.packages("pwr")
library(pwr)
#使用pwr包进行功效分析
#Linear Models
pwr.f2.test(u=3, f2=0.0769, sig.level=0.05, power=0.90)
#0.05显著性水平,0.9的功效
#ANOVA
pwr.anova.test(k=2,f=.25,sig.level=.05,power=.9)
#每一组需要86个样本
#t tests
pwr.t.test(d=.8, sig.level=.05,power=.9, type="two.sample",
alternative="two.sided")
pwr.t.test(n=20, d=.5, sig.level=.01, type="two.sample",
alternative="two.sided")
#Correlations
pwr.r.test(r=.25, sig.level=.05, power=.90, alternative="greater")
#Tests of proportions
pwr.2p.test(h=ES.h(.65, .6), sig.level=.05, power=.9,
alternative="greater")
#Chi-square tests
prob <- matrix(c(.42, .28, .03, .07, .10, .10), byrow=TRUE, nrow=3)
ES.w2(prob)
pwr.chisq.test(w=.1853, df=3 , sig.level=.05, power=.9)
广义线性模型
?glm
#library(glm)
#glm
#getwd()
#setwd("D:\\backup\\Ln\\RIA\\RData")
#使用glm进行广义线性模型分析
data(breslow.dat, package="robust")
names(breslow.dat)
summary(breslow.dat[c(6, 7, 8, 10)])
attach(breslow.dat)
#fit regression
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson(link="log"))
summary(fit)
#interpret model parametersR
coef(fit)
exp(coef(fit))
Logistics回归
#使用连续型或类别型变量来预测二值型变量
data(Affairs, package="AER")
#install.packages("AER")
summary(Affairs)
table(Affairs$affairs)
#使用summary和table简单统计分析affair
prop.table(table(Affairs$affairs))#只统计affair这一项
prop.table(table(Affairs$gender))#统计gender项
#create binary outcome variable
Affairs$ynaffair[Affairs$affairs > 0] <- 1
#将年度出轨次数中大于0的数据赋值为1
Affairs$ynaffair[Affairs$affairs == 0] <- 0
#将年度出轨次数中大于0的数据赋值为0
head(Affairs)
Affairs$ynaffair <- factor(Affairs$ynaffair,
levels=c(0,1),
labels=c("No","Yes"))
#将affair转换为因子,其中只取年度出轨数据,数据的水平为0和1,其中
#0是no,1是yes
table(Affairs$ynaffair)
#fit full model
attach(Affairs)
fit <- glm(ynaffair ~ gender + age + yearsmarried + children +
religiousness + education + occupation +rating,
data=Affairs,family=binomial())
#Affairs$ynaffair
#对年度出轨数据进行拟合,ynaffair为因变量,后面的为自变量
summary(fit)
#结果可以看到P值后面的*,*表示这个变量对affair影响是否显著
#fit reduced model
fit1 <- glm(ynaffair ~ age + yearsmarried + religiousness +
rating, data=Affairs, family=binomial())
summary(fit1)
#在新模型中,每一个因素的影响都非常显著
#compare models
anova(fit, fit1, test="Chisq")
#使用卡方检验,证明了两次检验结果差别不大,那么我们就可以精简变量了
#interpret coefficients
coef(fit1)
#使用coef算出回归系数,其他预测变量不变时,1单位预测变量的变化可能引起
#响应变化对数优势比的变化
exp(coef(fit1))#由于对其使用了对数优势比,那么我们这里将其指数运算
#回到了正常模式
#结果可以看到,婚龄增加一年婚外情优势比增加1.11,年龄增加一年
#婚外情优势比增加0.97
#calculate probability of extramariatal affair by marital ratings
testdata <- data.frame(rating = c(1, 2, 3, 4, 5),
age = mean(Affairs$age),
yearsmarried = mean(Affairs$yearsmarried),
religiousness = mean(Affairs$religiousness))
#使用测试数据集预测相应概率
testdata$prob <- predict(fit1, newdata=testdata, type="response")
testdata
#calculate probabilites of extramariatal affair by age
testdata <- data.frame(rating = mean(Affairs$rating),
age = seq(17, 57, 10),
yearsmarried = mean(Affairs$yearsmarried),
religiousness = mean(Affairs$religiousness))
testdata$prob <- predict(fit1, newdata=testdata, type="response")
testdata
主成分分析
#Principal components analysis of US Judge Ratings
library(psych)
#主成分分析,将大量相关变量转换为一组很少的不相关变量
head(USJudgeRatings)
fa.parallel(USJudgeRatings,fa="pc",n.iter = 100)
#作pc分析,循环100次,绘制碎石图
#碎石图用来确定使用几个因子比较恰当
pc <- principal(USJudgeRatings, nfactors=1)
pc
#使用principal进行分析,nfactors指定主成分数
#Principal components analysis Score
pc <- principal(USJudgeRatings,nfactors = 1,scores = TRUE)
pc$scores#获得每个变量的得分
#Principal components analysis Harman23.cor data
fa.parallel(Harman23.cor$cov, n.obs=302, fa="pc", n.iter=100,
show.legend=FALSE, main="Scree plot with parallel analysis")
#继续绘制碎石图,n.obs表示样本大小
#Principal components analysis of body measurements
library(psych)
PC <- principal(Harman23.cor$cov, nfactors=2, rotate="none")
PC#nfactors=2表示有两个主成分
#Principal components analysis with varimax rotation
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
rc#主成分的旋转
因子分析
#因子分析法,本质上用来降维
options(digits=2)
library(psych)
covariances <- ability.cov$cov
#convert covariances to correlations
correlations <- cov2cor(covariances)
correlations
#determine number of factors to extract
fa.parallel(correlations, n.obs=112, fa="both", n.iter=100,
main="Scree plots with parallel analysis")
#判断提取因子数
#Principal axis factoring without rotation
fa <- fa(correlations, nfactors=2, rotate="none", fm="pa")
fa
#Factor extraction with orthogonal rotation
fa.varimax <- fa(correlations, nfactors=2, rotate="varimax", fm="pa")
fa.varimax
#Listing Factor extraction with oblique rotation
fa.promax <- fa(correlations, nfactors=2, rotate="promax", fm="pa")
fa.promax
#plot factor solution
factor.plot(fa.promax, labels=rownames(fa.promax$loadings))
fa.diagram(fa.promax, simple=FALSE)
#factor scores
fa <- fa(correlations,nfactors=2,rotate="none",fm="pa",score=TRUE)
fa.promax$weights
购物篮分析
install.packages("arules")
library(arules)
data(Groceries)
Groceries#内置数据集
inspect(Groceries)#查看数据集内容
fit <- apriori(Groceries,parameter = list(support=0.01,confidence=0.5))
#使用apriori进行建模,最小支持度support为0.01,最小置信度confidence为0.5
summary(fit)
inspect(fit)