文章目录
GSE37815数据下载到表达矩阵
GSE37815数据下载
library(GEOquery)
gset = getGEO('GSE37815',destdir = '.',getGPL = F,
AnnotGPL = T)
gset = gset[[1]]
expr = exprs(gset) # 表达矩阵
pdata = pData(gset) # 样本信息
gset@annotation # 查看芯片平台
getGEO包下载的探针注释文件不全,需要在GEO网站下载
probe = read.table(file = 'GPL6102-11574.txt',
sep = '\t',
quote = '',
comment.char = '#', # 过滤掉'GPL6102-11574.txt'文件中以‘#’开头的注释信息
header = T,
fill = T, # 如果文件中某行的数据少于其他行,则自动添加空白域。
stringsAsFactors = F) # 字符串不改为因子
ids = probe[probe$Symbol != '',
c(1,13)] # 提取探针和geneID
筛选探针
library(dplyr)
colnames(ids)
expr = as.data.frame(expr)
expr$ID = rownames(expr)
#ids = ids[-grep('///',ids$Gene.Symbol),] # 一个探针对应多个基因,去除
exprSet = inner_join(ids,expr,by = 'ID') # 探针没有对应基因,去除
library(limma)
exprSet= avereps(exprSet[,-c(1,2)], # 多个探针对应一个基因,取均值
ID = exprSet$Symbol)
exprSet = as.data.frame(exprSet)
pdf(file = 'rowbox.pdf')
p <- boxplot(exprSet,outline=FALSE,las=2,col = 'blue',xaxt = 'n',ann = F)
title(main = list('Before normalization',cex = 2 ,font = 2),
xlab = list('Sample list',cex = 1.5,font = 2),
ylab = '',line = 0.7)
mtext('Expression value',side = 2,padj = -3,font = 2,cex = 1.5)
dev.off()
分位数标准化预处理
library(limma)
normalized_expr = normalizeBetweenArrays(exprSet) # 分位数标准化 method默认为quantile
#rt=log2(rt) 有时还需log2变换
pdf(file = 'normalized_box.pdf')
p1 <- boxplot(normalized_expr,outline=FALSE,las=2,col = 'red',xaxt = 'n',ann = F)
title(main = list('Normalization',cex = 2 ,font = 2),
xlab = list('Sample list',cex = 1.5,font = 2),
ylab = '',line = 0.7)
mtext('Expression value',side = 2,padj = -3,font = 2,cex = 1.5)
dev.off()
分组
group_list = pdata$title
#table(pdata$title) # 查看样品信息
control = normalized_expr[,grep('Control',group_list)]
tumor = normalized_expr[,grep('cancer',group_list)]
exprSet1 = cbind(tumor,control)
group_list = c(rep('tumor',ncol(tumor)),
rep('normal',ncol(control)))
差异表达
表达矩阵
data = exprSet1
分组矩阵
group_list = factor(group_list)
design <- model.matrix( ~0 + group_list)
colnames( design ) = levels(group_list)
rownames( design ) = colnames(data)
contrast.matrix <- makeContrasts( "tumor-normal", levels = design)
差异表达矩阵
fit <- lmFit( data, design )
fit2 <- contrasts.fit( fit, contrast.matrix )
fit2 <- eBayes( fit2 )
allDiff=topTable(fit2,adjust='fdr',number=200000)
write.table(allDiff,file="alldiff.xls",sep="\t",quote=F)
按照logFC排序
allLimma=allDiff
allLimma=allLimma[order(allLimma$logFC),]
allLimma=rbind(Gene=colnames(allLimma),allLimma)
write.table(allLimma,file="GSE37815_limmaTab.txt",sep="\t",quote=F,col.names=F)
保存差异表达矩阵
logFoldChange=1
adjustP=0.05
diffSig <- allDiff[with(allDiff, (abs(logFC)>logFoldChange & adj.P.Val < adjustP )), ]
write.table(diffSig,file="diff.xls",sep="\t",quote=F)
diffUp <- allDiff[with(allDiff, (logFC>logFoldChange & adj.P.Val < adjustP )), ]
write.table(diffUp,file="up.xls",sep="\t",quote=F)
diffDown <- allDiff[with(allDiff, (logFC<(-logFoldChange) & adj.P.Val < adjustP )), ]
write.table(diffDown,file="down.xls",sep="\t",quote=F)
hmExp=data[rownames(diffSig),]
diffExp=rbind(id=colnames(hmExp),hmExp)
write.table(diffExp,file="diffExp.txt",sep="\t",quote=F,col.names=F
火山图
xMax=max(-log10(allDiff$adj.P.Val))
yMax=max(abs(allDiff$logFC))
library(ggplot2)
allDiff$change <- ifelse(allDiff$adj.P.Val < 0.05 & abs(allDiff$logFC) > 1,
ifelse(allDiff$logFC > 1,'UP','DOWN'),
'NOT')
table(allDiff$change)
pdf(file = 'volcano.pdf')
ggplot(data= allDiff, aes(x = -log10(adj.P.Val), y = logFC, color = change)) +
geom_point(alpha=0.8, size = 1) +
theme_bw(base_size = 15) +
theme(plot.title=element_text(hjust=0.5), # 标题居中
panel.grid.minor = element_blank(),
panel.grid.major = element_blank()) + # 网格线设置为空白
geom_hline(yintercept= 0 ,linetype= 2 ) +
scale_color_manual(name = "",
values = c("red", "green", "black"),
limits = c("UP", "DOWN", "NOT")) +
xlim(0,xMax) +
ylim(-yMax,yMax) +
labs(title = 'Volcano', x = '-Log10(adj.P.Val)', y = 'LogFC')
dev.off()
热图,按p值从小到大筛选前100个差异基因(|logFC| > 1)
top100exp = exprSet1[rownames(diffSig)[1:100],] # 按P值大小排序,选择前100个
annotation_col = data.frame(group = group_list)
rownames(annotation_col) = colnames(exprSet1)
library(pheatmap)
pdf(file = 'heatmap.pdf')
pheatmap(top100exp,annotation_col = annotation_col,
color = colorRampPalette(c("green", "black", "red"))(50),
fontsize = 5)
dev.off()