2018年SCI论文--整合GEO数据挖掘完整复现 三 :差异表达(GSE37815)

文章目录

论文地址

GSE37815数据下载到表达矩阵

GSE37815数据下载

library(GEOquery)
  gset = getGEO('GSE37815',destdir = '.',getGPL = F,
                AnnotGPL = T)
  gset = gset[[1]]  
  expr = exprs(gset)        # 表达矩阵
  pdata = pData(gset)       # 样本信息
  gset@annotation           # 查看芯片平台

getGEO包下载的探针注释文件不全,需要在GEO网站下载

probe = read.table(file = 'GPL6102-11574.txt',
                     sep = '\t',
                     quote = '',
                     comment.char = '#', # 过滤掉'GPL6102-11574.txt'文件中以‘#’开头的注释信息
                     header = T,
                     fill = T,           #  如果文件中某行的数据少于其他行,则自动添加空白域。     
                     stringsAsFactors = F) # 字符串不改为因子

ids = probe[probe$Symbol != '',
              c(1,13)] # 提取探针和geneID

筛选探针

library(dplyr)
  colnames(ids)
  expr = as.data.frame(expr)
  expr$ID = rownames(expr)
  #ids = ids[-grep('///',ids$Gene.Symbol),]      # 一个探针对应多个基因,去除
  exprSet = inner_join(ids,expr,by = 'ID')      #  探针没有对应基因,去除

library(limma)
  exprSet= avereps(exprSet[,-c(1,2)],           # 多个探针对应一个基因,取均值
                   ID = exprSet$Symbol)
  exprSet = as.data.frame(exprSet)

pdf(file = 'rowbox.pdf')
  p <- boxplot(exprSet,outline=FALSE,las=2,col = 'blue',xaxt = 'n',ann = F)
  title(main = list('Before normalization',cex = 2 ,font = 2),
        xlab = list('Sample list',cex = 1.5,font = 2),
        ylab = '',line = 0.7)
  mtext('Expression value',side = 2,padj = -3,font = 2,cex = 1.5)
  dev.off()

2018年SCI论文--整合GEO数据挖掘完整复现 三 :差异表达(GSE37815)

分位数标准化预处理

library(limma)
  normalized_expr = normalizeBetweenArrays(exprSet) # 分位数标准化 method默认为quantile
  #rt=log2(rt) 有时还需log2变换

pdf(file = 'normalized_box.pdf')
  p1 <- boxplot(normalized_expr,outline=FALSE,las=2,col = 'red',xaxt = 'n',ann = F)
  title(main = list('Normalization',cex = 2 ,font = 2),
        xlab = list('Sample list',cex = 1.5,font = 2),
        ylab = '',line = 0.7)
  mtext('Expression value',side = 2,padj = -3,font = 2,cex = 1.5)
  dev.off()

2018年SCI论文--整合GEO数据挖掘完整复现 三 :差异表达(GSE37815)

分组

group_list = pdata$title
  #table(pdata$title) # 查看样品信息
  control = normalized_expr[,grep('Control',group_list)]
  tumor   = normalized_expr[,grep('cancer',group_list)]
  exprSet1 = cbind(tumor,control)
  group_list = c(rep('tumor',ncol(tumor)),
                 rep('normal',ncol(control)))

差异表达

表达矩阵

data = exprSet1

分组矩阵

group_list = factor(group_list)
  design <- model.matrix( ~0 + group_list)
  colnames( design ) = levels(group_list)
  rownames( design ) = colnames(data)
  contrast.matrix <- makeContrasts( "tumor-normal", levels = design)

差异表达矩阵

fit <- lmFit( data, design )
  fit2 <- contrasts.fit( fit, contrast.matrix ) 
  fit2 <- eBayes( fit2 )
  allDiff=topTable(fit2,adjust='fdr',number=200000)
  write.table(allDiff,file="alldiff.xls",sep="\t",quote=F)

按照logFC排序

allLimma=allDiff
  allLimma=allLimma[order(allLimma$logFC),]
  allLimma=rbind(Gene=colnames(allLimma),allLimma)
  write.table(allLimma,file="GSE37815_limmaTab.txt",sep="\t",quote=F,col.names=F)

保存差异表达矩阵

logFoldChange=1
  adjustP=0.05
  diffSig <- allDiff[with(allDiff, (abs(logFC)>logFoldChange & adj.P.Val < adjustP )), ]
  write.table(diffSig,file="diff.xls",sep="\t",quote=F)
  diffUp <- allDiff[with(allDiff, (logFC>logFoldChange & adj.P.Val < adjustP )), ]
  write.table(diffUp,file="up.xls",sep="\t",quote=F)
  diffDown <- allDiff[with(allDiff, (logFC<(-logFoldChange) & adj.P.Val < adjustP )), ]
  write.table(diffDown,file="down.xls",sep="\t",quote=F)

hmExp=data[rownames(diffSig),]
  diffExp=rbind(id=colnames(hmExp),hmExp)
  write.table(diffExp,file="diffExp.txt",sep="\t",quote=F,col.names=F

火山图

xMax=max(-log10(allDiff$adj.P.Val))   
  yMax=max(abs(allDiff$logFC))
  library(ggplot2)
  allDiff$change <- ifelse(allDiff$adj.P.Val < 0.05 & abs(allDiff$logFC) > 1,
                           ifelse(allDiff$logFC > 1,'UP','DOWN'),
                           'NOT')
  table(allDiff$change)
  pdf(file = 'volcano.pdf')
  ggplot(data= allDiff, aes(x = -log10(adj.P.Val), y = logFC, color = change)) +
    geom_point(alpha=0.8, size = 1) +
    theme_bw(base_size = 15) +
    theme(plot.title=element_text(hjust=0.5),   #  标题居中
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank()) + # 网格线设置为空白
    geom_hline(yintercept= 0 ,linetype= 2 ) +
    scale_color_manual(name = "", 
                       values = c("red", "green", "black"),
                       limits = c("UP", "DOWN", "NOT")) +
    xlim(0,xMax) + 
    ylim(-yMax,yMax) +
    labs(title = 'Volcano', x = '-Log10(adj.P.Val)', y = 'LogFC')
  dev.off()

2018年SCI论文--整合GEO数据挖掘完整复现 三 :差异表达(GSE37815)

热图,按p值从小到大筛选前100个差异基因(|logFC| > 1)

top100exp = exprSet1[rownames(diffSig)[1:100],] # 按P值大小排序,选择前100个
  annotation_col = data.frame(group = group_list)
  rownames(annotation_col) = colnames(exprSet1)
  library(pheatmap)
  pdf(file = 'heatmap.pdf')
  pheatmap(top100exp,annotation_col = annotation_col,
           color = colorRampPalette(c("green", "black", "red"))(50),
           fontsize  = 5)
  dev.off()

2018年SCI论文--整合GEO数据挖掘完整复现 三 :差异表达(GSE37815)

上一篇:20201222笔记


下一篇:批处理,修改环境变量path的方法(加环境变量)