目的
- 对不同处理的基因的表达量进行差异分析
- 使用python的scipy包进行t检验
导入统计包
from scipy import stats
单样本的T检验(ttest_1samp)
stats.ttest_1samp(data,1)
两独立样本T检验(ttest_ind)
当两个总体方差相等时,即具有方差齐性,可以直接检验
stats.ttest_ind(data1,data2)
不确定两总体方差是否相等时,应先用levene检验,检验两总体是否具有方差齐性
stats.levene(data1,data2)
如果返回的P值远大于0.05,那我们认为两总体具有方差齐性
如果两总体不具有方差齐性,需要加上参数equal_val并设定为False
stats.ttest_ind(data1,data2,equal_var=False)
配对样本T检验(ttest_rel)
stats.ttest_rel(data1,data2)
表达量进行T检验
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
from scipy import stats
import re
import statsmodels.stats.multitest
import click
import numpy as np
@click.group()
def stat_tools():
'''
This tool is for deal with GSE data,Protein data or Metabolism data '''
def get_pvalue(df,control_list,treat_list):
p_result = stats.ttest_ind(df[control_list],df[treat_list],equal_var=False)
pvalue = p_result.pvalue
return pvalue
@click.command()
@click.option("-i","--input", help="a gene expression matrix file.",type=click.Path(exists=True))
@click.option("-o","--output", help="a gene expression matrix file with pvalue, qvalue and Foldchange.")
@click.option("-c","--control", help="control group name, sample name is like Normal_1 or Normal-1.")
@click.option("-t","--treat", help="Treat group name, sample name is like Tumor_1 or Tumor-1.")
@click.option("-f","--foldchange", default=1.5,help="foldchange to filter DEG result, default 1.5.")
@click.option("-p","--pvalue", default=0.05,help="pvalue to filter DEG result,default 0.05.")
@click.option("--filtered/--nofiltered", default=False,help="if create a gene expression matrix file filter by threshold.")
def Ttest(input,output,control,treat,foldchange,pvalue,filtered):
'''
use T-test to get DEG from GSE data,Protein data or Metabolism data ''' df = pd.read_csv(input,sep="\t",header=0)
control_list = [n for n in df.columns if re.match(control,n)]
treat_list = [n for n in df.columns if re.match(treat,n)]
df["pvalue"] = df.apply(get_pvalue,axis=1,args = (control_list,treat_list))
df["log2FoldChange"] = df[treat_list].mean(axis=1) - df[control_list].mean(axis=1)
df["qvalue"]=statsmodels.stats.multitest.multipletests(df["pvalue"],method='fdr_bh')[1]
df.to_csv(output,sep="\t",index=False,header=True)
if filtered:
filter_name="DEGs_"+output.replace(".xls","")+".FC"+str(foldchange)+"_pvalue"+str(pvalue)+".xls"
df_filter = df.loc[(df["pvalue"] <= pvalue ) & (np.abs(df["log2FoldChange"]) >=np.log2(foldchange))]
df_filter.to_csv(filter_name,sep="\t",index=False,header=True)
stat_tools.add_command(Ttest)
if __name__ == '__main__':
stat_tools()