大数据高级筛选与合并--C++实现与Python实现

2021-09-07 02:12:51

需求描述：

从大量相同格式的源文件中筛选出符合同一个条件的数据，整合到一张工作表中。

具体源文件格式包括.csv，.txt，拥有相同的前缀名。

如下所示，.txt源文件中的目标数据包含非数字元素，.csv的同类数据存放在单元格中，因此需要判断数据类型：

python脚本实现方式：

判断非数字函数：

 1 def is_number(s):
 2     try:
 3         float(s)
 4         return True
 5     except ValueError:
 6         pass
 7  
 8     try:
 9         import unicodedata
10         unicodedata.numeric(s)
11         return True
12     except (TypeError, ValueError):
13         pass
14  
15     return False

遍历目标路径文件列表，对文件后缀正则分类：

1 from pathlib import Path
2 folder_path=r'D:\visual studio\Code\test'
3 path=Path(folder_path)
4 txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) ]

对于单一文件类型列表可以在listdir后加endswith限制条件：

1 txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.join(folder_path, file).endswith('.*')]

对txt_file_list中的对象进行遍历，并提取后缀：

1     OriginName=os.path.split(txt_file)[1]
2     TransName=OriginName.split('.')
3     L_Trans=len(TransName)
4     TypeName=TransName[L_Trans-1]
5     print(OriginName)
6     print(TypeName)
7 
8     work_book=xlwt.Workbook(encoding='utf-8') #统一编码方式，否则后续可能无法识别
9     sheet=work_book.add_sheet('temp_'+TransName[0])  #临时sheet命名

对.txt和.csv分别进行处理，统一生成temp_name.csv临时文件：

 1     if TypeName=='txt':
 2         workbook=xlwt.Workbook()
 3         sheet1=workbook.add_sheet('name_'+TransName[0],cell_overwrite_ok=True)
 4         row0=[u"Gene.refGene",u"gnomAD_genome_ALL"] #筛选目标列
 5         for i in range(0,len(row0)):
 6             sheet1.write(0,i,row0[i])
 7     
 8         index=1
 9         f=open(txt_file)
10         for line in f:
11             data=line.strip('\n').split('\t')
12             sheet1.write(index,0,data[6])
13             sheet1.write(index,1,data[38])
14             index=index+1
15         csv_file_name=TransName[0]+'.csv'
16         workbook.save(path.joinpath(csv_file_name))
17         sheet=pd.read_excel(path.joinpath(csv_file_name),sheet_name=0)
18         os.remove(os.path.join(path,csv_file_name))
19     elif TypeName=='csv':
20         sheet=pd.read_excel(path.joinpath(OriginName),sheet_name=0)
21     else:
22         ... # continue

继续使用loc函数和lambda表达式进一步按条件筛选出DataFrame：

1     DataFrame1 = sheet.copy()
2     DataFrame1 = DataFrame1[['Gene.refGene', 'gnomAD_genome_ALL']]
3     DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:is_number(x))]
4     DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:0<=float(x)<=0.05)]
5     DataTemp=DataFrame1.reset_index(drop=True)  #忽略表头，重置索引
6 #   os.remove(os.path.join(path,csv_file_name))  #删除临时文件

进行一些后续的频数统计之后使用concat或merge聚合DataFrame：

1     DataTemp=pd.concat([GenNameRes,freq],axis=1,join_axes=[freq.index])
2     Res=pd.concat([Res,DataTemp],axis=1, join='outer')

数据拼接方法详见：

python数据拼接: pd.concat - boobo - 博客园 (cnblogs.com)

最后生成结果表格：

1 Res.to_excel(path.joinpath('result.xlsx'),index=False,encoding='utf-8')

完整代码：

 1 import xlwt
 2 import csv
 3 import os
 4 import pandas as pd
 5 from pandas.core.frame import DataFrame
 6 import numpy as np
 7 from pathlib import Path
 8 
 9 def is_number(s):
10     try:
11         float(s)
12         return True
13     except ValueError:
14         pass
15  
16     try:
17         import unicodedata
18         unicodedata.numeric(s)
19         return True
20     except (TypeError, ValueError):
21         pass
22  
23     return False
24 
25 Res=DataFrame([])
26 
27 folder_path=r'D:\visual studio\Code\test'
28 path=Path(folder_path)
29 txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) ]
30 
31 for txt_file in txt_file_list:
32     OriginName=os.path.split(txt_file)[1]
33     TransName=OriginName.split('.')
34     L_Trans=len(TransName)
35     TypeName=TransName[L_Trans-1]
36     print(OriginName)
37     print(TypeName)
38     
39     work_book=xlwt.Workbook(encoding='utf-8')
40     sheet=work_book.add_sheet('temp_'+TransName[0])
41     
42     if TypeName=='txt':
43         workbook=xlwt.Workbook()
44         sheet1=workbook.add_sheet('name_'+TransName[0],cell_overwrite_ok=True)
45         row0=[u"Gene.refGene",u"gnomAD_genome_ALL"]
46         for i in range(0,len(row0)):
47             sheet1.write(0,i,row0[i])
48     
49         index=1
50         f=open(txt_file)
51         for line in f:
52             data=line.strip('\n').split('\t')
53             sheet1.write(index,0,data[6])
54             sheet1.write(index,1,data[38])
55             index=index+1
56         csv_file_name=TransName[0]+'.csv'
57         workbook.save(path.joinpath(csv_file_name))
58         sheet=pd.read_excel(path.joinpath(csv_file_name),sheet_name=0)
59         os.remove(os.path.join(path,csv_file_name))
60     elif TypeName=='csv':
61         sheet=pd.read_excel(path.joinpath(OriginName),sheet_name=0)
62     else:
63         continue
64 
65     DataFrame1 = sheet.copy()
66     DataFrame1 = DataFrame1[['Gene.refGene', 'gnomAD_genome_ALL']]
67     DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:is_number(x))]
68     DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:0<=float(x)<=0.05)]
69     DataTemp=DataFrame1.reset_index(drop=True)
70 #   os.remove(os.path.join(path,csv_file_name))  
71     
72     freq=[]
73     GenNameRes=[]
74     colName=DataFrame1['Gene.refGene']
75     colName=colName.reset_index(drop=True)
76 
77     dict={}
78     for key in colName:
79         dict[key]=dict.get(key, 0)+1
80     
81     for key in colName:
82         if key not in GenNameRes:
83             GenNameRes.append(key)
84             freq.append(str(dict[key]))
85         else:
86             continue
87 
88     GenNameRes=DataFrame(GenNameRes)
89     freq=DataFrame(freq)
90     GenNameRes.columns=["Gene.refGene_"+TransName[0]]
91     freq.columns=["Frequency_"+TransName[0]] 
92 
93     DataTemp=pd.concat([GenNameRes,freq],axis=1,join_axes=[freq.index])
94     Res=pd.concat([Res,DataTemp],axis=1, join='outer')
95 Res.to_excel(path.joinpath('result.xlsx'),index=False,encoding='utf-8')
96 Res

View Code

C++实现：

#include <iostream> 
#include <string>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <cstring>
#include <algorithm>
#include <unordered_map>

using namespace std;

struct GeneData{
    string GeneRef;
    float gnom[8];
    int Times;
};
struct Testgen{
    int gen1;
    int gen2;
    string genref;
};
int main(){
    ofstream outfile;

    for(int i=1;i<=3;i++){
        string path="./Test";
        path+=('0'+i);
        path+=".csv";
        cout<<path<<endl;
        
    //string path="./Test.csv";
            ifstream ifs;
            ifs.open(path,ios::in);
            if(!ifs.is_open()){
                cout<<"open failed";
                system("pause");
            }

            string line;

            vector<GeneData>GenVec;//1
            vector<Testgen>TestgenVec;//2
            vector<string>Vstr;
            unordered_map<string,int>M;

            int CNT=1;
            getline(ifs,line);
            while(getline(ifs,line)){
                if(CNT==1){
                    CNT++;
                    continue;
                }
                //cout<<line<<endl;
                stringstream ss(line);
                string str;
                GeneData Gen;
                Testgen TestGen;

                bool flag=true;
                int cnt=1;
                // while(cnt<=190){
                //     getline(ss,str,',');
                //     if(cnt==7){
                //         Gen.GeneRef=stod(str);
                //     }
                //     if(cnt<=46 || cnt>=39){
                //         Gen.gnom[cnt-39]=stof(str);
                //         if(stof(str)>0.05){
                //             flag=false;
                //         }
                //     }
                //     cnt++;
                // }

                // if(flag==true){
                //     GenVec.push_back(Gen);
                // }
                while(cnt<=4){
                    getline(ss,str,','); //.txt文件为getline(ss,str,' ')
                    if(cnt==2){
                        TestGen.gen1=stoi(str);
                        if(TestGen.gen1<0)flag=false;
                    }
                    if(cnt==3){
                        TestGen.gen2=stoi(str);
                        if(TestGen.gen2<0)flag=false;
                    }
                    if(cnt==4)TestGen.genref=str;
                    cnt++;
                }
                
                if(flag==true){
                    TestgenVec.push_back(TestGen);
                }
                CNT++;
            }

            for(auto x:TestgenVec){
                M[x.genref]++;
            }

            string res_path="./Res";
            res_path+=('0'+i);
            res_path+=".csv";
            cout<<res_path<<endl;
            
            if(i==1){
                outfile.open("./Res.csv",ios::out);
            }
            outfile<<"chart"<<i<<endl;
            outfile<<"genref"<<','<<"Frequency"<<endl;//<<"gen1"<<','<<"gen2"<<','
            for(auto x:TestgenVec){
                outfile<<x.genref<<','<<M[x.genref]<<endl;
            }//<<x.gen1<<','<<x.gen2<<','
            // for(auto x:Vstr){
            //     outfile<<x<<endl;
            //     cout<<x<<endl;
            // }
            if(i==3){
                outfile.close();
            }
    }
    //ifstream infile("./Test.csv",ios::in);
    
    // for(auto x:GenVec){
    //     x.Times=M[x.GeneRef];
    //     cout<<x.Times<<endl;
    // }


    system("pause");
    return 0;    
}

非注释代码为测试代码，测试用csv文件仅使用简化数据类型：

运行结果为：

C++实现方法有待优化文件编码方式；

其他实现方式请参考：

C++实现读取CSV文件数据将进行计算。_Liuxm-CSDN博客_c++读取csv文件

【C++】读取 .csv / .xlsx 文件中的指定数据(非常实用)_小朋友-CSDN博客_c++读取csv文件的某一列数据

码农公寓

相关文章