大数据处理

import re
from pathlib import Path

import pandas as pd
import sys
import os
def clean(paths):
    return_data = []
    with open(paths,"r") as f:
        text = f.read()

    lists = []
    strs = []
    count = 0
    for i in text.split("\n"):
        # print(i)
        if "DataName" in i :
            if strs:
                lists.append(strs)
            strs = []
            i = '%s,'%count + i.replace("  ",",").replace(" ",",")
            strs.append(i)
            # lists.append(i.split(","))
            # lists.append(str(count))

        if "DataValue" in i:
            i = '%s,'%count + i.replace(" ",",")
            strs.append(i)
            # lists.append(i.split(","))
            # lists.append(str(count))
        count +=1
    else:
        if strs:
            lists.append(strs)
    for i in lists:
        table = pd.DataFrame([ii.split(',') for ii in i]).applymap(lambda x:"" if x == None else x)
        # print(table)
        columns_name = []
        columns = table.columns
        for index, row in table.iterrows():
            if index == 0:
                # 提取Dataname行数据
                for c in columns.values.tolist():
                    columns_name.append(row[c])
            # print(columns_name)
            if index > 1:
                # 提取Datavalue行数据
                for c in range(2,len(columns_name)):
                    # 判断是否为空值
                    if row[c]:
                        return_data.append([columns_name[c],row[c],row[0]])

    # print(return_data)
    return return_data

def fileclean(inputdirs,outputfile):
    datas = []
    for folderName,subfolders,filenames in os.walk(inputdirs):
        # print(folderName,subfolders,filenames)
        for filename in filenames:
            if filename.endswith(".csv"):
                parent = folderName.split("\\")[-1]
                paths = folderName+'/'+filename
                parent_s = parent.split("_")
                Devices_time = re.findall(";(.*?)]",filename)[0].replace("_",'/',2).replace("_",":")
                test_name = filename.split(" ")[0]
                Wafer_id = re.findall("\[(.*?)-",filename)[0]
                test_pipeline = re.findall("-(.*?)\(",filename)[0]
                #ICES5V [A213911020-T1D1(17) ; 11_7_2021 10_31_26 PM]
                for i,j,k in clean(paths=paths):
                    data = {
                        "PATH":parent,
                        "FILE_NAME":filename,
                        "DATANAME":i,
                        "DATAVALUE":j,
                        "PROJECT_TYPE":parent_s[0],
                        "PRODUCT":parent_s[1],
                        "PRODUCT_VERSION":parent_s[2],
                        "LOT":parent_s[3],
                        "TEST_ITEM":parent_s[4],
                        "TEST_NODE":parent_s[5],
                        "FACTORY":parent_s[6],
                        "DEVICES_TIME":Devices_time,
                        "TEST_NAME":test_name,
                        "WAFER_ID":Wafer_id,
                        "TEST_PIPELINE":test_pipeline,
                        "LINE_NO":k
                    }
                    datas.append(data)
        if datas:
            table = pd.DataFrame(datas)
            # 日期处理
            table['DEVICES_TIME'] = pd.to_datetime(table['DEVICES_TIME'])
            table.to_csv(outputfile,index=None)
if __name__ == '__main__':
    argv = sys.argv
    print(argv)
    fileclean(argv[1],argv[2])
    # python clean_data.py 文件夹路径 输出文件路径.csv
    # fileclean("data","data2.csv")

上一篇:python数据分析实战项目—10000条北京二手房多维度可视化分析(附源码)


下一篇:pandas 的[ ]搜索法,只支持columns的标签和series的标签和数字