import re
from pathlib import Path
import pandas as pd
import sys
import os
def clean(paths):
return_data = []
with open(paths,"r") as f:
text = f.read()
lists = []
strs = []
count = 0
for i in text.split("\n"):
# print(i)
if "DataName" in i :
if strs:
lists.append(strs)
strs = []
i = '%s,'%count + i.replace(" ",",").replace(" ",",")
strs.append(i)
# lists.append(i.split(","))
# lists.append(str(count))
if "DataValue" in i:
i = '%s,'%count + i.replace(" ",",")
strs.append(i)
# lists.append(i.split(","))
# lists.append(str(count))
count +=1
else:
if strs:
lists.append(strs)
for i in lists:
table = pd.DataFrame([ii.split(',') for ii in i]).applymap(lambda x:"" if x == None else x)
# print(table)
columns_name = []
columns = table.columns
for index, row in table.iterrows():
if index == 0:
# 提取Dataname行数据
for c in columns.values.tolist():
columns_name.append(row[c])
# print(columns_name)
if index > 1:
# 提取Datavalue行数据
for c in range(2,len(columns_name)):
# 判断是否为空值
if row[c]:
return_data.append([columns_name[c],row[c],row[0]])
# print(return_data)
return return_data
def fileclean(inputdirs,outputfile):
datas = []
for folderName,subfolders,filenames in os.walk(inputdirs):
# print(folderName,subfolders,filenames)
for filename in filenames:
if filename.endswith(".csv"):
parent = folderName.split("\\")[-1]
paths = folderName+'/'+filename
parent_s = parent.split("_")
Devices_time = re.findall(";(.*?)]",filename)[0].replace("_",'/',2).replace("_",":")
test_name = filename.split(" ")[0]
Wafer_id = re.findall("\[(.*?)-",filename)[0]
test_pipeline = re.findall("-(.*?)\(",filename)[0]
#ICES5V [A213911020-T1D1(17) ; 11_7_2021 10_31_26 PM]
for i,j,k in clean(paths=paths):
data = {
"PATH":parent,
"FILE_NAME":filename,
"DATANAME":i,
"DATAVALUE":j,
"PROJECT_TYPE":parent_s[0],
"PRODUCT":parent_s[1],
"PRODUCT_VERSION":parent_s[2],
"LOT":parent_s[3],
"TEST_ITEM":parent_s[4],
"TEST_NODE":parent_s[5],
"FACTORY":parent_s[6],
"DEVICES_TIME":Devices_time,
"TEST_NAME":test_name,
"WAFER_ID":Wafer_id,
"TEST_PIPELINE":test_pipeline,
"LINE_NO":k
}
datas.append(data)
if datas:
table = pd.DataFrame(datas)
# 日期处理
table['DEVICES_TIME'] = pd.to_datetime(table['DEVICES_TIME'])
table.to_csv(outputfile,index=None)
if __name__ == '__main__':
argv = sys.argv
print(argv)
fileclean(argv[1],argv[2])
# python clean_data.py 文件夹路径 输出文件路径.csv
# fileclean("data","data2.csv")