引子
有100多封eml格式的本地邮件,需要从每封邮件中提取出特定的内容,并插入数据库中。难处在于如何使用正则提取内容。想试验正则表达式的结果,可以下载RegexBuddy,方便调试。
代码
import os from email import message_from_file import re import hashlib import json import pymssql def parseEmail(file): with open(file) as f: name = f.name msg = message_from_file(f) target_text = msg.get_payload()[0].get_payload() # <span>Timestamp: 2020-07-30 00:00:00 (UTC) </span> # <strong>Koality_AutoDQ4_OnePipeline_OfficeForms</strong> timestamp = re.findall("Timestamp: (\d{4}-\d{1,2}-\d{1,2})",target_text)[0] dataset = re.findall("Dataset: .+_.+_.+_(.+_*\w+) <http",target_text)[0] unique_dataset_name = re.findall("Dataset: (.+_.+_.+_.+) <http",target_text)[0] metrics = re.findall(" Metrics[:=]\n*(.+\n*.+)\n*<=*\n*h", target_text)[0] metrics = metrics.replace(‘\n‘, ‘‘).replace(‘=‘,‘‘).replace(‘: ‘, ‘‘) # Koality_AutoDQ4_OA-OXO_Teams_DoD, whose dataset is Teams_DoD instaed of DoD if ‘Teams_‘ in unique_dataset_name: dataset = ‘Teams_‘ + dataset incident_source_text = target_text[target_text.find(‘Incident List‘): target_text.find(‘RootCause‘)].replace(‘=‘, ‘‘).replace(‘\n‘, ‘‘) incidents_str = incident_source_text[:incident_source_text.find(‘<https‘)] incdient_groups = re.findall("(\w+:\w+),", incidents_str) incident_dict = {} for incident in incdient_groups: k, v = incident.split(‘:‘) incident_dict[k] = v incident_json = json.dumps(incident_dict) hashkey = toHash(incident_json) return timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name def toHash(text): hs = hashlib.md5() hs.update(text.encode()) return hs.hexdigest().upper() def insertIntoDB(conn,table, results): cur = conn.cursor() count = 0 for row in results: rowDict = {‘table‘: table, ‘timestamp‘:row[0], ‘dataset‘:row[1], ‘metric‘:row[2], ‘json‘:row[3], ‘hashkey‘:row[4], ‘unique_dataset_name‘: row[5]} sql = """ insert into [{table}] ([WorkloadName], [MetricName], [DimensionCombination_JsonForDb_Hash], [Timestamp], [DimensionCombination_JsonForDb], [UniqueDatasetName], [MetricValue], [ExpectedMetricValue]) VALUES (‘{dataset}‘, ‘{metric}‘,‘{hashkey}‘,‘{timestamp}‘, ‘{json}‘, ‘{unique_dataset_name}‘, 0, 0); """.format(**rowDict) try: cur.execute(sql) print(‘Inserted a row......‘) except: print(row[1], row[5]) count += 1 print("Successfully inserted {:d} rows....".format(count)) cur.close() conn.close() return True def main(): print("Starting script......") emailFolder = r"C:\Work\IncidentEmails" results = [] for filePath in os.listdir(emailFolder): fullPath = emailFolder + "\\" + filePath timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name = parseEmail(fullPath) results.append([timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name]) conn = pymssql.connect(‘server‘, ‘usr@server‘, ‘pwd‘, ‘db‘, autocommit=True) insertIntoDB(conn, ‘KenshoAutoDQAlert‘, results) if __name__ == "__main__": main()