Python-从邮件中提取内容并插入数据库

引子

有100多封eml格式的本地邮件,需要从每封邮件中提取出特定的内容,并插入数据库中。难处在于如何使用正则提取内容。想试验正则表达式的结果,可以下载RegexBuddy,方便调试。

 

代码

Python-从邮件中提取内容并插入数据库
import os
from email import message_from_file
import re
import hashlib
import json
import pymssql

def parseEmail(file):
    with open(file) as f:
        name = f.name
        msg = message_from_file(f)
        target_text = msg.get_payload()[0].get_payload()
    # <span>Timestamp: 2020-07-30 00:00:00 (UTC) </span>
    # <strong>Koality_AutoDQ4_OnePipeline_OfficeForms</strong>
    timestamp = re.findall("Timestamp: (\d{4}-\d{1,2}-\d{1,2})",target_text)[0]
    dataset = re.findall("Dataset: .+_.+_.+_(.+_*\w+) <http",target_text)[0]
    unique_dataset_name = re.findall("Dataset: (.+_.+_.+_.+) <http",target_text)[0]
    metrics = re.findall(" Metrics[:=]\n*(.+\n*.+)\n*<=*\n*h", target_text)[0]
    metrics = metrics.replace(\n, ‘‘).replace(=,‘‘).replace(: , ‘‘)

    # Koality_AutoDQ4_OA-OXO_Teams_DoD, whose dataset is  Teams_DoD instaed of DoD
    if Teams_ in unique_dataset_name:
        dataset = Teams_ + dataset


    incident_source_text = target_text[target_text.find(Incident List): target_text.find(RootCause)].replace(=, ‘‘).replace(\n, ‘‘)
    incidents_str = incident_source_text[:incident_source_text.find(<https)]
    incdient_groups = re.findall("(\w+:\w+),", incidents_str)
    incident_dict = {}
    for incident in incdient_groups:
        k, v = incident.split(:)
        incident_dict[k] = v
    incident_json = json.dumps(incident_dict)
    hashkey = toHash(incident_json)
    return timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name

def toHash(text):
    hs = hashlib.md5()
    hs.update(text.encode())
    return hs.hexdigest().upper()


def insertIntoDB(conn,table, results):
    cur = conn.cursor()
    count = 0
    for row in results:
        rowDict = {table: table,
                   timestamp:row[0], 
                   dataset:row[1],
                   metric:row[2], 
                   json:row[3],
                   hashkey:row[4],
                   unique_dataset_name: row[5]}
        sql = """
                insert into [{table}] 
                ([WorkloadName], [MetricName], [DimensionCombination_JsonForDb_Hash], [Timestamp], [DimensionCombination_JsonForDb], [UniqueDatasetName], [MetricValue], [ExpectedMetricValue])
                VALUES (‘{dataset}‘, ‘{metric}‘,‘{hashkey}‘,‘{timestamp}‘, ‘{json}‘, ‘{unique_dataset_name}‘, 0, 0); 
        """.format(**rowDict)
        try:
            cur.execute(sql)
            print(Inserted a row......)
        except:
            print(row[1], row[5])
        count += 1


    print("Successfully inserted {:d} rows....".format(count))
    cur.close()
    conn.close()
    return True


def main():
    print("Starting script......")

    emailFolder = r"C:\Work\IncidentEmails"

    results = []
    for filePath in os.listdir(emailFolder):
        fullPath = emailFolder + "\\" + filePath
        timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name = parseEmail(fullPath)
        results.append([timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name])

    conn = pymssql.connect(server, usr@server, pwd, db, autocommit=True)
    insertIntoDB(conn, KenshoAutoDQAlert, results)

if __name__ == "__main__":
    main()
View Code

 

Python-从邮件中提取内容并插入数据库

上一篇:数据库5和护具库8的区别


下一篇:Oracle创建用户并给用户授权查询指定表或视图的权限