python3 中英文标点转换

2023-11-26 23:50:28

工作中遇到需要把中文标点转化成英文标点的需求,

#coding=utf-8
import unicodedata
import os
import re

def punctuation_mend(string):
    #输入字符串或者txt文件路径
    table = {ord(f):ord(t) for f,t in zip(
                u'，、。！？【】（）％＃＠＆１２３４５６７８９０“”‘’',
                u',..!?[]()%#@&1234567890""\'\'')}   #其他自定义需要修改的符号可以加到这里
    if os.path.isfile(string):
        with open(string, 'r', encoding='utf-8') as f:
            res = unicodedata.normalize('NFKC', f.read())
            res = res.translate(table)
        with open(string, 'w', encoding='utf-8') as f:
            f.write(res)
    else:
        res = unicodedata.normalize('NFKC', string)
        res = res.translate(table)
        re.sub(r'(?<=[.,])(?=[^\s])', r' ', res)
        return res


def add_space(string):
    if os.path.isfile(string):
        with open(string, 'r', encoding='utf-8') as f:
            line = f.readline()
            tmp_f = open(tmp_file, 'w+', encoding='utf-8')
            while line:
                '''
                (?<=[.,]) positive lookbehind that looks for dots or commas
                (?=[^\s]) positive lookahead that matches anything that isn't a space
                '''
                line_new = re.sub(r'(?<=[:.,])(?=[^\s])', r' ', line)
                # write line except empty lines with or without space
                if not (re.match(r'^\s*$', line_new)):
                    tmp_f.write(line_new)
                line = f.readline()
            tmp_f.close()
        os.rename(tmp_file, string);
    else:
        re.sub(r'(?<=[.,])(?=[^\s])', r' ', string)
        return string

#print(punctuation_mend('【】（）％＃＠＆“”'))
convert_file='~/xxx.txt'
tmp_file='~/tmp.txt'
if __name__ == '__main__':
    punctuation_mend(convert_file)
    add_space(convert_file)

以上就能转化常用的中文标点, 并且在 逗号, 点号和冒号 后加空格, 再去掉文件中带或者不带空格的空行.

码农公寓

相关文章