工作中遇到需要把中文标点转化成英文标点的需求,
#coding=utf-8
import unicodedata
import os
import re
def punctuation_mend(string):
#输入字符串或者txt文件路径
table = {ord(f):ord(t) for f,t in zip(
u',、。!?【】()%#@&1234567890“”‘’',
u',..!?[]()%#@&1234567890""\'\'')} #其他自定义需要修改的符号可以加到这里
if os.path.isfile(string):
with open(string, 'r', encoding='utf-8') as f:
res = unicodedata.normalize('NFKC', f.read())
res = res.translate(table)
with open(string, 'w', encoding='utf-8') as f:
f.write(res)
else:
res = unicodedata.normalize('NFKC', string)
res = res.translate(table)
re.sub(r'(?<=[.,])(?=[^\s])', r' ', res)
return res
def add_space(string):
if os.path.isfile(string):
with open(string, 'r', encoding='utf-8') as f:
line = f.readline()
tmp_f = open(tmp_file, 'w+', encoding='utf-8')
while line:
'''
(?<=[.,]) positive lookbehind that looks for dots or commas
(?=[^\s]) positive lookahead that matches anything that isn't a space
'''
line_new = re.sub(r'(?<=[:.,])(?=[^\s])', r' ', line)
# write line except empty lines with or without space
if not (re.match(r'^\s*$', line_new)):
tmp_f.write(line_new)
line = f.readline()
tmp_f.close()
os.rename(tmp_file, string);
else:
re.sub(r'(?<=[.,])(?=[^\s])', r' ', string)
return string
#print(punctuation_mend('【】()%#@&“”'))
convert_file='~/xxx.txt'
tmp_file='~/tmp.txt'
if __name__ == '__main__':
punctuation_mend(convert_file)
add_space(convert_file)
以上就能转化常用的中文标点, 并且在 逗号, 点号和冒号
后加空格, 再去掉文件中带或者不带空格的空行.