python 检测文件编码等

参考:http://my.oschina.net/waterbear/blog/149852

chardet模块,能够实现文本编码的检查,

核心代码:

import chardet
chardet.detect(content)['encoding']

实现目录java文件转码:

#-*- coding: utf-8 -*-

import codecs
import os
import shutil
import re
import chardet def convert_encoding(filename, target_encoding):
# Backup the origin file.
shutil.copyfile(filename, filename + '.bak') # convert file from the source encoding to target encoding
content = codecs.open(filename, 'r').read()
source_encoding = chardet.detect(content)['encoding']
print source_encoding, filename
content = content.decode(source_encoding) #.encode(source_encoding)
codecs.open(filename, 'w', encoding=target_encoding).write(content) def main():
for root, dirs, files in os.walk(os.getcwd()):
for f in files:
if f.lower().endswith('.java'):
filename = os.path.join(root, f)
try:
convert_encoding(filename, 'utf-8')
except Exception, e:
print filename def process_bak_files(action='restore'):
for root, dirs, files in os.walk(os.getcwd()):
for f in files:
if f.lower().endswith('.java.bak'):
source = os.path.join(root, f)
target = os.path.join(root, re.sub('\.java\.bak$', '.java', f, flags=re.IGNORECASE))
try:
if action == 'restore':
shutil.move(source, target)
elif action == 'clear':
os.remove(source)
except Exception, e:
print source if __name__ == '__main__':
# process_bak_files(action='clear')
main()

另,参考:Python 的中文编码处理

http://in355hz.iteye.com/blog/1860787

  1. # 检查标准输出流的编码
  2. print sys.stdout.encoding
  1. # 无论如何,请用 linux 系统的当前字符集输出:
  2. if sys.stdout.encoding is None:
  3. enc = os.environ['LANG'].split('.')[1]
  4. sys.stdout = codecs.getwriter(enc)(sys.stdout)  # 替换 sys.stdout
  1. # 使得 sys.getdefaultencoding() 的值为 'utf-8'
  2. reload(sys)                      # reload 才能调用 setdefaultencoding 方法
  3. sys.setdefaultencoding('utf-8')  # 设置 'utf-8'
上一篇:WPF读书笔记 x名称空间详解(第二天)


下一篇:Xcode + Swift 制作动态原型