import os
import magic
# 决定使用这种方式识别,描述较全面,利用关键字匹配,如果能匹配到,就确定匹配的关键字为其文件类型
# 如果匹配不到,就用之前的文件格式
# a = magic.from_file(path)
# b = magic.from_buffer(open(path).read(1024))
# c = magic.from_file(path, mime=True)
# print(a)
FILE_TYPE = {
'Executables': ['exe', 'mz', 'msi', 'coff', 'elf', 'krnl', 'rpm', 'linux', 'macho'],
'Documents': ['ps', 'rtf', 'odp', 'ods', 'odt', 'hwp', 'gul', 'ebook', 'latex'],
'Code': ['php', 'python', 'perl', 'ruby', 'cpp', 'java', 'shell', 'pascal', 'awk', 'dyalog',
'fortran', 'java-bytecode'],
'Bundles': ['zip', 'gzip', 'bzip', 'rzip', 'dzip', '7-zip', 'cab', 'jar', 'rar', 'mscompress', 'ace',
'arj', 'asd', 'blackhole', 'kgb'],
'Other': ['bat', 'cmd']
}
def identify_file_type():
"""
文件类型识别
:return:
"""
# path = "D:/scripts/file/AF7.5.1.mf"
path = "D:/scripts/file/CmdHelperService.7z"
# 识别之前先根据文件后缀判断,排除以下文件格式:
exclude_file_type = ['txt', 'pdf', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'com', 'mf']
file_type = os.path.splitext(path)
if file_type[1]:
file_type = file_type[1].strip('.')
if file_type not in exclude_file_type:
file_type_info = magic.from_file(path)
print(file_type_info)
# TODO magic库识别后无法准确匹配的特殊文件类型,陆续添加
# dll 类型
if 'DLL' in file_type_info:
file_type = 'dll'
# vmdk 类型
if 'VMware4 disk image' in file_type_info:
file_type = 'vmdk'
else:
for file_types in FILE_TYPE.values():
for f_type in file_types:
if f_type.upper() in file_type_info or f_type.capitalize() in file_type_info or f_type in file_type_info:
file_type = f_type
return file_type
file_type = identify_file_type()
print(file_type)
# 7-zip archive data, version 0.4
# 7-zip