# -*- coding: utf-8 -*-
import time
dictionaryfilename = "../data/dic.txt"
inputfilename = "../data/199801_sent.txt"
BMMfilename = "../data/seg_BMM.txt"
FMMfilename = "../data/seg_FMM.txt"
def readDic(dicpath=dictionaryfilename):
tripleDic = []
singleDic = []
fd = open(dicpath, 'r', encoding='gbk')
for line in fd:
words = line.split("\t")
triple = (words[0], int(words[1]))
singleDic.append(words[0])
tripleDic.append(triple)
fd.close()
return singleDic, tripleDic
def isExist(dic, word):
low = 0
high = len(dic) - 1
flag = False
while low <= high:
mid = int((low + high) / 2)
# print(mid)
try:
if dic[mid] == word:
flag = True
break
elif dic[mid] > word:
high = mid - 1
else:
low = mid + 1
except:
print(mid)
break
return flag
def FMM(dic, fmmpath=FMMfilename, inputpath=inputfilename):
start = time.process_time()
lenth = 0
for word in dic:
if lenth < len(word):
lenth = len(word)
end = time.process_time()
dic.sort()
print("建立词典耗时", end - start)
fi = open(inputpath, 'r', encoding='gbk')
fo = open(fmmpath, 'w')
start = time.process_time()
for line in fi:
segList = []
if line != '\n':
segList.append(line[0:19])
i = 19
while (i < len(line)):
j = min(i + lenth, len(line))
while (j - i > 0):
if j == (i + 1):
segList.append(line[i:j])
i = j
break
else:
tmp = line[i:j]
if isExist(dic, tmp):
segList.append(tmp)
i = j
break
else:
j -= 1
segList = segList[:-1]
for word in segList:
fo.write(word + '/')
fo.write('\n')
end = time.process_time()
print("正向最大匹配耗时", end - start)
fo.close()
def BMM(dic, bmmpath=BMMfilename, inputpath=inputfilename):
start = time.process_time()
lenth = 0
for word in dic:
if lenth < len(word):
lenth = len(word)
end = time.process_time()
dic.sort()
print("建立词典耗时", end - start)
fi = open(inputpath, 'r', encoding='gbk')
fo = open(bmmpath, 'w')
start = time.process_time()
for line in fi:
segList = []
if line != '\n':
line = line[:-1]
segList.append(line[0:19])
i = len(line)
while (i > 19):
j = max(i - lenth, 0)
while (i - j > 0):
if i == (j + 1):
segList.append(line[j:i])
i = j
break
else:
tmp = line[j:i]
if isExist(dic, tmp):
segList.append(tmp)
i = j
break
else:
j += 1
if len(segList) == 0:
fo.write('\n')
continue
fo.write(segList[0] + '/')
# print(segList)
for i in range(len(segList) - 1, 0, -1):
fo.write(segList[i] + '/')
fo.write('\n')
end = time.process_time()
print("正向最大匹配耗时", end - start)
fo.close()
if __name__ == '__main__':
single, triple = readDic()
FMM(single)
BMM(single)