中文分词:逆向匹配最大算法(BMM)

中文分词:逆向匹配最大算法

  • 反向最大匹配法的基本原理与正向最大匹配法类似只是分词顺序变为从右至左。般从一个字符串的开始位置,选择一个最大长度的词长的片段,如果序列不足最大词长,则选择全部序列。

代码实现

#使用逆向最大匹配算法实现中文分词
words_dic = []

def init():
    '''
    读取词典文件
    获取词典
    :return:
    '''
    with open(r"C:\Users\lenovo\PycharmProjects\fenci\venv\dic\dic.txt","r",encoding= "utf-8")as dic_input:
        for word in dic_input:
            words_dic.append(word.strip())

    #实现逆向最大匹配算法的切词方法
def cut_words(raw_sentence,words_dic):
    #统计词典中词的最大长度
    max_length = max(len(word) for word in words_dic )
    sentence = raw_sentence.strip()
    #统计序列的长度
    words_length = len(sentence)
    cut_words_list = []
    while words_length > 0: #判断是否需要继续切词
        max_cut_length = min(max_length,words_length)
        subsentence = sentence[-max_cut_length:]
        while max_cut_length >0:
            if subsentence in words_dic:
                cut_words_list.append(subsentence)
                break
            elif max_cut_length == 1:
                cut_words_list.append(subsentence)
                break
            else:
                max_cut_length = max_cut_length - 1
                subsentence = subsentence[-max_cut_length:]
        sentence = sentence[0:-max_cut_length]
        words_length = words_length - max_cut_length
    cut_words_list.reverse()#自身反转
    words = "/".join(cut_words_list)
    return words

def main():
    '''
    与用户交互接口
    :return:
    '''
    init()
    while True:
        print("请输入您要分词的序列")
        input_str = input()
        if not input_str:
            break
        result = cut_words(input_str,words_dic)
        print("分词结果:")
        print(result)

if __name__=="__main__":
    main()
上一篇:Linux中cut提取命令


下一篇:Linux常用命令[002]:cut