知识图谱:【数据清洗工具flashtext(五)】——flashtext使用示例

文章目录

关键字提取

from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
>>> keywords_found
>>> # ['New York', 'Bay Area']
## 区分大小写
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
>>> keywords_found
>>> # ['Bay Area']

同时添加多个关键词
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict = {
     "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
 }
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java", "python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management', 'java']

删除关键字

from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict = {
    "java": ["java_2e", "java programing"],
     "product management": ["PM", "product manager"]
 }
 keyword_processor.add_keywords_from_dict(keyword_dict)
 print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')
# you can also remove keywords from a list/ dictionary
 keyword_processor.remove_keywords_from_dict({"product management": ["PM"]})
keyword_processor.remove_keywords_from_list(["java programing"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management']

函数封装示例

from flashtext import KeywordProcessor

def build_actree(wordlist):
    '''
        AC自动机进行关键词匹配
        构造AC trie
    '''
    actree = KeywordProcessor()
    for index, word in enumerate(wordlist):
        actree.add_keyword(word)     # 向trie树中添加单词
    #self.actree = actree
    return actree

def ac_detect(actree,text,span_info = True):
    '''
        AC自动机进行关键词匹配
        文本匹配
    '''
    region_wds = []
    for w1 in actree.extract_keywords(text,span_info = span_info):
        if len(w1) > 0:
            region_wds.append(w1[0])
    return region_wds

wordlist = ['健康','减肥']
text = '今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)

>>> CPU times: user 41 µs, sys: 0 ns, total: 41 µs
>>> Wall time: 47.2 µs
>>> ['减肥', '健康', '减肥', '健康']

pyahocorasick版

import ahocorasick

def build_actree(wordlist):
    '''
        AC自动机进行关键词匹配
        构造AC trie
    '''
    actree = ahocorasick.Automaton()         # 初始化trie树
    for index, word in enumerate(wordlist):
        actree.add_word(word, (index, word))     # 向trie树中添加单词
    actree.make_automaton()    # 将trie树转化为Aho-Corasick自动机
    #self.actree = actree
    return actree

def ac_detect(actree,text):
    '''
        AC自动机进行关键词匹配
        文本匹配
    '''
    region_wds = []
    for w1 in actree.iter(text):
        if len(w1) > 0:
            region_wds.append(w1[1][1])
    return region_wds

wordlist = ['健康','减肥']
text = '今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)

>>> CPU times: user 10 µs, sys: 3 µs, total: 13 µs
>>> Wall time: 17.4 µs
>>> ['减肥', '健康', '减肥', '健康']
上一篇:python中的标识符和保留字


下一篇:test