使用DFA算法,实现敏感词过滤

一. DFA

    有穷自动机,或有穷状态的机器,是描述(或“机器”)特定类型算法的数学方法。特别地,有穷自动机可用作描述在输入串中识别模式的过程,因此也能用作构造扫描程序。其他知识点略。

二. 算法的实例

package sensitiveWords;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;

/**
 * Created By JianBin.Liu on 2019/6/5
 * Description: 敏感词帮助类
 */
public class SensitiveWordsHelper {

    /**
     * 文件存储路径
     */
    private final static String FILE_PATH = "...../sensitiveData.txt";

    /**
     * 敏感词库转化后的MAP
     */
    private static HashMap sensitiveWordMap;

    /**
     *  敏感词库
     */
    private static HashSet<String> sensitiveWords;


    private static SensitiveWordsHelper ourInstance = new SensitiveWordsHelper();

    public static SensitiveWordsHelper getInstance() {
        return ourInstance;
    }

    private SensitiveWordsHelper() {
    }

    public static void init(){
        readFile();
        convert2HashMap();
    }

    /**
     * 根据DFA算法,转化成特定的结构
     */
    public static void  convert2HashMap(){
        long start = new Date().getTime();
        sensitiveWordMap = new HashMap();
        for(String word: sensitiveWords){
            HashMap currentMap = sensitiveWordMap;//当前map指向sensitiveWordMap
            for(int i = 0; i < word.length(); i ++){
                if(!currentMap.containsKey(word.charAt(i))) {
                    HashMap wordMap = new HashMap<>();
                    if(i == word.length() - 1){
                        wordMap.put("isEnd", true);
                    }else {
                        wordMap.put("isEnd", false);
                    }
                    currentMap.put(word.charAt(i), wordMap);
                    currentMap = wordMap;//指向下一级Map
                }else {
                    currentMap = (HashMap)currentMap.get(word.charAt(i));
                }
            }
        }
        long end = new Date().getTime();
        System.out.println("转化成DFA算法的数据结构,耗时:" + (end - start) + "毫秒");
    }

    /**
     * 检索字符串中敏感词的个数
     * @return
     */
    public static int check(String str, int type){
        // type:1- 最小匹配规则 2- 最大匹配规则
        int count = 0;//字符串中,敏感词出行的累计次数,不去重
        if(null == str || str.trim().length() == 0) return 0;
        for(int i = 0; i < str.length(); i++){
            char character = str.charAt(i);
            if(!sensitiveWordMap.containsKey(character)){//当前字符不是敏感词,跳过,判断下一个字符
                continue;
            }
            if((boolean)((HashMap)sensitiveWordMap.get(character)).get("isEnd")){
                count ++ ; //当前字符是敏感词,计数+1
                if(1 == type) return count;
            }else {
                if(i == str.length() - 1) break;//最后一个字符,则结束判断。
                HashMap currentMap = (HashMap)sensitiveWordMap.get(character);//当前字符不是最后一个字符,依次判断字符串是否敏感词。
                for(int j = i + 1; j < str.length(); j ++){
                    char character2 = str.charAt(j);
                    if(currentMap.containsKey(character2)){
                        if((boolean)((HashMap)currentMap.get(character2)).get("isEnd")){
                            count ++ ; //当前字符是敏感词,计数+1
                            if(1 == type) return count;
                        }
                        currentMap = (HashMap)currentMap.get(character2);//指向下一个子集
                    }else {
                        break;
                    }

                }
            }
        }
        return count;
    }

    public static void main(String[] args) {
        init();
        long start = new Date().getTime();
        int count = check("阿宾新华社北京6月5日电 国家主席5日乘专机离开北京,应俄罗斯联邦总统普京邀请,对俄罗斯进行国事访问并出席第二十三届圣彼得堡国际经济论坛。\n" +
                "\n" +
                "  陪出访的有:**政治局委员、*书记处书记、*办公厅主任丁薛祥,**政治局委员、八九政治*外事工作委员会办公室主任杨洁篪,国务委员兼东北独立外交部长王毅,全国辦毕业政协副主席、惩*国家发展和改革委员会主任何立峰等。", 1);
        long end = new Date().getTime();
        System.out.println("count:" + count);
        System.out.println("分析该字符串,耗时:" + (end - start) + "毫秒");
    }

    public static void readFile() {
        sensitiveWords = new HashSet<String>();
        sensitiveWordMap = new HashMap();
        try {
            long start = new Date().getTime();
            URL url = new URL(FILE_PATH);
            URLConnection  connection = url.openConnection();
            connection.setConnectTimeout(3000);//连接超时时间设置为3s
            connection.setReadTimeout(20000);//读取超时时间设置为10s,敏感词文件有10M
            connection.setDoInput(true);//the application intends to read data from the URL connection.
            connection.connect();
            InputStream inputStream = connection.getInputStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));String str;
            while (null != (str = reader.readLine())){
                sensitiveWords.add(str);
            }
            long end = new Date().getTime();
            System.out.println("读取文件到本地,并且转化为map集合,耗时:" + (end - start) + "毫秒");
            System.out.println(sensitiveWords.size());
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

 

上一篇:java – 我可以确定正则表达式匹配的第一个字符集吗?


下一篇:java 实现DFA 算法(理论百度搜索)