一. DFA
有穷自动机,或有穷状态的机器,是描述(或“机器”)特定类型算法的数学方法。特别地,有穷自动机可用作描述在输入串中识别模式的过程,因此也能用作构造扫描程序。其他知识点略。
二. 算法的实例
package sensitiveWords; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.*; /** * Created By JianBin.Liu on 2019/6/5 * Description: 敏感词帮助类 */ public class SensitiveWordsHelper { /** * 文件存储路径 */ private final static String FILE_PATH = "...../sensitiveData.txt"; /** * 敏感词库转化后的MAP */ private static HashMap sensitiveWordMap; /** * 敏感词库 */ private static HashSet<String> sensitiveWords; private static SensitiveWordsHelper ourInstance = new SensitiveWordsHelper(); public static SensitiveWordsHelper getInstance() { return ourInstance; } private SensitiveWordsHelper() { } public static void init(){ readFile(); convert2HashMap(); } /** * 根据DFA算法,转化成特定的结构 */ public static void convert2HashMap(){ long start = new Date().getTime(); sensitiveWordMap = new HashMap(); for(String word: sensitiveWords){ HashMap currentMap = sensitiveWordMap;//当前map指向sensitiveWordMap for(int i = 0; i < word.length(); i ++){ if(!currentMap.containsKey(word.charAt(i))) { HashMap wordMap = new HashMap<>(); if(i == word.length() - 1){ wordMap.put("isEnd", true); }else { wordMap.put("isEnd", false); } currentMap.put(word.charAt(i), wordMap); currentMap = wordMap;//指向下一级Map }else { currentMap = (HashMap)currentMap.get(word.charAt(i)); } } } long end = new Date().getTime(); System.out.println("转化成DFA算法的数据结构,耗时:" + (end - start) + "毫秒"); } /** * 检索字符串中敏感词的个数 * @return */ public static int check(String str, int type){ // type:1- 最小匹配规则 2- 最大匹配规则 int count = 0;//字符串中,敏感词出行的累计次数,不去重 if(null == str || str.trim().length() == 0) return 0; for(int i = 0; i < str.length(); i++){ char character = str.charAt(i); if(!sensitiveWordMap.containsKey(character)){//当前字符不是敏感词,跳过,判断下一个字符 continue; } if((boolean)((HashMap)sensitiveWordMap.get(character)).get("isEnd")){ count ++ ; //当前字符是敏感词,计数+1 if(1 == type) return count; }else { if(i == str.length() - 1) break;//最后一个字符,则结束判断。 HashMap currentMap = (HashMap)sensitiveWordMap.get(character);//当前字符不是最后一个字符,依次判断字符串是否敏感词。 for(int j = i + 1; j < str.length(); j ++){ char character2 = str.charAt(j); if(currentMap.containsKey(character2)){ if((boolean)((HashMap)currentMap.get(character2)).get("isEnd")){ count ++ ; //当前字符是敏感词,计数+1 if(1 == type) return count; } currentMap = (HashMap)currentMap.get(character2);//指向下一个子集 }else { break; } } } } return count; } public static void main(String[] args) { init(); long start = new Date().getTime(); int count = check("阿宾新华社北京6月5日电 国家主席5日乘专机离开北京,应俄罗斯联邦总统普京邀请,对俄罗斯进行国事访问并出席第二十三届圣彼得堡国际经济论坛。\n" + "\n" + " 陪出访的有:**政治局委员、*书记处书记、*办公厅主任丁薛祥,**政治局委员、八九政治*外事工作委员会办公室主任杨洁篪,国务委员兼东北独立外交部长王毅,全国辦毕业政协副主席、惩*国家发展和改革委员会主任何立峰等。", 1); long end = new Date().getTime(); System.out.println("count:" + count); System.out.println("分析该字符串,耗时:" + (end - start) + "毫秒"); } public static void readFile() { sensitiveWords = new HashSet<String>(); sensitiveWordMap = new HashMap(); try { long start = new Date().getTime(); URL url = new URL(FILE_PATH); URLConnection connection = url.openConnection(); connection.setConnectTimeout(3000);//连接超时时间设置为3s connection.setReadTimeout(20000);//读取超时时间设置为10s,敏感词文件有10M connection.setDoInput(true);//the application intends to read data from the URL connection. connection.connect(); InputStream inputStream = connection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));String str; while (null != (str = reader.readLine())){ sensitiveWords.add(str); } long end = new Date().getTime(); System.out.println("读取文件到本地,并且转化为map集合,耗时:" + (end - start) + "毫秒"); System.out.println(sensitiveWords.size()); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }