算法描述(正向):
给定最大词长n,待分词文本str,指针f=0,词典dic文档
1 取子串sub=str(f,f+n)
2 如果(遍历dic,有匹配sub) f++;
3 否则 n--;
4 注意:边界判定、没有找到词的情况
算法举例分析(正向):
你有个要分词的文本“你毁了我容忍傻逼的能力”,你给出能最大接受的词长为6
(注意,6为6字节(byte),而一个汉字为2字节,你可能注意到下面的程序里我把6除以2了,因为在java里,char也是两字节的,所以它能装一个汉字,也可以装一个字母或符号。)
现在指针指向第一个字,第一次取到的子串是“你毁了”,在词典中找有没有这个词,没有,那我们把词长变成4,即两个字,第二次取到的子串是“你毁”,再找,也没有,第三次取到的子串是“你”,在词典中能找到了,这时指针加一,指向“毁”字,词长恢复6,这就是一轮查找了。
。。。
跳过几步,假如现在指针指向了“容”字,第一次取到“容忍傻”,第二次取到“容忍”,“容忍”在词典能找到,这个时候,指针需要移动两位指向“傻”字,同理,若你取到三个字的词而且这个词能在词典中找到时,指针也要移三位,即 f+n
反向最大匹配跟正向原理一样,只是指针从最后一位开始向前移动。
1 package fc; 2 3 import java.io.BufferedReader; 4 import java.io.FileInputStream; 5 import java.io.FileNotFoundException; 6 import java.io.IOException; 7 import java.io.InputStreamReader; 8 import java.util.Stack; 9 10 /** 11 * 12 * @author 小振xzh 20140317 13 */ 14 public final class Fc { 15 public String m_sResult; 16 private int m_nPosIndex; 17 private int m_MaxLen; 18 private final int pre_m_MaxLen; 19 private int i = 0; // 默认正向最大匹配,i=-1时为反向最大匹配 20 21 // 正向最大匹配,构造函数1/2 22 public Fc(int maxLen, String str) throws IOException{ 23 m_MaxLen = maxLen/2; // Change in half of it 24 pre_m_MaxLen = m_MaxLen; 25 m_nPosIndex = 0; 26 m_sResult = ""; // Without initialization, the begining of result would have one null value 27 MM(str, pre_m_MaxLen, 0); 28 } 29 30 // 反向最大匹配,构造函数2/2,i为标识符 31 public Fc(int maxLen, String str, int i) throws IOException{ 32 this.i = -1; // 无论传什么数过来都是i=-1,即变成反向最大匹配 33 m_MaxLen = maxLen/2; 34 pre_m_MaxLen = m_MaxLen; 35 m_nPosIndex = 0; 36 m_sResult = ""; 37 MM(reverse(str), pre_m_MaxLen, 0); 38 } 39 40 /** 41 * Description: -Reverse/Forward- Maximum matching algorithm 42 * @param source Source string 43 * @param frompos Start position of source 44 * @param len The substring‘s length if no accident 45 * @throws java.io.IOException 46 */ 47 public void MM(String source,int len,int frompos) throws IOException{ 48 if(i==-1){ // 反向 49 // <editor-fold desc="反向最大匹配时"> 50 String tmp = subString(source, frompos, len); 51 if(m_nPosIndex>source.length()-1){ 52 return; 53 } 54 tmp = reverse(tmp); // A_1/5 for Reverse Directional Maximum Matching method 55 Stack<String> sk = new Stack<>(); // A_2/5 --‘A‘ marks the changes I have done for ‘Reverse‘ 56 if(match(tmp)){ 57 //m_sResult += tmp+"/ "; 58 sk.push(tmp+"/ "); // A_3/5 59 m_nPosIndex += m_MaxLen; 60 m_MaxLen = pre_m_MaxLen; // Resume the m_MaxLen 61 MM(source, m_MaxLen, m_nPosIndex); 62 }else{ 63 if(m_MaxLen>1){ 64 m_MaxLen -= 1; 65 MM(source, m_MaxLen, m_nPosIndex); 66 }else{ 67 //m_sResult += "字典中没有‘"+tmp+"‘字"; 68 sk.push("字典中没有’"+tmp+"‘字"); // A_4/5 69 m_nPosIndex += 1; 70 m_MaxLen = pre_m_MaxLen; // Resume the m_MaxLen 71 MM(source, m_MaxLen, m_nPosIndex); 72 } 73 } 74 while(!sk.isEmpty()){ // A_5/5 75 m_sResult += sk.pop(); 76 } 77 // </editor-fold> 78 }else{ // 正向 79 // <editor-fold desc="正向最大匹配时"> 80 String tmp = subString(source, frompos, len); 81 if(m_nPosIndex>source.length()-1){ // Warning: Different from Li‘s c++ example, it should be ‘>‘ here 82 return; 83 } 84 if(match(tmp)){ 85 m_sResult += tmp+"/ "; 86 m_nPosIndex += m_MaxLen; 87 m_MaxLen = pre_m_MaxLen; // Resume the m_MaxLen 88 MM(source, m_MaxLen, m_nPosIndex); 89 }else{ 90 if(m_MaxLen>1){ 91 m_MaxLen -= 1; 92 MM(source, m_MaxLen, m_nPosIndex); 93 }else{ 94 m_sResult += "字典中没有‘"+tmp+"‘字"; 95 m_nPosIndex += 1; 96 m_MaxLen = pre_m_MaxLen; // Resume the m_MaxLen 97 MM(source, m_MaxLen, m_nPosIndex); 98 } 99 } 100 // </editor-fold> 101 } 102 } 103 104 /** 105 * Description: Get the substring 106 * @param source Source string 107 * @param frompos Start position of source 108 * @param len The substring‘s length if no accident 109 * @return String 110 */ 111 private static String subString(String source, int frompos, int len){ 112 char sChar; 113 String result = ""; 114 115 if(source.length()-frompos<len){ 116 len = source.length()-frompos; 117 } 118 len += frompos; 119 while(frompos<len){ 120 sChar = source.charAt(frompos); // Get a char. A char contains two bytes. 121 frompos += 1; 122 result += sChar; 123 } 124 return result; 125 } 126 127 /** 128 * Description: Match with the dic 129 * @param str The word you want to find in dic 130 * @return boolean 131 */ 132 private static boolean match(String str) throws FileNotFoundException, IOException{ 133 InputStreamReader insr; 134 BufferedReader bufr; 135 try (FileInputStream fins = new FileInputStream("chineseDic.txt")) { // In project root file 136 insr = new InputStreamReader(fins,"gb2312"); 137 bufr = new BufferedReader(insr); 138 int len = str.length(); 139 String tmp; 140 while((tmp=bufr.readLine())!=null){ 141 tmp = tmp.split(",", 2)[0]; // I dont know how that example get the substring split with ‘,‘ 142 if(tmp.equals(str)){ 143 fins.close(); insr.close(); bufr.close(); 144 return true; 145 } 146 } 147 } 148 return false; 149 } 150 151 /** 152 * Description: Reverse the string 153 * @param str The string you want to reverse 154 * @return String 155 */ 156 private static String reverse(String str){ 157 StringBuilder sb = new StringBuilder(str); 158 str = sb.reverse().toString(); 159 return str; 160 } 161 }
调用分词:
1 package fc; 2 3 import java.io.IOException; 4 5 /** 6 * 7 * @author xzh 8 */ 9 public class main { 10 public static void main(String[] args) throws IOException{ 11 //String str = "中国家;" 12 String str = "在这一年中,中国的改革开放和现代化建设继续向前迈进。
国民经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。"; 13 Fc fc = new Fc(6, str); 14 //System.out.println(fc.m_sResult+"\n"); 15 Fc fcs = new Fc(6, str, -1); 16 //System.out.println(fcs.m_sResult); 17 } 18 }