今天看到一个工具类使用正则表达式将一大段字符串中的中文和英文都分离出来了,在此记录一下,读者可以收藏!
import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.regex.Pattern; /** * 将字符串中的中文和英文都分离出来 * @author ouyangpeng * @link http://blog.csdn.net/ouyang_peng/ */ public class WordSeg { public static class WordSegEntry { public String word; public boolean isEnglish; public String toString() { return "[word-->" + word + "\tisEnglish-->" + isEnglish + "]"; } } private String raw; public String getRaw() { return raw; } public void setRaw(String raw) { this.raw = raw; } List<WordSegEntry> segs = new ArrayList<WordSegEntry>(); public List<WordSegEntry> getSegs() { return segs; } public int getLength() { int l = 0; for (WordSegEntry seg : segs) { l += seg.word.length(); } return l; } public int getCNLength() { int l = 0; for (WordSegEntry seg : segs) { if (seg.isEnglish) continue; l += seg.word.length(); } return l; } public String toString() { return join(segs, "\r\n") + "\n\n"+"lenOfEnglish=" + getLength() +"\n 中文长度=" + getCNLength(); } public static String join(Collection arr, String sep) { return join(arr.toArray(new Object[arr.size()]), sep); } public static String join(Object[] arr, String sep) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < arr.length; i++) { sb.append(arr[i]); if (sep != null && i < arr.length - 1) sb.append(sep); } return sb.toString(); } private static Pattern p = Pattern.compile("\\p{Punct}"); public static boolean isPunctuation(char c) { return p.matcher(String.valueOf(c)).matches() || ("、,:;。!?\n" + "{}()〔〕<>〈〉《》[]「」『』〖〗【】\n" + "@#%*&+=±×÷~-\u2014\u2015_—─━ ̄\u2025…┈┄┅┉┆┇┊┋|\ufe31│┃∥\/\u2215\n" + "‘’“”"'\u2035′\u301d″\u02ca\u02cb\n" + "$£¥‰§№°℃\u2109\u2105\n" + "^ˇ¨`°¤〃\n" + " ¢¤※\u2573\n" + "\u221f\u2252\u2266\u2267\u22bf∧∨∑∏∪∩∈∷√⊥∥∠⌒⊙∫∮≡≌≈∽∝≠≮≯≤≥∞∵∴\n" + "○●◎◇◆□■△▲\u25bd\u25bc\u2609\n" + "〓\u25e2\u25e3\u25e4\u25e5\u2594\u2581\u2582\u2583\u2585\u2587\u2588\u2589\u2593\u258a\u258b\u258c\u258d\u258e\u258f\u2595\n" + "→←↑↓\u2196\u2197\u2198\u2199\n" + "\u256d\u256e\u2570\u256f\n" + "\ufe35\ufe36\ufe39\ufe3a\ufe3f\ufe40\ufe3d\ufe3e\ufe41\ufe42\ufe43\ufe44\ufe3b\ufe3c\ufe37\ufe38\n" + "\u2550\u2551\u2552\u2553\u2554\u2555\u2556\u2557\u2559\u255a\u255b\u255c\u255d\u255e\u255f\u2560\u2561\u2562\u2563\u2564\u2565\u2566\u2567\u2568\u2569\u256a\u256b\u256c\u3012\n" + "┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋" + "\n") .contains(String.valueOf(c)); } public static WordSeg segWord(String input) { WordSeg wordseg = new WordSeg(); wordseg.setRaw(input); List<WordSegEntry> segs = wordseg.getSegs(); int last_c_is_en_cn = 0;// 1-en,2-cn int last = 0; for (int i = 0; i < input.length(); i++) { char c = input.charAt(i); if (c <= 32) {// invisible chars if (last_c_is_en_cn == 1) {// en ,break the words if (i > last) { WordSegEntry seg = new WordSegEntry(); seg.word = input.substring(last, i); seg.isEnglish = last_c_is_en_cn == 1; segs.add(seg); last = i + 1; } } } else if (isPunctuation(c)) { if (i > last) { if (last_c_is_en_cn != 0) { WordSegEntry seg = new WordSegEntry(); seg.word = input.substring(last, i); seg.isEnglish = last_c_is_en_cn == 1; segs.add(seg); } } last = i + 1; last_c_is_en_cn = 0; } else if ((Character.isLetter(c) || Character.isDigit(c)) && c < 127) { // en word if (last_c_is_en_cn == 2) { if (i > last) { WordSegEntry seg = new WordSegEntry(); seg.word = input.substring(last, i); seg.isEnglish = last_c_is_en_cn == 1; segs.add(seg); last = i; } } last_c_is_en_cn = 1; } else {// cn word if (last_c_is_en_cn == 1) { if (i > last) { WordSegEntry seg = new WordSegEntry(); seg.word = input.substring(last, i); seg.isEnglish = last_c_is_en_cn == 1; segs.add(seg); last = i; } } last_c_is_en_cn = 2; } } if (last < input.length() && last_c_is_en_cn != 0) { WordSegEntry seg = new WordSegEntry(); seg.word = input.substring(last, input.length()); seg.isEnglish = last_c_is_en_cn == 1; segs.add(seg); } for (WordSegEntry seg : segs) if (!seg.isEnglish) seg.word = seg.word.replaceAll("\\s+", ""); return wordseg; } }
对工具类进行测试:
class WordSegTest{ public static void main(String[] args) { System.out.println(WordSeg.segWord(" (欧阳鹏)我的Android进阶之旅{}()〔〕<>〈〉《》博客地址:http://blog.csdn.net/ouyang_peng/article/details/17224229「」『』〖〗【】○●◎◇◆□■△▲@#%*&欢迎大家![]∏∪∩∈∷√⊥∥∠⌒⊙∫∮≡≌≈∽∝≠≮≯≤≥∞∵∴. ")); } }
得到结果如下:
[word-->欧阳鹏 isEnglish-->false] [word-->我的 isEnglish-->false] [word-->Android isEnglish-->true] [word-->进阶之旅 isEnglish-->false] [word-->博客地址 isEnglish-->false] [word-->http isEnglish-->true] [word-->blog isEnglish-->true] [word-->csdn isEnglish-->true] [word-->net isEnglish-->true] [word-->ouyang isEnglish-->true] [word-->peng isEnglish-->true] [word-->article isEnglish-->true] [word-->details isEnglish-->true] [word-->17224229 isEnglish-->true] [word-->欢迎大家 isEnglish-->false] lenOfEnglish=71 中文长度=17
====================================================================================
作者:欧阳鹏 欢迎转载,与人分享是进步的源泉!
转载请保留原文地址:http://blog.csdn.net/ouyang_peng
===================================================================================