今天在做一个新浪微博的抓取测试,发现抓取后的内容是Unicode编码的,完全找不到熟悉的汉字了,下面搜索出来的一种方法,完全可行,只是不知到Java内部是否提供了相关的类库。
实现方法如下:
1 public static String fromEncodedUnicode(char[] in, int off, int len) { 2 3 char aChar; 4 5 char[] out = new char[len]; 6 7 int outLen = 0; 8 9 int end = off + len; 10 11 while (off < end) { 12 13 aChar = in[off++]; 14 15 if (aChar == ‘\\‘) { 16 17 aChar = in[off++]; 18 19 if (aChar == ‘u‘) { 20 21 // Read the xxxx 22 23 int value = 0; 24 25 for (int i = 0; i < 4; i++) { 26 27 aChar = in[off++]; 28 29 switch (aChar) { 30 31 case ‘0‘: 32 33 case ‘1‘: 34 35 case ‘2‘: 36 37 case ‘3‘: 38 39 case ‘4‘: 40 41 case ‘5‘: 42 43 case ‘6‘: 44 45 case ‘7‘: 46 47 case ‘8‘: 48 49 case ‘9‘: 50 51 value = (value << 4) + aChar - ‘0‘; 52 53 break; 54 55 case ‘a‘: 56 57 case ‘b‘: 58 59 case ‘c‘: 60 61 case ‘d‘: 62 63 case ‘e‘: 64 65 case ‘f‘: 66 67 value = (value << 4) + 10 + aChar - ‘a‘; 68 69 break; 70 71 case ‘A‘: 72 73 case ‘B‘: 74 75 case ‘C‘: 76 77 case ‘D‘: 78 79 case ‘E‘: 80 81 case ‘F‘: 82 83 value = (value << 4) + 10 + aChar - ‘A‘; 84 85 break; 86 87 default: 88 89 throw new IllegalArgumentException("Malformed \\uxxxx encoding."); 90 91 } 92 93 } 94 95 out[outLen++] = (char) value; 96 97 } else { 98 99 if (aChar == ‘t‘) { 100 101 aChar = ‘\t‘; 102 103 } else if (aChar == ‘r‘) { 104 105 aChar = ‘\r‘; 106 107 } else if (aChar == ‘n‘) { 108 109 aChar = ‘\n‘; 110 111 } else if (aChar == ‘f‘) { 112 113 aChar = ‘\f‘; 114 115 } 116 117 out[outLen++] = aChar; 118 119 } 120 121 } else { 122 123 out[outLen++] = (char) aChar; 124 125 } 126 127 } 128 129 return new String(out, 0, outLen); 130 131 }