字符串按照字节截取

2024-01-21 17:12:28

如果遇到截取字符串进行存储数据库时，注意⚠️：数据库中是按照字节存储的。所以在截取时需要注意
一般substring()是按照字符串的长度来进行的截取，所以有时即使你截取完入库时依旧会报，字段超长问题。

首先附上的是中英文在不同的编码下的字节长度：

import java.io.UnsupportedEncodingException;   
  
public class EncodeTest {   
    /**  
     * 打印字符串在指定编码下的字节数和编码名称到控制台  
     *   
     * @param s  
     *            字符串  
     * @param encodingName  
     *            编码格式  
     */  
    public static void printByteLength(String s, String encodingName) {   
        System.out.print("字节数：");   
        try {   
            System.out.print(s.getBytes(encodingName).length);   
        } catch (UnsupportedEncodingException e) {   
            e.printStackTrace();   
        }   
        System.out.println(";编码：" + encodingName);   
    }   
  
    public static void main(String[] args) {   
        String en = "A";   
        String ch = "人";   
  
        // 计算一个英文字母在各种编码下的字节数   
        System.out.println("英文字母：" + en);   
        EncodeTest.printByteLength(en, "GB2312");   
        EncodeTest.printByteLength(en, "GBK");   
        EncodeTest.printByteLength(en, "GB18030");   
        EncodeTest.printByteLength(en, "ISO-8859-1");   
        EncodeTest.printByteLength(en, "UTF-8");   
        EncodeTest.printByteLength(en, "UTF-16");   
        EncodeTest.printByteLength(en, "UTF-16BE");   
        EncodeTest.printByteLength(en, "UTF-16LE");   
  
        System.out.println();   
  
        // 计算一个中文汉字在各种编码下的字节数   
        System.out.println("中文汉字：" + ch);   
        EncodeTest.printByteLength(ch, "GB2312");   
        EncodeTest.printByteLength(ch, "GBK");   
        EncodeTest.printByteLength(ch, "GB18030");   
        EncodeTest.printByteLength(ch, "ISO-8859-1");   
        EncodeTest.printByteLength(ch, "UTF-8");   
        EncodeTest.printByteLength(ch, "UTF-16");   
        EncodeTest.printByteLength(ch, "UTF-16BE");   
        EncodeTest.printByteLength(ch, "UTF-16LE");   
    }   
}  
//运行结果

英文字母：A
字节数：1;编码：GB2312
字节数：1;编码：GBK
字节数：1;编码：GB18030
字节数：1;编码：ISO-8859-1
字节数：1;编码：UTF-8
字节数：4;编码：UTF-16
字节数：2;编码：UTF-16BE
字节数：2;编码：UTF-16LE
中文汉字：人
字节数：2;编码：GB2312
字节数：2;编码：GBK
字节数：2;编码：GB18030
字节数：1;编码：ISO-8859-1
字节数：3;编码：UTF-8
字节数：4;编码：UTF-16
字节数：2;编码：UTF-16BE
字节数：2;编码：UTF-16LE

下面开始处理字符串，这里有两种方式：个人建议采用第一种，第二种比较鸡肋采用倒减方式（和同事一起扯皮来的(o^^o)）

// 入参都是 str：要处理的字符串 count：截取的字节数量
//第一种方式
   public String test(String str, int count) {
        String result = null;
        if (StringUtils.isBlank(str) || count <= 0) {
            return null;
        }
        byte[] bytes = str.getBytes();
        int length = bytes.length;
        if (length <= count) {
            return str;
        } else {
            result = new String(bytes, 0, count);
            int len = result.length();
            if (str.charAt(len - 1) != result.charAt(len - 1)) { //最后一个字符对比 只有汉字会出现不相等的情况
                if (len < 2) { // 如果字符串开头是汉字 并且截取字节小于此汉子字节 那么返回null 2表示字节长度 汉字编码为GBK时字节为2， UTF-8为3 默认StandardCharsets.UTF_8
                    return null;
                } else {
                    result = result.substring(0, len - 1);
                }

            }
        }

        return result;
    }
    //第二种方式
        public  String spiltByByte(String str, int count) {
        StringBuilder buffer = new StringBuilder();
        char[] chars = str.toCharArray();
        char testChar;
        for (int i = 0; ; i++) {
            if (count <= 0) {
                break;
            }
            testChar = str.charAt(i);
            buffer.append(testChar);
            count -= String.valueOf(testChar).getBytes().length;//某个字符的长度

            if (i + 1 < chars.length) {
                if (String.valueOf(chars[i + 1]).getBytes().length > count) {// 当前截取的字符子节长度大于剩余的子节长度，表明需要舍弃该字符
                    break;
                }

            }
        }
        return buffer.toString();
    }

码农公寓

相关文章