[No000040]取得一个文本文件的编码方式

2023-11-24 08:14:40
using System;

using System.IO;

using System.Text;

/// <summary>

/// 用于取得一个文本文件的编码方式(Encoding)。

/// </summary>

public class TxtFileEncoder

{

    public TxtFileEncoder()

    {

        //

        // TODO: 在此处添加构造函数逻辑

        //

    }

    /// <summary>

    /// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符，Encoding.Default将被返回。

    /// </summary>

    /// <param name="fileName">文件名。</param>

    /// <returns></returns>

    public static Encoding GetEncoding(string fileName)

    {

        return GetEncoding(fileName, Encoding.Default);

    }

    /// <summary>

    /// 取得一个文本文件流的编码方式。

    /// </summary>

    /// <param name="stream">文本文件流。</param>

    /// <returns></returns>

    public static Encoding GetEncoding(FileStream stream)

    {

        return GetEncoding(stream, Encoding.Default);

    }

    /// <summary>

    /// 取得一个文本文件的编码方式。

    /// </summary>

    /// <param name="fileName">文件名。</param>

    /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>

    /// <returns></returns>

    public static Encoding GetEncoding(string fileName, Encoding defaultEncoding)

    {

        FileStream fs = new FileStream(fileName, FileMode.Open);

        Encoding targetEncoding = GetEncoding(fs, defaultEncoding);

        fs.Close();

        return targetEncoding;

    }

    /// <summary>

    /// 取得一个文本文件流的编码方式。

    /// </summary>

    /// <param name="stream">文本文件流。</param>

    /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>

    /// <returns></returns>

    public static Encoding GetEncoding(FileStream stream, Encoding defaultEncoding)

    {

        Encoding targetEncoding = defaultEncoding;

        if (stream != null && stream.Length >= )

        {

            //保存文件流的前4个字节

            byte byte1 = ;

            byte byte2 = ;

            byte byte3 = ;

            byte byte4 = ;

            //保存当前Seek位置

            long origPos = stream.Seek(, SeekOrigin.Begin);

            stream.Seek(, SeekOrigin.Begin);

            int nByte = stream.ReadByte();

            byte1 = Convert.ToByte(nByte);

            byte2 = Convert.ToByte(stream.ReadByte());

            if (stream.Length >= )

            {

                byte3 = Convert.ToByte(stream.ReadByte());

            }

            if (stream.Length >= )

            {

                byte4 = Convert.ToByte(stream.ReadByte());

            }

            //根据文件流的前4个字节判断Encoding

            //Unicode {0xFF, 0xFE};

            //BE-Unicode {0xFE, 0xFF};

            //UTF8 = {0xEF, 0xBB, 0xBF};

            if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe

            {

                targetEncoding = Encoding.BigEndianUnicode;

            }

            if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode

            {

                targetEncoding = Encoding.Unicode;

            }

            if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8

            {

                targetEncoding = Encoding.UTF8;

            }

            //恢复Seek位置

            stream.Seek(origPos, SeekOrigin.Begin);

        }

        return targetEncoding;

    }

    // 新增加一个方法，解决了不带BOM的 UTF8 编码问题

    /// <summary>

    /// 通过给定的文件流，判断文件的编码类型

    /// </summary>

    /// <param name="fs">文件流</param>

    /// <returns>文件的编码类型</returns>

    public static System.Text.Encoding GetEncoding(Stream fs)

    {

        byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };

        byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };

        byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM

        Encoding reVal = Encoding.Default;

        BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);

        byte[] ss = r.ReadBytes();

        if (ss[] == 0xFE && ss[] == 0xFF && ss[] == 0x00)

        {

            reVal = Encoding.BigEndianUnicode;

        }

        else if (ss[] == 0xFF && ss[] == 0xFE && ss[] == 0x41)

        {

            reVal = Encoding.Unicode;

        }

        else

        {

            if (ss[] == 0xEF && ss[] == 0xBB && ss[] == 0xBF)

            {

                reVal = Encoding.UTF8;

            }

            else

            {

                int i;

                int.TryParse(fs.Length.ToString(), out i);

                ss = r.ReadBytes(i);

                if (IsUTF8Bytes(ss))

                    reVal = Encoding.UTF8;

            }

        }

        r.Close();

        return reVal;

    }

    /// <summary>

    /// 判断是否是不带 BOM 的 UTF8 格式

    /// </summary>

    /// <param name="data"></param>

    /// <returns></returns>

    private static bool IsUTF8Bytes(byte[] data)

    {

        int charByteCounter = ;  //计算当前正分析的字符应还有的字节数

        byte curByte; //当前分析的字节.

        for (int i = ; i < data.Length; i++)

        {

            curByte = data[i];

            if (charByteCounter == )

            {

                if (curByte >= 0x80)

                {

                    //判断当前

                    while (((curByte <<= ) & 0x80) != )

                    {

                        charByteCounter++;

                    }

                    //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X　

                    if (charByteCounter ==  || charByteCounter > )

                    {

                        return false;

                    }

                }

            }

            else

            {

                //若是UTF-8 此时第一位必须为1

                if ((curByte & 0xC0) != 0x80)

                {

                    return false;

                }

                charByteCounter--;

            }

        }

        if (charByteCounter > )

        {

            throw new Exception("非预期的byte格式!");

        }

        return true;

    }

}
码农公寓

相关文章