c#-SimHash匹配相似-算法

2022-05-10 23:41:35

使用场景：Google 的 simhash 算法

 //通过大量测试，simhash用于比较大文本，比如500字以上效果都还蛮好，距离小于3的基本都是相似，误判率也比较低。

 //从我的经验，如果我们假定N是每个块的大小，M是重叠的字符的数目，N = 4和M = 3是最好的选择

    public class SimHashAnalyser : IAnalyser

    {

        private const int HashSize = 32;

        public float GetLikenessValue(string needle, string haystack)

        {

            var needleSimHash = this.DoCalculateSimHash(needle);

            var hayStackSimHash = this.DoCalculateSimHash(haystack);

            return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / (float)HashSize;

        }

        private static IEnumerable<int> DoHashTokens(IEnumerable<string> tokens)

        {

            var hashedTokens = new List<int>();

            foreach (string token in tokens)

            {

                hashedTokens.Add(token.GetHashCode());

            }

            return hashedTokens;

        }

        private static int GetHammingDistance(int firstValue, int secondValue)

        {

            var hammingBits = firstValue ^ secondValue;

            var hammingValue = 0;

            for (int i = 0; i < 32; i++)

            {

                if (IsBitSet(hammingBits, i))

                {

                    hammingValue += 1;

                }

            }

            return hammingValue;

        }

        private static bool IsBitSet(int b, int pos)

        {

            return (b & (1 << pos)) != 0;

        }

        private int DoCalculateSimHash(string input)

        {

            ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3);

            var hashedtokens = DoHashTokens(tokeniser.Tokenise(input));

            var vector = new int[HashSize];

            for (var i = 0; i < HashSize; i++)

            {

                vector[i] = 0;

            }

            foreach (var value in hashedtokens)

            {

                for (var j = 0; j < HashSize; j++)

                {

                    if (IsBitSet(value, j))

                    {

                        vector[j] += 1;

                    }

                    else

                    {

                        vector[j] -= 1;

                    }

                }

            }

            var fingerprint = 0;

            for (var i = 0; i < HashSize; i++)

            {

                if (vector[i] > 0)

                {

                    fingerprint += 1 << i;

                }

            }

            return fingerprint;

        }

    }

    public interface IAnalyser

    {

        float GetLikenessValue(string needle, string haystack);

    }

    public interface ITokeniser

    {

        IEnumerable<string> Tokenise(string input);

    }

    public class FixedSizeStringTokeniser : ITokeniser

    {

        private readonly ushort tokensize = 5;

        public FixedSizeStringTokeniser(ushort tokenSize)

        {

            if (tokenSize < 2 || tokenSize > 127)

            {

                throw new ArgumentException("Token 不能超出范围");

            }

            this.tokensize = tokenSize;

        }

        public IEnumerable<string> Tokenise(string input)

        {

            var chunks = new List<string>();

            int offset = 0;

            while (offset < input.Length)

            {

                chunks.Add(new string(input.Skip(offset).Take(this.tokensize).ToArray()));

                offset += this.tokensize;

            }

            return chunks;

        }

    }

    public class OverlappingStringTokeniser : ITokeniser

    {

        private readonly ushort chunkSize = 4;

        private readonly ushort overlapSize = 3;

        public OverlappingStringTokeniser(ushort chunkSize, ushort overlapSize)

        {

            if (chunkSize <= overlapSize)

            {

                throw new ArgumentException("Chunck 必须大于 overlap");

            }

            this.overlapSize = overlapSize;

            this.chunkSize = chunkSize;

        }

        public IEnumerable<string> Tokenise(string input)

        {

            var result = new List<string>();

            int position = 0;

            while (position < input.Length - this.chunkSize)

            {

                result.Add(input.Substring(position, this.chunkSize));

                position += this.chunkSize - this.overlapSize;

            }

            return result;

        }

    }

使用：

    const string HayStack = "中国香港………………";

    const string Needle = "中国香港 2013………………";

    IAnalyser analyser = new SimHashAnalyser();

    var likeness = analyser.GetLikenessValue(Needle, HayStack);

    Console.Clear();

    Console.WriteLine("Likeness: {0}%", likeness * 100);

    Console.ReadKey();

SimHash for c#

码农公寓

相关文章