我是靠谱客的博主 俊秀台灯,最近开发中收集的这篇文章主要介绍C# SimHash文字相似度,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;
using System.Text;

namespace chx
{
    public class SimHash
    {
        private String tokens;
        private BigInteger strSimHash;
        private int hashbits = 128;

        public BigInteger StrSimHash
        {
            get
            {
                return strSimHash;
            }
        }

        public SimHash(String tokens, int hashbits)
        {
            this.tokens = tokens;
            this.hashbits = hashbits;
            this.strSimHash = simHash();
        }
        public SimHash(String tokens)
        {
            this.tokens = tokens;
            this.strSimHash = simHash();
        }

        private BigInteger simHash()
        {
            int[] v = new int[this.hashbits];
            ChxTokenizer stringTokens = new ChxTokenizer(this.tokens);
            while (stringTokens.hasMoreTokens())
            {
                String temp = stringTokens.nextToken();
                BigInteger t = this.hash(temp);
                //Console.WriteLine("temp = {0} : {1}", temp, t);
                for (int i = 0; i < this.hashbits; i++)
                {
                    BigInteger bitmask = BigInteger.One << i;
                    if ((t & bitmask).Sign != 0)
                    {
                        v[i] += 1;
                    }
                    else
                    {
                        v[i] -= 1;
                    }
                }
            }
            BigInteger fingerprint = BigInteger.Zero;
            for (int i = 0; i < this.hashbits; i++)
            {
                if (v[i] >= 0)
                {
                    fingerprint = fingerprint + (BigInteger.Parse("1") << i);
                }
            }
            return fingerprint;
        }

        private BigInteger hash(string source)
        {
            if (source == null || source.Length == 0)
            {
                return BigInteger.Zero;
            }
            else
            {
                char[] sourceArray = source.ToCharArray();
                BigInteger x = new BigInteger(((long)sourceArray[0]) << 7);
                BigInteger m = BigInteger.Parse("1000003");
                BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One;
                foreach (char item in sourceArray)
                {
                    BigInteger temp = new BigInteger((long)item);
                    x = ((x * m) ^ temp) & mask;
                }
                x = x ^ (new BigInteger(source.Length));
                if (x.Equals(BigInteger.MinusOne))
                {
                    x = new BigInteger(-2);
                }
                return x;
            }
        }

        public int HammingDistance(SimHash other)
        {
            BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One;
            BigInteger x = (this.strSimHash ^ other.strSimHash) & m;
            int tot = 0;
            while (x.Sign != 0)
            {
                tot += 1;
                x = x & (x - BigInteger.One);
            }
            return tot;
        }

    }

    //简单的分词法,直接将中文分成单个汉。可以用其他分词法代替
    public class ChxTokenizer
    {
        private string source;
        private int index;
        private int length;
        public ChxTokenizer(string source)
        {
            this.source = source;
            this.index = 0;
            this.length = (source ?? "").Length;
        }

        public bool hasMoreTokens()
        {
            return index < length;
        }

        public string nextToken()
        {
            String s = source.Substring(index, 1);
            index++;
            return s;
        }
    }
}

最后

以上就是俊秀台灯为你收集整理的C# SimHash文字相似度的全部内容,希望文章能够帮你解决C# SimHash文字相似度所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(40)

评论列表共有 0 条评论

立即
投稿
返回
顶部