复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136using System; using System.Collections.Generic; using System.Linq; using System.Numerics; using System.Text; namespace chx { public class SimHash { private String tokens; private BigInteger strSimHash; private int hashbits = 128; public BigInteger StrSimHash { get { return strSimHash; } } public SimHash(String tokens, int hashbits) { this.tokens = tokens; this.hashbits = hashbits; this.strSimHash = simHash(); } public SimHash(String tokens) { this.tokens = tokens; this.strSimHash = simHash(); } private BigInteger simHash() { int[] v = new int[this.hashbits]; ChxTokenizer stringTokens = new ChxTokenizer(this.tokens); while (stringTokens.hasMoreTokens()) { String temp = stringTokens.nextToken(); BigInteger t = this.hash(temp); //Console.WriteLine("temp = {0} : {1}", temp, t); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = BigInteger.One << i; if ((t & bitmask).Sign != 0) { v[i] += 1; } else { v[i] -= 1; } } } BigInteger fingerprint = BigInteger.Zero; for (int i = 0; i < this.hashbits; i++) { if (v[i] >= 0) { fingerprint = fingerprint + (BigInteger.Parse("1") << i); } } return fingerprint; } private BigInteger hash(string source) { if (source == null || source.Length == 0) { return BigInteger.Zero; } else { char[] sourceArray = source.ToCharArray(); BigInteger x = new BigInteger(((long)sourceArray[0]) << 7); BigInteger m = BigInteger.Parse("1000003"); BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One; foreach (char item in sourceArray) { BigInteger temp = new BigInteger((long)item); x = ((x * m) ^ temp) & mask; } x = x ^ (new BigInteger(source.Length)); if (x.Equals(BigInteger.MinusOne)) { x = new BigInteger(-2); } return x; } } public int HammingDistance(SimHash other) { BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One; BigInteger x = (this.strSimHash ^ other.strSimHash) & m; int tot = 0; while (x.Sign != 0) { tot += 1; x = x & (x - BigInteger.One); } return tot; } } //简单的分词法,直接将中文分成单个汉。可以用其他分词法代替 public class ChxTokenizer { private string source; private int index; private int length; public ChxTokenizer(string source) { this.source = source; this.index = 0; this.length = (source ?? "").Length; } public bool hasMoreTokens() { return index < length; } public string nextToken() { String s = source.Substring(index, 1); index++; return s; } } }
最后
以上就是俊秀台灯最近收集整理的关于C# SimHash文字相似度的全部内容,更多相关C#内容请搜索靠谱客的其他文章。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复