我是靠谱客的博主 俊秀台灯,这篇文章主要介绍C# SimHash文字相似度,现在分享给大家,希望可以做个参考。

复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
using System; using System.Collections.Generic; using System.Linq; using System.Numerics; using System.Text; namespace chx { public class SimHash { private String tokens; private BigInteger strSimHash; private int hashbits = 128; public BigInteger StrSimHash { get { return strSimHash; } } public SimHash(String tokens, int hashbits) { this.tokens = tokens; this.hashbits = hashbits; this.strSimHash = simHash(); } public SimHash(String tokens) { this.tokens = tokens; this.strSimHash = simHash(); } private BigInteger simHash() { int[] v = new int[this.hashbits]; ChxTokenizer stringTokens = new ChxTokenizer(this.tokens); while (stringTokens.hasMoreTokens()) { String temp = stringTokens.nextToken(); BigInteger t = this.hash(temp); //Console.WriteLine("temp = {0} : {1}", temp, t); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = BigInteger.One << i; if ((t & bitmask).Sign != 0) { v[i] += 1; } else { v[i] -= 1; } } } BigInteger fingerprint = BigInteger.Zero; for (int i = 0; i < this.hashbits; i++) { if (v[i] >= 0) { fingerprint = fingerprint + (BigInteger.Parse("1") << i); } } return fingerprint; } private BigInteger hash(string source) { if (source == null || source.Length == 0) { return BigInteger.Zero; } else { char[] sourceArray = source.ToCharArray(); BigInteger x = new BigInteger(((long)sourceArray[0]) << 7); BigInteger m = BigInteger.Parse("1000003"); BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One; foreach (char item in sourceArray) { BigInteger temp = new BigInteger((long)item); x = ((x * m) ^ temp) & mask; } x = x ^ (new BigInteger(source.Length)); if (x.Equals(BigInteger.MinusOne)) { x = new BigInteger(-2); } return x; } } public int HammingDistance(SimHash other) { BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One; BigInteger x = (this.strSimHash ^ other.strSimHash) & m; int tot = 0; while (x.Sign != 0) { tot += 1; x = x & (x - BigInteger.One); } return tot; } } //简单的分词法,直接将中文分成单个汉。可以用其他分词法代替 public class ChxTokenizer { private string source; private int index; private int length; public ChxTokenizer(string source) { this.source = source; this.index = 0; this.length = (source ?? "").Length; } public bool hasMoreTokens() { return index < length; } public string nextToken() { String s = source.Substring(index, 1); index++; return s; } } }

最后

以上就是俊秀台灯最近收集整理的关于C# SimHash文字相似度的全部内容,更多相关C#内容请搜索靠谱客的其他文章。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(61)

评论列表共有 0 条评论

立即
投稿
返回
顶部