我是靠谱客的博主 阳光猎豹,这篇文章主要介绍计算文本词频tf-idf然后进行聚类,现在分享给大家,希望可以做个参考。

复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import jieba import jieba.analyse import math import operator from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN from sklearn.cluster import MeanShift, estimate_bandwidth from collections import Counter from sklearn.manifold import TSNE from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import PCA from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer import numpy as np import matplotlib.pyplot as plt from collections import defaultdict import os #np.set_printoptions(threshold=np.inf) #加载手工设置的某些词的idf值 def load_idf_file(path): idf_dict = {} handle = open(path, 'r',encoding= 'utf8') line = handle.readline() # 调用文件的 readline()方法 datas = [] while line: line = line.strip() if len(line)>0: line_arr = line.split(' ') idf_dict[line_arr[0]] = float(line_arr[1]) line = handle.readline() handle.close() return idf_dict def cal_idf(data_set,idf_dict): doc_num = len(data_set) word_doc_count=defaultdict(int) for word_str in data_set: word_list = word_str.split(' ') word_list = list(set(word_list)) for item in word_list: if item and item.strip()!='': word_doc_count[item]+=1 word_idf = {} default_idf_keys = idf_dict.keys() for k,v in word_doc_count.items(): idf = math.log(doc_num*1.0 / v) if k in default_idf_keys: word_idf[k] = idf_dict[k] else:word_idf[k] = idf #path = "idf.txt" #save(word_idf, path) return word_idf def cal_tfidf(data_set,idf_ret): doc_word_tfidf = [] i = 0 for word_str in data_set: word_list = word_str.split(' ') doc_word_total = len(word_list) doc_word_dict = defaultdict(int) doc_word_tfidf_dict = defaultdict(int) for item in word_list: if item and item.strip()!='': doc_word_dict[item]+=1 for k,v in doc_word_dict.items(): doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k] doc_word_tfidf.append(doc_word_tfidf_dict) i=i+1 return doc_word_tfidf def save(idf_dict, path): f = open(path, 'a+',encoding= 'utf8') f.truncate() for key in idf_dict.keys(): f.write(str(key) + " " + str(idf_dict[key]) + "n") f.close() # 切词 def jieba_tokenize(): jieba_need =[] for item in datas: temp_list1 = jieba.analyse.extract_tags(item,topK=10) w_len = len(temp_list1) if w_len>2: w_num = math.ceil(float(w_len)*0.6) temp_list1 = temp_list1[0:w_num] temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1] jieba_need.append(" ".join(temp_list)) return jieba_need # 降维 def reduction(matrix): svd = TruncatedSVD(50) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(matrix) return X jieba.analyse.set_idf_path("./jieba/idf.txt") data_num = 10000 k_num = 200 i = 0 output = open('../data.txt', 'r',encoding= 'utf8') line = output.readline() # 调用文件的 readline()方法 datas = [] while line: line = line.strip() if len(line)>0: i=i+1 datas.append(line) if i>=data_num:break line = output.readline() output.close() npyfile = "data.npy" if os.path.exists(npyfile): X = np.load("data.npy") else: # 读取停用词 with open('./stop_words.txt',encoding='utf-8') as f: entities = list(f) lines = [] for line in entities: line1 = line.strip() lines.append(line1) #加载手工设置的idf default_idf_dict = load_idf_file("./jieba/idf.txt") my_train = jieba_tokenize() idf_ret = cal_idf(my_train,default_idf_dict) tfidf_ret = cal_tfidf(my_train,idf_ret) doc_rows = len(datas) word_rows = len(idf_ret) #把每个文档每个分词的词频 转换成 矩阵[文档数,分词数] X = np.zeros([doc_rows,word_rows]) for i in range(doc_rows): j = 0 for k,v in idf_ret.items(): X[i][j] = tfidf_ret[i][k] j=j+1 X = np.array(X) np.save("data.npy",X) #降维 X = reduction(X) #cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False) #cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False) cl = DBSCAN(eps=0.2, min_samples=30) result = cl.fit_predict(X) num_clusters = len(set(result)) # 结果输出 ret = [[] for y in range(len(result))] for i in range(len(datas)): classid = result[i] ret[classid].append(datas[i]) for m in range(num_clusters): file = "./result/result_"+str(m)+".txt" handle = open(file, 'w+',encoding= 'utf8') for n in range(len(ret[m])): handle.write(ret[m][n]+"n") handle.close()

最后

以上就是阳光猎豹最近收集整理的关于计算文本词频tf-idf然后进行聚类的全部内容,更多相关计算文本词频tf-idf然后进行聚类内容请搜索靠谱客的其他文章。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(100)

评论列表共有 0 条评论

立即
投稿
返回
顶部