计算文本词频tf-idf然后进行聚类

151 阅读 0 评论 100 点赞

我是靠谱客的博主阳光猎豹，这篇文章主要介绍计算文本词频tf-idf然后进行聚类，现在分享给大家，希望可以做个参考。

复制代码

import jieba
import jieba.analyse
import math
import operator
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import os
#np.set_printoptions(threshold=np.inf)
#加载手工设置的某些词的idf值
def load_idf_file(path):
idf_dict = {}
handle = open(path, 'r',encoding= 'utf8')
line = handle.readline()
# 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
line_arr = line.split(' ')
idf_dict[line_arr[0]] = float(line_arr[1])
line = handle.readline()
handle.close()
return idf_dict
def cal_idf(data_set,idf_dict):
doc_num = len(data_set)
word_doc_count=defaultdict(int)
for word_str in data_set:
word_list = word_str.split(' ')
word_list = list(set(word_list))
for item in word_list:
if item and item.strip()!='':
word_doc_count[item]+=1
word_idf = {}
default_idf_keys = idf_dict.keys()
for k,v in word_doc_count.items():
idf = math.log(doc_num*1.0 / v)
if k in default_idf_keys: word_idf[k] = idf_dict[k]
else:word_idf[k] = idf
#path = "idf.txt"
#save(word_idf, path)
return word_idf
def cal_tfidf(data_set,idf_ret):
doc_word_tfidf = []
i = 0
for word_str in data_set:
word_list = word_str.split(' ')
doc_word_total = len(word_list)
doc_word_dict = defaultdict(int)
doc_word_tfidf_dict = defaultdict(int)
for item in word_list:
if item and item.strip()!='':
doc_word_dict[item]+=1
for k,v in doc_word_dict.items():
doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k]
doc_word_tfidf.append(doc_word_tfidf_dict)
i=i+1
return doc_word_tfidf
def save(idf_dict, path):
f = open(path, 'a+',encoding= 'utf8')
f.truncate()
for key in idf_dict.keys():
f.write(str(key) + " " + str(idf_dict[key]) + "n")
f.close()
# 切词
def jieba_tokenize():
jieba_need =[]
for item in datas:
temp_list1 = jieba.analyse.extract_tags(item,topK=10)
w_len = len(temp_list1)
if w_len>2:
w_num = math.ceil(float(w_len)*0.6)
temp_list1 = temp_list1[0:w_num]
temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1]
jieba_need.append(" ".join(temp_list))
return jieba_need
# 降维
def reduction(matrix):
svd = TruncatedSVD(50)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(matrix)
return X
jieba.analyse.set_idf_path("./jieba/idf.txt")
data_num = 10000
k_num = 200
i = 0
output = open('../data.txt', 'r',encoding= 'utf8')
line = output.readline()
# 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
i=i+1
datas.append(line)
if i>=data_num:break
line = output.readline()
output.close()
npyfile = "data.npy"
if os.path.exists(npyfile):
X = np.load("data.npy")
else:
# 读取停用词
with open('./stop_words.txt',encoding='utf-8') as f:
entities = list(f)
lines = []
for line in entities:
line1 = line.strip()
lines.append(line1)
#加载手工设置的idf
default_idf_dict = load_idf_file("./jieba/idf.txt")
my_train = jieba_tokenize()
idf_ret = cal_idf(my_train,default_idf_dict)
tfidf_ret = cal_tfidf(my_train,idf_ret)
doc_rows = len(datas)
word_rows = len(idf_ret)
#把每个文档每个分词的词频 转换成 矩阵[文档数,分词数]
X = np.zeros([doc_rows,word_rows])
for i in range(doc_rows):
j = 0
for k,v in idf_ret.items():
X[i][j] = tfidf_ret[i][k]
j=j+1
X = np.array(X)
np.save("data.npy",X)
#降维
X = reduction(X)
#cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False)
#cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False)
cl = DBSCAN(eps=0.2, min_samples=30)
result = cl.fit_predict(X)
num_clusters = len(set(result))
# 结果输出
ret = [[] for y in range(len(result))]
for i in range(len(datas)):
classid = result[i]
ret[classid].append(datas[i])
for m in range(num_clusters):
file = "./result/result_"+str(m)+".txt"
handle = open(file, 'w+',encoding= 'utf8')
for n in range(len(ret[m])):
handle.write(ret[m][n]+"n")
handle.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import jieba
import jieba.analyse
import math
import operator
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation, DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import os
#np.set_printoptions(threshold=np.inf)
#加载手工设置的某些词的idf值
def load_idf_file(path):
idf_dict = {}
handle = open(path, 'r',encoding= 'utf8')
line = handle.readline()
# 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
line_arr = line.split(' ')
idf_dict[line_arr[0]] = float(line_arr[1])
line = handle.readline()
handle.close()
return idf_dict
def cal_idf(data_set,idf_dict):
doc_num = len(data_set)
word_doc_count=defaultdict(int)
for word_str in data_set:
word_list = word_str.split(' ')
word_list = list(set(word_list))
for item in word_list:
if item and item.strip()!='':
word_doc_count[item]+=1
word_idf = {}
default_idf_keys = idf_dict.keys()
for k,v in word_doc_count.items():
idf = math.log(doc_num*1.0 / v)
if k in default_idf_keys: word_idf[k] = idf_dict[k]
else:word_idf[k] = idf
#path = "idf.txt"
#save(word_idf, path)
return word_idf
def cal_tfidf(data_set,idf_ret):
doc_word_tfidf = []
i = 0
for word_str in data_set:
word_list = word_str.split(' ')
doc_word_total = len(word_list)
doc_word_dict = defaultdict(int)
doc_word_tfidf_dict = defaultdict(int)
for item in word_list:
if item and item.strip()!='':
doc_word_dict[item]+=1
for k,v in doc_word_dict.items():
doc_word_tfidf_dict[k]=(v/doc_word_total)*idf_ret[k]
doc_word_tfidf.append(doc_word_tfidf_dict)
i=i+1
return doc_word_tfidf
def save(idf_dict, path):
f = open(path, 'a+',encoding= 'utf8')
f.truncate()
for key in idf_dict.keys():
f.write(str(key) + " " + str(idf_dict[key]) + "n")
f.close()
# 切词
def jieba_tokenize():
jieba_need =[]
for item in datas:
temp_list1 = jieba.analyse.extract_tags(item,topK=10)
w_len = len(temp_list1)
if w_len>2:
w_num = math.ceil(float(w_len)*0.6)
temp_list1 = temp_list1[0:w_num]
temp_list=[e for e in temp_list1 if e not in lines + [' '] and len(e)>1]
jieba_need.append(" ".join(temp_list))
return jieba_need
# 降维
def reduction(matrix):
svd = TruncatedSVD(50)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(matrix)
return X
jieba.analyse.set_idf_path("./jieba/idf.txt")
data_num = 10000
k_num = 200
i = 0
output = open('../data.txt', 'r',encoding= 'utf8')
line = output.readline()
# 调用文件的 readline()方法
datas = []
while line:
line = line.strip()
if len(line)>0:
i=i+1
datas.append(line)
if i>=data_num:break
line = output.readline()
output.close()
npyfile = "data.npy"
if os.path.exists(npyfile):
X = np.load("data.npy")
else:
# 读取停用词
with open('./stop_words.txt',encoding='utf-8') as f:
entities = list(f)
lines = []
for line in entities:
line1 = line.strip()
lines.append(line1)
#加载手工设置的idf
default_idf_dict = load_idf_file("./jieba/idf.txt")
my_train = jieba_tokenize()
idf_ret = cal_idf(my_train,default_idf_dict)
tfidf_ret = cal_tfidf(my_train,idf_ret)
doc_rows = len(datas)
word_rows = len(idf_ret)
#把每个文档每个分词的词频 转换成 矩阵[文档数,分词数]
X = np.zeros([doc_rows,word_rows])
for i in range(doc_rows):
j = 0
for k,v in idf_ret.items():
X[i][j] = tfidf_ret[i][k]
j=j+1
X = np.array(X)
np.save("data.npy",X)
#降维
X = reduction(X)
#cl = MiniBatchKMeans(n_clusters=k_num, init='k-means++', n_init=1,init_size=1000, batch_size=10000, verbose=False)
#cl = KMeans(n_clusters=k_num, init='k-means++', random_state=30, n_init=1,verbose=False)
cl = DBSCAN(eps=0.2, min_samples=30)
result = cl.fit_predict(X)
num_clusters = len(set(result))
# 结果输出
ret = [[] for y in range(len(result))]
for i in range(len(datas)):
classid = result[i]
ret[classid].append(datas[i])
for m in range(num_clusters):
file = "./result/result_"+str(m)+".txt"
handle = open(file, 'w+',encoding= 'utf8')
for n in range(len(ret[m])):
handle.write(ret[m][n]+"n")
handle.close()