TF-IDF用于文章分类

168 阅读 0 评论 111 点赞

我是靠谱客的博主温柔帆布鞋，这篇文章主要介绍TF-IDF用于文章分类，现在分享给大家，希望可以做个参考。

转载自http://blog.csdn.net/orlandowww/article/details/52706135
下载：
停用词集合：
http://download.csdn.net/download/kevinelstri/9817721
分词语料训练集：
http://download.csdn.net/download/github_36326955/9747927
1）TF-IDF
TF=某个词组在文章中出现次数/该文章的总词组数
IDF=log(语料库中文档总数/（包含该词组的文章总数+1））
计算：
参考http://www.voidcn.com/article/p-bbabkmsv-pt.html

# coding:utf-8
from sklearn.feature_extraction.text import CountVectorizer
#语料
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
#将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer()
#计算个词语出现的次数
X = vectorizer.fit_transform(corpus)
#获取词袋中所有文本关键词
word = vectorizer.get_feature_names()
print word
#查看词频结果
print X.toarray()
from sklearn.feature_extraction.text import TfidfTransformer
#类调用
transformer = TfidfTransformer()
print transformer
#将词频矩阵X统计成TF-IDF值
tfidf = transformer.fit_transform(X)
#查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重
print tfidf.toarray()

2）分词
使用python包jieba

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
reload(sys)
sys.setdefaultencoding('utf-8')
# 保存文件
def savefile(savepath,content):
with open(savepath,"wb") as fp:
fp.write(content)
# 读取文件
def readfile(path):
with open(path,"rb") as fp:
content=fp.read()
return content
def corpus_segment(corpus_path,seg_path):
for parent,dirnames,filenames in os.walk(corpus_path):
for filename in filenames:
if ".txt" in filename:
filepath=os.path.join(parent,filename)
content=readfile(filepath)
content=content.replace("rn","") #删除换行
content=content.replace(" ","") #删除空格
content_seg=jieba.cut(content) #为文件内容分词
savefile(seg_path+filename," ".join(content_seg))
print "中文语料分词结束n"
if __name__=="__main__":
#对训练集进行分词
corpus_path="./train_corpus/"
seg_path="./train_corpus_seg/"
corpus_segment(corpus_path,seg_path)

3)将文本文件转为bunch

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
reload(sys)
sys.setdefaultencoding('utf-8')
import cPickle as pickle
from sklearn.datasets.base import Bunch
# 读取文件
def readfile(path):
with open(path,"rb") as fp:
content=fp.read()
return content
def corpus2Bunch(wordbag_path,seg_path):
#创建一个Bunch实例
bunch=Bunch(target_name=[],label=[],filenames=[],contents=[])
regpat=re.compile('-(w+).txt')
for parent,dirnames,filenames in os.walk(seg_path):
for filename in filenames:
if ".txt" in filename:
filepath=os.path.join(parent,filename)
templabel=regpat.search(filename)
templabel=templabel.group(1)[:-4]
bunch.label.append(templabel)
if templabel not in bunch.target_name:
bunch.target_name.append(templabel)
bunch.filenames.append(filepath)
bunch.contents.append(readfile(filepath))
print bunch.target_name
print bunch.filenames[:10]
print bunch.label[:10]
with open(wordbag_path,"wb") as file_obj:
pickle.dump(bunch,file_obj)
print "构建文本对象结束n"
if __name__=="__main__":
wordbag_path="train_word_bag/train_set.dat" #Bunch保存路径
seg_path="train_corpus_seg/" #分词后分类语料库路径
corpus2Bunch(wordbag_path,seg_path)

4）生成TF-IDF向量空间

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
reload(sys)
sys.setdefaultencoding('utf-8')
import cPickle as pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
# 读取文件
def readfile(path):
with open(path,"rb") as fp:
content=fp.read()
return content
#读取bunch对象
def readbunchobj(path):
with open(path,"rb") as file_obj:
bunch=pickle.load(file_obj)
return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
with open(path,"wb") as file_obj:
pickle.dump(bunchobj,file_obj)
def vector_space(stopword_path,bunch_path,space_path):
stpwrdlst=readfile(stopword_path).splitlines() #读取停用词
bunch=readbunchobj(bunch_path) #导入分词后的词向量bunch对象
#构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
'''
tdm存放的是计算后得到的TF-IDF权重矩阵.
vocabulary是词向量空间的索引，例如，如果我们定义的词向量空间是（我，喜欢，相国大人），那么vocabulary就是这样一个索引字典
vocabulary={"我":0,"喜欢":1,"相国大人":2}，你可以简单的理解为：vocabulary就是词向量空间的坐标轴，索引值相当于表明了第几个维度。
'''
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
'''''
关于参数，你只需要了解这么几个就可以了：
stop_words:
传入停用词，以后我们获得vocabulary_的时候，就会根据文本信息去掉停用词得到
vocabulary:
之前说过，不再解释。
sublinear_tf:
计算tf值采用亚线性策略。比如，我们以前算tf是词频，现在用1+log(tf)来充当词频。
smooth_idf:
计算idf的时候log(分子/分母)分母有可能是0，smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启，无需关心。
norm:
归一化，我们计算TF-IDF的时候，是用TF*IDF，TF可以是归一化的，也可以是没有归一化的，一般都是采用归一化的方法，默认开启.
max_df:
有些词，他们的文档频率太高了（一个词如果每篇文档都出现，那还有必要用它来区分文本类别吗？当然不用了呀），所以，我们可以
设定一个阈值，比如float类型0.5（取值范围[0.0,1.0]）,表示这个词如果在整个数据集中超过50%的文本都出现了，那么我们也把它列
为临时停用词。当然你也可以设定为int型，例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了，那么我们也把它列
为临时停用词。
min_df:
与max_df相反，虽然文档频率越低，似乎越能区分文本，可是如果太低，例如10000篇文本中只有1篇文本出现过这个词，仅仅因为这1篇
文本，就增加了词向量空间的维度，太不划算。
当然，max_df和min_df在给定vocabulary参数时，就失效了。
'''
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
writebunchobj(space_path, tfidfspace)
print "if-idf词向量空间实例创建成功！！！"
if __name__=="__main__":
stopword_path="train_word_bag/stopword.txt"
bunch_path="train_word_bag/train_set.dat" #导入训练集bunch的路径
space_path="train_word_bag/tfidfspace.dat" #词向量空间保存路径
vector_space(stopword_path,bunch_path,space_path)

5）贝叶斯分类器
贝叶斯定理指：对于事件A和B，它们之间的概率关系满足：
P(A|B)=P(B|A)P(A)P(B)
贝叶斯分类原理：
这里写图片描述
5.1高斯朴素贝叶斯

5.2多项式贝叶斯

5.3伯努利模型
伯努利模型中，每个特征取值为0或者1
6）文章分类器

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
import numpy as np
reload(sys)
sys.setdefaultencoding('utf-8')
import cPickle as pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法
#读取bunch对象
def readbunchobj(path):
with open(path,"rb") as file_obj:
bunch=pickle.load(file_obj)
return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
with open(path,"wb") as file_obj:
pickle.dump(bunchobj,file_obj)
#导入训练集
trainpath="train_word_bag/tfidfspace.dat"
train_set=readbunchobj(trainpath)
#a=train_set.tdm.toarray()[0,:]
#indices=np.where(a!=0)
#print a[indices]
#训练分类器
# 训练分类器：输入词袋向量和分类标签，alpha:0.001
clf = MultinomialNB(alpha=0.001).fit(train_set.tdm, train_set.label)
clf_path="train_word_bag/clf.m" #贝叶斯模型保存路径
writebunchobj(clf_path, clf)

7)预测

# -*- encoding: utf-8 -*-
import sys
import os
import jieba
import re
import numpy as np
reload(sys)
sys.setdefaultencoding('utf-8')
import cPickle as pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法
# 保存文件
def savefile(savepath,content):
with open(savepath,"wb") as fp:
fp.write(content)
# 读取文件
def readfile(path):
with open(path,"rb") as fp:
content=fp.read()
return content
def corpus_segment(corpus_path,seg_path):
for parent,dirnames,filenames in os.walk(corpus_path):
for filename in filenames:
if ".txt" in filename:
filepath=os.path.join(parent,filename)
content=readfile(filepath)
content=content.replace("rn","") #删除换行
content=content.replace(" ","") #删除空格
content_seg=jieba.cut(content) #为文件内容分词
savefile(seg_path+filename," ".join(content_seg))
print "中文语料分词结束n"
def corpus2Bunch(wordbag_path,seg_path):
#创建一个Bunch实例
bunch=Bunch(target_name=[],label=[],filenames=[],contents=[])
regpat=re.compile('-(w+).txt')
for parent,dirnames,filenames in os.walk(seg_path):
for filename in filenames:
if ".txt" in filename:
filepath=os.path.join(parent,filename)
templabel=regpat.search(filename)
templabel=templabel.group(1)[:-4]
bunch.label.append(templabel)
if templabel not in bunch.target_name:
bunch.target_name.append(templabel)
bunch.filenames.append(filepath)
bunch.contents.append(readfile(filepath))
print bunch.target_name
print bunch.filenames[:10]
print bunch.label[:10]
with open(wordbag_path,"wb") as file_obj:
pickle.dump(bunch,file_obj)
print "构建文本对象结束n"
#读取bunch对象
def readbunchobj(path):
with open(path,"rb") as file_obj:
bunch=pickle.load(file_obj)
return bunch
#写入bunch对象
def writebunchobj(path,bunchobj):
with open(path,"wb") as file_obj:
pickle.dump(bunchobj,file_obj)
def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path):
stpwrdlst=readfile(stopword_path).splitlines() #读取停用词
bunch=readbunchobj(bunch_path) #导入分词后的词向量bunch对象
#构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})
'''
tdm存放的是计算后得到的TF-IDF权重矩阵.
vocabulary是词向量空间的索引，例如，如果我们定义的词向量空间是（我，喜欢，相国大人），那么vocabulary就是这样一个索引字典
vocabulary={"我":0,"喜欢":1,"相国大人":2}，你可以简单的理解为：vocabulary就是词向量空间的坐标轴，索引值相当于表明了第几个维度。
'''
trainbunch=readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)
'''''
关于参数，你只需要了解这么几个就可以了：
stop_words:
传入停用词，以后我们获得vocabulary_的时候，就会根据文本信息去掉停用词得到
vocabulary:
之前说过，不再解释。
sublinear_tf:
计算tf值采用亚线性策略。比如，我们以前算tf是词频，现在用1+log(tf)来充当词频。
smooth_idf:
计算idf的时候log(分子/分母)分母有可能是0，smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启，无需关心。
norm:
归一化，我们计算TF-IDF的时候，是用TF*IDF，TF可以是归一化的，也可以是没有归一化的，一般都是采用归一化的方法，默认开启.
max_df:
有些词，他们的文档频率太高了（一个词如果每篇文档都出现，那还有必要用它来区分文本类别吗？当然不用了呀），所以，我们可以
设定一个阈值，比如float类型0.5（取值范围[0.0,1.0]）,表示这个词如果在整个数据集中超过50%的文本都出现了，那么我们也把它列
为临时停用词。当然你也可以设定为int型，例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了，那么我们也把它列
为临时停用词。
min_df:
与max_df相反，虽然文档频率越低，似乎越能区分文本，可是如果太低，例如10000篇文本中只有1篇文本出现过这个词，仅仅因为这1篇
文本，就增加了词向量空间的维度，太不划算。
当然，max_df和min_df在给定vocabulary参数时，就失效了。
'''
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
writebunchobj(space_path, tfidfspace)
print "if-idf词向量空间实例创建成功！！！"
def metrics_result(actual, predict):
from sklearn import metrics
print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))
print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))
print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))
if __name__=="__main__":
clf_path="train_word_bag/clf.m" #贝叶斯模型保存路径
clf=readbunchobj(clf_path)
#对测试集进行分词
corpus_path="./test_corpus/"
seg_path="./test_corpus_seg/"
corpus_segment(corpus_path,seg_path)
wordbag_path="test_word_bag/test_set.dat" #Bunch保存路径
seg_path="test_corpus_seg/" #分词后分类语料库路径
corpus2Bunch(wordbag_path,seg_path)
stopword_path="train_word_bag/stopword.txt"
bunch_path="test_word_bag/test_set.dat" #导入bunch的路径
space_path="test_word_bag/tfidfspace.dat" #词向量空间保存路径
train_tfidf_path="train_word_bag/tfdifspace.dat"
vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)
#导入测试集
testpath="test_word_bag/tfidfspace.dat"
test_set=readbunchobj(testpath)
# 预测分类结果

predicted = clf.predict(test_set.tdm)
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
if flabel != expct_cate:
print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate
print "预测完毕!!!"
# 计算分类精度：

metrics_result(test_set.label, predicted)