LDA主题模型从分词到词云再到模型

78 阅读 0 评论 52 点赞

我是靠谱客的博主热心老虎，最近开发中收集的这篇文章主要介绍LDA主题模型从分词到词云再到模型，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

这是我在做本科内容时，从python小白，一直到能够做出计算结果有关代码的记录，代码均能跑起来，有些内容可以看我前面文章，这个章节主要是我的一些代码记录。

结巴分词

# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
import jieba.posseg as pseg
import re
# 加载词典
jieba.load_userdict('dict.txt')
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open('stop_words.txt', 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子进行分词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('stopwords.txt')
# 这里引用函数，加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != 't':
outstr += word
outstr += " "
return outstr
inputs = open('./2019_wenben/0030.txt', 'r', encoding='utf-8')
# ./2019_wenben/0030.txt是读取文件的路径，有关文件路径的写法可参考其他文章
outputs = open('jieba_out.txt', 'a+', encoding='utf-8')
# jieba_out.txt是写入文件的名称，如没有该文件，则自动创建文件
for line in inputs:
line_seg = seg_sentence(line)
# 这里的返回值是字符串
# outputs.write(line_seg + 'n')
outputs.write(line_seg + ' ') # 写入txt文件中的词，采用空格分开
outputs.close()
inputs.close()

词频统计1

# # -*- encoding=utf-8 -*-
import jieba
# 将文件打开，并将文件的内容赋值给content，然后读取content
content = open(r'2018_jieba_output.txt', 'r', encoding='utf-8')
forest = content.read()
content.close()
words = list(jieba.cut(forest))
s = set(words)
dic = {}
for i in s:
if (i == " "):
continue
if (i == ""):
continue
if len(i) == 1:
continue
else:
dic[i] = words.count(i)
lis = list(dic.items())
lis.sort(key=lambda x: x[1], reverse=True)
for i in range(12):
print(lis[i])

词频统计2

这种方法只能统计字而不是词出现的频率，小白一个，还不知道怎么解决

import codecs
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong']
# 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False
# 解决保存图像是负号'-'显示为方块的问题
word = []
counter = {}
with codecs.open(r'2015_jieba_output.txt', 'r', encoding='utf-8') as fr:
for line in fr:
line = line.strip()
if len(line) == 0:
continue
for w in line:
if not w in word:
word.append(w)
if not w in counter:
counter[w] = 0
else:
counter[w] += 1
counter_list = sorted(counter.items(), key=lambda x: x[1], reverse=True)
print(counter_list[:50])
label = list(map(lambda x: x[0], counter_list[:50]))
value = list(map(lambda y: y[1], counter_list[:50]))
plt.bar(range(len(value)), value, tick_label=label)
plt.show()

词云

import jieba.analyse
import jieba
import wordcloud
f = open('2018_jieba_output.txt', 'r', encoding="utf-8")
t = f.read()
f.close()
w = wordcloud.WordCloud(font_path='msyh.ttc',
width=1000, height=700, background_color='white',
collocations=False,
# stopwords={'创新', '创业'}
# stopwords={'通过', '各类','一批','部门','万众','国家','建立','实施'},
#
min_font_size=10,
)
w.generate(t)
w.to_file('2018_grwordcloud.png')

LDA模型

# coding=utf-8
# 存储读取语料 一行语料为一个文档
print('读取语料')
corpus = []
for line in open('all_jieba_output_2.txt', 'r', encoding='utf-8').readlines():
corpus.append(line.strip())
# print(corpus)
# corpus是一个数组，存放的是test.txt中的所有内容，每行内容作为数组的一个元素
from sklearn.feature_extraction.text import CountVectorizer
# 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
print('词语转换为词频矩阵')
vectorizer = CountVectorizer()
print(vectorizer)
X = vectorizer.fit_transform(corpus)
# 作用是提取特征词
# analyze = vectorizer.build_analyzer()
word = vectorizer.get_feature_names()
weight = X.toarray()
# weight = X.toarray()的作用是把特征语出现次数放在数组里
print(X)
# X = vectorizer.fit_transform(corpus)是用于获取特征词的出现次数
print(len(weight))
print(weight[:5, :15])
# 5行，15列
# LDA算法
print('------LDA算法------:')
import numpy as np
import lda
import lda.datasets
model = lda.LDA(n_topics=5, n_iter=500, random_state=2)
# n_topics表示主题数,n_iter表示训练迭代的次数,topic_word = model.topic_word_表示主题中特征词的分布
model.fit(np.asarray(weight))
# model.fit_transform(X) is also available
# 主题词分布
print('------主题词分布------:')
topic_word = model.topic_word_
#生成主题以及主题中词的分布
print("topic-word:n", topic_word)
# 文档-主题（Document-Topic）分布
print('------文档-主题（Document-Topic）分布------:')
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))
# 输出5篇文章最可能的Topic
print('------输出5篇文章最可能的Topic------:')
label = []
for n in range(5):
topic_most_pr = doc_topic[n].argmax()
label.append(topic_most_pr)
print("doc: {} topic: {}".format(n, topic_most_pr))
# 计算topN关词
print('------计算topN关词------:')
n = 10
for i, word_weight in enumerate(topic_word):
# print("word_weight:n", word_weight)
distIndexArr = np.argsort(word_weight)
# print("distIndexArr:n", distIndexArr)
topN_index = distIndexArr[:-(n + 1):-1]
# print("topN_index:n", topN_index) # 权重最在的n个
topN_words = np.array(word)[topN_index]
print(u'*Topic {}n- {}'.format(i, ' '.join(topN_words)))
# 计算文档-主题分布图
import matplotlib.pyplot as plt
f, ax = plt.subplots(5, 1, figsize=(8, 8), sharex=False)
# 5行1个8x8大小的子图
for i, k in enumerate([0, 1, 2, 3, 4]):
ax[i].stem(doc_topic[k, :], linefmt='r-',
markerfmt='ro', basefmt='w-')
ax[i].set_xlim(-1, 5)
# x坐标下标
ax[i].set_ylim(0, 0.8)
# y坐标下标
ax[i].set_ylabel("Prob")
ax[i].set_title("Document {}".format(k))
ax[4].set_xlabel("Topic")
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
f, ax = plt.subplots(5, 1, figsize=(6, 6), sharex=True)
for i, k in enumerate([0, 1, 2, 3, 4]):
# 两个主题
ax[i].stem(topic_word[k, :], linefmt='b-',
markerfmt='bo', basefmt='w-')
ax[i].set_xlim(0, 4800)
ax[i].set_ylim(0, 0.15)
ax[i].set_ylabel("Prob")
ax[i].set_title("topic {}".format(k))
ax[1].set_xlabel("word")
plt.tight_layout()
plt.show()

以上都只是一个简单的代码实战，有些思路还需要梳理，我会慢慢在后面文章中写到。所有代码均可正常运行，如无法正常运行需查看文件路径是否正确，有些包是否已经安装。