概述
import warnings
#控制警告错误的输出
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec
import jieba
import tensorflow as tf
import numpy as np
import time
#模块random包含以各种方式生成随机数的函数,其中的randint()返回一个位于指定范围内的整数
from random import randint
#shuffle() 方法将序列的所有元素随机排序。
from random import shuffle
#----------------------------------
#通过读取名字为停用词.txt的文件来返回停用词
def makeStopWord():
with open('停用词.txt','r',encoding = 'utf-8') as f:
lines = f.readlines()
stopWord = []
for line in lines:
words = jieba.lcut(line,cut_all = False)
for word in words:
stopWord.append(word)
return stopWord
def words2Array(lineList):
linesArray=[]
wordsArray=[]
steps = []
for line in lineList:
t = 0
p = 0
for i in range(MAX_SIZE):#一条评价最多容纳的单词数目(25个,多退少补)
if i<len(line):
try:#添加每个行的每个词的词向量
wordsArray.append(model.wv.word_vec(line[i]))
p = p + 1
except KeyError:
t=t+1
continue
else:#一句话不够25个词的,用200维的词向量代表的词来补够
wordsArray.append(np.array([0.0]*dimsh))
for i in range(t):
wordsArray.append(np.array([0.0]*dimsh))
steps.append(p)#统计一句话包含多少个有效词(即扣除非补齐的词)
linesArray.append(wordsArray)#从第一句话开始,每句话用25行200列的矩阵来表示,直到遍历所有的句子。
wordsArray = []
linesArray = np.array(linesArray)#三维矩阵
steps = np.array(steps)#统计每一句话中的有效词放到数组中,数组的元素个数为句子的个数
return linesArray, steps
def convert2Data(posArray, negArray, posStep, negStep):
randIt = []
data = []
steps = []
labels = []
for i in range(len(posArray)):
#积极评价:25*200的矩阵,有效词的个数,标签,如果是分3类,标签就为[1,0,0]
randIt.append([posArray[i], posStep[i], [1,0]])
for i in range(len(negArray)):#消极评价:25*200的矩阵,有效词的个数,标签
randIt.append([negArray[i], negStep[i], [0,1]])
shuffle(randIt)#随机混乱
for i in range(len(randIt)):
data.append(randIt[i][0])#每一句话的25*200的矩阵表示,放到data中
steps.append(randIt[i][1])#每一句话的有效词的个数,放到step中
labels.append(randIt[i][2])#每一句话的标签,放到label中
data = np.array(data)
steps = np.array(steps)
return data, steps, labels
def getWords(file):
wordList = []
trans = []
lineList = []
with open(file,'r',encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
#去掉句子末尾的空格符
trans = jieba.lcut(line.replace('n',''), cut_all = False)
for word in trans:
if word not in stopWord:
wordList.append(word)
lineList.append(wordList)
wordList = []
return lineList
def makeData(posPath,negPath):
#获取词汇,返回类型为[[word1,word2...],[word1,word2...],...]
pos = getWords(posPath)
print("The positive data's length is :",len(pos))
neg = getWords(negPath)
print("The negative data's length is :",len(neg))
#将评价数据转换为矩阵,返回类型为array
posArray, posSteps = words2Array(pos)
negArray, negSteps = words2Array(neg)
#将积极数据和消极数据混合在一起打乱,制作数据集
Data, Steps, Labels = convert2Data(posArray, negArray, posSteps, negSteps)
return Data, Steps, Labels
#----------------------------------------------
# Word60.model 60维
# word2vec.model 200维
timeA=time.time()
word2vec_path = 'word2vec/word2vec.model'
model=gensim.models.Word2Vec.load(word2vec_path)
dimsh=model.vector_size
MAX_SIZE=25
stopWord = makeStopWord()
print("In train data:")
#trainSteps是一维数组,len(trainSteps)=总的样本数据的数目,即多少条评论;每个元素就是每条评论的长度
#trainData的数据结构是19130*25*200.
#trainLabels的数据结构是19130*2
trainData, trainSteps, trainLabels = makeData('data/B/Pos-train.txt',
'data/B/Neg-train.txt')
print("In test data:")
testData, testSteps, testLabels = makeData('data/B/Pos-test.txt',
'data/B/Neg-test.txt')
trainLabels = np.array(trainLabels)
del model
print("-"*30)
print("The trainData's shape is:",trainData.shape)
print("The testData's shape is:",testData.shape)
print("The trainSteps's shape is:",trainSteps.shape)
print("The testSteps's shape is:",testSteps.shape)
print("The trainLabels's shape is:",trainLabels.shape)
print("The testLabels's shape is:",np.array(testLabels).shape)
num_nodes = 128
batch_size = 16
output_size = 2
graph = tf.Graph()
with graph.as_default():
#trainData's shape 与下面的tf.placeholder的shape是相同的才行,都是3维的矩阵
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size,MAX_SIZE,dimsh))
#每个batch中长度最长的那条样本数据对应的长度
tf_train_steps = tf.placeholder(tf.int32,shape=(batch_size))
#The trainLabels's shape与tf_train_labels的shape类型相同,都是2维矩阵。
tf_train_labels = tf.placeholder(tf.float32,shape=(batch_size,output_size))
tf_test_dataset = tf.constant(testData,tf.float32)#常量
tf_test_steps = tf.constant(testSteps,tf.int32)#常量
#tf.nn.rnn_cell.BasicLSTMCell定义单个基本的LSTM单元,num_units表示神经元的个数
#state_is_tuple=True的时候,state是元组形式,state=(c,h)。
#如果是False,那么state是一个由c和h拼接起来的张量,state=tf.concat(1,[c,h])
#还有个参数forget_bias是遗忘门的偏差值(0到1之间)
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = num_nodes,
state_is_tuple=True)
#tf.truncated_normal(shape, mean, stddev) :shape表示生成张量的维度,mean是均值,stddev是标准差。
#这个函数产生随机的正太分布值作为w1初始值,均值和标准差自己设定。
w1 = tf.Variable(tf.truncated_normal([num_nodes,num_nodes // 2], stddev=0.1))
#随机产生一维数组,数组内的元素个数为num_nodes // 2
b1 = tf.Variable(tf.truncated_normal([num_nodes // 2], stddev=0.1))
#如果是3分类问题,w2的列数就为3
w2 = tf.Variable(tf.truncated_normal([num_nodes // 2, 2], stddev=0.1))
#随机产生一维数组,数组内的元素个数(output_size)为2,如果是3分类,数组内的元素个数为3
b2 = tf.Variable(tf.truncated_normal([2], stddev=0.1))
#训练过程中,每送进模型一个字,就是一个step
def model(dataset, steps):
#dynamic_rnn返回两个变量,第一个是每个step的输出值,第二个是最终的状态。
#sequencelength即每条评价对应的长度只包括有效单词,通过trainSteps传入
outputs, last_states = tf.nn.dynamic_rnn(cell = lstm_cell,
dtype = tf.float32,
sequence_length = steps,
inputs = dataset)
#rnn会在每一个字产生一个cell_state,一个字对应一个cell,在这里只取最后一个字的cell_state作为输出
#last_states[1]=last_states.h即该时刻的隐藏状态,形状为[batch_size,cell_num]
#last_states[0]=last_states.c即该时刻的细胞状态,形状为[batch_size,cell_num]
#outputs由一系列的h构成,形状为[batch_size,step,cell_num]
hidden = last_states[1]
hidden = tf.matmul(hidden, w1) + b1
logits = tf.matmul(hidden, w2) + b2
#就是神经网络最后一层的输出,如果有batch的话,它的大小就是[batchsize,num_classes]
#意思是矩阵:batchsize*num_classes,logits的维度就是类别的数目,每个类都有对应的概率
return logits
train_logits = model(tf_train_dataset, tf_train_steps)
#tf.reduce_mean(x)表示计算全局平均值,batchz_size做分母;
#tf.reduce_mean(x, axis=0)表示计算每列的平均值,batchz_size做分母;
#tf.reduce_mean(x, axis=1)表示计算每行的平均值,batchz_size做分母;
loss = tf.reduce_mean(
#第一个参数logits:就是神经网络最后一层的输出,如果有batch的话,它的大小就是[batchsize,num_classes],单样本的话,大小就是num_classes
#第二个参数labels:实际的标签,大小同上。
#第一步是对输出层做归一化处理,得到每一批样本中的每个样本的结果分类概率分布
#第二步是实际标签做一个交叉熵,得到的是一个矩阵
#第三步是对上述矩阵求平均值
tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels,
logits=train_logits))
#学习率取0.01
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
#test_prediction将按照模型计算测试集的标签为0或1的概率,
#形式为[[0.6,0.4],
# [0.8,0.2],
# [0.4,0.6]]
test_prediction = tf.nn.softmax(model(tf_test_dataset, tf_test_steps))
summary_frequency = 500
with tf.Session(graph = graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
#bacth_size为移动的覆盖面,从第一条评论开始移动,每次移动的步数最小是1,最大是bacth_size那么大。
for data_index in range(len(trainData)+1):
offset = (data_index * batch_size) % (len(trainLabels)-batch_size)
#feed_dict的作用是给使用placeholder创建出来的tensor赋值
#每次输入的训练数据只有batch_size个,随机取起点,取连续batch_size个数据
feed_dict={tf_train_dataset:trainData[offset:offset + batch_size],
tf_train_labels:trainLabels[offset:offset + batch_size],
tf_train_steps:trainSteps[offset:offset + batch_size]}
_, l = session.run([optimizer,loss],
feed_dict = feed_dict)
mean_loss += l
if data_index >0 and data_index % summary_frequency == 0:
mean_loss = mean_loss / summary_frequency
print("The data_index is: %d"%(data_index))
print("In train data,the loss is:%.4f"%(mean_loss))
mean_loss = 0
acrc = 0
prediction = session.run(test_prediction)
for i in range(len(prediction)):
#testLabels[i].index(1)代表测试集的第i行的第二列的vulue值(0或1),也就是类别值的标签
#testLabels的形式为[[0,1],
# [1,0],
# [2,0]]
#prediction的列数跟类别的数量相等,有多少列就有多少类
'''多分类问题就是:testLabels的形式为[[0,1],相应的prediction的形式为[[0.6,0.2,02], []
# [1,2], [0.5,0.3,02],
# [2,0], [0.7,0.1,02],
# [3,2], [0.4,0.4,02],
# [4,1]] [0.6,0.3,01]]
'''
#大于0.5说明成功分类一次,如果是多分类比如分三类,大于0.34就行
if prediction[i][testLabels[i].index(1)] > 0.5:
acrc = acrc + 1
print("In test data,the accuracy is:%.2f%%"%((acrc/len(testLabels))*100))
#####################################
timeB=time.time()
#输出项目运行时间
print("time cost:",int(timeB-timeA))
训练过程如下:
In train data:
The positive data's length is : 9701
The negative data's length is : 9429
In test data:
The positive data's length is : 995
The negative data's length is : 999
------------------------------
The trainData's shape is: (19130, 25, 200)
The testData's shape is: (1994, 25, 200)
The trainSteps's shape is: (19130,)
The testSteps's shape is: (1994,)
The trainLabels's shape is: (19130, 2)
The testLabels's shape is: (1994, 2)
Initialized
The data_index is: 500
In train data,the loss is:0.6226
In test data,the accuracy is:69.56%
The data_index is: 1000
In train data,the loss is:0.5280
In test data,the accuracy is:74.17%
The data_index is: 1500
In train data,the loss is:0.4674
In test data,the accuracy is:77.68%
The data_index is: 2000
In train data,the loss is:0.4277
In test data,the accuracy is:80.59%
The data_index is: 2500
In train data,the loss is:0.3888
In test data,the accuracy is:82.15%
The data_index is: 3000
In train data,the loss is:0.3724
In test data,the accuracy is:83.05%
The data_index is: 3500
In train data,the loss is:0.3435
In test data,the accuracy is:83.75%
The data_index is: 4000
In train data,the loss is:0.3362
In test data,the accuracy is:86.06%
The data_index is: 4500
In train data,the loss is:0.3024
In test data,the accuracy is:87.36%
The data_index is: 5000
In train data,the loss is:0.2957
In test data,the accuracy is:88.11%
The data_index is: 5500
In train data,the loss is:0.2755
In test data,the accuracy is:89.27%
The data_index is: 6000
In train data,the loss is:0.2520
In test data,the accuracy is:89.82%
The data_index is: 6500
In train data,the loss is:0.2400
In test data,the accuracy is:88.67%
The data_index is: 7000
In train data,the loss is:0.2213
In test data,the accuracy is:91.02%
The data_index is: 7500
In train data,the loss is:0.2174
In test data,the accuracy is:92.43%
The data_index is: 8000
In train data,the loss is:0.1927
In test data,the accuracy is:92.68%
The data_index is: 8500
In train data,the loss is:0.1801
In test data,the accuracy is:93.28%
The data_index is: 9000
In train data,the loss is:0.1639
In test data,the accuracy is:92.73%
The data_index is: 9500
In train data,the loss is:0.1471
In test data,the accuracy is:94.68%
The data_index is: 10000
In train data,the loss is:0.1404
In test data,the accuracy is:93.83%
The data_index is: 10500
In train data,the loss is:0.1251
In test data,the accuracy is:94.13%
The data_index is: 11000
In train data,the loss is:0.1196
In test data,the accuracy is:94.38%
The data_index is: 11500
In train data,the loss is:0.0992
In test data,the accuracy is:94.73%
The data_index is: 12000
In train data,the loss is:0.0894
In test data,the accuracy is:94.73%
The data_index is: 12500
In train data,the loss is:0.0773
In test data,the accuracy is:94.68%
The data_index is: 13000
In train data,the loss is:0.0678
In test data,the accuracy is:95.49%
The data_index is: 13500
In train data,the loss is:0.0688
In test data,the accuracy is:96.24%
The data_index is: 14000
In train data,the loss is:0.0527
In test data,the accuracy is:94.78%
The data_index is: 14500
In train data,the loss is:0.0508
In test data,the accuracy is:95.79%
The data_index is: 15000
In train data,the loss is:0.0432
In test data,the accuracy is:95.54%
The data_index is: 15500
In train data,the loss is:0.0585
In test data,the accuracy is:94.73%
The data_index is: 16000
In train data,the loss is:0.0418
In test data,the accuracy is:95.79%
The data_index is: 16500
In train data,the loss is:0.0318
In test data,the accuracy is:95.44%
The data_index is: 17000
In train data,the loss is:0.0397
In test data,the accuracy is:96.04%
The data_index is: 17500
In train data,the loss is:0.0287
In test data,the accuracy is:95.74%
The data_index is: 18000
In train data,the loss is:0.0215
In test data,the accuracy is:95.14%
The data_index is: 18500
In train data,the loss is:0.0464
In test data,the accuracy is:95.09%
The data_index is: 19000
In train data,the loss is:0.0259
In test data,the accuracy is:95.59%
time cost: 662
数据集和stopwords、训练好的Word2vector,入群后跟我要,群号:228735640
最后
以上就是激昂小海豚为你收集整理的LSTM模型---情感分析(文本评价分类)的全部内容,希望文章能够帮你解决LSTM模型---情感分析(文本评价分类)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复