LSTM模型---情感分析(文本评价分类)


import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec
import jieba
import tensorflow as tf
import numpy as np
import time
from random import randint
#shuffle() 方法将序列的所有元素随机排序。
from random import shuffle
def makeStopWord():
    with open('停用词.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
    stopWord = []
    for line in lines:
        words = jieba.lcut(line,cut_all = False)
        for word in words:
    return stopWord

def words2Array(lineList):
    steps = []
    for line in lineList:
        t = 0
        p = 0
        for i in range(MAX_SIZE):#一条评价最多容纳的单词数目(25个,多退少补)
            if i<len(line):
                    p = p + 1
                except KeyError:
        for i in range(t):
        wordsArray = []
    linesArray = np.array(linesArray)#三维矩阵
    steps = np.array(steps)#统计每一句话中的有效词放到数组中,数组的元素个数为句子的个数
    return linesArray, steps

def convert2Data(posArray, negArray, posStep, negStep):
    randIt = []
    data = []
    steps = []
    labels = []
    for i in range(len(posArray)):
        randIt.append([posArray[i], posStep[i], [1,0]])
    for i in range(len(negArray)):#消极评价:25*200的矩阵,有效词的个数,标签
        randIt.append([negArray[i], negStep[i], [0,1]])
    for i in range(len(randIt)):
    data = np.array(data)
    steps = np.array(steps)
    return data, steps, labels

def getWords(file):
    wordList = []
    trans = []
    lineList = []
    with open(file,'r',encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        trans = jieba.lcut(line.replace('n',''), cut_all = False)
        for word in trans:
            if word not in stopWord:
        wordList = []
    return lineList

def makeData(posPath,negPath):
    pos = getWords(posPath)
    print("The positive data's length is :",len(pos))
    neg = getWords(negPath)
    print("The negative data's length is :",len(neg))
    posArray, posSteps = words2Array(pos)
    negArray, negSteps = words2Array(neg)
    Data, Steps, Labels = convert2Data(posArray, negArray, posSteps, negSteps)
    return Data, Steps, Labels

# Word60.model   60维
# word2vec.model        200维

word2vec_path = 'word2vec/word2vec.model'
stopWord = makeStopWord()

print("In train data:")
trainData, trainSteps, trainLabels = makeData('data/B/Pos-train.txt',
print("In test data:")
testData, testSteps, testLabels = makeData('data/B/Pos-test.txt',
trainLabels = np.array(trainLabels)

del model

print("The trainData's shape is:",trainData.shape)
print("The testData's shape is:",testData.shape)
print("The trainSteps's shape is:",trainSteps.shape)
print("The testSteps's shape is:",testSteps.shape)
print("The trainLabels's shape is:",trainLabels.shape)
print("The testLabels's shape is:",np.array(testLabels).shape)

num_nodes = 128
batch_size = 16
output_size = 2

graph = tf.Graph()
with graph.as_default():
    #trainData's shape 与下面的tf.placeholder的shape是相同的才行,都是3维的矩阵
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size,MAX_SIZE,dimsh))
    tf_train_steps = tf.placeholder(tf.int32,shape=(batch_size))
    #The trainLabels's shape与tf_train_labels的shape类型相同,都是2维矩阵。
    tf_train_labels = tf.placeholder(tf.float32,shape=(batch_size,output_size))

    tf_test_dataset = tf.constant(testData,tf.float32)#常量
    tf_test_steps = tf.constant(testSteps,tf.int32)#常量
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = num_nodes,
#tf.truncated_normal(shape, mean, stddev) :shape表示生成张量的维度,mean是均值,stddev是标准差。
    w1 = tf.Variable(tf.truncated_normal([num_nodes,num_nodes // 2], stddev=0.1))
    #随机产生一维数组,数组内的元素个数为num_nodes // 2
    b1 = tf.Variable(tf.truncated_normal([num_nodes // 2], stddev=0.1))
    w2 = tf.Variable(tf.truncated_normal([num_nodes // 2, 2], stddev=0.1))
    b2 = tf.Variable(tf.truncated_normal([2], stddev=0.1))
    def model(dataset, steps):
        outputs, last_states = tf.nn.dynamic_rnn(cell = lstm_cell,
                                                 dtype = tf.float32,
                                                 sequence_length = steps,
                                         inputs = dataset)
        hidden = last_states[1]
        hidden = tf.matmul(hidden, w1) + b1
        logits = tf.matmul(hidden, w2) + b2
        return logits
    train_logits = model(tf_train_dataset, tf_train_steps)
#tf.reduce_mean(x, axis=0)表示计算每列的平均值,batchz_size做分母;
#tf.reduce_mean(x, axis=1)表示计算每行的平均值,batchz_size做分母;
    loss = tf.reduce_mean(
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
#      [0.8,0.2],
#     [0.4,0.6]]
    test_prediction = tf.nn.softmax(model(tf_test_dataset, tf_test_steps))

summary_frequency = 500

with tf.Session(graph = graph) as session:
    mean_loss = 0
    for data_index in range(len(trainData)+1):
        offset = (data_index * batch_size) % (len(trainLabels)-batch_size)
        feed_dict={tf_train_dataset:trainData[offset:offset + batch_size],
                   tf_train_labels:trainLabels[offset:offset + batch_size],
                   tf_train_steps:trainSteps[offset:offset + batch_size]}
        _, l = session.run([optimizer,loss],
                           feed_dict = feed_dict)
        mean_loss += l
        if data_index >0 and data_index % summary_frequency == 0:
            mean_loss = mean_loss / summary_frequency
            print("The data_index is: %d"%(data_index))
            print("In train data,the loss is:%.4f"%(mean_loss))
            mean_loss = 0
            acrc = 0
            prediction = session.run(test_prediction)
            for i in range(len(prediction)):
                #                   [1,0],
                #                   [2,0]]
                '''多分类问题就是:testLabels的形式为[[0,1],相应的prediction的形式为[[0.6,0.2,02],                                                                                   []
                #                                   [1,2],                         [0.5,0.3,02],
                #                                   [2,0],                         [0.7,0.1,02],
                #                                   [3,2],                         [0.4,0.4,02],
                #                                   [4,1]]                         [0.6,0.3,01]]
                if prediction[i][testLabels[i].index(1)] > 0.5:
                    acrc = acrc + 1
            print("In test data,the accuracy is:%.2f%%"%((acrc/len(testLabels))*100))
print("time cost:",int(timeB-timeA))


In train data:
The positive data's length is : 9701
The negative data's length is : 9429
In test data:
The positive data's length is : 995
The negative data's length is : 999
The trainData's shape is: (19130, 25, 200)
The testData's shape is: (1994, 25, 200)
The trainSteps's shape is: (19130,)
The testSteps's shape is: (1994,)
The trainLabels's shape is: (19130, 2)
The testLabels's shape is: (1994, 2)
The data_index is: 500
In train data,the loss is:0.6226
In test data,the accuracy is:69.56%
The data_index is: 1000
In train data,the loss is:0.5280
In test data,the accuracy is:74.17%
The data_index is: 1500
In train data,the loss is:0.4674
In test data,the accuracy is:77.68%
The data_index is: 2000
In train data,the loss is:0.4277
In test data,the accuracy is:80.59%
The data_index is: 2500
In train data,the loss is:0.3888
In test data,the accuracy is:82.15%
The data_index is: 3000
In train data,the loss is:0.3724
In test data,the accuracy is:83.05%
The data_index is: 3500
In train data,the loss is:0.3435
In test data,the accuracy is:83.75%
The data_index is: 4000
In train data,the loss is:0.3362
In test data,the accuracy is:86.06%
The data_index is: 4500
In train data,the loss is:0.3024
In test data,the accuracy is:87.36%
The data_index is: 5000
In train data,the loss is:0.2957
In test data,the accuracy is:88.11%
The data_index is: 5500
In train data,the loss is:0.2755
In test data,the accuracy is:89.27%
The data_index is: 6000
In train data,the loss is:0.2520
In test data,the accuracy is:89.82%
The data_index is: 6500
In train data,the loss is:0.2400
In test data,the accuracy is:88.67%
The data_index is: 7000
In train data,the loss is:0.2213
In test data,the accuracy is:91.02%
The data_index is: 7500
In train data,the loss is:0.2174
In test data,the accuracy is:92.43%
The data_index is: 8000
In train data,the loss is:0.1927
In test data,the accuracy is:92.68%
The data_index is: 8500
In train data,the loss is:0.1801
In test data,the accuracy is:93.28%
The data_index is: 9000
In train data,the loss is:0.1639
In test data,the accuracy is:92.73%
The data_index is: 9500
In train data,the loss is:0.1471
In test data,the accuracy is:94.68%
The data_index is: 10000
In train data,the loss is:0.1404
In test data,the accuracy is:93.83%
The data_index is: 10500
In train data,the loss is:0.1251
In test data,the accuracy is:94.13%
The data_index is: 11000
In train data,the loss is:0.1196
In test data,the accuracy is:94.38%
The data_index is: 11500
In train data,the loss is:0.0992
In test data,the accuracy is:94.73%
The data_index is: 12000
In train data,the loss is:0.0894
In test data,the accuracy is:94.73%
The data_index is: 12500
In train data,the loss is:0.0773
In test data,the accuracy is:94.68%
The data_index is: 13000
In train data,the loss is:0.0678
In test data,the accuracy is:95.49%
The data_index is: 13500
In train data,the loss is:0.0688
In test data,the accuracy is:96.24%
The data_index is: 14000
In train data,the loss is:0.0527
In test data,the accuracy is:94.78%
The data_index is: 14500
In train data,the loss is:0.0508
In test data,the accuracy is:95.79%
The data_index is: 15000
In train data,the loss is:0.0432
In test data,the accuracy is:95.54%
The data_index is: 15500
In train data,the loss is:0.0585
In test data,the accuracy is:94.73%
The data_index is: 16000
In train data,the loss is:0.0418
In test data,the accuracy is:95.79%
The data_index is: 16500
In train data,the loss is:0.0318
In test data,the accuracy is:95.44%
The data_index is: 17000
In train data,the loss is:0.0397
In test data,the accuracy is:96.04%
The data_index is: 17500
In train data,the loss is:0.0287
In test data,the accuracy is:95.74%
The data_index is: 18000
In train data,the loss is:0.0215
In test data,the accuracy is:95.14%
The data_index is: 18500
In train data,the loss is:0.0464
In test data,the accuracy is:95.09%
The data_index is: 19000
In train data,the loss is:0.0259
In test data,the accuracy is:95.59%
time cost: 662







