文本情感识别项目：Hinge Loss梯度下降公式推导和代码实现，doc2Vec和TF-IDF句子特征提取

68 阅读 0 评论 45 点赞

我是靠谱客的博主忧虑山水，这篇文章主要介绍文本情感识别项目：Hinge Loss梯度下降公式推导和代码实现，doc2Vec和TF-IDF句子特征提取，现在分享给大家，希望可以做个参考。

1、Hinge Loss权重参数更新公式推导和实现

1.1 推导梯度更新公式

损失函数：
$\$
求梯度：
$\ 0, & pmb{w}phi(pmb{x})ygeq1 end{cases}\$
使用梯度下降更新权重：
$&pmb{w}_{t+1}=pmb{w}_{t}-eta nabla L(pmb{w}_t) end{aligned}$

1.2 梯度更新公式代码实现

（1）实现梯度下降更新权重；

（2）实现带动量的梯度下降法更新权重；

复制代码

def learnPredictor(trainExamples, testExamples, featureExtractor, numIters, eta):
"""
训练分类模型
@param trainExamples: 训练数据
@param testExamples: 测试数据
@param featureExtractor: 特征提取器
@param numIters: 训练轮数
@param eta: 学习率
@return:weights
"""
# 实现批量梯度下降法，更新分类器的权重
train_feature_labels = list(featureExtractor(trainExamples, 'doc2vec'))
data_num = len(train_feature_labels)
feature_len = len(train_feature_labels[0][0])
weights = np.squeeze(init_w(feature_len, 1))
# 初始化权重
use_momentum = False
velocity_W = 0
# 初始速度
velocity_b = 0
# 初始速度
momentum = 0.9
epoch_avg_loss = []
for epoch in range(1, numIters + 1):
batch_avg_loss = []
for i in range(len(train_feature_labels)):
feature = train_feature_labels[i][0]
# (1, feature_len)
label = train_feature_labels[i][1]
# 计算损失
loss = max(0, 1 - np.dot(feature, weights) * label)
batch_avg_loss.append(loss)
# 计算梯度
if np.dot(feature, weights) * label < 1:
grad_w = -feature * label
else:
grad_w = np.zeros(feature_len)
# 更新权重
if use_momentum:# 使用带动量的梯度下降算法
velocity_W = momentum * velocity_W - eta * grad_w
weights += velocity_W
else:
weights -= eta * grad_w
batch_avg_loss = np.mean(batch_avg_loss)
epoch_avg_loss.append(batch_avg_loss)
print('epoch:{}, loss:{}'.format(epoch, batch_avg_loss))
# 计算训练集准确率
right_count = 0
for i in range(data_num):
feature = train_feature_labels[i][0]
# (1, feature_len)
label = train_feature_labels[i][1]
y_hat = 1 if np.dot(feature, weights) >= 0 else -1
if y_hat == label:
right_count += 1
print('correct_rate:{}'.format(right_count / data_num))
np.save("../result/result_500_300.npy", weights)
return weights

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def learnPredictor(trainExamples, testExamples, featureExtractor, numIters, eta):
"""
训练分类模型
@param trainExamples: 训练数据
@param testExamples: 测试数据
@param featureExtractor: 特征提取器
@param numIters: 训练轮数
@param eta: 学习率
@return:weights
"""
# 实现批量梯度下降法，更新分类器的权重
train_feature_labels = list(featureExtractor(trainExamples, 'doc2vec'))
data_num = len(train_feature_labels)
feature_len = len(train_feature_labels[0][0])
weights = np.squeeze(init_w(feature_len, 1))
# 初始化权重
use_momentum = False
velocity_W = 0
# 初始速度
velocity_b = 0
# 初始速度
momentum = 0.9
epoch_avg_loss = []
for epoch in range(1, numIters + 1):
batch_avg_loss = []
for i in range(len(train_feature_labels)):
feature = train_feature_labels[i][0]
# (1, feature_len)
label = train_feature_labels[i][1]
# 计算损失
loss = max(0, 1 - np.dot(feature, weights) * label)
batch_avg_loss.append(loss)
# 计算梯度
if np.dot(feature, weights) * label < 1:
grad_w = -feature * label
else:
grad_w = np.zeros(feature_len)
# 更新权重
if use_momentum:# 使用带动量的梯度下降算法
velocity_W = momentum * velocity_W - eta * grad_w
weights += velocity_W
else:
weights -= eta * grad_w
batch_avg_loss = np.mean(batch_avg_loss)
epoch_avg_loss.append(batch_avg_loss)
print('epoch:{}, loss:{}'.format(epoch, batch_avg_loss))
# 计算训练集准确率
right_count = 0
for i in range(data_num):
feature = train_feature_labels[i][0]
# (1, feature_len)
label = train_feature_labels[i][1]
y_hat = 1 if np.dot(feature, weights) >= 0 else -1
if y_hat == label:
right_count += 1
print('correct_rate:{}'.format(right_count / data_num))
np.save("../result/result_500_300.npy", weights)
return weights

2、特征提取

（1）实现doc2vec提取文本特征；

（2）实现tfidf提取文本特征；

2.1 doc2vec

doc2vec是一种针对提取句子和段落的特征的方式，相比word2vec有一些改进，在论文《Distributed Representations of Sentences and Documents》中，提出两种表示方式：PV-DM与PV-DBOW。

具体代码实现：

复制代码

from util import *
from gensim import corpora
from gensim.models.tfidfmodel import TfidfModel
import gensim
import numpy as np
def doc2vec_read_corpus(sentence_s, tokens_only=False):
"""
doc2vec的数据预处理方式，去标点、停用词等，生成token
@param sentence_s:句子列表
@param tokens_only:
@return:
"""
for i, line in enumerate(sentence_s):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def doc2vec_feature(sentence_s, data_type='train'):
"""
使用doc2vec提取句子的特征
@param data_type: 训练还是测试数据
@param sentence_s:输入的句子列表
@return: 向量列表
"""
corpus = []
assert (data_type == 'train' or data_type == 'test')
if data_type == 'train':
corpus = list(doc2vec_read_corpus(sentence_s, False))
else:
corpus = list(doc2vec_read_corpus(sentence_s, True))
# 训练模型
model = gensim.models.doc2vec.Doc2Vec(vector_size=500, min_count=2, epochs=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
# 使用训练的模型生成句子的特征向量
feature = []
for doc_id in range(len(corpus)):
inferred_vector = model.infer_vector(corpus[doc_id].words)
feature.append(inferred_vector)
return feature

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from util import *
from gensim import corpora
from gensim.models.tfidfmodel import TfidfModel
import gensim
import numpy as np
def doc2vec_read_corpus(sentence_s, tokens_only=False):
"""
doc2vec的数据预处理方式，去标点、停用词等，生成token
@param sentence_s:句子列表
@param tokens_only:
@return:
"""
for i, line in enumerate(sentence_s):
tokens = gensim.utils.simple_preprocess(line)
if tokens_only:
yield tokens
else:
# For training data, add tags
yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
def doc2vec_feature(sentence_s, data_type='train'):
"""
使用doc2vec提取句子的特征
@param data_type: 训练还是测试数据
@param sentence_s:输入的句子列表
@return: 向量列表
"""
corpus = []
assert (data_type == 'train' or data_type == 'test')
if data_type == 'train':
corpus = list(doc2vec_read_corpus(sentence_s, False))
else:
corpus = list(doc2vec_read_corpus(sentence_s, True))
# 训练模型
model = gensim.models.doc2vec.Doc2Vec(vector_size=500, min_count=2, epochs=300)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
# 使用训练的模型生成句子的特征向量
feature = []
for doc_id in range(len(corpus)):
inferred_vector = model.infer_vector(corpus[doc_id].words)
feature.append(inferred_vector)
return feature

2.2 TF-IDF

TF-IDF的主要思想是：如果某个单词在一篇文章中出现的频率TF高，并且在其他文章中很少出现，则认为此词或者短语具有很好的类别区分能力。 TF-IDF是一种统计方法，用以评估一字词对于一个文件集或一个语料库中的其中一份文件的重要程度。字词的重要性随着它在文件中出现的次数成正比增加，但同时会随着它在语料库中出现的频率成反比下降。

具体的代码实现：

需要注意的是，gensim得到的是一种稀疏矩阵的形式，训练的话需要转换成向量表示，由于较稀疏，使用PCA降低维度，保留重要的特征构建训练的向量列表。

复制代码

def tfidf_feature(sentence_s, data_type='train', is_sparse=False):
"""
使用tfidf处理句子列表得到词向量
@param is_sparse: 返回稀疏表示还是密集表示
@param sentence_s:句子列表
@param data_type:数据是训练还是测试
@return: 句子的特征向量列表
"""
# 处理文本
assert (data_type == 'train' or data_type == 'test')
if data_type == 'train':
corpus = list(doc2vec_read_corpus(sentence_s, False))
else:
corpus = list(doc2vec_read_corpus(sentence_s, True))
# 得到模型
sentence_processed = []
feature = []
for doc_id in range(len(corpus)):
sentence_processed.append(corpus[doc_id].words)
sentences_dict = corpora.Dictionary(sentence_processed)
corpus = [sentences_dict.doc2bow(sentence) for sentence in sentence_processed]
model = TfidfModel(corpus, normalize=True)
# 使用模型得到向量
if is_sparse:# 使用稀疏表示方式
feature_s = []
for doc_id in range(len(corpus)):
inferred_vector = model[corpus[doc_id]]
feature_s.append(inferred_vector)
else:# 将稀疏表示转换成普通矩阵形式，并且进行PCA降维
feature_len = len(model.idfs)
feature_s = []
for doc_id in range(len(corpus)):
feature = np.zeros(feature_len)
inferred_vector = model[corpus[doc_id]]
for sparse_word in inferred_vector:
feature[sparse_word[0]] = sparse_word[1]
feature_s.append(feature)
feature_s = np.array(feature_s)
#使用PCA降维数据
pca = PCA(n_components=500)
pca.fit_transform(feature_s)
return feature_s

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def tfidf_feature(sentence_s, data_type='train', is_sparse=False):
"""
使用tfidf处理句子列表得到词向量
@param is_sparse: 返回稀疏表示还是密集表示
@param sentence_s:句子列表
@param data_type:数据是训练还是测试
@return: 句子的特征向量列表
"""
# 处理文本
assert (data_type == 'train' or data_type == 'test')
if data_type == 'train':
corpus = list(doc2vec_read_corpus(sentence_s, False))
else:
corpus = list(doc2vec_read_corpus(sentence_s, True))
# 得到模型
sentence_processed = []
feature = []
for doc_id in range(len(corpus)):
sentence_processed.append(corpus[doc_id].words)
sentences_dict = corpora.Dictionary(sentence_processed)
corpus = [sentences_dict.doc2bow(sentence) for sentence in sentence_processed]
model = TfidfModel(corpus, normalize=True)
# 使用模型得到向量
if is_sparse:# 使用稀疏表示方式
feature_s = []
for doc_id in range(len(corpus)):
inferred_vector = model[corpus[doc_id]]
feature_s.append(inferred_vector)
else:# 将稀疏表示转换成普通矩阵形式，并且进行PCA降维
feature_len = len(model.idfs)
feature_s = []
for doc_id in range(len(corpus)):
feature = np.zeros(feature_len)
inferred_vector = model[corpus[doc_id]]
for sparse_word in inferred_vector:
feature[sparse_word[0]] = sparse_word[1]
feature_s.append(feature)
feature_s = np.array(feature_s)
#使用PCA降维数据
pca = PCA(n_components=500)
pca.fit_transform(feature_s)
return feature_s