概述
文章目录
- 1. 对数据进行预处理
- 2. 定义网络结构
- 3. 数据分割及训练网络
- 4. 代码展示
1. 对数据进行预处理
在数据预处理过程中,遇到的主要问题是列表套列表,有时甚至会出现5层。对于多维数据的处理,尤其在用for循环以及转换为array时较容易出错。之所以会出现列表套列表,是因为每对实体可能对应多个label值,而且每对实体每个label也可能对应多个句子。
构造特征时,主要有三大特征,一为字向量,二为字与实体1相对位置编码,三为字与实体2相对位置编码。在这一过程中,由于每个句子的长度(字的个数)是不一样的,故需要指定一个固定长度,当句子大于该固定长度时,舍弃后面的;当句子长度小于该固定长度时,向后填充(使用随机初始化的向量和字BLANK)
下面为具体代码:
import numpy as np
#定义单词距离实体的位置信息
def pos_embed(x):
if x < -60:
return 0
if -60 <= x <= 60:
return x + 61
if x > 60:
return 122
# 定义y中1的index
def find_index(x, y):
flag = -1
for i in range(len(y)):
if x != y[i]:
continue
else:
return i
return flag
# 获得vec & word2id
vec = [] # the list of word vector
word2id = {} # word and its index
f = open('./origin_data/vec.txt', encoding='utf-8')
content = f.readline()
content = content.strip().split()
dim = int(content[1]) # the dimension of word vector
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
word2id[content[0]] = len(word2id) # the index of the current word(index from 0)
content = content[1:]
content = [(float)(i) for i in content] # the list of word vector
vec.append(content)
f.close()
#word2id & vec 的进一步处理
word2id['UNK'] = len(word2id) # add unknown
word2id['BLANK'] = len(word2id) # add blank
vec.append(np.random.normal(size=dim, loc=0, scale=0.05)) # add UNK对应的词向量
vec.append(np.random.normal(size=dim, loc=0, scale=0.05)) # ad BLANK对应的词向量
vec = np.array(vec, dtype=np.float32) # 将vec变为array
#获得relation2id
relation2id = {}
f = open('./origin_data/relation2id.txt', 'r', encoding='utf-8')
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
relation2id[content[0]] = int(content[1])
f.close()
#获得trian_sen & train_ans
#情况1:对于一组实体对可能存在多个label,即多个样本(句子)可能是相同的实体对;
#情况2:多个样本,可能实体对一样,同时label也一样,即一组实体对一个label对应多个sentence(样本)
fixlen = 70 # 每个句子的长度固定为70,即有70个字
maxlen = 60 #
train_sen = {} # 类似于情况2,字典中每个实体对的value为不同label值对应的sentence,sentence用output表示
train_ans = {} # {entity pair:[[label1],[label2],...]]} the label is one-hot vector,类似于情况1,每个实体对对应的label
print('reading train data...')
f = open('./origin_data/train.txt', 'r', encoding='utf-8')
while True:
content = f.readline() # 一句一句地处理
if content == '':
break
content = content.strip().split()
# get entity name
en1 = content[0] # get entity 1
en2 = content[1] # get entity 2
relation = 0 # 关系初始化为0
if content[2] not in relation2id: # 本质是报错信息
relation = relation2id['NA']
else:
relation = relation2id[content[2]] # 更新每个句子的relation(index表示)
# put the same entity pair sentences into a dict
tup = (en1, en2) # 一个句子实体对构成的元组
label_tag = 0 # 该句子在tran_ans中value对应的索引(value是一个list,每个元组为label,label为一个one-hot)
if tup not in train_sen: # 实体对不在tran_sen时
train_sen[tup] = []
train_sen[tup].append([])
y_id = relation
label_tag = 0
label = [0 for i in range(len(relation2id))]
label[y_id] = 1
train_ans[tup] = []
train_ans[tup].append(label)
else: # 实体对在tran_sen时
y_id = relation
label_tag = 0
label = [0 for i in range(len(relation2id))]
label[y_id] = 1
temp = find_index(label, train_ans[tup]) # the index of label in train_ans[tup]
if temp == -1: # 实体对在tran_sen且实体对对应的label值不在tran_ans时
train_ans[tup].append(label)
label_tag = len(train_ans[tup]) - 1 # when not found, return the last index in train_ans[tup]
train_sen[tup].append([])
else: # 实体对在tran_sen且实体对对应的label值在tran_ans时
label_tag = temp # when found, return temp
sentence = content[3] # 句子
en1pos = 0
en2pos = 0
#For Chinese
en1pos = sentence.find(en1) # 句子中实体1的位置(index)
if en1pos == -1:
en1pos = 0
en2pos = sentence.find(en2)
if en2pos == -1:
en2post = 0
output = []
# Embeding the position
for i in range(fixlen):
word = word2id['BLANK']
rel_e1 = pos_embed(i - en1pos) # 单词距离实体的位置:单词在实体左边时编码为0-60(太远即大于60时)编码为0,重合编码为61
rel_e2 = pos_embed(i - en2pos) # 单词距离实体的位置:单词在实体右边时编码为62-122(太远即大于60时)编码为122
output.append([word, rel_e1, rel_e2]) # initializer,列表套列表,每个元素为[word, rel_e1, rel_e2]
for i in range(min(fixlen, len(sentence))):
word = 0
if sentence[i] not in word2id:
word = word2id['UNK']
else:
word = word2id[sentence[i]] # the index of the word
output[i][0] = word # freshing,word表示一个句子中每个词在word2id中对应的索引
train_sen[tup][label_tag].append(output)
# get test_sen & test_ans
test_sen = {} # 与train不同的是,少了一个维度,大列表中每个元素为所有sentence
test_ans = {} # 与train不同的是,label为N_hot,因此每个实体对对应的value只是一个N-hot的列表
f = open('./origin_data/test.txt', 'r', encoding='utf-8')
while True:
content = f.readline()
if content == '':
break
content = content.strip().split()
en1 = content[0]
en2 = content[1]
relation = 0
if content[2] not in relation2id:
relation = relation2id['NA']
else:
relation = relation2id[content[2]]
tup = (en1, en2)
if tup not in test_sen:
test_sen[tup] = []
y_id = relation
label_tag = 0
label = [0 for i in range(len(relation2id))]
label[y_id] = 1
test_ans[tup] = label
else:
y_id = relation
test_ans[tup][y_id] = 1
sentence = content[3]
en1pos = 0
en2pos = 0
#For Chinese
en1pos = sentence.find(en1)
if en1pos == -1:
en1pos = 0
en2pos = sentence.find(en2)
if en2pos == -1:
en2post = 0
output = []
for i in range(fixlen):
word = word2id['BLANK']
rel_e1 = pos_embed(i - en1pos)
rel_e2 = pos_embed(i - en2pos)
output.append([word, rel_e1, rel_e2])
for i in range(min(fixlen, len(sentence))):
word = 0
if sentence[i] not in word2id:
word = word2id['UNK']
else:
word = word2id[sentence[i]]
output[i][0] = word
test_sen[tup].append(output)
train_x = []
train_y = []
test_x = []
test_y = []
f = open('./data/train_q&a.txt', 'w', encoding='utf-8')
temp = 0 # 行数
for i in train_sen: # i为每个实体对
if len(train_ans[i]) != len(train_sen[i]):
print('ERROR')
lenth = len(train_ans[i]) # 计算实体对有几个label
for j in range(lenth): # j为每个label的标签
train_x.append(train_sen[i][j]) # 添加第i个实体对的第j个label对应的sentence(三维)
train_y.append(train_ans[i][j]) # 添加第i个实体对的第j个label(二维)
f.write(str(temp) + 't' + i[0] + 't' + i[1] + 't' + str(np.argmax(train_ans[i][j])) + 'n')
temp += 1
f.close()
f = open('./data/test_q&a.txt', 'w', encoding='utf-8')
temp = 0
for i in test_sen:
test_x.append(test_sen[i])
test_y.append(test_ans[i])
tempstr = ''
for j in range(len(test_ans[i])):
if test_ans[i][j] != 0:
tempstr = tempstr + str(j) + 't'
f.write(str(temp) + 't' + i[0] + 't' + i[1] + 't' + tempstr + 'n')
temp += 1
f.close()
print(train_x[0:2])
train_x = np.array(train_x)
train_y = np.array(train_y)
test_x = np.array(test_x)
test_y = np.array(test_y)
np.save('./data/vec.npy', vec)
np.save('./data/train_x.npy', train_x)
np.save('./data/train_y.npy', train_y)
np.save('./data/testall_x.npy', test_x)
np.save('./data/testall_y.npy', test_y)
x_train = np.load('./data/train_x.npy',allow_pickle=True)
train_word = []
train_pos1 = []
train_pos2 = []
print('seprating train data')
for i in range(len(x_train)): # each tuple
word = []
pos1 = []
pos2 = []
for j in x_train[i]: # each sentence
temp_word = []
temp_pos1 = []
temp_pos2 = []
for k in j: # each element in sentence(output)
temp_word.append(k[0])
temp_pos1.append(k[1])
temp_pos2.append(k[2])
word.append(temp_word)
pos1.append(temp_pos1)
pos2.append(temp_pos2)
train_word.append(word) # the index of all words
train_pos1.append(pos1) # the pos1 of all words
train_pos2.append(pos2) # the pos2 of all words
train_word = np.array(train_word) # 所有词的index
train_pos1 = np.array(train_pos1) # 所有re_pos1
train_pos2 = np.array(train_pos2) # 所有re_pos2
print(train_word.shape)
print(train_pos1.shape)
print(train_pos2.shape)
print(train_word[0:2])
np.save('./data/train_word.npy', train_word)
np.save('./data/train_pos1.npy', train_pos1)
np.save('./data/train_pos2.npy', train_pos2)
print('seperating test all data')
x_test = np.load('./data/testall_x.npy')
test_word = []
test_pos1 = []
test_pos2 = []
for i in range(len(x_test)):
word = []
pos1 = []
pos2 = []
for j in x_test[i]:
temp_word = []
temp_pos1 = []
temp_pos2 = []
for k in j:
temp_word.append(k[0])
temp_pos1.append(k[1])
temp_pos2.append(k[2])
word.append(temp_word)
pos1.append(temp_pos1)
pos2.append(temp_pos2)
test_word.append(word)
test_pos1.append(pos1)
test_pos2.append(pos2)
test_word = np.array(test_word)
test_pos1 = np.array(test_pos1)
test_pos2 = np.array(test_pos2)
print(test_word[0:2])
print(test_word.shape)
np.save('./data/testall_word.npy', test_word)
np.save('./data/testall_pos1.npy', test_pos1)
np.save('./data/testall_pos2.npy', test_pos2)
test_y = np.load('./data/testall_y.npy')
eval_y = []
for i in test_y:
eval_y.append(i[1:]) # unless unknown [label1, label2,...]
allans = np.reshape(eval_y, (-1)) # 1 dimension unless unknown
print(allans.shape)
print(allans[0:10])
np.save('./data/allans.npy', allans)
fwrite = open('./data/metadata.tsv', 'w', encoding='utf-8')
f = open('./origin_data/vec.txt', encoding='utf-8')
f.readline()
while True:
content = f.readline().strip()
if content == '':
break
name = content.split()[0] # the list of all words
fwrite.write(name + 'n')
f.close()
fwrite.close()
2. 定义网络结构
本论文网络结构主要分为两大部分,一为双向的LSTM,因此首先要将刚才得到的数据分成多个batch,并转化成LSTM要求的shape,LSTM结果得到以后,还需要经过第二大部分即attention,attention简单来讲就是赋予不同的权重,从而使得学习更精准,网络结构的定义较为简单,多为调用tf库和书写公式。
3. 数据分割及训练网络
第一章虽然对数据进行了预处理,但是仍然不是LSTM想要的输入。此外在神经网络模型中,往往将数据分为很多batch,此代码认为batch_size=50,故在用网路进行训练时,仍然需要对数据进行转换并分割数据。训练网络时,采用adam的方式进行优化,并用准确率进行评估
4. 代码展示
剩余主要部分代码如下所示:
import tensorflow as tf
import numpy as np
import time
import datetime
from tensorflow.contrib.tensorboard.plugins import projector
save_path = './model/'
#print('reading wordembedding')
wordembedding = np.load('./data/vec.npy') # 即vec
#print(wordembedding.shape)
len(wordembedding)
print('reading training data')
train_y = np.load('./data/train_y.npy',allow_pickle=True)
train_word = np.load('./data/train_word.npy',allow_pickle=True)
train_pos1 = np.load('./data/train_pos1.npy',allow_pickle=True)
train_pos2 = np.load('./data/train_pos2.npy',allow_pickle=True)
vocab_size = 16691 # 单词的总个数
num_steps = 70 # 一个句子中单词的个数
num_epochs = 10 # epoch的个数,循环几轮
num_classes = 12 # 种类数
lstm_size = 230 # 隐藏层神经元的个数(230个cell)
keep_prob = 0.5 # dropout比例
num_layers = 1 # 隐藏层个数
pos_size = 5 #
pos_num = 123 #
big_num = 50 # batch_size
# 定义神经网络结构
class lstm:
def __init__(self, is_training, word_embeddings): # is_training=True, word_embeddings=wordembedding
self.num_steps = num_steps
self.vocab_size = vocab_size
self.num_classes = num_classes
self.lstm_size = lstm_size
self.big_num = big_num
self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, num_steps], name='input_word')# 占位符
self.input_pos1 = tf.placeholder(dtype=tf.int32, shape=[None, num_steps], name='input_pos1')
self.input_pos2 = tf.placeholder(dtype=tf.int32, shape=[None, num_steps], name='input_pos2')
self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y')
self.total_shape = tf.placeholder(dtype=tf.int32, shape=[big_num + 1], name='total_shape') # 一维array
total_num = self.total_shape[-1] # 取出最后一个,即当前样本的total_num
word_embedding = tf.get_variable(initializer=word_embeddings, name='word_embedding') # vec,shape(16691,100)
pos1_embedding = tf.get_variable('pos1_embedding', [pos_num, pos_size])# shape(123,5)
pos2_embedding = tf.get_variable('pos2_embedding', [pos_num, pos_size])# shape(123,5)
attention_w = tf.get_variable('attention_omega', [lstm_size, 1]) # 权重:shape(230,5)
sen_a = tf.get_variable('attention_A', [lstm_size]) # shape(230)
sen_r = tf.get_variable('query_r', [lstm_size, 1]) # shape(230,1)
relation_embedding = tf.get_variable('relation_embedding', [self.num_classes, lstm_size])# shape(12, 230)
sen_d = tf.get_variable('bias_d', [self.num_classes])# shape(12,)
lstm_cell_forward = tf.contrib.rnn.lstmCell(lstm_size) # 正序的lstm
lstm_cell_backward = tf.contrib.rnn.lstmCell(lstm_size) # 反序的lstm
if is_training and keep_prob < 1:
lstm_cell_forward = tf.contrib.rnn.DropoutWrapper(lstm_cell_forward, output_keep_prob=keep_prob) # 设置dropout
lstm_cell_backward = tf.contrib.rnn.DropoutWrapper(lstm_cell_backward, output_keep_prob=keep_prob)
cell_forward = tf.contrib.rnn.MultiRNNCell([lstm_cell_forward] * num_layers) # 正向lstm
cell_backward = tf.contrib.rnn.MultiRNNCell([lstm_cell_backward] * num_layers) # 反向lstm
sen_repre = []
sen_alpha = []
sen_s = []
sen_out = []
self.prob = []
self.predictions = []
self.loss = []
self.accuracy = []
self.total_loss = 0.0
self._initial_state_forward = cell_forward.zero_state(total_num, tf.float32) # 初始化
self._initial_state_backward = cell_backward.zero_state(total_num, tf.float32) # 初始化
# embedding layer
inputs_forward = tf.concat(axis=2, values=[tf.nn.embedding_lookup(word_embedding, self.input_word),
tf.nn.embedding_lookup(pos1_embedding, self.input_pos1),
tf.nn.embedding_lookup(pos2_embedding, self.input_pos2)])
inputs_backward = tf.concat(axis=2,
values=[tf.nn.embedding_lookup(word_embedding, tf.reverse(self.input_word, [1])),
tf.nn.embedding_lookup(pos1_embedding, tf.reverse(self.input_pos1, [1])),
tf.nn.embedding_lookup(pos2_embedding,
tf.reverse(self.input_pos2, [1]))])
outputs_forward = []
state_forward = self._initial_state_forward
# Bi-lstm layer
with tf.variable_scope('lstm_FORWARD') as scope:
for step in range(num_steps):
if step > 0:
scope.reuse_variables()
(cell_output_forward, state_forward) = cell_forward(inputs_forward[:, step, :], state_forward)
outputs_forward.append(cell_output_forward)
outputs_backward = []
state_backward = self._initial_state_backward
with tf.variable_scope('lstm_BACKWARD') as scope:
for step in range(num_steps):
if step > 0:
scope.reuse_variables()
(cell_output_backward, state_backward) = cell_backward(inputs_backward[:, step, :], state_backward)
outputs_backward.append(cell_output_backward)
# 50个实体对的sentence个数, 70, 230
output_forward = tf.reshape(tf.concat(axis=1, values=outputs_forward), [total_num, num_steps, lstm_size])
output_backward = tf.reverse(
tf.reshape(tf.concat(axis=1, values=outputs_backward), [total_num, num_steps, lstm_size]),
[1])
# word-level attention layer
output_h = tf.add(output_forward, output_backward) # 将正向和反向的结果相加shape(50个实体对的sentence个数, 70, 230)
attention_r = tf.reshape(tf.matmul(tf.reshape(tf.nn.softmax(
tf.reshape(tf.matmul(tf.reshape(tf.tanh(output_h), [total_num * num_steps, lstm_size]), attention_w),
[total_num, num_steps])), [total_num, 1, num_steps]), output_h), [total_num, lstm_size])
# attention_r:shape(50个实体对的sentence个数,230)
# tf.summary.scalar('loss',self.total_loss)
# tf.scalar_summary(['loss'],[self.total_loss])
with tf.name_scope("accuracy"):
self.accuracy.append(
tf.reduce_mean(tf.cast(tf.equal(self.predictions[i], tf.argmax(self.input_y[i], 0)), "float"),
name="accuracy"))
# tf.summary.scalar('loss',self.total_loss)
tf.summary.scalar('loss', self.total_loss)
# regularization
self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001),
weights_list=tf.trainable_variables())
self.final_loss = self.total_loss + self.l2_loss
tf.summary.scalar('l2_loss', self.l2_loss)
tf.summary.scalar('final_loss', self.final_loss)
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
initializer = tf.contrib.layers.xavier_initializer()
with tf.variable_scope("model", reuse=None, initializer=initializer):
m =lstm(is_training=True, word_embeddings=wordembedding) # 定义网络结构
# 损失函数在上一个cell中定义,并采用类的方式调用:m.final_loss
global_step = tf.Variable(0, name="global_step", trainable=False) # optimizer.minimize的参数
optimizer = tf.train.AdamOptimizer(0.0005) # 优化函数
train_op = optimizer.minimize(m.final_loss, global_step=global_step) # 优化目标,参数为损失函数,
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=None) # 保存模型
merged_summary = tf.summary.merge_all() # 画图用
summary_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train_loss', sess.graph) # 画图用
for one_epoch in range(num_epochs):
temp_order = list(range(len(train_word))) # 0-967的列表,967表示训练集有967个实体对
np.random.shuffle(temp_order) # 将temp_order 乱序排列
for i in range(int(len(temp_order) / float(big_num))): # 967 // 50 即iteration
######################开始构造随机的big_num(50)个临时训练数据##############################################################
temp_word = []
temp_pos1 = []
temp_pos2 = []
temp_y = []
temp_input = temp_order[i * big_num:(i + 1) * big_num] # 本质是batch_size,每次取50个实体对,这里取出了这50个的index
# 随机构造50个训练样本
for k in temp_input: # 对于这50个实体对的每一个
temp_word.append(train_word[k]) # 循环结束后为trian_word的50个(随机)
temp_pos1.append(train_pos1[k]) # 循环结束后为trian_pos1的50个(随机)
temp_pos2.append(train_pos2[k]) # 循环结束后为trian_pos2的50个(随机)
temp_y.append(train_y[k]) # 循环结束后为train_y的50个(随机)
num = 0
for single_word in temp_word: # 这50个随机实体对共有多少个label-sentence
num += len(single_word) # len(single_word)表示一个实体对有几个label-sentence
if num > 1500:
print('out of range')
continue
temp_word = np.array(temp_word) # 转换成array
temp_pos1 = np.array(temp_pos1)
temp_pos2 = np.array(temp_pos2)
temp_y = np.array(temp_y)
###################################随机构造50个临时训练数据结束###########################################################
######################开始构造随机的big_num(50)个训练数据#############################################################
feed_dict = {}
total_shape = []# 第0个为0,此后第k个数为前k-1个实体对的label-sentence和,一共有51个数(多加了没有样本时的0)
total_num = 0 # 记录total_shape前k个样本label的和
total_word = []# 训练集数据:行数为50个实体对对应的label-sentence的总个数,列就是每一句对应单词的index
total_pos1 = []# 训练集数据:行数为50个实体对对应的label-sentence的总个数,列就是每一句每个单词的rel_pos1
total_pos2 = []# 训练集数据:行数为50个实体对对应的label-sentence的总个数,列就是每一句每个单词的rel_pos2
for i in range(len(temp_word)): # 从0到50循环,为了构造上述空列表和字典,一个样本一个添加
total_shape.append(total_num)
total_num += len(temp_word[i])# 第i个样本的label-sentence 个数
for word in temp_word[i]:
total_word.append(word)
for pos1 in temp_pos1[i]:
total_pos1.append(pos1)
for pos2 in temp_pos2[i]:
total_pos2.append(pos2)
total_shape.append(total_num)
total_shape = np.array(total_shape) #
total_word = np.array(total_word)
total_pos1 = np.array(total_pos1)
total_pos2 = np.array(total_pos2)
feed_dict[m.total_shape] = total_shape
feed_dict[m.input_word] = total_word
feed_dict[m.inpu1] = total_pos1
feed_dict[m.input_pt_posos2] = total_pos2
feed_dict[m.input_y] = temp_y
###################################随机构造50个训练数据结束###########################################################
###################################开始运行神经网络###########################################################
temp, step, loss, accuracy, summary, l2_loss, final_loss = sess.run(
[train_op, global_step, m.total_loss, m.accuracy, merged_summary, m.l2_loss, m.final_loss],
feed_dict)
time_str = datetime.datetime.now().isoformat() # 计时
accuracy = np.reshape(np.array(accuracy), (big_num)) # 准确率,在network中定义
acc = np.mean(accuracy) # 均值
summary_writer.add_summary(summary, step) # 画图
if step % 50 == 0:
tempstr = "{}: step {}, softmax_loss {:g}, acc {:g}".format(time_str, step, loss, acc)
print(tempstr)
current_step = tf.train.global_step(sess, global_step)
if current_step > 8000 and current_step % 100 == 0:
print('saving model')
path = saver.save(sess, save_path + 'ATT_lstm_model', global_step=current_step)
tempstr = 'have saved model to ' + path
print(tempstr)
最后
以上就是疯狂蜜蜂为你收集整理的Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification代码调试1. 对数据进行预处理2. 定义网络结构3. 数据分割及训练网络4. 代码展示的全部内容,希望文章能够帮你解决Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification代码调试1. 对数据进行预处理2. 定义网络结构3. 数据分割及训练网络4. 代码展示所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复