概述
尊敬的读者您好:笔者很高兴自己的文章能被阅读,但原创与编辑均不易,所以转载请必须注明本文出处并附上本文地址超链接以及博主博客地址:https://blog.csdn.net/vensmallzeng。若觉得本文对您有益处还请帮忙点个赞鼓励一下,笔者在此感谢每一位读者,如需联系笔者,请记下邮箱:zengzenghe@gmail.com,谢谢合作!
最近在公司实习,正好对接了亲子评论文本分类的需求,针对该需求,笔者通过查询各方资料,在GitHub上download了若干代码,发现textcnn在文本分类方面的确很有优势,趁现在有时间就来好好的分析一波该代码,至于效果大家可以自行尝试。
1、主程序部分:train.py
代码重要部分均有注释
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
import logging
# 参数设置
# ==================================================
# 导入数据参数
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percnentage of the training data to use for validation")
tf.flags.DEFINE_string("family_nofamily_comments", "./data/rt-polaritydata/family_nofamily_comments_1w.txt", "Data source for the positive data.")
#tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")
# 模型超参数
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")
# 训练参数
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 2, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# 其他参数
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
# ==================================================
#数据预处理
# ==================================================
def preprocess():
# 导入数据
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.family_nofamily_comments)
# 建立词典
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
# 将数据随机打散
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# 划分训练集和测试集
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
#情况内存
del x, y, x_shuffled, y_shuffled
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
return x_train, y_train, vocab_processor, x_dev, y_dev
# 训练过程
# ==================================================
def train(x_train, y_train, vocab_processor, x_dev, y_dev):
with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = TextCNN(
sequence_length=x_train.shape[1],
num_classes=y_train.shape[1],
vocab_size=len(vocab_processor.vocabulary_),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
# 定义训练过程
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# 记录梯度值和稀疏度(可选)
grad_summaries = []
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged = tf.summary.merge(grad_summaries)
#训练模型与评价指标输出路径
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}n".format(out_dir))
# 评价指标
loss_summary = tf.summary.scalar("loss", cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
rec_summary = tf.summary.scalar("recall", cnn.recall)
pre_summary = tf.summary.scalar("precision", cnn.precision)
# 训练时显示评价指标
train_summary_op = tf.summary.merge([loss_summary, acc_summary, rec_summary, pre_summary, grad_summaries_merged])
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
# 验证时显示评价指标
dev_summary_op = tf.summary.merge([loss_summary, acc_summary, rec_summary, pre_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
# Checkpoint路径. Tensorflow默认该路径已存在,因为必须先创建它
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
# 输出字典
vocab_processor.save(os.path.join(out_dir, "vocab"))
# 初始化所有变量
sess.run(tf.global_variables_initializer())
def train_step(x_batch, y_batch):
"""
训练一个batch
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
}
_, step, summaries, loss, accuracy, recall, precision = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.recall, cnn.precision],
feed_dict)
time_str = datetime.datetime.now().isoformat()
log.info("{}: step {}, loss {:g}, acc {:g}, rec {:g}, pre {:g}".format(time_str, step, loss, accuracy, recall, precision))
train_summary_writer.add_summary(summaries, step)
def dev_step(x_batch, y_batch, writer=None):
"""
在验证集上评价模型
"""
batches_dev = data_helpers.batch_iter(
list(zip(x_batch, y_batch)), FLAGS.batch_size, 1)
# 为一个epoch生成若干batches
total_loss = 0
total_accuracy = 0
total_recall = 0
total_precision = 0
batch_number = 0
for batch_dev in batches_dev:
batch_number = batch_number + 1
x_batch_dev, y_batch_dev = zip(*batch_dev)
feed_dict = {
cnn.input_x: x_batch_dev,
cnn.input_y: y_batch_dev,
cnn.dropout_keep_prob: 1.0
}
step, summaries, loss, accuracy, recall, precision = sess.run(
[global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.recall, cnn.precision],
feed_dict)
total_loss = total_loss + loss
total_accuracy = total_accuracy + accuracy
total_recall = total_recall + recall
total_precision = total_precision + precision
total_loss = total_loss / batch_number
total_accuracy = total_accuracy / batch_number
total_recall = total_recall / batch_number
total_precision = total_precision / batch_number
time_str = datetime.datetime.now().isoformat()
log.info("{}: step {}, loss {:g}, acc {:g}, rec {:g}, pre {:g}".format(time_str, step, total_loss, total_accuracy, total_recall, total_precision))
if writer:
writer.add_summary(summaries, step)
# 生成batches
batches = data_helpers.batch_iter(
list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
# 获取TF训练过程中的日志 即红字打印的部分
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)
# 创建formatter并把其放入handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh = logging.FileHandler('tensorflow.log')
log.addHandler(fh)
for batch in batches:
x_batch, y_batch = zip(*batch)
train_step(x_batch, y_batch)
current_step = tf.train.global_step(sess, global_step)
if current_step % FLAGS.evaluate_every == 0:
log.info("nEvaluation:")
dev_step(x_dev, y_dev, writer=dev_summary_writer)
log.info("")
if current_step % FLAGS.checkpoint_every == 0:
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
log.info("Saved model checkpoint to {}n".format(path))
# 创建用于记录调试消息的文件助手(file handler)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
def main(argv=None):
x_train, y_train, vocab_processor, x_dev, y_dev = preprocess()
train(x_train, y_train, vocab_processor, x_dev, y_dev)
if __name__ == '__main__':
tf.app.run()
2、 模型建立部分:text_cnn.py
代码重要部分均有注释
import tensorflow as tf
import numpy as np
from sklearn import metrics
class TextCNN(object):
"""
A CNN for text classification.
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
定义一个用于文本分类的CNN
该网络由一个词嵌入层,一个卷积层,一个池化层和一个softmax层构成
"""
def __init__(
self, sequence_length, num_classes, vocab_size,
embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
# 定义输入、输出和dropout
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
# 记录l2正则损失
l2_loss = tf.constant(0.0)
# 词嵌入层
with tf.name_scope("embedding"):
self.W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
# 为每个滤子创建卷积+ maxpool层
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# 卷积层
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# 非线性激活层
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# 最大池化层
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
pooled_outputs.append(pooled)
# 将池化结果进行拼接
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
# 接dropout层
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
# 计算未归一化得分与预测结果
with tf.name_scope("output"):
W = tf.get_variable(
"W",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
self.predictions = tf.argmax(self.scores, 1, name="predictions")
# 计算平均交叉熵损失
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
# 计算精度
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="acuracy")
prediction = self.predictions
print("prediction")
print(prediction)
actuals = tf.argmax(self.input_y, 1)
print("actuals")
print(actuals)
ones_like_actuals = tf.ones_like(actuals)
zeros_like_actuals = tf.zeros_like(actuals)
ones_like_predictions = tf.ones_like(prediction)
zeros_like_predictions = tf.zeros_like(prediction)
tp = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, ones_like_actuals),
tf.equal(prediction, ones_like_predictions)
),
"float"
)
)
tn = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, zeros_like_actuals),
tf.equal(prediction, zeros_like_predictions)
),
"float"
)
)
fp = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, zeros_like_actuals),
tf.equal(prediction, ones_like_predictions)
),
"float"
)
)
fn = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, ones_like_actuals),
tf.equal(prediction, zeros_like_predictions)
),
"float"
)
)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
fnr = fn / (tp + fn)
# 计算查准率、查全率、f1值与精度
accuracy = (tp + tn) / (tp + fp + fn + tn)
self.recall = tpr
self.precision = tp / (tp + fp)
f1_score = (2 * (self.precision * self.recall)) / (self.precision + self.recall)
3、数据处理部分data_helpers.py
代码重要部分均有注释
import numpy as np
import re
import jieba
#文本句子清理
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9(),!?'`]", " ", string)
string = re.sub(r"'s", " 's", string)
string = re.sub(r"'ve", " 've", string)
string = re.sub(r"n't", " n't", string)
string = re.sub(r"'re", " 're", string)
string = re.sub(r"'d", " 'd", string)
string = re.sub(r"'ll", " 'll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"(", " ( ", string)
string = re.sub(r")", " ) ", string)
string = re.sub(r"?", " ? ", string)
string = re.sub(r"s{2,}", " ", string)
return string.strip().lower()
#导入亲子评论正负样本,返回句子和标签
def load_data_and_labels(family_nofamily_comments):
# 给出文档路径
filename = family_nofamily_comments
family_texts = []
no_family_texts = []
# 创建停用词列表
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords', encoding='UTF-8').readlines()]
return stopwords
# 对句子进行中文分词
def seg_depart(sentence):
# 对文档中的每一行进行中文分词
# print("正在分词")
sentence_depart = jieba.cut(sentence.strip())
# 创建一个停用词列表
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in sentence_depart:
if " " in word or len(word) == 1:
continue
word = re.sub('[a-zA-Z0-9]', '', word)
if word not in stopwords:
if word != 't':
outstr += word
outstr += " "
return outstr
with open(filename) as txtData:
lines = txtData.readlines()
for line in lines:
lines = line.strip().split('