Pytorch实例----NLP之文本分类

137 阅读 0 评论 91 点赞

我是靠谱客的博主朴实小蚂蚁，最近开发中收集的这篇文章主要介绍Pytorch实例----NLP之文本分类，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

介绍：

使用Y Yelp dataset (提取码：eqy3)，对餐馆评价做训练，预测新的评价是否属于积极评价。

整体流程：

数据集预处理：对json数据集解析csv格式，并进行数据集划分；

import csv
import json
import pandas as pd
def json2csv(csv_file_path, json_file_path, n = 10):
with open(json_file_path,'r',encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
headers=line_contents.keys()
break
print(headers)
i = 0
with open(csv_file_path, 'w', newline='',encoding='utf-8') as fout:
title = (['stars', 'text'])
writer=csv.DictWriter(fout, title)
writer.writeheader()
with open(json_file_path, 'r', encoding='utf-8') as fin:
for line in fin:
i += 1
line_contents = json.loads(line)
writer.writerow({'stars':int(line_contents['stars']), 'text':line_contents['text']})
if i == n:
break
fin.close()
fout.close()
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
by_rating[row.stars].append(row.to_dict())
# #split dataset
final_list = []
np.random.seed(arg.seed)
for _, item_list in sorted(by_rating.items()):
np.random.shuffle(item_list)
n_total = len(item_list)
n_train = int(arg.train_ratio * n_total)
n_val = int(arg.val_ratio * n_total)
n_test = int(arg.test_ratio * n_total)
for item in item_list[:n_train]:
item['split'] = 'train'
for item in item_list[n_train:n_train+n_val]:
item['split'] = 'val'
for item in item_list[n_train+n_val:n_train+n_val+n_test]:
item['split'] = 'test'
final_list.extend(item_list)
print(len(final_list))
final_review = pd.DataFrame(final_list)

词表生成：构建review和rating双向字典查找表；

class Vocabulary(object):
"""Class to process text and extract vocabulary for mapping"""
def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
"""
Args:
token_to_idx (dict): a pre-existing map of tokens to indices
add_unk (bool): a flag that indicates whether to add the UNK token
unk_token (str): the UNK token to add into the Vocabulary
"""
if token_to_idx is None:
token_to_idx = {}
self._token_to_idx = token_to_idx
self._idx_to_token = {idx: token
for token, idx in self._token_to_idx.items()}
self._add_unk = add_unk
self._unk_token = unk_token
self.unk_index = -1
if add_unk:
self.unk_index = self.add_token(unk_token)
def to_serializable(self):
""" returns a dictionary that can be serialized """
return {'token_to_idx': self._token_to_idx,
'add_unk': self._add_unk,
'unk_token': self._unk_token}
@classmethod
def from_serializable(cls, contents):
""" instantiates the Vocabulary from a serialized dictionary """
return cls(**contents)
def add_token(self, token):
"""Update mapping dicts based on the token.
Args:
token (str): the item to add into the Vocabulary
Returns:
index (int): the integer corresponding to the token
"""
if token in self._token_to_idx:
index = self._token_to_idx[token]
else:
index = len(self._token_to_idx)
self._token_to_idx[token] = index
self._idx_to_token[index] = token
return index
def add_many(self, tokens):
"""Add a list of tokens into the Vocabulary
Args:
tokens (list): a list of string tokens
Returns:
indices (list): a list of indices corresponding to the tokens
"""
return [self.add_token(token) for token in tokens]
def lookup_token(self, token):
"""Retrieve the index associated with the token
or the UNK index if token isn't present.
Args:
token (str): the token to look up
Returns:
index (int): the index corresponding to the token
Notes:
`unk_index` needs to be >=0 (having been added into the Vocabulary)
for the UNK functionality
"""
if self.unk_index >= 0:
return self._token_to_idx.get(token, self.unk_index)
else:
return self._token_to_idx[token]
def lookup_index(self, index):
"""Return the token associated with the index
Args:
index (int): the index to look up
Returns:
token (str): the token corresponding to the index
Raises:
KeyError: if the index is not in the Vocabulary
"""
if index not in self._idx_to_token:
raise KeyError("the index (%d) is not in the Vocabulary" % index)
return self._idx_to_token[index]
def __str__(self):
return "<Vocabulary(size=%d)>" % len(self)
def __len__(self):
return len(self._token_to_idx)

输入数据向量化：对review和rating进行向量化匹配；

class ReviewVectorizer(object):
""" The Vectorizer which coordinates the Vocabularies and puts them to use"""
def __init__(self, review_vocab, rating_vocab):
"""
Args:
review_vocab (Vocabulary): maps words to integers
rating_vocab (Vocabulary): maps class labels to integers
"""
self.review_vocab = review_vocab
self.rating_vocab = rating_vocab
def vectorize(self, review):
"""Create a collapsed one-hit vector for the review
Args:
review (str): the review
Returns:
one_hot (np.ndarray): the collapsed one-hot encoding
"""
one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
for token in review.split(" "):
if token not in string.punctuation:
one_hot[self.review_vocab.lookup_token(token)] = 1
return one_hot
@classmethod
def from_dataframe(cls, review_df, cutoff=25):
"""Instantiate the vectorizer from the dataset dataframe
Args:
review_df (pandas.DataFrame): the review dataset
cutoff (int): the parameter for frequency-based filtering
Returns:
an instance of the ReviewVectorizer
"""
review_vocab = Vocabulary(add_unk=True)
rating_vocab = Vocabulary(add_unk=False)
# Add ratings
for rating in sorted(set(review_df.rating)):
rating_vocab.add_token(rating)
# Add top words if count > provided count
word_counts = Counter()
for review in review_df.review:
for word in review.split(" "):
if word not in string.punctuation:
word_counts[word] += 1
for word, count in word_counts.items():
if count > cutoff:
review_vocab.add_token(word)
return cls(review_vocab, rating_vocab)
@classmethod
def from_serializable(cls, contents):
"""Instantiate a ReviewVectorizer from a serializable dictionary
Args:
contents (dict): the serializable dictionary
Returns:
an instance of the ReviewVectorizer class
"""
review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
rating_vocab =
Vocabulary.from_serializable(contents['rating_vocab'])
return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
def to_serializable(self):
"""Create the serializable dictionary for caching
Returns:
contents (dict): the serializable dictionary
"""
return {'review_vocab': self.review_vocab.to_serializable(),
'rating_vocab': self.rating_vocab.to_serializable()}

数据封装：加dataset封装成PyTorch Dataloader格式；

class ReviewDataset(Dataset):
def __init__(self, review_df, vectorizer):
"""
Args:
review_df (pandas.DataFrame): the dataset
vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
"""
self.review_df = review_df
self._vectorizer = vectorizer
self.train_df = self.review_df[self.review_df.split=='train']
self.train_size = len(self.train_df)
self.val_df = self.review_df[self.review_df.split=='val']
self.validation_size = len(self.val_df)
self.test_df = self.review_df[self.review_df.split=='test']
self.test_size = len(self.test_df)
self._lookup_dict = {'train': (self.train_df, self.train_size),
'val': (self.val_df, self.validation_size),
'test': (self.test_df, self.test_size)}
self.set_split('train')
@classmethod
def load_dataset_and_make_vectorizer(cls, review_csv):
"""Load dataset and make a new vectorizer from scratch
Args:
review_csv (str): location of the dataset
Returns:
an instance of ReviewDataset
"""
review_df = pd.read_csv(review_csv)
train_review_df = review_df[review_df.split=='train']
return cls(review_df, ReviewVectorizer.from_dataframe(train_review_df))
@classmethod
def load_dataset_and_load_vectorizer(cls, review_csv, vectorizer_filepath):
"""Load dataset and the corresponding vectorizer.
Used in the case in the vectorizer has been cached for re-use
Args:
review_csv (str): location of the dataset
vectorizer_filepath (str): location of the saved vectorizer
Returns:
an instance of ReviewDataset
"""
review_df = pd.read_csv(review_csv)
vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
return cls(review_df, vectorizer)
@staticmethod
def load_vectorizer_only(vectorizer_filepath):
"""a static method for loading the vectorizer from file
Args:
vectorizer_filepath (str): the location of the serialized vectorizer
Returns:
an instance of ReviewVectorizer
"""
with open(vectorizer_filepath) as fp:
return ReviewVectorizer.from_serializable(json.load(fp))
def save_vectorizer(self, vectorizer_filepath):
"""saves the vectorizer to disk using json
Args:
vectorizer_filepath (str): the location to save the vectorizer
"""
with open(vectorizer_filepath, "w") as fp:
json.dump(self._vectorizer.to_serializable(), fp)
def get_vectorizer(self):
""" returns the vectorizer """
return self._vectorizer
def set_split(self, split="train"):
""" selects the splits in the dataset using a column in the dataframe
Args:
split (str): one of "train", "val", or "test"
"""
self._target_split = split
self._target_df, self._target_size = self._lookup_dict[split]
def __len__(self):
return self._target_size
def __getitem__(self, index):
"""the primary entry point method for PyTorch datasets
Args:
index (int): the index to the data point
Returns:
a dictionary holding the data point's features (x_data) and label (y_target)
"""
row = self._target_df.iloc[index]
review_vector = 
self._vectorizer.vectorize(row.review)
rating_index = 
self._vectorizer.rating_vocab.lookup_token(row.rating)
return {'x_data': review_vector,
'y_target': rating_index}
def get_num_batches(self, batch_size):
"""Given a batch size, return the number of batches in the dataset
Args:
batch_size (int)
Returns:
number of batches in the dataset
"""
return len(self) // batch_size
def generate_batches(dataset, batch_size, shuffle=True,
drop_last=True, device="cpu"):
"""
A generator function which wraps the PyTorch DataLoader. It will
ensure each tensor is on the write device location.
"""
dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
shuffle=shuffle, drop_last=drop_last)
for data_dict in dataloader:
out_data_dict = {}
for name, tensor in data_dict.items():
out_data_dict[name] = data_dict[name].to(device)
yield out_data_dict

模型定义：使用简单的Precision模型进行初步尝试；

class ReviewClassifier(nn.Module):
""" a simple perceptron based classifier """
def __init__(self, num_features):
"""
Args:
num_features (int): the size of the input feature vector
"""
super(ReviewClassifier, self).__init__()
self.fc1 = nn.Linear(in_features=num_features,
out_features=1)
def forward(self, x_in, apply_sigmoid=False):
"""The forward pass of the classifier
Args:
x_in (torch.Tensor): an input data tensor.
x_in.shape should be (batch, num_features)
apply_sigmoid (bool): a flag for the sigmoid activation
should be false if used with the Cross Entropy losses
Returns:
the resulting tensor. tensor.shape should be (batch,)
"""
y_out = self.fc1(x_in).squeeze()
if apply_sigmoid:
y_out = torch.sigmoid(y_out)
return y_out

模型训练：对每一个review进行on-hot编码，作为网络输入，输出positive/negative结果；

def train(args):
if args.expand_filepaths_to_save_dir:
args.vectorizer_file = os.path.join(args.save_dir,
args.vectorizer_file)
args.model_state_file = os.path.join(args.save_dir,
args.model_state_file)
print("Expanded filepaths: ")
print("t{}".format(args.vectorizer_file))
print("t{}".format(args.model_state_file))
# Check CUDA
if not torch.cuda.is_available():
args.cuda = False
print("Using CUDA: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)
# handle dirs
handle_dirs(args.save_dir)
if args.reload_from_files:
# training from a checkpoint
print("Loading dataset and vectorizer")
dataset = ReviewDataset.load_dataset_and_load_vectorizer(args.review_csv,
args.vectorizer_file)
else:
print("Loading dataset and creating vectorizer")
# create dataset and vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
#training loop
classifier = classifier.to(args.device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
mode='min', factor=0.5,
patience=1)
train_state = make_train_state(args)
epoch_bar = tqdm_notebook(desc='training routine',
total=args.num_epochs,
position=0)
dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
total=dataset.get_num_batches(args.batch_size),
position=1,
leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
total=dataset.get_num_batches(args.batch_size),
position=1,
leave=True)

模型测试：输入review，输出评价结果。

def test(classifier, vectorizer, review):
classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))

整体代码见：NLP/chapter3

reference:PyTorchNLPBook