【bert训练自用】

84 阅读 0 评论 56 点赞

我是靠谱客的博主俭朴煎饼，这篇文章主要介绍【bert训练自用】，现在分享给大家，希望可以做个参考。

bert训练自用

复制代码

#%% 导入包
from transformers import BertTokenizer,BertModel,BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.utils import shuffle
from plt_score import plt_roc_pr
#%% 导入数据
import os,sys
# print('current path:',os.getcwd())
os.chdir(sys.path[0])
# print('current path:',os.getcwd())
# df = pd.read_excel('../../data/label_data_language_en.xlsx')
df = pd.read_excel('../../data/label_data_language_en_aug.xlsx')#读取数据增强文件
print(df['gf_review_fix'].value_counts())
df['content'] = df['content'].str.lower().fillna('test') #全部小写
print(df.isnull().sum())
df = shuffle(df,random_state=1115)#shuffle两次
df = shuffle(df,random_state=930)
#%% 文本预处理
start_token=time.time()
x = list(df['content'])
y = list(df['gf_review_fix'])
x_train, x_test, y_train, y_test =
train_test_split(x, y, test_size=0.2,random_state=1634,stratify = y)
x_vali,x_test,y_vali,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=1111,stratify = y_test)
tokenizer = BertTokenizer.from_pretrained('../../model-pytorch/bert-base-uncased')
# print(tokenizer)
train_encoding = tokenizer(x_train, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt" )
vali_encoding = tokenizer(x_vali, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt" )
test_encoding = tokenizer(x_test, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt")
# print(type(train_encoding))
#%%定义类
# print(x)
class NewsDataset(Dataset):
def __init__(self,encodings,labels):
self.encodings = encodings
self.labels = labels
#这里的idx是为了让后面的DataLoader成批处理成迭代器，按idx映射到对应数据
def __getitem__(self, idx):
item = {key:(val[idx]) for key, val in self.encodings.items()}
# item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(int(self.labels[idx]))
return item
def __len__(self):
return len(self.labels)
train_dataset = NewsDataset(train_encoding, y_train)
vali_dataset = NewsDataset(vali_encoding, y_vali)
test_dataset = NewsDataset(test_encoding, y_test)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True)
vali_loader = DataLoader(vali_dataset,batch_size=batch_size ,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size , shuffle=True)
end_token = time.time()
print('data token batch Running time:{} Seconds'.format(end_token-start_token))
#可以看看长啥样
# batch = next(iter(train_loader))
# print(batch)
# print(batch['input_ids'].shape)
# %% 构建模型
torch.cuda.empty_cache()
device = torch.device("cuda:0"if torch.cuda.is_available() else"cpu")
class BertClassificationModel(nn.Module):
def __init__(self):
super(BertClassificationModel, self).__init__()
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer,
"../../model-pytorch/bert-base-uncased/")
# self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#bert，input，dense都要放入GPU中
#从上次的第**次开始再训练
# pretrained_weights = '../model_save/uncase_bert_stratify_19'
self.bert = torch.nn.DataParallel(model_class.from_pretrained(pretrained_weights).cuda())
# 最后的预测层
self.predictor = nn.Sequential(
nn.Linear(768, 2).cuda(), #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
nn.Softmax(dim=1)
)
def forward(self, input_ids,attention_mask):
torch.cuda.empty_cache()
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
bert_output = self.bert( input_ids= input_ids, attention_mask=attention_mask)
bert_cls_hidden_state = bert_output[0][:,0,:]
#提取[CLS]对应的隐藏状态
linear_sigmoid_output = self.predictor(bert_cls_hidden_state)
return linear_sigmoid_output
#%% 实例化模型、定义损失函数和优化器、epochs
#初始化
torch.cuda.empty_cache()
bert_classifier_model = BertClassificationModel()
#超参数-----------------
#轮次，学习率
epochs = 10
lr = 1e-4
#优化器、调整器、损失函数
optimizer = AdamW(bert_classifier_model.parameters(), lr=lr) #改为Adam ->AdamW,删除筛选过滤冻结层(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
total_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
criterion =nn.CrossEntropyLoss().cuda()
#记录数据参数-----------------
#训练集记录数据
batch_loss = pd.DataFrame(columns=['epoch','batch','loss'])
#验证集记录数据
epoch_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#测试集评分
epoch_test_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#保存模型比较值
max_f1 = 0
max_epoch = 0
#%% 训练函数
# 训练函数
def train():
bert_classifier_model.train()
total_train_loss = 0
iter_num = 0
total_iter = len(train_loader)
global batch_loss
for batch in train_loader:
# 正向传播
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
total_train_loss += loss.item()
loss.backward()
nn.utils.clip_grad_norm_(bert_classifier_model.parameters(), 1.0)
#梯度裁剪，防止梯度爆炸
# 参数更新
optimizer.step()
scheduler.step()
# df1 = pd.DataFrame([[epoch,iter_num,loss.item]],columns=['epoch','batch','loss'])
# batch_loss = batch_loss.append(df1, ignore_index=True)
loss_each = total_train_loss/(iter_num+1)/batch_size
df1 = pd.DataFrame([[epoch,iter_num,loss_each]],columns=['epoch','batch','loss'])
batch_loss = batch_loss.append(df1, ignore_index=True)
iter_num += 1
if(iter_num % 100 == 0):
print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss_each, iter_num/total_iter*100))
batch_loss.to_csv('./result/batch_loss.csv')
print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
#%% 评价函数及测试
def validation(test_dataloader):
bert_classifier_model.eval()
pre_list = []
prob_list = []
with torch.no_grad():
for batch in test_dataloader:
# 正常传播
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
# logits = outputs
prob = outputs[:, 1].cpu().numpy()
prob_list.extend(prob.tolist())
_, predicted = torch.max(outputs, 1)
predicted = predicted.cpu().numpy()
torch.cuda.empty_cache()
pre_list.extend(predicted.tolist())
return pre_list,prob_list
def metric_show(pre_list,targets,prob_list,epoch,figlist):
prescore= precision_score(targets,pre_list)
accscore = accuracy_score(targets,pre_list)
recascore = recall_score(targets,pre_list)
f1score = f1_score(targets,pre_list)
print("precision:{}".format(prescore))
print("accuracy:{}".format(accscore))
print("recascore:{}".format(recascore))
print("f1:{}".format(f1score))
# 
save_path = './result/bert'
plt_roc_pr(targets,prob_list,name = str(epoch),save_path=save_path,figlist=figlist)
return prescore,accscore,recascore,f1score
#%% 训练开始
start = time.time()
for epoch in range(epochs):
print("------------Epoch: %d ----------------" % epoch)
start_epoch = time.time()
train()
#使用验证集比较epoch效果最好的模型
pre_list,prob_list= validation(vali_loader)
prescore,accscore,recascore,f1score = metric_show(pre_list,y_vali,prob_list,epoch,figlist=[0,1])
epoch_scores.loc[epoch]=[epoch,prescore,accscore,recascore,f1score]
epoch_scores.to_csv('./result/epoch_scores.csv')
#得到各种指标，同时记录效果最好的F1和当时EPOCH,保存模型
if epoch>5 and max_f1<f1score:
max_f1 = f1score
max_epoch = epoch
print("最大F1:{}-对应EPOCH:{}".format(max_f1,max_epoch))
save_directory = '../model_save/uncase_bert_stratify_best'
bert_classifier_model.bert.module.save_pretrained(save_directory)
#记录测试集数据
pre_list,prob_list= validation(test_loader)
t_prescore,t_accscore,t_recascore,t_f1score = metric_show(pre_list,y_test,prob_list,epoch = 't'+str(epoch),figlist=[2,3])
epoch_test_scores.loc[epoch]=[epoch,t_prescore,t_accscore,t_recascore,t_f1score]
epoch_test_scores.to_csv('./result/epoch_test_scores.csv')
end_epoch = time.time()
print('total model Running time:{} Seconds'.format(end_epoch-start_epoch))
end = time.time()
print('total model Running time:{} Seconds'.format(end-start))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

#%% 导入包
from transformers import BertTokenizer,BertModel,BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.utils import shuffle
from plt_score import plt_roc_pr
#%% 导入数据
import os,sys
# print('current path:',os.getcwd())
os.chdir(sys.path[0])
# print('current path:',os.getcwd())
# df = pd.read_excel('../../data/label_data_language_en.xlsx')
df = pd.read_excel('../../data/label_data_language_en_aug.xlsx')#读取数据增强文件
print(df['gf_review_fix'].value_counts())
df['content'] = df['content'].str.lower().fillna('test') #全部小写
print(df.isnull().sum())
df = shuffle(df,random_state=1115)#shuffle两次
df = shuffle(df,random_state=930)
#%% 文本预处理
start_token=time.time()
x = list(df['content'])
y = list(df['gf_review_fix'])
x_train, x_test, y_train, y_test =
train_test_split(x, y, test_size=0.2,random_state=1634,stratify = y)
x_vali,x_test,y_vali,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=1111,stratify = y_test)
tokenizer = BertTokenizer.from_pretrained('../../model-pytorch/bert-base-uncased')
# print(tokenizer)
train_encoding = tokenizer(x_train, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt" )
vali_encoding = tokenizer(x_vali, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt" )
test_encoding = tokenizer(x_test, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt")
# print(type(train_encoding))
#%%定义类
# print(x)
class NewsDataset(Dataset):
def __init__(self,encodings,labels):
self.encodings = encodings
self.labels = labels
#这里的idx是为了让后面的DataLoader成批处理成迭代器，按idx映射到对应数据
def __getitem__(self, idx):
item = {key:(val[idx]) for key, val in self.encodings.items()}
# item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(int(self.labels[idx]))
return item
def __len__(self):
return len(self.labels)
train_dataset = NewsDataset(train_encoding, y_train)
vali_dataset = NewsDataset(vali_encoding, y_vali)
test_dataset = NewsDataset(test_encoding, y_test)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True)
vali_loader = DataLoader(vali_dataset,batch_size=batch_size ,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size , shuffle=True)
end_token = time.time()
print('data token batch Running time:{} Seconds'.format(end_token-start_token))
#可以看看长啥样
# batch = next(iter(train_loader))
# print(batch)
# print(batch['input_ids'].shape)
# %% 构建模型
torch.cuda.empty_cache()
device = torch.device("cuda:0"if torch.cuda.is_available() else"cpu")
class BertClassificationModel(nn.Module):
def __init__(self):
super(BertClassificationModel, self).__init__()
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer,
"../../model-pytorch/bert-base-uncased/")
# self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#bert，input，dense都要放入GPU中
#从上次的第**次开始再训练
# pretrained_weights = '../model_save/uncase_bert_stratify_19'
self.bert = torch.nn.DataParallel(model_class.from_pretrained(pretrained_weights).cuda())
# 最后的预测层
self.predictor = nn.Sequential(
nn.Linear(768, 2).cuda(), #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
nn.Softmax(dim=1)
)
def forward(self, input_ids,attention_mask):
torch.cuda.empty_cache()
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
bert_output = self.bert( input_ids= input_ids, attention_mask=attention_mask)
bert_cls_hidden_state = bert_output[0][:,0,:]
#提取[CLS]对应的隐藏状态
linear_sigmoid_output = self.predictor(bert_cls_hidden_state)
return linear_sigmoid_output
#%% 实例化模型、定义损失函数和优化器、epochs
#初始化
torch.cuda.empty_cache()
bert_classifier_model = BertClassificationModel()
#超参数-----------------
#轮次，学习率
epochs = 10
lr = 1e-4
#优化器、调整器、损失函数
optimizer = AdamW(bert_classifier_model.parameters(), lr=lr) #改为Adam ->AdamW,删除筛选过滤冻结层(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
total_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
criterion =nn.CrossEntropyLoss().cuda()
#记录数据参数-----------------
#训练集记录数据
batch_loss = pd.DataFrame(columns=['epoch','batch','loss'])
#验证集记录数据
epoch_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#测试集评分
epoch_test_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#保存模型比较值
max_f1 = 0
max_epoch = 0
#%% 训练函数
# 训练函数
def train():
bert_classifier_model.train()
total_train_loss = 0
iter_num = 0
total_iter = len(train_loader)
global batch_loss
for batch in train_loader:
# 正向传播
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
total_train_loss += loss.item()
loss.backward()
nn.utils.clip_grad_norm_(bert_classifier_model.parameters(), 1.0)
#梯度裁剪，防止梯度爆炸
# 参数更新
optimizer.step()
scheduler.step()
# df1 = pd.DataFrame([[epoch,iter_num,loss.item]],columns=['epoch','batch','loss'])
# batch_loss = batch_loss.append(df1, ignore_index=True)
loss_each = total_train_loss/(iter_num+1)/batch_size
df1 = pd.DataFrame([[epoch,iter_num,loss_each]],columns=['epoch','batch','loss'])
batch_loss = batch_loss.append(df1, ignore_index=True)
iter_num += 1
if(iter_num % 100 == 0):
print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss_each, iter_num/total_iter*100))
batch_loss.to_csv('./result/batch_loss.csv')
print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
#%% 评价函数及测试
def validation(test_dataloader):
bert_classifier_model.eval()
pre_list = []
prob_list = []
with torch.no_grad():
for batch in test_dataloader:
# 正常传播
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
# logits = outputs
prob = outputs[:, 1].cpu().numpy()
prob_list.extend(prob.tolist())
_, predicted = torch.max(outputs, 1)
predicted = predicted.cpu().numpy()
torch.cuda.empty_cache()
pre_list.extend(predicted.tolist())
return pre_list,prob_list
def metric_show(pre_list,targets,prob_list,epoch,figlist):
prescore= precision_score(targets,pre_list)
accscore = accuracy_score(targets,pre_list)
recascore = recall_score(targets,pre_list)
f1score = f1_score(targets,pre_list)
print("precision:{}".format(prescore))
print("accuracy:{}".format(accscore))
print("recascore:{}".format(recascore))
print("f1:{}".format(f1score))
# 
save_path = './result/bert'
plt_roc_pr(targets,prob_list,name = str(epoch),save_path=save_path,figlist=figlist)
return prescore,accscore,recascore,f1score
#%% 训练开始
start = time.time()
for epoch in range(epochs):
print("------------Epoch: %d ----------------" % epoch)
start_epoch = time.time()
train()
#使用验证集比较epoch效果最好的模型
pre_list,prob_list= validation(vali_loader)
prescore,accscore,recascore,f1score = metric_show(pre_list,y_vali,prob_list,epoch,figlist=[0,1])
epoch_scores.loc[epoch]=[epoch,prescore,accscore,recascore,f1score]
epoch_scores.to_csv('./result/epoch_scores.csv')
#得到各种指标，同时记录效果最好的F1和当时EPOCH,保存模型
if epoch>5 and max_f1<f1score:
max_f1 = f1score
max_epoch = epoch
print("最大F1:{}-对应EPOCH:{}".format(max_f1,max_epoch))
save_directory = '../model_save/uncase_bert_stratify_best'
bert_classifier_model.bert.module.save_pretrained(save_directory)
#记录测试集数据
pre_list,prob_list= validation(test_loader)
t_prescore,t_accscore,t_recascore,t_f1score = metric_show(pre_list,y_test,prob_list,epoch = 't'+str(epoch),figlist=[2,3])
epoch_test_scores.loc[epoch]=[epoch,t_prescore,t_accscore,t_recascore,t_f1score]
epoch_test_scores.to_csv('./result/epoch_test_scores.csv')
end_epoch = time.time()
print('total model Running time:{} Seconds'.format(end_epoch-start_epoch))
end = time.time()
print('total model Running time:{} Seconds'.format(end-start))