bert训练自用
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216#%% 导入包 from transformers import BertTokenizer,BertModel,BertConfig from transformers import AdamW, get_linear_schedule_with_warmup import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader, TensorDataset import pandas as pd import numpy as np import time from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score from sklearn.utils import shuffle from plt_score import plt_roc_pr #%% 导入数据 import os,sys # print('current path:',os.getcwd()) os.chdir(sys.path[0]) # print('current path:',os.getcwd()) # df = pd.read_excel('../../data/label_data_language_en.xlsx') df = pd.read_excel('../../data/label_data_language_en_aug.xlsx')#读取数据增强文件 print(df['gf_review_fix'].value_counts()) df['content'] = df['content'].str.lower().fillna('test') #全部小写 print(df.isnull().sum()) df = shuffle(df,random_state=1115)#shuffle两次 df = shuffle(df,random_state=930) #%% 文本预处理 start_token=time.time() x = list(df['content']) y = list(df['gf_review_fix']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=1634,stratify = y) x_vali,x_test,y_vali,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=1111,stratify = y_test) tokenizer = BertTokenizer.from_pretrained('../../model-pytorch/bert-base-uncased') # print(tokenizer) train_encoding = tokenizer(x_train, add_special_tokens=True, padding=True, truncation=True, max_length = 300,return_tensors="pt" ) vali_encoding = tokenizer(x_vali, add_special_tokens=True, padding=True, truncation=True, max_length = 300,return_tensors="pt" ) test_encoding = tokenizer(x_test, add_special_tokens=True, padding=True, truncation=True, max_length = 300,return_tensors="pt") # print(type(train_encoding)) #%%定义类 # print(x) class NewsDataset(Dataset): def __init__(self,encodings,labels): self.encodings = encodings self.labels = labels #这里的idx是为了让后面的DataLoader成批处理成迭代器,按idx映射到对应数据 def __getitem__(self, idx): item = {key:(val[idx]) for key, val in self.encodings.items()} # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(int(self.labels[idx])) return item def __len__(self): return len(self.labels) train_dataset = NewsDataset(train_encoding, y_train) vali_dataset = NewsDataset(vali_encoding, y_vali) test_dataset = NewsDataset(test_encoding, y_test) batch_size = 32 train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True) vali_loader = DataLoader(vali_dataset,batch_size=batch_size ,shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size , shuffle=True) end_token = time.time() print('data token batch Running time:{} Seconds'.format(end_token-start_token)) #可以看看长啥样 # batch = next(iter(train_loader)) # print(batch) # print(batch['input_ids'].shape) # %% 构建模型 torch.cuda.empty_cache() device = torch.device("cuda:0"if torch.cuda.is_available() else"cpu") class BertClassificationModel(nn.Module): def __init__(self): super(BertClassificationModel, self).__init__() model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer, "../../model-pytorch/bert-base-uncased/") # self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights) #bert,input,dense都要放入GPU中 #从上次的第**次开始再训练 # pretrained_weights = '../model_save/uncase_bert_stratify_19' self.bert = torch.nn.DataParallel(model_class.from_pretrained(pretrained_weights).cuda()) # 最后的预测层 self.predictor = nn.Sequential( nn.Linear(768, 2).cuda(), #bert默认的隐藏单元数是768, 输出单元是2,表示二分类 nn.Softmax(dim=1) ) def forward(self, input_ids,attention_mask): torch.cuda.empty_cache() input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) bert_output = self.bert( input_ids= input_ids, attention_mask=attention_mask) bert_cls_hidden_state = bert_output[0][:,0,:] #提取[CLS]对应的隐藏状态 linear_sigmoid_output = self.predictor(bert_cls_hidden_state) return linear_sigmoid_output #%% 实例化模型、定义损失函数和优化器、epochs #初始化 torch.cuda.empty_cache() bert_classifier_model = BertClassificationModel() #超参数----------------- #轮次,学习率 epochs = 10 lr = 1e-4 #优化器、调整器、损失函数 optimizer = AdamW(bert_classifier_model.parameters(), lr=lr) #改为Adam ->AdamW,删除筛选过滤冻结层(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5) total_steps = epochs * len(train_loader) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = total_steps) criterion =nn.CrossEntropyLoss().cuda() #记录数据参数----------------- #训练集记录数据 batch_loss = pd.DataFrame(columns=['epoch','batch','loss']) #验证集记录数据 epoch_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score']) #测试集评分 epoch_test_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score']) #保存模型比较值 max_f1 = 0 max_epoch = 0 #%% 训练函数 # 训练函数 def train(): bert_classifier_model.train() total_train_loss = 0 iter_num = 0 total_iter = len(train_loader) global batch_loss for batch in train_loader: # 正向传播 optimizer.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = bert_classifier_model(input_ids, attention_mask=attention_mask) loss = criterion(outputs, labels) total_train_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(bert_classifier_model.parameters(), 1.0) #梯度裁剪,防止梯度爆炸 # 参数更新 optimizer.step() scheduler.step() # df1 = pd.DataFrame([[epoch,iter_num,loss.item]],columns=['epoch','batch','loss']) # batch_loss = batch_loss.append(df1, ignore_index=True) loss_each = total_train_loss/(iter_num+1)/batch_size df1 = pd.DataFrame([[epoch,iter_num,loss_each]],columns=['epoch','batch','loss']) batch_loss = batch_loss.append(df1, ignore_index=True) iter_num += 1 if(iter_num % 100 == 0): print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss_each, iter_num/total_iter*100)) batch_loss.to_csv('./result/batch_loss.csv') print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader))) #%% 评价函数及测试 def validation(test_dataloader): bert_classifier_model.eval() pre_list = [] prob_list = [] with torch.no_grad(): for batch in test_dataloader: # 正常传播 input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = bert_classifier_model(input_ids, attention_mask=attention_mask) loss = criterion(outputs, labels) # logits = outputs prob = outputs[:, 1].cpu().numpy() prob_list.extend(prob.tolist()) _, predicted = torch.max(outputs, 1) predicted = predicted.cpu().numpy() torch.cuda.empty_cache() pre_list.extend(predicted.tolist()) return pre_list,prob_list def metric_show(pre_list,targets,prob_list,epoch,figlist): prescore= precision_score(targets,pre_list) accscore = accuracy_score(targets,pre_list) recascore = recall_score(targets,pre_list) f1score = f1_score(targets,pre_list) print("precision:{}".format(prescore)) print("accuracy:{}".format(accscore)) print("recascore:{}".format(recascore)) print("f1:{}".format(f1score)) # save_path = './result/bert' plt_roc_pr(targets,prob_list,name = str(epoch),save_path=save_path,figlist=figlist) return prescore,accscore,recascore,f1score #%% 训练开始 start = time.time() for epoch in range(epochs): print("------------Epoch: %d ----------------" % epoch) start_epoch = time.time() train() #使用验证集比较epoch效果最好的模型 pre_list,prob_list= validation(vali_loader) prescore,accscore,recascore,f1score = metric_show(pre_list,y_vali,prob_list,epoch,figlist=[0,1]) epoch_scores.loc[epoch]=[epoch,prescore,accscore,recascore,f1score] epoch_scores.to_csv('./result/epoch_scores.csv') #得到各种指标,同时记录效果最好的F1和当时EPOCH,保存模型 if epoch>5 and max_f1<f1score: max_f1 = f1score max_epoch = epoch print("最大F1:{}-对应EPOCH:{}".format(max_f1,max_epoch)) save_directory = '../model_save/uncase_bert_stratify_best' bert_classifier_model.bert.module.save_pretrained(save_directory) #记录测试集数据 pre_list,prob_list= validation(test_loader) t_prescore,t_accscore,t_recascore,t_f1score = metric_show(pre_list,y_test,prob_list,epoch = 't'+str(epoch),figlist=[2,3]) epoch_test_scores.loc[epoch]=[epoch,t_prescore,t_accscore,t_recascore,t_f1score] epoch_test_scores.to_csv('./result/epoch_test_scores.csv') end_epoch = time.time() print('total model Running time:{} Seconds'.format(end_epoch-start_epoch)) end = time.time() print('total model Running time:{} Seconds'.format(end-start))
最后
以上就是俭朴煎饼最近收集整理的关于【bert训练自用】的全部内容,更多相关【bert训练自用】内容请搜索靠谱客的其他文章。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复