基于TNEWS' 今日头条中文新闻(短文本)分类
- 数据部分
-
- 内容
- 数据处理
- 模型构建
-
- 训练配置
- 模型训练和预测
-
- 定义评估函数
- 训练
- 预测
数据部分
内容
TNEWS’今日头条中文新闻数据集来自今日头条的新闻版块,共提取了15个类别的新闻,包括旅游,教育,金融,军事等。数据量:训练集(53,360),验证集(10,000),测试集(10,000)
数据处理
将文本数据转换成id
# 转换成id的函数 def convert_example(example, tokenizer): encoded_inputs = tokenizer(text=example["sentence"], max_seq_len=128, pad_to_max_seq_len=True) return tuple([np.array(x, dtype="int64") for x in [ encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], [example["label"]]]]) # 将训练集合转换为id train_ds = train_ds.map(partial(convert_example, tokenizer=tokenizer)) # 将验证集合转换为id dev_ds = dev_ds.map(partial(convert_example, tokenizer=tokenizer)) # 建立训练集合dataloader train_batch_size=32 dev_batch_size=32 train_batch_sampler = paddle.io.DistributedBatchSampler(dataset=train_ds, batch_size=train_batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, return_list=True) # 针对验证集数据加载,我们使用单卡进行评估,所以采用 paddle.io.BatchSampler 即可 # 定义验证集的dataloader dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=dev_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, return_list=True)
模型构建
class ShortTextClassification(nn.Layer):
def __init__(self, pretrained_model,num_class,dropout=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# num_labels = 2 (similar or dissimilar)
self.classifier = nn.Linear(self.ptm.config["hidden_size"], num_class)
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
attention_mask)
cls_embedding = self.dropout(cls_embedding)
logits = self.classifier(cls_embedding)
return logits
model = ShortTextClassification(pretrained_model,num_class=len(train_ds.label_list))
训练配置
epochs = 3
num_training_steps = len(train_data_loader) * epochs
# 定义 learning_rate_scheduler,负责在训练过程中对 lr 进行调度
lr_scheduler = LinearDecayWithWarmup(2E-5, num_training_steps, 0.0)
# 训练结束后,存储模型参数
save_dir ="checkpoint"
# 创建保存的文件夹
os.makedirs(save_dir,exist_ok=True)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
# 定义 Optimizer
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=0.0,
apply_decay_param_fun=lambda x: x in decay_params)
# 交叉熵损失
criterion = paddle.nn.loss.CrossEntropyLoss()
# 评估的时候采用准确率指标
metric = paddle.metric.Accuracy()
模型训练和预测
定义评估函数
# 因为训练过程中同时要在验证集进行模型评估,因此我们先定义评估函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, phase="dev"):
model.eval()
metric.reset()
losses = []
for batch in data_loader:
input_ids, token_type_ids, labels = batch
probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
# 计算损失
loss = criterion(probs, labels)
losses.append(loss.numpy())
# 计算准确率
correct = metric.compute(probs, labels)
#准确率更新
metric.update(correct)
accu = metric.accumulate()
print("eval {} loss: {:.5}, accu: {:.5}".format(phase,
np.mean(losses), accu))
model.train()
metric.reset()
return np.mean(losses),accu
训练
def do_train(model, criterion, metric, dev_data_loader,train_data_loader):
global_step = 0
tic_train = time.time()
best_accuracy=0.0
for epoch in range(1, epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
input_ids, token_type_ids, labels = batch
probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
loss = criterion(probs, labels)
correct = metric.compute(probs, labels)
metric.update(correct)
acc = metric.accumulate()
global_step += 1
# 每间隔 100 step 输出训练指标
if global_step % 100 == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss, acc,
10 / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
# 每间隔 100 step 在验证集和测试集上进行评估
if global_step % 500 == 0:
eval_loss,eval_accu=evaluate(model, criterion, metric, dev_data_loader, "dev")
if(best_accuracy<eval_accu):
best_accuracy=eval_accu
# 保存模型
save_param_path = os.path.join(save_dir, 'model_best.pdparams')
paddle.save(model.state_dict(), save_param_path)
# 保存tokenizer
tokenizer.save_pretrained(save_dir)
do_train(model, criterion, metric, dev_data_loader,train_data_loader)
预测
state_dict=paddle.load('checkpoint/model_best.pdparams')
model.load_dict(state_dict)
# 测试集可以选择 test,test1.0两个
test_ds = load_dataset('clue', task_name, splits=['test1.0'])
def do_predict(model,example): # 把文本转换成input_ids,token_type_ids # example=test_ds[0] encoded_text = tokenizer(text=example["sentence"], max_seq_len=512, pad_to_max_seq_len=True) # 把input_ids变成paddle tensor input_ids = paddle.to_tensor([encoded_text['input_ids']]) # 把token_type_ids变成paddle tensor segment_ids = paddle.to_tensor([encoded_text['token_type_ids']]) # 模型预测 pooled_output = model(input_ids, segment_ids) # 取概率值最大的索引 out2 = paddle.argmax(pooled_output, axis=1)fgh # print('预测的label
标签为 {}'.format(out2.numpy()[0])) # print('真实的label标签为 {}'.format(test_ds[0]['label'])) return out2.numpy()[0] predict_label=[] for i in tqdm(range(len(test_ds))): example=test_ds[i] label_pred=do_predict(model,example) predict_label.append(label_pred)
output_submit_file = "tnews10_predict.json"
label_map = {
i: label for i, label in enumerate(train_ds.label_list)}
# 保存标签结果
with open(output_submit_file, "w") as writer:
for i, pred in enumerate(predict_label):
json_d = {
}
json_d['id'] = i
json_d['label'] = str(label_map[pred])
writer.write(json.dumps(json_d) + '\n')