Qwen2.5-0.5B-Instruct 是阿里云 Qwen 团队开发的 Qwen2.5 系列语言模型中的一个指令微调模型,参数规模为 0.5B,类型为因果语言模型,经过了预训练(Pretraining)和后续训练(Post-training)阶段。该模型擅长基于输入生成相关内容,但在进行分类任务时可能无法严格输出类别标签,且可能输出与任务无关的内容。因此,直接应用该模型进行评论预测时,出现了以下问题:
- 输出的问题: 模型不能严格按照要求输出情感类别标签,可能会生成与任务无关的内容。
- 准确率问题: 在测试集上的预测准确率较低,未能满足业务要求。
为了改进模型的性能,接下来我们将通过微调使其更好地符合我们的需求。我们计划对模型进行两种微调思路:
- 将 Qwen2.5-0.5B-Instruct 模型作为特征提取器,后接一个分类输出层,将提取的特征映射到预定义的情感类别标签。
- 引导 Qwen2.5-0.5B-Instruct 模型通过特定的提示词,直接生成情感类别标签。
我们选择第二种方式进行微调训练。通过精心设计提示词,模型将根据输入的评论直接生成类别标签,能够更好地发挥其生成能力,从而提升情感分类的准确性。
HuggingFace:https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct
运行环境:Ubuntu 22.04 + Python 3.10 + RTX 3060 Driver 550.144.03 + CUDA 12.4 + 12G 显存
1. 准备数据
import pandas as pd import pickle from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt from collections import Counter def show_info(data): review_sizes = [] for label, review in data.to_numpy().tolist(): review_sizes.append(len(review)) print('最大长度:', max(review_sizes)) print('最小长度:', min(review_sizes)) print('平均长度:', int(sum(review_sizes) / len(review_sizes))) print('-' * 50) def demo(): # data = pd.read_csv('ChnSentiCorp_htl_8k/ChnSentiCorp_htl_8k.csv') data = pd.read_csv('weibo_senti_100k/weibo_senti_100k.csv') data['label'] = np.where(data['label'] == 1, '好评', '差评') print('数据标签分布:', Counter(data['label'])) print('-' * 50) # 去掉太长的评论 data = data[data['review'].apply(lambda x: len(x) > 10 and len(x) < 300)] show_info(data) # 原始数数据分割 train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42) print('原始训练集数量:', train_data.shape) print('原始测试集数量:', test_data.shape) print('-' * 50) # 采样部分数据 sample_num = 5000 train_data = train_data.sample(int(sample_num * 0.8), random_state=42) test_data = test_data.sample(int(sample_num * 0.2), random_state=52) print('最终训练集数量:', train_data.shape) print('最终测试集数量:', test_data.shape) # 数据转换字典 train_data = train_data.to_dict(orient='records') test_data = test_data.to_dict(orient='records') # 数据本地存储 pickle.dump(train_data, open('weibo_senti_100k/01-训练集.pkl', 'wb')) pickle.dump(test_data, open('weibo_senti_100k/02-测试集.pkl', 'wb')) if __name__ == '__main__': demo()
数据标签分布: Counter({'差评': 59995, '好评': 59993}) -------------------------------------------------- 最大长度: 260 最小长度: 11 平均长度: 68 -------------------------------------------------- 原始训练集数量: (92480, 2) 原始测试集数量: (23121, 2) -------------------------------------------------- 最终训练集数量: (4000, 2) 最终测试集数量: (1000, 2)
2. 模型微调
import torch import pickle from torch.utils.data import DataLoader from transformers import AutoTokenizer from transformers import AutoModelForCausalLM from transformers import TrainingArguments from transformers import Trainer from transformers import DataCollatorForLanguageModeling from transformers import Qwen2Tokenizer from transformers import Qwen2ForCausalLM # https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def get_dataset(tokenizer): comm_data = pickle.load(open(f'weibo_senti_100k/01-训练集.pkl', 'rb')) result_data = [] for data in comm_data: message = [{'role': 'system', 'content': '你是一个专业的情感分类助手。你的任务是对输入的文本进行情感分析,判断其情感倾向并输出 "好评" 或 "差评" 两个词之一,不要输出任何其他额外的信息或解释。'}, {'role': 'user', 'content': data['review']}, {'role': 'assistant', 'content': data['label']}] inputs = tokenizer.apply_chat_template(message, add_generation_prompt=False, tokenize=True) result_data.append(inputs) return result_data # watch -n 1 nvidia-smi def demo(): estimator : Qwen2ForCausalLM= AutoModelForCausalLM.from_pretrained('Qwen2.5-0.5B-Instruct').to(device) tokenizer : Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen2.5-0.5B-Instruct') arguments = TrainingArguments(output_dir='Qwen2.5-0.5B-Instruct-SFT', per_device_train_batch_size=2, optim='adamw_torch', num_train_epochs=5, learning_rate=2e-5, eval_strategy='no', save_strategy='epoch', logging_strategy='epoch', gradient_accumulation_steps=4, save_total_limit=5, load_best_model_at_end=False) train_data = get_dataset(tokenizer) trainer = Trainer(model=estimator, train_dataset=train_data, args=arguments, data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)) trainer.train() if __name__ == '__main__': demo()
{'loss': 1.9545, 'grad_norm': 6.6124982833862305, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0} {'loss': 1.3967, 'grad_norm': 8.334535598754883, 'learning_rate': 1.2e-05, 'epoch': 2.0} {'loss': 0.8885, 'grad_norm': 8.637581825256348, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0} {'loss': 0.5098, 'grad_norm': 7.636900424957275, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0} {'loss': 0.2875, 'grad_norm': 5.367765426635742, 'learning_rate': 0.0, 'epoch': 5.0} {'train_runtime': 2040.111, 'train_samples_per_second': 9.803, 'train_steps_per_second': 1.225, 'train_loss': 1.007375665283203, 'epoch': 5.0} 100%|███████████████████████████████████████| 2500/2500 [34:00<00:00, 1.23it/s]
3. 模型推理
import torch from transformers import AutoTokenizer from transformers import AutoModelForCausalLM from transformers import Qwen2Tokenizer from transformers import Qwen2ForCausalLM device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def demo(): model_path = 'Qwen2.5-0.5B-Instruct-SFT/checkpoint-2500' estimator : Qwen2ForCausalLM= AutoModelForCausalLM.from_pretrained(model_path).to(device) tokenizer : Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen2.5-0.5B-Instruct') system = '你是一个专业的情感分类专家,请对以下文本进行情感分类,并输出 "好评" 或 "差评" 两个词之一。' while True: comment = input('请输入评论内容:') message = [{'role': 'system', 'content': system}, {'role': 'user', 'content': comment}] inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True).to(device) inputs_length = len(inputs['input_ids'][0]) with torch.no_grad(): outputs = estimator.generate(**inputs, max_length=512) output = outputs[0] y_pred = tokenizer.decode(output[inputs_length:], skip_special_tokens=True).strip() print('预测标签:', y_pred) print('-' * 50) if __name__ == '__main__': demo()
请输入评论内容:今天心情真好啊 今天心情真好啊 预测标签: 好评 ----------------------------------------------- 请输入评论内容:今天心情很差,想打人 今天心情很差,想打人 预测标签: 差评 ----------------------------------------------- 请输入评论内容:我也不知道为什么,我就留言评论了一下,她就骂我 我也不知道为什么,我就留言评论了一下,她就骂我 预测标签: 差评 -----------------------------------------------
4. 模型评估
import numpy as np import pickle import torch from torch.utils.data import DataLoader from tqdm import tqdm from transformers import AutoTokenizer from transformers import AutoModelForCausalLM from transformers import Qwen2Tokenizer from transformers import Qwen2ForCausalLM device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def evaluate(model_path): # 模型和分词器加载 estimator: Qwen2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path).to(device) tokenizer: Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen2.5-0.5B-Instruct', padding_side='left') # 加载测试集 test_data = pickle.load(open('weibo_senti_100k/02-测试集.pkl', 'rb')) # 数据加载器 system = '你是一个专业的情感分类专家,请对以下文本进行情感分类,并输出 "好评" 或 "差评" 两个词之一。' def collate_fn(batch_data): inputs, labels = [], [] for data in batch_data: message = [{'role': 'system', 'content': system}, {'role': 'user', 'content': data['review']}] inputs.append(message) labels.append(data['label']) inputs = tokenizer.apply_chat_template(inputs, add_generation_prompt=True, tokenize=True, return_tensors='pt', padding=True, return_dict=True) inputs = { k: v.to(device) for k, v in inputs.items() } return inputs, labels dataloader = DataLoader(test_data, batch_size=8, shuffle=True, collate_fn=collate_fn) # 预测评估 true_labels, pred_labels, wrong = [], [], 0 description = '评估-输出错误: %d' progress = tqdm(range(len(dataloader)), desc=description % wrong) for inputs, labels in dataloader: with torch.no_grad(): outputs = estimator.generate(**inputs, max_length=512) progress.update() # 输出解码 for output, input, y_true in zip(outputs, inputs['input_ids'], labels): y_pred = tokenizer.decode(output[len(input):], skip_special_tokens=True).strip() if y_pred not in ['好评', '差评']: wrong += 1 progress.set_description(description % wrong) continue pred_labels.append(y_pred) true_labels.append(y_true) progress.close() return np.sum(np.array(true_labels) == np.array(pred_labels)) / len(true_labels) def demo(): model_path = 'Qwen2.5-0.5B-Instruct' acc = evaluate(model_path) print('模型微调前: %.3f' % acc) model_path = 'Qwen2.5-0.5B-Instruct-SFT/checkpoint-2500' acc = evaluate(model_path) print('模型微调后: %.3f' % acc) if __name__ == '__main__': demo()
评估-输出错误: 1: 100%|███████████████████████| 125/125 [00:26<00:00, 4.66it/s] 模型微调前: 0.770 评估-输出错误: 0: 100%|███████████████████████| 125/125 [00:26<00:00, 4.73it/s] 模型微调后: 0.983