基于 LoRA 高效微调分类任务

直接微调(全量微调)会更新模型的所有参数,根据特定数据集对模型的权重进行全面优化。这种方式能够充分利用模型的全部参数来适应新任务,理论上可以获得最佳性能,但需要大量的计算资源和时间。

LoRA(Low-Rank Adaptation)通过在模型的某些层中引入低秩矩阵来调整模型的部分参数,而不是更新整个模型的参数。具体来说,LoRA 在模型的每一层添加可训练的低秩矩阵,微调时只更新这些低秩矩阵的参数,而原始模型的大部分参数保持不变。这种方式显著减少了需要训练的参数数量,降低了计算资源的消耗。

conda create -n fine-env python=3.10

pip install transformers-4.46.0 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install torch==2.6.0+cu126 --index-url https://download.pytorch.org/whl/cu126
pip install peft==0.14.0 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install pandas==2.2.3 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install sklearn==1.6.1 -i https://pypi.tuna.tsinghua.edu.cn/simple/

1. 准备数据

import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter


def show_info(data):

    review_sizes = []
    for label, review in data.to_numpy().tolist():
        review_sizes.append(len(review))

    print('最大长度:', max(review_sizes))
    print('最小长度:', min(review_sizes))
    print('平均长度:', int(sum(review_sizes) / len(review_sizes)))
    print('-' * 50)


def demo():
    # data = pd.read_csv('ChnSentiCorp_htl_8k/ChnSentiCorp_htl_8k.csv')
    data = pd.read_csv('weibo_senti_100k/weibo_senti_100k.csv')
    data['label'] = np.where(data['label'] == 1, '好评', '差评')

    print('数据标签分布:', Counter(data['label']))
    print('-' * 50)

    # 去掉太长的评论
    data = data[data['review'].apply(lambda x: len(x) > 10 and len(x) < 300)]
    show_info(data)

    # 原始数数据分割
    train_data, test_data  = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)
    train_data, valid_data  = train_test_split(train_data, test_size=0.2, stratify=train_data['label'], random_state=42)

    print('原始训练集数量:', train_data.shape)
    print('原始测试集数量:', test_data.shape)
    print('-' * 50)

    # 采样部分数据
    sample_num = 5000
    train_data = train_data.sample(int(sample_num * 0.7), random_state=42)
    valid_data = valid_data.sample(int(sample_num * 0.1), random_state=42)
    test_data  = test_data.sample(int(sample_num * 0.2),  random_state=52)

    print('最终训练集数量:', train_data.shape)
    print('最终验证集数量:', valid_data.shape)
    print('最终测试集数量:', test_data.shape)

    # 数据转换字典
    train_data = train_data.to_dict(orient='records')
    valid_data = valid_data.to_dict(orient='records')
    test_data  = test_data.to_dict(orient='records')

    # 数据本地存储
    pickle.dump(train_data, open('weibo_senti_100k/01-训练集.pkl', 'wb'))
    pickle.dump(valid_data, open('weibo_senti_100k/02-验证集.pkl', 'wb'))
    pickle.dump(test_data,  open('weibo_senti_100k/03-测试集.pkl', 'wb'))


if __name__ == '__main__':
    demo()
数据标签分布: Counter({'差评': 59995, '好评': 59993})
--------------------------------------------------
最大长度: 260
最小长度: 11
平均长度: 68
--------------------------------------------------
原始训练集数量: (73984, 2)
原始测试集数量: (23121, 2)
--------------------------------------------------
最终训练集数量: (3500, 2)
最终验证集数量: (500, 2)
最终测试集数量: (1000, 2)

2. 模型微调

import torch
import pickle
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import Qwen2Tokenizer
from transformers import Qwen2ForCausalLM
from peft import LoraConfig
from peft import TaskType
from peft import get_peft_model

# https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_dataset(data_type, tokenizer):
    data_name = '01-训练集.pkl' if data_type == 'train' else '02-验证集.pkl'
    comm_data = pickle.load(open(f'weibo_senti_100k/' + data_name, 'rb'))
    result_data = []
    for data in comm_data:
        message = [{'role': 'system', 'content': '你是一个专业的情感分类助手。你的任务是对输入的文本进行情感分析,判断其情感倾向并输出 "好评" 或 "差评" 两个词之一,不要输出任何其他额外的信息或解释。'},
                   {'role': 'user', 'content': data['review']},
                   {'role': 'assistant', 'content': data['label']}]
        inputs = tokenizer.apply_chat_template(message, add_generation_prompt=False, tokenize=True)
        result_data.append(inputs)

    return result_data


# watch -n 1 nvidia-smi
def demo():

    estimator : Qwen2ForCausalLM= AutoModelForCausalLM.from_pretrained('Qwen2.5-0.5B-Instruct').to(device)
    tokenizer : Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen2.5-0.5B-Instruct', padding_side='left')

    lora_config = LoraConfig(r=8,
                             lora_alpha=4,
                             lora_dropout=0.3,
                             bias='none',
                             task_type=TaskType.CAUSAL_LM,
                             inference_mode=False)

    estimator = get_peft_model(estimator, peft_config=lora_config)

    arguments = TrainingArguments(output_dir='Qwen2.5-0.5B-Instruct-LoRA',
                                  per_device_train_batch_size=8,
                                  optim='adamw_torch',
                                  num_train_epochs=10,
                                  learning_rate=1e-3,
                                  eval_strategy='no',
                                  save_strategy='epoch',
                                  logging_strategy='epoch',
                                  gradient_accumulation_steps=1,
                                  save_total_limit=5,
                                  load_best_model_at_end=False)

    train_data = get_dataset('train', tokenizer)
    valid_data = get_dataset('valid', tokenizer)
    trainer = Trainer(model=estimator,
                      train_dataset=train_data,
                      # eval_dataset=valid_data,
                      args=arguments,
                      data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False))

    trainer.train()


if __name__ == '__main__':
    demo()
{'loss': 2.144, 'grad_norm': 0.631549060344696, 'learning_rate': 0.0009000000000000001, 'epoch': 1.0}
{'loss': 1.9552, 'grad_norm': 0.7571874856948853, 'learning_rate': 0.0008, 'epoch': 2.0}
{'loss': 1.9151, 'grad_norm': 0.671574056148529, 'learning_rate': 0.0007, 'epoch': 3.0}
{'loss': 1.8882, 'grad_norm': 0.6083812117576599, 'learning_rate': 0.0006, 'epoch': 4.0}
{'loss': 1.863, 'grad_norm': 0.6990318894386292, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 1.8428, 'grad_norm': 0.7220743894577026, 'learning_rate': 0.0004, 'epoch': 6.0}
{'loss': 1.8226, 'grad_norm': 0.8619930744171143, 'learning_rate': 0.0003, 'epoch': 7.0}
{'loss': 1.8029, 'grad_norm': 0.7418396472930908, 'learning_rate': 0.0002, 'epoch': 8.0}
{'loss': 1.7894, 'grad_norm': 0.7201936841011047, 'learning_rate': 0.0001, 'epoch': 9.0}
{'loss': 1.7793, 'grad_norm': 0.738139808177948, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 2092.9564, 'train_samples_per_second': 16.723, 'train_steps_per_second': 2.093, 'train_loss': 1.8802502218446775, 'epoch': 10.0}
100%|███████████████████████████████████████| 4380/4380 [34:52<00:00,  2.09it/s]

3. 模型推理

import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import PeftModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



def demo():

    estimator = AutoModelForCausalLM.from_pretrained('Qwen2.5-0.5B-Instruct')
    tokenizer = AutoTokenizer.from_pretrained('Qwen2.5-0.5B-Instruct')
    estimator = PeftModel.from_pretrained(estimator, 'Qwen2.5-0.5B-Instruct-LoRA/checkpoint-5000').to(device)
    
    system = '你是一个专业的情感分类专家,请对以下文本进行情感分类,并输出 "好评" 或 "差评" 两个词之一。'
    while True:
        comment = input('请输入评论内容:')
        message = [{'role': 'system', 'content': system}, {'role': 'user', 'content': comment}]
        inputs = tokenizer.apply_chat_template(message,
                                               add_generation_prompt=True,
                                               tokenize=True,
                                               return_tensors='pt',
                                               return_dict=True).to(device)
        inputs_length = len(inputs['input_ids'][0])
        with torch.no_grad():
            outputs = estimator.generate(**inputs, max_length=128)
        output = outputs[0]
        y_pred = tokenizer.decode(output[inputs_length:], skip_special_tokens=True).strip()
        print('预测标签:', y_pred)
        print('-' * 50)


if __name__ == '__main__':
    demo()

4. 模型评估

import numpy as np
import pickle
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Qwen2Tokenizer
from transformers import Qwen2ForCausalLM
from peft import PeftModel


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def evaluate(estimator, tokenizer):
    # 加载测试集
    test_data = pickle.load(open('weibo_senti_100k/03-测试集.pkl', 'rb'))

    # 数据加载器
    system = '你是一个专业的情感分类专家,请对以下文本进行情感分类,并输出 "好评" 或 "差评" 两个词之一。'
    def collate_fn(batch_data):
        inputs, labels = [], []
        for data in batch_data:
            message = [{'role': 'system', 'content': system}, {'role': 'user', 'content': data['review']}]
            inputs.append(message)
            labels.append(data['label'])

        inputs = tokenizer.apply_chat_template(inputs,
                                               add_generation_prompt=True,
                                               tokenize=True,
                                               return_tensors='pt',
                                               padding=True,
                                               return_dict=True).to(device)
        inputs = { k: v.to(device) for k, v in inputs.items() }
        return inputs, labels

    dataloader = DataLoader(test_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

    # 预测评估
    true_labels, pred_labels, wrong = [], [], 0
    description = '评估-输出错误: %d'
    progress = tqdm(range(len(dataloader)), desc=description % wrong)
    for inputs, labels in dataloader:
        with torch.no_grad():
            outputs = estimator.generate(**inputs, max_length=512)
        progress.update()

        # 输出解码
        for output, input, y_true in zip(outputs, inputs['input_ids'], labels):
            y_pred = tokenizer.decode(output[len(input):], skip_special_tokens=True).strip()
            if y_pred not in ['好评', '差评']:
                wrong += 1
                progress.set_description(description % wrong)
                continue

            pred_labels.append(y_pred)
            true_labels.append(y_true)

    progress.close()

    return np.sum(np.array(true_labels) == np.array(pred_labels)) / len(true_labels)


def demo():
    estimator = AutoModelForCausalLM.from_pretrained('Qwen2.5-0.5B-Instruct').to(device)
    tokenizer: Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen2.5-0.5B-Instruct', padding_side='left')
    acc = evaluate(estimator, tokenizer)
    print('模型微调前: %.3f' % acc)

    estimator = PeftModel.from_pretrained(estimator, 'Qwen2.5-0.5B-Instruct-LoRA/checkpoint-4380').to(device)
    acc = evaluate(estimator, tokenizer)
    print('模型微调后: %.3f' % acc)


if __name__ == '__main__':
    demo()
评估-输出错误: 1: 100%|███████████████████████| 125/125 [00:22<00:00,  5.66it/s]
模型微调前: 0.766
评估-输出错误: 0: 100%|███████████████████████| 125/125 [00:23<00:00,  5.36it/s]
模型微调后: 0.982
未经允许不得转载:一亩三分地 » 基于 LoRA 高效微调分类任务
评论 (0)

6 + 1 =