数据集:中文酒店评论分类语料.csv
1. 数据处理
from datasets import load_dataset from datasets import Dataset import pandas as pd import re from pyhanlp import JClass # import pkg_resources from symspellpy.symspellpy import SymSpell import time import datasets datasets.disable_progress_bar() def timer(func): def inner(*args, **kwargs): print('function [%s] starts runing' % (func.__name__)) start = time.time() result = func(*args, **kwargs) end = time.time() print('function [%s] executed for %.2f seconds' % (func.__name__, end - start)) print('-' * 51) return result return inner sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7) sym_spell.load_dictionary("data/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1) normalizer = JClass('com.hankcs.hanlp.dictionary.other.CharTable') def clean_data(label, inputs): # 文本标准化 inputs = normalizer.convert(inputs) # 英文分割单词 inputs = sym_spell.word_segmentation(inputs).corrected_string # 替换连续重复 inputs = re.sub(r'(.)\1{3,}', r'\1', inputs) # 去除多余空格 inputs = ''.join(inputs.split()) return {'label': label, 'review': inputs} @timer def preprocess(): train_data = pd.read_csv('data/中文酒店评论分类语料.csv') train_data = train_data[['label', 'review']] train_data = train_data.dropna() # DataFrame 转换为 Dataset 对象 train_data = Dataset.from_pandas(train_data) # 文本清洗 train_data = train_data.map(clean_data, input_columns=['label', 'review'], num_proc=6) # 过滤空数据 train_data = train_data.filter(lambda x: len(x['review']) > 5) # 分割数据集 train_data = train_data.train_test_split(test_size=0.2) train_data['valid'] = train_data.pop('test') print(train_data) # 存储数据 train_data.save_to_disk('data/senti-dataset') if __name__ == '__main__': preprocess()
2. 模型训练评估
import torch import torch.nn as nn from transformers import BertTokenizer from datasets import load_from_disk from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pack_padded_sequence import numpy as np import datasets import torch.optim as optim from torch.optim.lr_scheduler import StepLR import time from sklearn.metrics import classification_report # datasets.disable_progress_bar() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class SentimentClassification(nn.Module): def __init__(self, vocab_size): super(SentimentClassification, self).__init__() self.embedding = nn.Embedding(vocab_size, 128) self.bilstim = nn.LSTM(128, 256, num_layers=1, bidirectional=True, batch_first=True) self.output_logits = nn.Linear(256 * 2, 2) def forward(self, sorted_inputs, lengths=None): inputs_embedding = self.embedding(sorted_inputs) if lengths is not None: inputs_embedding = pack_padded_sequence(inputs_embedding, lengths, batch_first=True) output, (hn, cn) = self.bilstim(inputs_embedding) # 将 hn 维度从 (2, 2, 256) 修改为 (2, 512) hn = hn.transpose(0, 1) last_hidden_state = hn.reshape(hn.shape[0], -1) logits = self.output_logits(last_hidden_state) return logits, last_hidden_state def train(): # 训练数据 train_data = load_from_disk('data/senti-dataset')['train'] # 分词器 tokenizer = BertTokenizer.from_pretrained('data/bert-base-chinese') # 分类模型 model = SentimentClassification(tokenizer.vocab_size).to(device) # 损失函数 criterion = nn.CrossEntropyLoss() # 优化方法 optimizer = optim.AdamW(model.parameters(), lr=5e-5) # 调节器 scheduler = StepLR(optimizer, step_size=1, gamma=0.9) def train_step(batch_labels, batch_inputs): # 输入分词编码 param = {'padding': 'longest', 'return_token_type_ids': False, 'return_attention_mask': False} batch_inputs = tokenizer(batch_inputs, **param)['input_ids'] # 获得文本长度 input_length = [] for inputs in batch_inputs: input_length.append(len(inputs)) # 长度降序排列 sorted_length = np.argsort(-np.array(input_length)) # 排序输入和标签 sorted_inputs = [] sorted_labels = [] length = [] for index in sorted_length: sorted_inputs.append(batch_inputs[index]) sorted_labels.append(batch_labels[index]) length.append(input_length[index]) sorted_inputs = torch.tensor(sorted_inputs, device=device) sorted_labels = torch.tensor(sorted_labels, device=device) # 模型计算 logits, last_hidden_state = model(sorted_inputs, length) # 计算损失 loss = criterion(logits, sorted_labels) # 梯度清零 optimizer.zero_grad() # 反向传播 loss.backward() # 参数更新 optimizer.step() nonlocal total_loss, total_iter total_loss += loss.item() * len(sorted_labels) total_iter += len(sorted_labels) epochs = 100 prev = 10000 for epoch in range(epochs): total_loss = 0.0 total_iter = 0 start = time.time() train_data.map(train_step, input_columns=['label', 'review'], batched=True, batch_size=32, desc='训练') loss = total_loss / total_iter print('loss: %.5f time: %.2fs' % (loss, time.time() - start)) if loss < prev: torch.save(model.state_dict(), 'model/sentimentclassification.pth') evaluate(epoch) prev = loss def evaluate(epoch): # 训练数据 valid_data = load_from_disk('data/senti-dataset')['valid'] # 分词器 tokenizer = BertTokenizer.from_pretrained('data/bert-base-chinese') # 分类模型 model = SentimentClassification(tokenizer.vocab_size).to(device) model.load_state_dict(torch.load('model/sentimentclassification.pth')) y_true, y_pred = [], [] def predict(batch_labels, batch_inputs): # 输入分词编码 param = {'padding': 'longest', 'return_token_type_ids': False, 'return_attention_mask': False} batch_inputs = tokenizer(batch_inputs, **param)['input_ids'] batch_inputs = torch.tensor(batch_inputs, device=device) logits, last_hidden_state = model(batch_inputs) predictions = torch.argmax(logits, dim=-1).tolist() y_pred.extend(predictions) y_true.extend(batch_labels) valid_data.map(predict, input_columns=['label', 'review'], batched=True, batch_size=32, desc='评估') report_result = classification_report(y_true, y_pred, labels=[0, 1], target_names=['差评', '好评']) print('---epoch %d---------------------------------------------' % epoch) print(report_result) if __name__ == '__main__': train() # evaluate(10)